normalize-transcript.py
#!/usr/bin/env python3
"""
build-whisper-prompt.py
Extracts primary keys from a TranscriptOMatic YAML meta file and writes
them one per line to a .prompt file for use as a whisper vocabulary hint.
The output is a starting point — edit it manually to remove common words
that don't benefit from hinting and to stay within Whisper's ~224 token limit.
Usage:
python3 build-whisper-prompt.py --game <slug>
YAML: resolved from <script-dir>/../meta/<slug>.yaml
Output: <script-dir>/../meta/<slug>.prompt
Run from anywhere; paths are relative to the script location.
Options:
--game SLUG Game slug (required)
--force Overwrite existing .prompt file (default: abort if exists)
"""
import sys
import yaml
import argparse
from pathlib import Path
# Sections to extract primary keys from, in priority order.
# Terms and locations first — most phonetically unusual for Whisper.
SECTIONS = ["terms", "surnames", "locations", "characters", "groups", "phrases", "players", "gm"]
def load_yaml(path):
with open(path, encoding="utf-8") as f:
return yaml.safe_load(f)
def extract_keys(data):
"""Extract primary keys and titles from all relevant sections."""
keys = []
seen = set()
def add(term):
clean = str(term).split("(")[0].strip()
if clean and clean not in seen:
keys.append(clean)
seen.add(clean)
for section in SECTIONS:
block = data.get(section, {}) or {}
if not isinstance(block, dict):
continue
for key, entry in block.items():
add(key)
if isinstance(entry, dict):
# Titles: phonetically unusual, benefit from hinting
for title in (entry.get("titles") or []):
add(title)
# English name: intentional alternate identity, include for Whisper awareness
name_en = entry.get("name_en")
if name_en:
add(name_en)
return keys
def main():
script_dir = Path(__file__).resolve().parent
meta_dir = (script_dir / ".." / "meta").resolve()
parser = argparse.ArgumentParser(
description="Generate a whisper vocabulary prompt file from a YAML meta file."
)
parser.add_argument("--game", required=True, metavar="SLUG",
help="Game slug — resolves to meta/<slug>.yaml")
parser.add_argument("--force", action="store_true",
help="Overwrite existing .prompt file")
args = parser.parse_args()
yaml_path = meta_dir / f"{args.game}.yaml"
prompt_path = meta_dir / f"{args.game}.prompt"
if not yaml_path.exists():
print(f"❌ YAML not found: {yaml_path}", file=sys.stderr)
sys.exit(1)
if prompt_path.exists() and not args.force:
print(f"❌ Prompt file already exists: {prompt_path}", file=sys.stderr)
print( " Use --force to overwrite.", file=sys.stderr)
sys.exit(1)
data = load_yaml(yaml_path)
keys = extract_keys(data)
prompt_path.write_text("\n".join(keys) + "\n", encoding="utf-8")
print(f"📋 YAML: {yaml_path}")
print(f"✅ Written: {prompt_path}")
print(f" {len(keys)} terms — edit to remove unproblematic entries")
print(f" then check token count (target: <224 tokens)")
if __name__ == "__main__":
main()