Skip to main content

normalize-transcript.py

#!/usr/bin/env python3
"""
build-whisper-prompt.py
Extracts primary keys from a TranscriptOMatic YAML meta file and writes
them one per line to a .prompt file for use as a whisper vocabulary hint.

The output is a starting point — edit it manually to remove common words
that don't benefit from hinting and to stay within Whisper's ~224 token limit.

Usage:
    python3 build-whisper-prompt.py --game <slug>

    YAML:   resolved from <script-dir>/../meta/<slug>.yaml
    Output: <script-dir>/../meta/<slug>.prompt

    Run from anywhere; paths are relative to the script location.

Options:
    --game SLUG     Game slug (required)
    --force         Overwrite existing .prompt file (default: abort if exists)
"""

import sys
import yaml
import argparse
from pathlib import Path


# Sections to extract primary keys from, in priority order.
# Terms and locations first — most phonetically unusual for Whisper.
SECTIONS = ["terms", "surnames", "locations", "characters", "groups", "phrases", "players", "gm"]


def load_yaml(path):
    with open(path, encoding="utf-8") as f:
        return yaml.safe_load(f)


def extract_keys(data):
    """Extract primary keys and titles from all relevant sections."""
    keys = []
    seen = set()

    def add(term):
        clean = str(term).split("(")[0].strip()
        if clean and clean not in seen:
            keys.append(clean)
            seen.add(clean)

    for section in SECTIONS:
        block = data.get(section, {}) or {}
        if not isinstance(block, dict):
            continue
        for key, entry in block.items():
            add(key)
            if isinstance(entry, dict):
                # Titles: phonetically unusual, benefit from hinting
                for title in (entry.get("titles") or []):
                    add(title)
                # English name: intentional alternate identity, include for Whisper awareness
                name_en = entry.get("name_en")
                if name_en:
                    add(name_en)
    return keys


def main():
    script_dir = Path(__file__).resolve().parent
    meta_dir = (script_dir / ".." / "meta").resolve()

    parser = argparse.ArgumentParser(
        description="Generate a whisper vocabulary prompt file from a YAML meta file."
    )
    parser.add_argument("--game", required=True, metavar="SLUG",
                        help="Game slug — resolves to meta/<slug>.yaml")
    parser.add_argument("--force", action="store_true",
                        help="Overwrite existing .prompt file")
    args = parser.parse_args()

    yaml_path = meta_dir / f"{args.game}.yaml"
    prompt_path = meta_dir / f"{args.game}.prompt"

    if not yaml_path.exists():
        print(f"❌ YAML not found: {yaml_path}", file=sys.stderr)
        sys.exit(1)

    if prompt_path.exists() and not args.force:
        print(f"❌ Prompt file already exists: {prompt_path}", file=sys.stderr)
        print( "   Use --force to overwrite.", file=sys.stderr)
        sys.exit(1)

    data = load_yaml(yaml_path)
    keys = extract_keys(data)

    prompt_path.write_text("\n".join(keys) + "\n", encoding="utf-8")

    print(f"📋 YAML:   {yaml_path}")
    print(f"✅ Written: {prompt_path}")
    print(f"   {len(keys)} terms — edit to remove unproblematic entries")
    print(f"   then check token count (target: <224 tokens)")


if __name__ == "__main__":
    main()