normalize-transcript.py

#!/usr/bin/env python3
"""
build-whisper-prompt.normalize-transcript.py
ExtractsReplaces alias variants in a whisper transcript with their canonical primary keyskeys,
frombased on a TranscriptOMatic YAML meta file and writes
them one per line to a .prompt file for use as a whisper vocabulary hint.

The output is a starting point — edit it manually to remove common words
that don't benefit from hinting and to stay within Whisper's ~224 token limit.file.

Usage:
    python3 build-whisper-prompt.normalize-transcript.py <transcript.txt> --game <slug> YAML:[--min-length N]

    Meta file is resolved fromrelative to the transcript:
    <script-transcript-dir>/../../meta/<slug>.yaml
    Works on any machine regardless of the base directory name.

    If a matching .srt file exists alongside the .txt, it is normalized
    in sync. Borderline report is always generated from .txt only.

Output:
    <script-dirtranscript_base>/_normalized.txt     — cleaned transcript
    <transcript_base>_normalized.srt     — cleaned SRT (if ../meta/srt exists)
    <slugtranscript_base>.prompt_borderline.txt     Run— fromborderline anywhere;replacements pathsfor aremanual relative to the script location.review

Options:
    --game SLUG      Game slug to resolve meta file (required)
    --forcemin-length OverwriteN   existingMinimum .promptalias filelength to auto-replace (default: abort5)
    if--dry-run        exists)Show what would be replaced without writing output
"""

import re
import sys
import yaml
import argparse
from pathlib import Path


# Sections to extract primary keys from, in priority order.
# Terms and locations first — most phonetically unusual for Whisper.
SECTIONSMIN_LENGTH_DEFAULT = ["terms", "surnames", "locations", "characters", "groups", "phrases", "players", "gm"]5


def load_yaml(path):
    with open(path, encoding="utf-8") as f:
        return yaml.safe_load(f)


def extract_keys(data)extract_replacements(data, min_length=5):
    """Extract
    Build two lists from the YAML:
    - replacements: [(alias, target), ...] for aliases >= min_length
    - borderline:   [(alias, target), ...] for aliases < min_length

    The replacement target is the entry's short: value if present,
    otherwise the primary key. This prevents partial-match duplication
    when primary keys andcontain titlessubstrings fromof alleach relevantother.

    sections.Covers: characters, groups, locations, terms, phrases, roles, players, gm
    """
    keysreplacements = []
    seenborderline = set()[]

    def add(term):
        cleansections = str(term).split([
        data.get("(characters", {}),
        data.get("groups", {})[0].strip(,
        data.get("locations", {}),
        ifdata.get("terms", clean{}),
        anddata.get("phrases", clean{}),
        notdata.get("players", in{}),
        seen:data.get("gm", keys.append(clean){}),
        seen.add(clean)data.get("surnames", {}),
    ]

    for section in SECTIONS:
        block = data.get(section, {}) or {}sections:
        if not isinstance(block,section, dict):
            continue
        for key,primary_key, entry in block.section.items():
            add(key)if ifnot isinstance(entry, dict):
                #continue
            Titles:aliases phonetically= unusual, benefit from hinting
                for title in (entry.get("titles"aliases", []) or []):
                    add(title)
            # EnglishUse name:short intentionalname alternateas identity,replacement includetarget forif Whisperavailable, awarenesselse name_enprimary key.
            # This prevents partial-match duplication e.g. "Louis-Adrien de Bailly-Adrien de Bailly".
            target = str(entry.get("short", primary_key) or primary_key)
            # safe: true   → force auto-replace, bypasses length check
            # safe: false  → borderline only, never auto-replace
            # safe: ignore → skip entirely, never replaced or reported
            # safe absent  → auto-replace if alias >= min_length, else borderline
            safe = entry.get("name_en")safe", None)
            for alias in aliases:
                if name_en:not add(name_en)alias or alias == target:
                    continue
                pair = (str(alias), target)
                if safe == "ignore":
                    continue
                elif safe is False:
                    borderline.append(pair)
                elif safe is True or len(str(alias)) >= min_length:
                    replacements.append(pair)
                else:
                    borderline.append(pair)

    # Roles section is a flat dict: role_name → character(s)
    # No aliases to replace here, skip.

    return keysreplacements, borderline


def build_pattern(alias):
    """Word-boundary aware, case-insensitive regex for alias."""
    escaped = re.escape(alias)
    return re.compile(r'\b' + escaped + r'\b', re.IGNORECASE | re.UNICODE)


def normalize(text, replacements):
    """Apply all replacements to text.

    Longer aliases are processed first. Each match is immediately replaced
    with a unique placeholder so subsequent regexes cannot re-match already
    substituted text. Placeholders are resolved to their targets at the end.
    """
    sorted_replacements = sorted(replacements, key=lambda x: len(x[0]), reverse=True)
    # Use Private Use Area characters as placeholder delimiters —
    # vanishingly unlikely to appear in any real transcript.
    OPEN  = ""
    CLOSE = ""
    protected = []  # list of target strings, indexed by placeholder number

    for alias, target in sorted_replacements:
        pattern = build_pattern(alias)
        def replacer(m, t=target):
            idx = len(protected)
            protected.append(t)
            return f"{OPEN}{idx}{CLOSE}"
        text = pattern.sub(replacer, text)

    # Resolve placeholders in order
    for idx, target in enumerate(protected):
        text = text.replace(f"{OPEN}{idx}{CLOSE}", target)

    return text


def find_borderline_matches(lines, borderline):
    """Find lines containing borderline aliases and return report entries."""
    findings = []
    for lineno, line in enumerate(lines, 1):
        for alias, primary_key in borderline:
            pattern = build_pattern(alias)
            if pattern.search(line):
                findings.append((lineno, line.rstrip(), alias, primary_key))
    return findings


def main():
    script_dir = Path(__file__).resolve().parent
    meta_dir = (script_dir / ".." / "meta").resolve()

    parser = argparse.ArgumentParser(
        description="GenerateNormalize atranscript whisper vocabulary prompt file from ausing YAML meta file.")
    parser.add_argument("transcript", help="Path to transcript .txt file")
    parser.add_argument("--game", required=True, metavar="SLUG",
                        help="Game slug — resolves to meta/META_DIR/<slug>.yaml")
    parser.add_argument("--force"min-length", type=int, default=MIN_LENGTH_DEFAULT,
                        help=f"Minimum alias length for auto-replacement (default: {MIN_LENGTH_DEFAULT})")
    parser.add_argument("--dry-run", action="store_true",
                        help="OverwriteShow existingreplacements .promptwithout file"writing output")
    args = parser.parse_args()

    yaml_pathtranscript_path = meta_dirPath(args.transcript)
    # Resolve meta dir relative to transcript: <session>/ → ../../meta/
    meta_path = (transcript_path.parent / ".." / ".." / "meta" / f"{args.game}.yaml"
    prompt_path = meta_dir / f"{args.game}).prompt"resolve()

    if not yaml_path.transcript_path.exists():
        print(f"❌ YAMLTranscript not found: {yaml_path}transcript_path}", file=sys.stderr)
        sys.exit(1)
    if prompt_path.not meta_path.exists() and not args.force::
        print(f"❌ PromptMeta file alreadynot exists:found: {prompt_path}meta_path}", file=sys.stderr)
        print(f"   "Expected: Use --force to overwrite.{meta_path}", file=sys.stderr)
        sys.exit(1)

    print(f"📄 Transcript: {transcript_path}")
    print(f"📋 Game:       {args.game}")
    print(f"📋 Meta:       {meta_path}")
    print(f"🔤 Min alias length for auto-replace: {args.min_length}")
    print("----")

    data = load_yaml(yaml_path)str(meta_path))
    keysreplacements, borderline = extract_keys(data)extract_replacements(data, prompt_path.write_text(args.min_length)

    print(f"✅ {len(replacements)} aliases will be auto-replaced")
    print(f"⚠️  {len(borderline)} short/flagged aliases skipped (see report below)"\n")
    print("----")

    # --- TXT ---
    text = transcript_path.read_text(encoding="utf-8")
    lines = text.splitlines()
    normalized_txt = normalize(text, replacements)

    # --- SRT (optional, normalized in sync with TXT) ---
    srt_path = transcript_path.with_suffix(".join(keys)srt")
    srt_out_path = transcript_path.with_name(transcript_path.stem + "\n"_normalized.srt")
    has_srt = srt_path.exists()
    if has_srt:
        normalized_srt = normalize(srt_path.read_text(encoding="utf-8"), replacements)

    # --- Write output ---
    if args.dry_run:
        print("🔍 Dry run — no files written.")
    else:
        txt_out_path = transcript_path.with_name(transcript_path.stem + "_normalized.txt")
        txt_out_path.write_text(normalized_txt, encoding="utf-8")
        print(f"📋✅ YAML:Written: {yaml_path}txt_out_path}")
        if has_srt:
            srt_out_path.write_text(normalized_srt, encoding="utf-8")
            print(f"✅ Written: {prompt_path}srt_out_path}")
        else:
            print(f"ℹ️  No matching .srt found alongside transcript — skipped.")

    # --- Borderline report (from TXT only) ---
    report_path = transcript_path.with_name(transcript_path.stem + "_borderline.txt")
    if borderline:
        findings = find_borderline_matches(lines, borderline)
        if findings:
            header = (
                f"{'Line':<6} {'Alias':<20} {'Primary Key':<30} Context\n"
                f"{'----':<6} {'-----':<20} {'-----------':<30} -------\n"
            )
            rows = []
            for lineno, line, alias, primary_key in findings:
                context = line[:80] + ("…" if len(line) > 80 else "")
                rows.append(f"{lineno:<6} {alias:<20} {primary_key:<30} {context}")
            report_text = header + "\n".join(rows) + "\n"

            if not args.dry_run:
                report_path.write_text(report_text, encoding="utf-8")
                print(f"⚠️  Borderline report: {report_path} ({len(findings)} entries)")
            else:
                print("⚠️  Borderline replacements (dry run — not written):")
                print(f"header {len(keys)}+ terms — edit to remove unproblematic entries""\n".join(rows))
        else:
            print(f""✅ thenNo checkborderline tokenmatches countfound (target:in <224 tokens)transcript.")


if __name__ == "__main__":
    main()