normalize-transcript.py

#!/usr/bin/env python3
"""
normalize-transcript.py
Replaces alias variants in a whisper transcript with their canonical primary keys,
based on a TranscriptOMatic YAML meta file.

Usage:
    python3 normalize-transcript.py <transcript.txt> --game <slug> [--min-length N]

    Meta file is resolved relative to the transcript:
    <transcript-dir>/../../meta/<slug>.yaml
    Works on any machine regardless of the base directory name.

Output:
    <transcript_base>_normalized.txt  — cleaned transcript
    Borderline cases printed to stdout for manual review

Options:
    --game SLUG      Game slug to resolve meta file (required)
    --min-length N   Minimum alias length to auto-replace (default: 5)
    --dry-run        Show what would be replaced without writing output
"""

import re
import sys
import yaml
import argparse
from pathlib import Path


MIN_LENGTH_DEFAULT = 5


def load_yaml(path):
    with open(path, encoding="utf-8") as f:
        return yaml.safe_load(f)


def extract_replacements(data, min_length=5):
    """
    Build two lists from the YAML:
    - replacements: [(alias, primary_key), ...] for aliases >= min_length
    - borderline:   [(alias, primary_key), ...] for aliases < min_length

    Covers: characters, groups, locations, terms, phrases, roles, players, gm
    The primary key is the YAML key itself (e.g. "Séamus MacGregor").
    The short name is NOT used here — normalization targets the transcript,
    not the summary output.
    """
    replacements = []
    borderline = []

    sections = [
        data.get("characters", {}),
        data.get("groups", {}),
        data.get("locations", {}),
        data.get("terms", {}),
        data.get("phrases", {}),
        data.get("players", {}),
        data.get("gm", {}),
    ]

    for section in sections:
        if not isinstance(section, dict):
            continue
        for primary_key, entry in section.items():
            if not isinstance(entry, dict):
                continue
            aliases = entry.get("aliases", []) or []
            safe = entry.get("safe", True)  # safe: false skips auto-replacement
            for alias in aliases:
                if not alias or alias == primary_key:
                    continue
                pair = (str(alias), str(primary_key))
                if not safe:
                    borderline.append(pair)
                elif len(str(alias)) < min_length:
                    borderline.append(pair)
                else:
                    replacements.append(pair)

    # Roles section is a flat dict: role_name → character(s)
    # No aliases to replace here, skip.

    return replacements, borderline


def build_pattern(alias):
    """Word-boundary aware, case-insensitive regex for alias."""
    escaped = re.escape(alias)
    return re.compile(r'\b' + escaped + r'\b', re.IGNORECASE | re.UNICODE)


def normalize(text, replacements):
    """Apply all replacements to text. Longer aliases first to avoid partial matches."""
    sorted_replacements = sorted(replacements, key=lambda x: len(x[0]), reverse=True)
    for alias, primary_key in sorted_replacements:
        pattern = build_pattern(alias)
        text = pattern.sub(primary_key, text)
    return text


def find_borderline_matches(lines, borderline):
    """Find lines containing borderline aliases and return report entries."""
    findings = []
    for lineno, line in enumerate(lines, 1):
        for alias, primary_key in borderline:
            pattern = build_pattern(alias)
            if pattern.search(line):
                findings.append((lineno, line.rstrip(), alias, primary_key))
    return findings


def main():
    parser = argparse.ArgumentParser(description="Normalize transcript using YAML meta file.")
    parser.add_argument("transcript", help="Path to transcript .txt file")
    parser.add_argument("--game", required=True, metavar="SLUG",
                        help="Game slug — resolves to META_DIR/<slug>.yaml")
    parser.add_argument("--min-length", type=int, default=MIN_LENGTH_DEFAULT,
                        help=f"Minimum alias length for auto-replacement (default: {MIN_LENGTH_DEFAULT})")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show replacements without writing output")
    args = parser.parse_args()

    transcript_path = Path(args.transcript)
    # Resolve meta dir relative to transcript: <session>/ → ../../meta/
    meta_path = (transcript_path.parent / ".." / ".." / "meta" / f"{args.game}.yaml").resolve()

    if not transcript_path.exists():
        print(f"❌ Transcript not found: {transcript_path}", file=sys.stderr)
        sys.exit(1)
    if not meta_path.exists():
        print(f"❌ Meta file not found: {meta_path}", file=sys.stderr)
        print(f"   Expected: {meta_path}", file=sys.stderr)
        sys.exit(1)

    print(f"📄 Transcript: {transcript_path}")
    print(f"📋 Game:       {args.game}")
    print(f"📋 Meta:       {meta_path}")
    print(f"🔤 Min alias length for auto-replace: {args.min_length}")
    print("----")

    data = load_yaml(str(meta_path))
    replacements, borderline = extract_replacements(data, args.min_length)

    print(f"✅ {len(replacements)} aliases will be auto-replaced")
    print(f"⚠️  {len(borderline)} short/flagged aliases skipped (see report below)")
    print("----")

    text = transcript_path.read_text(encoding="utf-8")
    lines = text.splitlines()

    normalized = normalize(text, replacements)

    if args.dry_run:
        print("🔍 Dry run — no file written.")
    else:
        out_path = transcript_path.with_name(transcript_path.stem + "_normalized.txt")
        out_path.write_text(normalized, encoding="utf-8")
        print(f"✅ Written: {out_path}")

    # Borderline report
    if borderline:
        findings = find_borderline_matches(lines, borderline)
        if findings:
            print()
            print("⚠️  Borderline replacements — review manually:")
            print(f"   {'Line':<6} {'Alias':<20} {'Primary Key':<30} Context")
            print(f"   {'----':<6} {'-----':<20} {'-----------':<30} -------")
            for lineno, line, alias, primary_key in findings:
                # Truncate long lines for readability
                context = line[:80] + ("…" if len(line) > 80 else "")
                print(f"   {lineno:<6} {alias:<20} {primary_key:<30} {context}")
        else:
            print("✅ No borderline matches found in transcript.")


if __name__ == "__main__":
    main()