normalize-transcript.py
#!/usr/bin/env python3
"""
normalize-transcript.py
Replaces alias variants in a whisper transcript with their canonical primary keys,
based on a TranscriptOMatic YAML meta file.
Usage:
python3 normalize-transcript.py <transcript.txt> --game <slug> [--min-length N]
Meta file is resolved relative to the transcript:
<transcript-dir>/../../meta/<slug>.yaml
Works on any machine regardless of the base directory name.
If a matching .srt file exists alongside the .txt, it is normalized
in sync. Borderline report is always generated from .txt only.
Output:
<transcript_base>_normalized.txt — cleaned transcript
<transcript_base>_normalized.srt — cleaned SRT (if .srt exists)
Borderline cases printed to stdout for manual review
Options:
--game SLUG Game slug to resolve meta file (required)
--min-length N Minimum alias length to auto-replace (default: 5)
--dry-run Show what would be replaced without writing output
"""
import re
import sys
import yaml
import argparse
from pathlib import Path
MIN_LENGTH_DEFAULT = 5
def load_yaml(path):
with open(path, encoding="utf-8") as f:
return yaml.safe_load(f)
def extract_replacements(data, min_length=5):
"""
Build two lists from the YAML:
- replacements: [(alias, primary_key), ...] for aliases >= min_length
- borderline: [(alias, primary_key), ...] for aliases < min_length
Covers: characters, groups, locations, terms, phrases, roles, players, gm
The primary key is the YAML key itself (e.g. "Séamus MacGregor").
The short name is NOT used here — normalization targets the transcript,
not the summary output.
"""
replacements = []
borderline = []
sections = [
data.get("characters", {}),
data.get("groups", {}),
data.get("locations", {}),
data.get("terms", {}),
data.get("phrases", {}),
data.get("players", {}),
data.get("gm", {}),
]
for section in sections:
if not isinstance(section, dict):
continue
for primary_key, entry in section.items():
if not isinstance(entry, dict):
continue
aliases = entry.get("aliases", []) or []
safe = entry.get("safe", True) # safe: false skips auto-replacement
for alias in aliases:
if not alias or alias == primary_key:
continue
pair = (str(alias), str(primary_key))
if not safe:
borderline.append(pair)
elif len(str(alias)) < min_length:
borderline.append(pair)
else:
replacements.append(pair)
# Roles section is a flat dict: role_name → character(s)
# No aliases to replace here, skip.
return replacements, borderline
def build_pattern(alias):
"""Word-boundary aware, case-insensitive regex for alias."""
escaped = re.escape(alias)
return re.compile(r'\b' + escaped + r'\b', re.IGNORECASE | re.UNICODE)
def normalize(text, replacements):
"""Apply all replacements to text. Longer aliases first to avoid partial matches."""
sorted_replacements = sorted(replacements, key=lambda x: len(x[0]), reverse=True)
for alias, primary_key in sorted_replacements:
pattern = build_pattern(alias)
text = pattern.sub(primary_key, text)
return text
def find_borderline_matches(lines, borderline):
"""Find lines containing borderline aliases and return report entries."""
findings = []
for lineno, line in enumerate(lines, 1):
for alias, primary_key in borderline:
pattern = build_pattern(alias)
if pattern.search(line):
findings.append((lineno, line.rstrip(), alias, primary_key))
return findings
def main():
parser = argparse.ArgumentParser(description="Normalize transcript using YAML meta file.")
parser.add_argument("transcript", help="Path to transcript .txt file")
parser.add_argument("--game", required=True, metavar="SLUG",
help="Game slug — resolves to META_DIR/<slug>.yaml")
parser.add_argument("--min-length", type=int, default=MIN_LENGTH_DEFAULT,
help=f"Minimum alias length for auto-replacement (default: {MIN_LENGTH_DEFAULT})")
parser.add_argument("--dry-run", action="store_true",
help="Show replacements without writing output")
args = parser.parse_args()
transcript_path = Path(args.transcript)
# Resolve meta dir relative to transcript: <session>/ → ../../meta/
meta_path = (transcript_path.parent / ".." / ".." / "meta" / f"{args.game}.yaml").resolve()
if not transcript_path.exists():
print(f"❌ Transcript not found: {transcript_path}", file=sys.stderr)
sys.exit(1)
if not meta_path.exists():
print(f"❌ Meta file not found: {meta_path}", file=sys.stderr)
print(f" Expected: {meta_path}", file=sys.stderr)
sys.exit(1)
print(f"📄 Transcript: {transcript_path}")
print(f"📋 Game: {args.game}")
print(f"📋 Meta: {meta_path}")
print(f"🔤 Min alias length for auto-replace: {args.min_length}")
print("----")
data = load_yaml(str(meta_path))
replacements, borderline = extract_replacements(data, args.min_length)
print(f"✅ {len(replacements)} aliases will be auto-replaced")
print(f"⚠️ {len(borderline)} short/flagged aliases skipped (see report below)")
print("----")
# --- TXT ---
text = transcript_path.read_text(encoding="utf-8")
lines = text.splitlines()
normalized_txt = normalize(text, replacements)
# --- SRT (optional, normalized in sync with TXT) ---
srt_path = transcript_path.with_suffix(".srt")
srt_out_path = transcript_path.with_name(transcript_path.stem + "_normalized.srt")
has_srt = srt_path.exists()
if has_srt:
normalized_srt = normalize(srt_path.read_text(encoding="utf-8"), replacements)
# --- Write output ---
if args.dry_run:
print("🔍 Dry run — no files written.")
else:
txt_out_path = transcript_path.with_name(transcript_path.stem + "_normalized.txt")
txt_out_path.write_text(normalized_txt, encoding="utf-8")
print(f"✅ Written: {txt_out_path}")
if has_srt:
srt_out_path.write_text(normalized_srt, encoding="utf-8")
print(f"✅ Written: {srt_out_path}")
else:
print(f"ℹ️ No matching .srt found alongside transcript — skipped.")
# --- Borderline report (from TXT only) ---
if borderline:
findings = find_borderline_matches(lines, borderline)
if findings:
print()
print("⚠️ Borderline replacements — review manually:")
print(f" {'Line':<6} {'Alias':<20} {'Primary Key':<30} Context")
print(f" {'----':<6} {'-----':<20} {'-----------':<30} -------")
for lineno, line, alias, primary_key in findings:
context = line[:80] + ("…" if len(line) > 80 else "")
print(f" {lineno:<6} {alias:<20} {primary_key:<30} {context}")
else:
print("✅ No borderline matches found in transcript.")
if __name__ == "__main__":
main()