Skip to main content

transcribe_audio.sh - With Context

#!/usr/bin/env bash
set -euo pipefail

# ------------------------------------------------------------
# transcribe-audio.sh
# Post-session transcription on Mac using whisper-cli
# Usage: transcribe-audio.sh [--de|--en|--auto] [--game <slug>] <audio.wav>
# ------------------------------------------------------------

WHISPER="$HOME/Transkriptionen/whisper.cpp/build/bin/whisper-cli"
MODEL="$HOME/Transkriptionen/whisper.cpp/models/ggml-large-v3-turbo.bin"
VAD_MODEL="$HOME/Transkriptionen/whisper.cpp/models/ggml-silero-v6.2.0.bin"
META_DIR="$HOME/Syncthing/TranscriptOMatic/meta"

# ------------------------------------------------------------
# Argument parsing
# ------------------------------------------------------------

LANG_MODE="en"
AUDIO=""
GAME_SLUG=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --de)    LANG_MODE="de"; shift ;;
    --en)    LANG_MODE="en"; shift ;;
    --auto)  LANG_MODE="auto"; shift ;;
    --game)  GAME_SLUG="$2"; shift 2 ;;
    -*)
      echo "Usage: transcribe-audio [--de|--en|--auto] [--game <slug>] <audio.wav>" >&2
      exit 2
      ;;
    *)
      AUDIO="$1"; shift ;;
  esac
done

if [[ -z "$AUDIO" ]]; then
  echo "❌ No audio file specified." >&2
  echo "   Usage: transcribe-audio [--de|--en|--auto] [--game <slug>] <audio.wav>" >&2
  exit 1
fi

if [[ ! -f "$AUDIO" ]]; then
  echo "❌ File not found: $AUDIO" >&2
  exit 1
fi

# ------------------------------------------------------------
# Language options
# ------------------------------------------------------------

case "$LANG_MODE" in
  en|de) LANG_OPTS=(-l "$LANG_MODE") ;;
  auto)  LANG_OPTS=() ;;
esac

# ------------------------------------------------------------
# Build prompt from YAML meta file (if --game given)
# ------------------------------------------------------------

PROMPT_OPTS=()

if [[ -z "$GAME_SLUG" ]]; then
  echo "⚠️  No --game specified. Running without vocabulary prompt."
  echo "   Tip: use --game <slug> for better transcription of proper nouns."
fi

if [[ -n "$GAME_SLUG" ]]; then
  META_FILE="$META_DIR/${GAME_SLUG}.yaml"
  if [[ -f "$META_FILE" ]]; then
    # Extract all primary keys from characters, locations, terms, phrases, groups
    # using Python — handles Unicode correctly
    PROMPT_TEXT="$(uv run --with pyyaml python3 - "$META_FILE" <<'PYEOF'
import sys, yaml

with open(sys.argv[1], encoding="utf-8") as f:
    data = yaml.safe_load(f)

keys = []
for section in ["characters", "locations", "terms", "phrases", "groups"]:
    block = data.get(section, {}) or {}
    if isinstance(block, dict):
        for key in block.keys():
            # Strip parenthetical suffixes e.g. "Muiris Doyle (Ó Dubhghaill)"
            clean = key.split("(")[0].strip()
            if clean:
                keys.append(clean)

print(", ".join(keys))
PYEOF
)"
    if [[ -n "$PROMPT_TEXT" ]]; then
      PROMPT_OPTS=(--prompt "$PROMPT_TEXT" --carry-initial-prompt)
      echo "📋 Game:       $GAME_SLUG"
      echo "💬 Prompt:     $PROMPT_TEXT"
    fi
  else
    echo "⚠️  No meta file found for slug '$GAME_SLUG' in $META_DIR" >&2
  fi
fi

# ------------------------------------------------------------
# Output path: same dir as audio, transcript suffix
# ------------------------------------------------------------

AUDIO_DIR="$(dirname "$AUDIO")"
AUDIO_BASE="$(basename "$AUDIO" .wav)"
TRANSCRIPT_BASE="$AUDIO_DIR/${AUDIO_BASE}_transcript"

echo "🎧 Audio:      $AUDIO"
echo "🗣️  Language:   $LANG_MODE"
echo "📝 Transcript: ${TRANSCRIPT_BASE}.txt"
echo "----"

# ------------------------------------------------------------
# Run whisper-cli
# ------------------------------------------------------------

"$WHISPER" \
  -m "$MODEL" \
  "${LANG_OPTS[@]}" \
  --vad \
  -vm "$VAD_MODEL" \
  --output-txt \
  --output-srt \
  -of "$TRANSCRIPT_BASE" \
  "${PROMPT_OPTS[@]}" \
  -f "$AUDIO"

# Strip ANSI escape sequences and carriage returns from txt output
sed -i '' 's/\x1b\[[0-9;]*[mGKH]//g; s/\r//g' "${TRANSCRIPT_BASE}.txt"

echo "✅ Done: ${TRANSCRIPT_BASE}.txt"