trax/simple_transcribe.py

#!/usr/bin/env python3
"""Simple transcription script using faster-whisper directly."""

import time
from pathlib import Path
from faster_whisper import WhisperModel

def main():
    """Transcribe the BAP South meeting recording using faster-whisper directly."""
    audio_file = Path("BAP_South_Meeting_Clean.wav")

    if not audio_file.exists():
        print(f"❌ Audio file not found: {audio_file}")
        return

    print(f"🎵 Transcribing: {audio_file.name}")
    print(f"📁 File size: {audio_file.stat().st_size / (1024*1024):.1f} MB")

    try:
        print("🚀 Loading Whisper model (distil-large-v3)...")
        start_time = time.time()

        # Load the model directly
        model = WhisperModel(
            "distil-large-v3",
            device="cpu",
            compute_type="int8_float32"
        )

        model_load_time = time.time() - start_time
        print(f"✅ Model loaded in {model_load_time:.1f} seconds")

        print("🎯 Starting transcription...")
        transcription_start = time.time()

        # Transcribe the audio
        segments, info = model.transcribe(
            str(audio_file),
            language=None,  # Auto-detect
            temperature=0.0,  # Deterministic
            beam_size=1,
            best_of=1
        )

        # Convert generator to list and extract text
        segments_list = list(segments)
        full_text = " ".join([seg.text for seg in segments_list])

        transcription_time = time.time() - transcription_start
        total_time = time.time() - start_time

        print("✅ Transcription completed!")
        print(f"📝 Text length: {len(full_text)} characters")
        print(f"⏱️  Transcription time: {transcription_time:.1f} seconds")
        print(f"⏱️  Total time (including model load): {total_time:.1f} seconds")
        print(f"🎯 Language detected: {info.language}")
        print(f"📊 Segments: {len(segments_list)}")

        # Save to text file
        output_file = Path("BAP_South_Meeting_Transcript.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(f"BAP South Meeting - August 28, 2025\n")
            f.write(f"Transcription completed at: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Model: distil-large-v3\n")
            f.write(f"Language: {info.language}\n")
            f.write(f"Transcription time: {transcription_time:.1f} seconds\n")
            f.write(f"Total time: {total_time:.1f} seconds\n")
            f.write(f"Segments: {len(segments_list)}\n")
            f.write("=" * 80 + "\n\n")
            f.write(full_text)

        print(f"💾 Transcript saved to: {output_file}")

        # Also save as JSON for detailed analysis
        import json
        json_output = {
            "text": full_text,
            "segments": [
                {
                    "start": seg.start,
                    "end": seg.end,
                    "text": seg.text,
                    "avg_logprob": seg.avg_logprob,
                    "no_speech_prob": seg.no_speech_prob
                }
                for seg in segments_list
            ],
            "info": {
                "language": info.language,
                "language_probability": info.language_probability,
                "all_language_probs": info.all_language_probs
            },
            "processing_time": transcription_time,
            "total_time": total_time,
            "model": "distil-large-v3",
            "segments_count": len(segments_list)
        }

        json_file = Path("BAP_South_Meeting_Transcript.json")
        with open(json_file, "w", encoding="utf-8") as f:
            json.dump(json_output, f, indent=2, ensure_ascii=False)

        print(f"📊 Detailed data saved to: {json_file}")

        # Show first few segments as preview
        print(f"\n📋 Preview (first 3 segments):")
        for i, seg in enumerate(segments_list[:3]):
            print(f"  {i+1}. [{seg.start:.1f}s - {seg.end:.1f}s] {seg.text}")

        if len(segments_list) > 3:
            print(f"  ... and {len(segments_list) - 3} more segments")

    except Exception as e:
        print(f"❌ Transcription failed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()