trax/transcribe_bap.py

88 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""Simple script to transcribe the BAP South meeting recording."""
import asyncio
import json
from pathlib import Path
from src.services.local_transcription_service import create_local_transcription_service, LocalTranscriptionConfig
from src.repositories.transcription_repository import create_transcription_repository
async def main():
"""Transcribe the BAP South meeting recording."""
# Use the clean WAV file instead of the problematic M4A
audio_file = Path("BAP_South_Meeting_Clean.wav")
if not audio_file.exists():
print(f"❌ Audio file not found: {audio_file}")
return
print(f"🎵 Transcribing: {audio_file.name}")
print(f"📁 File size: {audio_file.stat().st_size / (1024*1024):.1f} MB")
try:
# Create repository and service
repository = await create_transcription_repository()
service = create_local_transcription_service(repository)
# Initialize service
await service.initialize()
# Configure transcription
config = LocalTranscriptionConfig(
model="distil-large-v3",
language=None, # Auto-detect
temperature=0.0, # Deterministic
chunk_size_seconds=600, # 10 minutes
enable_m3_preprocessing=True
)
print("🚀 Starting transcription...")
# Transcribe the audio
result = await service.transcribe_audio(audio_file, config)
print("✅ Transcription completed!")
print(f"📝 Text length: {len(result.text_content)} characters")
print(f"⏱️ Processing time: {result.processing_time_ms / 1000:.1f} seconds")
print(f"🎯 Accuracy estimate: {result.accuracy_estimate * 100:.1f}%")
# Save to text file
output_file = Path("BAP_South_Meeting_Transcript.txt")
with open(output_file, "w", encoding="utf-8") as f:
f.write(f"BAP South Meeting - August 28, 2025\n")
f.write(f"Transcription completed at: {result.completed_at}\n")
f.write(f"Model: {result.model_used}\n")
f.write(f"Accuracy: {result.accuracy_estimate * 100:.1f}%\n")
f.write(f"Processing time: {result.processing_time_ms / 1000:.1f} seconds\n")
f.write(f"Word count: {result.word_count}\n")
f.write("=" * 80 + "\n\n")
f.write(result.text_content)
print(f"💾 Transcript saved to: {output_file}")
# Also save as JSON for detailed analysis
json_output = {
"text": result.text_content,
"segments": result.segments,
"accuracy": result.accuracy_estimate,
"processing_time": result.processing_time_ms / 1000.0,
"word_count": result.word_count,
"model_used": result.model_used,
"completed_at": result.completed_at,
"quality_warnings": result.quality_warnings
}
json_file = Path("BAP_South_Meeting_Transcript.json")
with open(json_file, "w", encoding="utf-8") as f:
json.dump(json_output, f, indent=2, ensure_ascii=False)
print(f"📊 Detailed data saved to: {json_file}")
except Exception as e:
print(f"❌ Transcription failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())