88 lines
3.4 KiB
Python
88 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Simple script to transcribe the BAP South meeting recording."""
|
|
|
|
import asyncio
|
|
import json
|
|
from pathlib import Path
|
|
from src.services.local_transcription_service import create_local_transcription_service, LocalTranscriptionConfig
|
|
from src.repositories.transcription_repository import create_transcription_repository
|
|
|
|
async def main():
|
|
"""Transcribe the BAP South meeting recording."""
|
|
# Use the clean WAV file instead of the problematic M4A
|
|
audio_file = Path("BAP_South_Meeting_Clean.wav")
|
|
|
|
if not audio_file.exists():
|
|
print(f"❌ Audio file not found: {audio_file}")
|
|
return
|
|
|
|
print(f"🎵 Transcribing: {audio_file.name}")
|
|
print(f"📁 File size: {audio_file.stat().st_size / (1024*1024):.1f} MB")
|
|
|
|
try:
|
|
# Create repository and service
|
|
repository = await create_transcription_repository()
|
|
service = create_local_transcription_service(repository)
|
|
|
|
# Initialize service
|
|
await service.initialize()
|
|
|
|
# Configure transcription
|
|
config = LocalTranscriptionConfig(
|
|
model="distil-large-v3",
|
|
language=None, # Auto-detect
|
|
temperature=0.0, # Deterministic
|
|
chunk_size_seconds=600, # 10 minutes
|
|
enable_m3_preprocessing=True
|
|
)
|
|
|
|
print("🚀 Starting transcription...")
|
|
|
|
# Transcribe the audio
|
|
result = await service.transcribe_audio(audio_file, config)
|
|
|
|
print("✅ Transcription completed!")
|
|
print(f"📝 Text length: {len(result.text_content)} characters")
|
|
print(f"⏱️ Processing time: {result.processing_time_ms / 1000:.1f} seconds")
|
|
print(f"🎯 Accuracy estimate: {result.accuracy_estimate * 100:.1f}%")
|
|
|
|
# Save to text file
|
|
output_file = Path("BAP_South_Meeting_Transcript.txt")
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
f.write(f"BAP South Meeting - August 28, 2025\n")
|
|
f.write(f"Transcription completed at: {result.completed_at}\n")
|
|
f.write(f"Model: {result.model_used}\n")
|
|
f.write(f"Accuracy: {result.accuracy_estimate * 100:.1f}%\n")
|
|
f.write(f"Processing time: {result.processing_time_ms / 1000:.1f} seconds\n")
|
|
f.write(f"Word count: {result.word_count}\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
f.write(result.text_content)
|
|
|
|
print(f"💾 Transcript saved to: {output_file}")
|
|
|
|
# Also save as JSON for detailed analysis
|
|
json_output = {
|
|
"text": result.text_content,
|
|
"segments": result.segments,
|
|
"accuracy": result.accuracy_estimate,
|
|
"processing_time": result.processing_time_ms / 1000.0,
|
|
"word_count": result.word_count,
|
|
"model_used": result.model_used,
|
|
"completed_at": result.completed_at,
|
|
"quality_warnings": result.quality_warnings
|
|
}
|
|
|
|
json_file = Path("BAP_South_Meeting_Transcript.json")
|
|
with open(json_file, "w", encoding="utf-8") as f:
|
|
json.dump(json_output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"📊 Detailed data saved to: {json_file}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Transcription failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|