youtube-summarizer/test_faster_whisper_speech.py

171 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Test script for faster-whisper with a speech-heavy video and VAD disabled for comparison.
"""
import asyncio
import sys
import logging
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent / "backend"))
from backend.services.faster_whisper_transcript_service import FasterWhisperTranscriptService
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def test_with_vad_disabled():
"""Test with VAD disabled to ensure we get transcription."""
# Use same video but disable VAD for testing
test_video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Rick Roll
test_video_id = "dQw4w9WgXcQ"
logger.info("🚀 Testing FasterWhisperTranscriptService with VAD DISABLED")
logger.info(f"📹 Test video: {test_video_url}")
try:
# Create service with VAD disabled
service = FasterWhisperTranscriptService(
model_size="large-v3-turbo",
device="auto",
compute_type="auto",
beam_size=5,
vad_filter=False, # DISABLE VAD for testing
word_timestamps=True,
temperature=0.0,
best_of=5
)
logger.info(f"⚙️ Configuration: VAD={service.vad_filter}, Model={service.model_size}")
# Progress callback
async def progress_callback(message: str):
logger.info(f"📊 {message}")
# Run transcription
logger.info("🎬 Starting transcription with VAD disabled...")
import time
start_time = time.time()
segments, metadata = await service.transcribe_video(
video_id=test_video_id,
video_url=test_video_url,
progress_callback=progress_callback
)
end_time = time.time()
total_time = end_time - start_time
# Report results
logger.info(f"✅ Transcription completed!")
logger.info(f"⏱️ Total time: {total_time:.2f} seconds")
logger.info(f"📊 Results:")
logger.info(f" Segments: {len(segments)}")
logger.info(f" Word count: {metadata.word_count}")
logger.info(f" Quality: {metadata.quality_score:.3f}")
logger.info(f" Confidence: {metadata.confidence_score:.3f}")
logger.info(f" Processing time: {metadata.processing_time_seconds:.2f}s")
# Show segments
if segments:
logger.info(f"📝 First 5 segments:")
for i, segment in enumerate(segments[:5]):
logger.info(f" [{segment.start_time:.1f}s - {segment.end_time:.1f}s] {segment.text}")
# Calculate speed
if segments[-1].end_time:
video_duration = segments[-1].end_time
speed_ratio = video_duration / metadata.processing_time_seconds
logger.info(f"🚀 Speed: {speed_ratio:.1f}x faster than realtime")
else:
logger.warning("⚠️ No segments generated even with VAD disabled")
await service.cleanup()
logger.info("🎉 Test completed!")
return True
except Exception as e:
logger.error(f"❌ Test failed: {e}")
import traceback
traceback.print_exc()
return False
async def test_with_different_video():
"""Test with a video that has clear speech."""
# Use a TED talk or similar video with clear speech
# This is a short educational video with clear speech
test_video_url = "https://www.youtube.com/watch?v=9bZkp7q19f0" # TED talk (example)
test_video_id = "9bZkp7q19f0"
logger.info("🚀 Testing with speech-heavy video")
logger.info(f"📹 Test video: {test_video_url}")
try:
# Use VAD enabled for speech video
service = FasterWhisperTranscriptService(
model_size="base", # Use smaller model for faster testing
device="auto",
compute_type="auto",
vad_filter=True,
beam_size=3 # Faster beam search
)
logger.info(f"⚙️ Using {service.model_size} model with VAD enabled")
async def progress_callback(message: str):
logger.info(f"📊 {message}")
import time
start_time = time.time()
segments, metadata = await service.transcribe_video(
video_id=test_video_id,
video_url=test_video_url,
progress_callback=progress_callback
)
end_time = time.time()
logger.info(f"✅ Speech video test completed!")
logger.info(f"⏱️ Time: {end_time - start_time:.2f}s")
logger.info(f"📊 Segments: {len(segments)}")
if segments:
logger.info(f"📝 Sample segments:")
for segment in segments[:3]:
logger.info(f" [{segment.start_time:.1f}s] {segment.text[:100]}...")
await service.cleanup()
return True
except Exception as e:
logger.error(f"❌ Speech video test failed: {e}")
return False
if __name__ == "__main__":
# First test with VAD disabled
logger.info("=" * 60)
logger.info("TEST 1: VAD DISABLED")
logger.info("=" * 60)
success1 = asyncio.run(test_with_vad_disabled())
# Then test with a speech video
logger.info("\n" + "=" * 60)
logger.info("TEST 2: SPEECH VIDEO")
logger.info("=" * 60)
success2 = asyncio.run(test_with_different_video())
if success1:
logger.info("✅ faster-whisper integration is working!")
sys.exit(0)
else:
logger.error("❌ faster-whisper integration has issues")
sys.exit(1)