171 lines
5.7 KiB
Python
171 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for faster-whisper with a speech-heavy video and VAD disabled for comparison.
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
# Add backend to path
|
|
sys.path.insert(0, str(Path(__file__).parent / "backend"))
|
|
|
|
from backend.services.faster_whisper_transcript_service import FasterWhisperTranscriptService
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def test_with_vad_disabled():
|
|
"""Test with VAD disabled to ensure we get transcription."""
|
|
|
|
# Use same video but disable VAD for testing
|
|
test_video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Rick Roll
|
|
test_video_id = "dQw4w9WgXcQ"
|
|
|
|
logger.info("🚀 Testing FasterWhisperTranscriptService with VAD DISABLED")
|
|
logger.info(f"📹 Test video: {test_video_url}")
|
|
|
|
try:
|
|
# Create service with VAD disabled
|
|
service = FasterWhisperTranscriptService(
|
|
model_size="large-v3-turbo",
|
|
device="auto",
|
|
compute_type="auto",
|
|
beam_size=5,
|
|
vad_filter=False, # DISABLE VAD for testing
|
|
word_timestamps=True,
|
|
temperature=0.0,
|
|
best_of=5
|
|
)
|
|
|
|
logger.info(f"⚙️ Configuration: VAD={service.vad_filter}, Model={service.model_size}")
|
|
|
|
# Progress callback
|
|
async def progress_callback(message: str):
|
|
logger.info(f"📊 {message}")
|
|
|
|
# Run transcription
|
|
logger.info("🎬 Starting transcription with VAD disabled...")
|
|
import time
|
|
start_time = time.time()
|
|
|
|
segments, metadata = await service.transcribe_video(
|
|
video_id=test_video_id,
|
|
video_url=test_video_url,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
end_time = time.time()
|
|
total_time = end_time - start_time
|
|
|
|
# Report results
|
|
logger.info(f"✅ Transcription completed!")
|
|
logger.info(f"⏱️ Total time: {total_time:.2f} seconds")
|
|
logger.info(f"📊 Results:")
|
|
logger.info(f" Segments: {len(segments)}")
|
|
logger.info(f" Word count: {metadata.word_count}")
|
|
logger.info(f" Quality: {metadata.quality_score:.3f}")
|
|
logger.info(f" Confidence: {metadata.confidence_score:.3f}")
|
|
logger.info(f" Processing time: {metadata.processing_time_seconds:.2f}s")
|
|
|
|
# Show segments
|
|
if segments:
|
|
logger.info(f"📝 First 5 segments:")
|
|
for i, segment in enumerate(segments[:5]):
|
|
logger.info(f" [{segment.start_time:.1f}s - {segment.end_time:.1f}s] {segment.text}")
|
|
|
|
# Calculate speed
|
|
if segments[-1].end_time:
|
|
video_duration = segments[-1].end_time
|
|
speed_ratio = video_duration / metadata.processing_time_seconds
|
|
logger.info(f"🚀 Speed: {speed_ratio:.1f}x faster than realtime")
|
|
else:
|
|
logger.warning("⚠️ No segments generated even with VAD disabled")
|
|
|
|
await service.cleanup()
|
|
logger.info("🎉 Test completed!")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
async def test_with_different_video():
|
|
"""Test with a video that has clear speech."""
|
|
|
|
# Use a TED talk or similar video with clear speech
|
|
# This is a short educational video with clear speech
|
|
test_video_url = "https://www.youtube.com/watch?v=9bZkp7q19f0" # TED talk (example)
|
|
test_video_id = "9bZkp7q19f0"
|
|
|
|
logger.info("🚀 Testing with speech-heavy video")
|
|
logger.info(f"📹 Test video: {test_video_url}")
|
|
|
|
try:
|
|
# Use VAD enabled for speech video
|
|
service = FasterWhisperTranscriptService(
|
|
model_size="base", # Use smaller model for faster testing
|
|
device="auto",
|
|
compute_type="auto",
|
|
vad_filter=True,
|
|
beam_size=3 # Faster beam search
|
|
)
|
|
|
|
logger.info(f"⚙️ Using {service.model_size} model with VAD enabled")
|
|
|
|
async def progress_callback(message: str):
|
|
logger.info(f"📊 {message}")
|
|
|
|
import time
|
|
start_time = time.time()
|
|
|
|
segments, metadata = await service.transcribe_video(
|
|
video_id=test_video_id,
|
|
video_url=test_video_url,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
end_time = time.time()
|
|
|
|
logger.info(f"✅ Speech video test completed!")
|
|
logger.info(f"⏱️ Time: {end_time - start_time:.2f}s")
|
|
logger.info(f"📊 Segments: {len(segments)}")
|
|
|
|
if segments:
|
|
logger.info(f"📝 Sample segments:")
|
|
for segment in segments[:3]:
|
|
logger.info(f" [{segment.start_time:.1f}s] {segment.text[:100]}...")
|
|
|
|
await service.cleanup()
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Speech video test failed: {e}")
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# First test with VAD disabled
|
|
logger.info("=" * 60)
|
|
logger.info("TEST 1: VAD DISABLED")
|
|
logger.info("=" * 60)
|
|
success1 = asyncio.run(test_with_vad_disabled())
|
|
|
|
# Then test with a speech video
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("TEST 2: SPEECH VIDEO")
|
|
logger.info("=" * 60)
|
|
success2 = asyncio.run(test_with_different_video())
|
|
|
|
if success1:
|
|
logger.info("✅ faster-whisper integration is working!")
|
|
sys.exit(0)
|
|
else:
|
|
logger.error("❌ faster-whisper integration has issues")
|
|
sys.exit(1) |