youtube-summarizer/test_faster_whisper_speech.py

#!/usr/bin/env python3
"""
Test script for faster-whisper with a speech-heavy video and VAD disabled for comparison.
"""

import asyncio
import sys
import logging
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent / "backend"))

from backend.services.faster_whisper_transcript_service import FasterWhisperTranscriptService

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


async def test_with_vad_disabled():
    """Test with VAD disabled to ensure we get transcription."""

    # Use same video but disable VAD for testing
    test_video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # Rick Roll
    test_video_id = "dQw4w9WgXcQ"

    logger.info("🚀 Testing FasterWhisperTranscriptService with VAD DISABLED")
    logger.info(f"📹 Test video: {test_video_url}")

    try:
        # Create service with VAD disabled
        service = FasterWhisperTranscriptService(
            model_size="large-v3-turbo",
            device="auto",
            compute_type="auto",
            beam_size=5,
            vad_filter=False,  # DISABLE VAD for testing
            word_timestamps=True,
            temperature=0.0,
            best_of=5
        )

        logger.info(f"⚙️  Configuration: VAD={service.vad_filter}, Model={service.model_size}")

        # Progress callback
        async def progress_callback(message: str):
            logger.info(f"📊 {message}")

        # Run transcription
        logger.info("🎬 Starting transcription with VAD disabled...")
        import time
        start_time = time.time()

        segments, metadata = await service.transcribe_video(
            video_id=test_video_id,
            video_url=test_video_url,
            progress_callback=progress_callback
        )

        end_time = time.time()
        total_time = end_time - start_time

        # Report results
        logger.info(f"✅ Transcription completed!")
        logger.info(f"⏱️  Total time: {total_time:.2f} seconds")
        logger.info(f"📊 Results:")
        logger.info(f"   Segments: {len(segments)}")
        logger.info(f"   Word count: {metadata.word_count}")
        logger.info(f"   Quality: {metadata.quality_score:.3f}")
        logger.info(f"   Confidence: {metadata.confidence_score:.3f}")
        logger.info(f"   Processing time: {metadata.processing_time_seconds:.2f}s")

        # Show segments
        if segments:
            logger.info(f"📝 First 5 segments:")
            for i, segment in enumerate(segments[:5]):
                logger.info(f"   [{segment.start_time:.1f}s - {segment.end_time:.1f}s] {segment.text}")

            # Calculate speed
            if segments[-1].end_time:
                video_duration = segments[-1].end_time
                speed_ratio = video_duration / metadata.processing_time_seconds
                logger.info(f"🚀 Speed: {speed_ratio:.1f}x faster than realtime")
        else:
            logger.warning("⚠️  No segments generated even with VAD disabled")

        await service.cleanup()
        logger.info("🎉 Test completed!")
        return True

    except Exception as e:
        logger.error(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()
        return False


async def test_with_different_video():
    """Test with a video that has clear speech."""

    # Use a TED talk or similar video with clear speech
    # This is a short educational video with clear speech
    test_video_url = "https://www.youtube.com/watch?v=9bZkp7q19f0"  # TED talk (example)
    test_video_id = "9bZkp7q19f0"

    logger.info("🚀 Testing with speech-heavy video")
    logger.info(f"📹 Test video: {test_video_url}")

    try:
        # Use VAD enabled for speech video
        service = FasterWhisperTranscriptService(
            model_size="base",  # Use smaller model for faster testing
            device="auto",
            compute_type="auto",
            vad_filter=True,
            beam_size=3  # Faster beam search
        )

        logger.info(f"⚙️  Using {service.model_size} model with VAD enabled")

        async def progress_callback(message: str):
            logger.info(f"📊 {message}")

        import time
        start_time = time.time()

        segments, metadata = await service.transcribe_video(
            video_id=test_video_id,
            video_url=test_video_url,
            progress_callback=progress_callback
        )

        end_time = time.time()

        logger.info(f"✅ Speech video test completed!")
        logger.info(f"⏱️  Time: {end_time - start_time:.2f}s")
        logger.info(f"📊 Segments: {len(segments)}")

        if segments:
            logger.info(f"📝 Sample segments:")
            for segment in segments[:3]:
                logger.info(f"   [{segment.start_time:.1f}s] {segment.text[:100]}...")

        await service.cleanup()
        return True

    except Exception as e:
        logger.error(f"❌ Speech video test failed: {e}")
        return False


if __name__ == "__main__":
    # First test with VAD disabled
    logger.info("=" * 60)
    logger.info("TEST 1: VAD DISABLED")
    logger.info("=" * 60)
    success1 = asyncio.run(test_with_vad_disabled())

    # Then test with a speech video
    logger.info("\n" + "=" * 60)
    logger.info("TEST 2: SPEECH VIDEO")
    logger.info("=" * 60)
    success2 = asyncio.run(test_with_different_video())

    if success1:
        logger.info("✅ faster-whisper integration is working!")
        sys.exit(0)
    else:
        logger.error("❌ faster-whisper integration has issues")
        sys.exit(1)