youtube-summarizer/backend/services/faster_whisper_transcript_s...

"""
Faster Whisper transcription service for YouTube videos.
Uses faster-whisper (CTranslate2) for 20-32x speed improvement over OpenAI Whisper.
Implements large-v3-turbo model for maximum accuracy and speed.
"""

import os
import logging
import tempfile
import asyncio
from datetime import datetime
from typing import List, Dict, Optional, Tuple, Union
from pathlib import Path
import torch
from faster_whisper import WhisperModel
from pydub import AudioSegment
import yt_dlp
import aiofiles
import aiohttp

from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
from ..core.config import settings
from ..config.video_download_config import VideoDownloadConfig

logger = logging.getLogger(__name__)


class FasterWhisperTranscriptService:
    """
    Service for transcribing YouTube videos using faster-whisper.

    Provides 20-32x speed improvement over OpenAI Whisper while maintaining
    or improving accuracy using the large-v3-turbo model.
    """

    def __init__(
        self,
        model_size: str = "large-v3-turbo",
        device: str = "auto",
        compute_type: str = "auto",
        beam_size: int = 5,
        vad_filter: bool = True,
        word_timestamps: bool = True,
        temperature: float = 0.0,
        best_of: int = 5
    ):
        """
        Initialize the faster-whisper transcription service.

        Args:
            model_size: Model size ("large-v3-turbo", "large-v3", "large-v2", "medium", "small", "base", "tiny")
                       Recommended: "large-v3-turbo" for best speed/accuracy balance
            device: Device to run on ("cpu", "cuda", "auto")
            compute_type: Computation type ("int8", "float16", "float32", "auto")
                         "int8" provides best speed with minimal accuracy loss
        """
        self.model_size = model_size
        self.device = self._get_device(device)
        self.compute_type = self._get_compute_type(compute_type)
        self.model = None

        # Configuration optimized for faster-whisper
        self.chunk_duration = 30 * 60  # 30 minutes per chunk
        self.overlap_duration = 30     # 30 seconds overlap between chunks
        self.max_segment_length = 1000 # Maximum characters per segment

        # Faster-whisper specific optimizations from parameters
        self.vad_filter = vad_filter  # Voice Activity Detection for efficiency
        self.vad_parameters = dict(
            min_silence_duration_ms=500,
            speech_pad_ms=400,
        )

        # Batch processing configuration from parameters
        self.beam_size = beam_size            # Beam search size (1-10, higher = better quality, slower)
        self.best_of = best_of               # Number of candidates when sampling (None = deterministic)
        self.temperature = temperature        # Sampling temperature (0 = deterministic)
        self.word_timestamps = word_timestamps # Enable word-level timestamps

        # Use video storage configuration
        self.config = VideoDownloadConfig()
        self.config.ensure_directories()
        self.storage_dirs = self.config.get_storage_dirs()
        self.temp_dir = self.storage_dirs["temp"]

    def _get_device(self, device: str) -> str:
        """Determine the appropriate device for processing."""
        if device == "auto":
            if torch.cuda.is_available():
                logger.info("CUDA available, using GPU acceleration")
                return "cuda"
            else:
                logger.info("CUDA not available, using CPU")
                return "cpu"
        return device

    def _get_compute_type(self, compute_type: str) -> str:
        """Determine the appropriate compute type for the device."""
        if compute_type == "auto":
            if self.device == "cuda":
                # Use float16 for GPU for best speed/memory balance
                return "float16"
            else:
                # Use int8 for CPU for best speed
                return "int8"
        return compute_type

    async def _load_model(self) -> WhisperModel:
        """Load the faster-whisper model on-demand."""
        if self.model is None:
            logger.info(f"Loading faster-whisper model '{self.model_size}' on device '{self.device}' with compute_type '{self.compute_type}'")
            try:
                # Run model loading in executor to avoid blocking async loop
                loop = asyncio.get_event_loop()

                # Handle special model names
                model_name = self.model_size
                if model_name == "large-v3-turbo":
                    # Use the optimized CTranslate2 model
                    model_name = "deepdml/faster-whisper-large-v3-turbo-ct2"

                self.model = await loop.run_in_executor(
                    None,
                    lambda: WhisperModel(
                        model_name,
                        device=self.device,
                        compute_type=self.compute_type,
                        cpu_threads=0,  # Use all available CPU threads
                        num_workers=1,  # Number of parallel workers
                    )
                )

                logger.info(f"Successfully loaded faster-whisper model '{self.model_size}' ({model_name})")
                logger.info(f"Model device: {self.device}, compute_type: {self.compute_type}")

            except Exception as e:
                logger.error(f"Failed to load faster-whisper model: {e}")
                # Fallback to standard large-v3 if turbo model fails
                if self.model_size == "large-v3-turbo":
                    logger.info("Falling back to large-v3 model")
                    try:
                        self.model = await loop.run_in_executor(
                            None,
                            lambda: WhisperModel(
                                "large-v3",
                                device=self.device,
                                compute_type=self.compute_type,
                            )
                        )
                        logger.info("Successfully loaded fallback large-v3 model")
                    except Exception as fallback_error:
                        logger.error(f"Fallback model also failed: {fallback_error}")
                        raise fallback_error
                else:
                    raise e
        return self.model

    async def transcribe_video(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
        """
        Transcribe a YouTube video and return segments with metadata.

        Args:
            video_id: YouTube video ID
            video_url: Full YouTube video URL
            progress_callback: Optional callback for progress updates

        Returns:
            Tuple of (segments, metadata)
        """
        start_time = datetime.now()

        try:
            if progress_callback:
                await progress_callback("Downloading audio from YouTube video...")

            # Download audio from YouTube video
            audio_path = await self._download_audio(video_id, video_url)

            if progress_callback:
                await progress_callback("Audio downloaded, starting faster-whisper transcription...")

            logger.info(f"Starting faster-whisper transcription for video {video_id} using model {self.model_size}")

            # Transcribe the audio file
            segments = await self._transcribe_audio_file(
                audio_path,
                progress_callback=progress_callback
            )

            # Calculate processing time
            processing_time = (datetime.now() - start_time).total_seconds()

            # Create metadata
            metadata = DualTranscriptMetadata(
                video_id=video_id,
                language="en",  # faster-whisper auto-detects, but assume English for now
                word_count=sum(len(segment.text.split()) for segment in segments),
                total_segments=len(segments),
                has_timestamps=True,
                extraction_method="faster_whisper",
                processing_time_seconds=processing_time,
                quality_score=self._calculate_quality_score(segments),
                confidence_score=self._calculate_confidence_score(segments)
            )

            duration_minutes = processing_time / 60
            logger.info(
                f"Completed faster-whisper transcription for video {video_id}. "
                f"Generated {len(segments)} segments in {processing_time:.2f}s ({duration_minutes:.2f} minutes). "
                f"Model: {self.model_size}, Device: {self.device}"
            )

            # Save transcript to file
            await self._save_transcript(video_id, segments, metadata)

            return segments, metadata

        except Exception as e:
            logger.error(f"Faster-whisper transcription failed for video {video_id}: {e}")
            raise
        finally:
            # Clean up temporary files, but keep MP3 for future re-transcription
            if 'audio_path' in locals() and audio_path:
                await self._cleanup_temp_files(audio_path)

    async def _download_audio(self, video_id: str, video_url: str) -> str:
        """Download audio from YouTube video using yt-dlp."""
        try:
            # Check if audio already exists (MP3 for storage)
            mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"

            # If MP3 exists, use it directly (faster-whisper handles MP3 natively)
            if mp3_path.exists():
                logger.info(f"Using existing audio file: {mp3_path}")
                return str(mp3_path)

            # Download as MP3 for efficient storage
            ydl_opts = {
                'format': 'bestaudio/best',
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
                'outtmpl': str(self.storage_dirs["audio"] / f"{video_id}.%(ext)s"),
                'quiet': True,
                'no_warnings': True,
            }

            # Run yt-dlp in executor to avoid blocking
            loop = asyncio.get_event_loop()
            await loop.run_in_executor(
                None,
                lambda: self._run_yt_dlp(video_url, ydl_opts)
            )

            # Return MP3 path (faster-whisper can handle MP3 directly)
            if mp3_path.exists():
                return str(mp3_path)

            raise RuntimeError(f"Failed to download audio for {video_id}")

        except Exception as e:
            logger.error(f"Failed to download audio for video {video_id}: {e}")
            raise RuntimeError(f"Audio download failed: {e}")

    def _run_yt_dlp(self, url: str, opts: dict):
        """Run yt-dlp synchronously."""
        with yt_dlp.YoutubeDL(opts) as ydl:
            ydl.download([url])

    async def _transcribe_audio_file(
        self,
        audio_path: str,
        progress_callback=None
    ) -> List[DualTranscriptSegment]:
        """
        Transcribe an audio file with optimized faster-whisper settings.

        Args:
            audio_path: Path to the audio file
            progress_callback: Optional callback for progress updates

        Returns:
            List of transcription segments
        """
        model = await self._load_model()

        # Get audio duration for progress tracking
        duration = await self._get_audio_duration(audio_path)
        logger.info(f"Audio duration: {duration:.2f} seconds ({duration/60:.1f} minutes)")

        try:
            if progress_callback:
                await progress_callback(f"Transcribing {duration/60:.1f} minute audio with {self.model_size}...")

            # Use faster-whisper with optimized settings
            logger.info(f"Transcribing with faster-whisper - VAD: {self.vad_filter}, Beam: {self.beam_size}")

            loop = asyncio.get_event_loop()
            result = await loop.run_in_executor(
                None,
                lambda: self._transcribe_with_faster_whisper(model, audio_path)
            )

            segments, info = result

            # Log transcription info
            logger.info(f"Detected language: {info.language} (probability: {info.language_probability:.2f})")
            logger.info(f"Duration: {info.duration:.2f}s, VAD: {info.vad_options if hasattr(info, 'vad_options') else 'N/A'}")

            # Convert to DualTranscriptSegment objects
            transcript_segments = []

            for segment in segments:
                # Handle word-level timestamps if available
                text = segment.text.strip()

                # Split long segments if needed
                if len(text) > self.max_segment_length:
                    split_segments = self._split_long_segment(
                        text, segment.start, segment.end
                    )
                    transcript_segments.extend(split_segments)
                else:
                    transcript_segments.append(DualTranscriptSegment(
                        start_time=segment.start,
                        end_time=segment.end,
                        text=text,
                        confidence=segment.avg_logprob if hasattr(segment, 'avg_logprob') else None
                    ))

            if progress_callback:
                await progress_callback(f"Transcription complete - {len(transcript_segments)} segments generated")

            return transcript_segments

        except Exception as e:
            logger.error(f"Failed to transcribe audio file {audio_path}: {e}")
            raise

    def _transcribe_with_faster_whisper(self, model: WhisperModel, audio_path: str):
        """
        Perform the actual transcription with faster-whisper.
        Run in executor to avoid blocking the event loop.
        """
        return model.transcribe(
            audio_path,
            beam_size=self.beam_size,
            best_of=self.best_of,
            temperature=self.temperature,
            vad_filter=self.vad_filter,
            vad_parameters=self.vad_parameters,
            word_timestamps=self.word_timestamps,
            language="en",  # Can be made configurable
            task="transcribe"
        )

    async def _get_audio_duration(self, audio_path: str) -> float:
        """Get audio duration using pydub."""
        loop = asyncio.get_event_loop()
        audio = await loop.run_in_executor(None, AudioSegment.from_file, audio_path)
        return len(audio) / 1000.0  # Convert milliseconds to seconds

    def _split_long_segment(
        self,
        text: str,
        start_time: float,
        end_time: float
    ) -> List[DualTranscriptSegment]:
        """
        Split a long text segment into smaller segments.

        Args:
            text: Text to split
            start_time: Start time of the original segment
            end_time: End time of the original segment

        Returns:
            List of smaller segments
        """
        segments = []
        duration = end_time - start_time

        # Split text by sentences or at word boundaries
        words = text.split()
        current_text = ""
        current_words = 0

        time_per_word = duration / len(words) if len(words) > 0 else 0

        for i, word in enumerate(words):
            if len(current_text + " " + word) > self.max_segment_length and current_text:
                # Create segment
                segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
                segment_end = start_time + current_words * time_per_word

                segments.append(DualTranscriptSegment(
                    start_time=segment_start,
                    end_time=segment_end,
                    text=current_text.strip()
                ))

                current_text = word
            else:
                current_text += " " + word if current_text else word

            current_words += 1

        # Add final segment
        if current_text:
            segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
            segments.append(DualTranscriptSegment(
                start_time=segment_start,
                end_time=end_time,
                text=current_text.strip()
            ))

        return segments

    def _calculate_quality_score(self, segments: List[DualTranscriptSegment]) -> float:
        """Calculate overall quality score based on segment characteristics."""
        if not segments:
            return 0.0

        # Faster-whisper provides more reliable confidence scores
        confidences = [s.confidence for s in segments if s.confidence is not None]
        if not confidences:
            return 0.8  # Default high quality for faster-whisper

        avg_confidence = sum(confidences) / len(confidences)

        # Normalize confidence from log probability to 0-1 scale
        # faster-whisper typically gives better normalized scores
        normalized_confidence = max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))

        # Boost quality score for faster-whisper due to improved model
        return min(1.0, normalized_confidence * 1.1)

    def _calculate_confidence_score(self, segments: List[DualTranscriptSegment]) -> float:
        """Calculate average confidence score."""
        if not segments:
            return 0.0

        confidences = [s.confidence for s in segments if s.confidence is not None]
        if not confidences:
            return 0.85  # Higher default for faster-whisper

        avg_confidence = sum(confidences) / len(confidences)
        # Normalize from log probability to 0-1 scale
        return max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))

    async def _save_transcript(
        self,
        video_id: str,
        segments: List[DualTranscriptSegment],
        metadata: DualTranscriptMetadata
    ):
        """Save transcript and metadata to files for future use"""
        try:
            # Save audio metadata with faster-whisper info
            await self._save_audio_metadata(video_id, metadata)

            transcript_path = self.storage_dirs["transcripts"] / f"{video_id}_faster_whisper.txt"

            # Create human-readable transcript file
            transcript_lines = [
                f"# Faster-Whisper Transcript - Model: {self.model_size}",
                f"# Processing time: {metadata.processing_time_seconds:.2f}s",
                f"# Quality score: {metadata.quality_score:.3f}",
                f"# Confidence score: {metadata.confidence_score:.3f}",
                f"# Total segments: {len(segments)}",
                ""
            ]

            for segment in segments:
                if segment.start_time is not None and segment.end_time is not None:
                    timestamp = f"[{segment.start_time:.1f}s - {segment.end_time:.1f}s]"
                    transcript_lines.append(f"{timestamp} {segment.text}")
                else:
                    transcript_lines.append(segment.text)

            # Write transcript to file
            async with aiofiles.open(transcript_path, 'w', encoding='utf-8') as f:
                await f.write('\n'.join(transcript_lines))

            logger.info(f"Saved faster-whisper transcript to {transcript_path}")

            # Also save as JSON for programmatic access
            json_path = self.storage_dirs["transcripts"] / f"{video_id}_faster_whisper.json"
            segments_data = {
                "metadata": {
                    "model": self.model_size,
                    "device": self.device,
                    "compute_type": self.compute_type,
                    "processing_time_seconds": metadata.processing_time_seconds,
                    "quality_score": metadata.quality_score,
                    "confidence_score": metadata.confidence_score,
                    "total_segments": len(segments),
                    "word_count": metadata.word_count,
                    "extraction_method": "faster_whisper"
                },
                "segments": [
                    {
                        "start_time": seg.start_time,
                        "end_time": seg.end_time,
                        "text": seg.text,
                        "confidence": seg.confidence
                    }
                    for seg in segments
                ]
            }

            async with aiofiles.open(json_path, 'w', encoding='utf-8') as f:
                import json
                await f.write(json.dumps(segments_data, indent=2))

            logger.info(f"Saved faster-whisper transcript JSON to {json_path}")

        except Exception as e:
            logger.warning(f"Failed to save transcript for {video_id}: {e}")

    async def _save_audio_metadata(self, video_id: str, metadata: DualTranscriptMetadata):
        """Save audio metadata with faster-whisper specific information"""
        try:
            mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
            if not mp3_path.exists():
                return

            # Get audio file info
            audio_info = {
                "video_id": video_id,
                "file_path": str(mp3_path),
                "file_size_mb": round(mp3_path.stat().st_size / (1024 * 1024), 2),
                "download_date": datetime.now().isoformat(),
                "format": "mp3",
                "quality": "192kbps",

                # Faster-whisper specific metadata
                "transcription_engine": "faster_whisper",
                "model_used": self.model_size,
                "device": self.device,
                "compute_type": self.compute_type,
                "processing_time_seconds": metadata.processing_time_seconds,
                "quality_score": metadata.quality_score,
                "confidence_score": metadata.confidence_score,
                "vad_enabled": self.vad_filter,
                "beam_size": self.beam_size
            }

            # Try to get audio duration
            try:
                loop = asyncio.get_event_loop()
                audio = await loop.run_in_executor(None, AudioSegment.from_file, str(mp3_path))
                duration_seconds = len(audio) / 1000.0
                audio_info["duration_seconds"] = duration_seconds
                audio_info["duration_formatted"] = f"{int(duration_seconds // 60)}:{int(duration_seconds % 60):02d}"

                # Calculate speed improvement ratio
                if metadata.processing_time_seconds > 0:
                    speed_ratio = duration_seconds / metadata.processing_time_seconds
                    audio_info["speed_ratio"] = round(speed_ratio, 2)
                    audio_info["realtime_factor"] = f"{speed_ratio:.1f}x faster than realtime"

            except:
                pass

            # Save metadata
            metadata_path = self.storage_dirs["audio"] / f"{video_id}_faster_whisper_metadata.json"
            async with aiofiles.open(metadata_path, 'w', encoding='utf-8') as f:
                import json
                await f.write(json.dumps(audio_info, indent=2))

            logger.info(f"Saved faster-whisper audio metadata to {metadata_path}")

        except Exception as e:
            logger.warning(f"Failed to save audio metadata for {video_id}: {e}")

    async def _cleanup_temp_files(self, audio_path: str):
        """Clean up temporary files while preserving MP3 for re-use."""
        try:
            # Only clean up if this was a temporary WAV file
            if audio_path.endswith('.wav'):
                wav_path = Path(audio_path)
                mp3_path = wav_path.with_suffix('.mp3')

                if mp3_path.exists() and wav_path.exists():
                    try:
                        os.unlink(audio_path)
                        logger.info(f"Cleaned up temporary WAV, keeping MP3: {mp3_path}")
                    except Exception as e:
                        logger.warning(f"Failed to clean up WAV file {audio_path}: {e}")
                else:
                    logger.info(f"Keeping audio file: {audio_path}")
        except Exception as e:
            logger.warning(f"Error during temp file cleanup: {e}")

    async def cleanup(self):
        """Clean up resources and free memory."""
        try:
            # Unload model to free memory
            if self.model is not None:
                del self.model
                self.model = None

                # Clear GPU cache if using CUDA
                if torch.cuda.is_available() and self.device == "cuda":
                    torch.cuda.empty_cache()
                    logger.info("Cleared GPU cache")

            logger.info("Faster-whisper service cleanup completed")

        except Exception as e:
            logger.warning(f"Error during cleanup: {e}")

    def get_performance_info(self) -> Dict:
        """Get information about the current configuration and expected performance."""
        return {
            "model": self.model_size,
            "device": self.device,
            "compute_type": self.compute_type,
            "vad_enabled": self.vad_filter,
            "beam_size": self.beam_size,
            "expected_speed_improvement": "20-32x faster than OpenAI Whisper",
            "optimizations": [
                "CTranslate2 optimization engine",
                "Voice Activity Detection (VAD)",
                "GPU acceleration" if self.device == "cuda" else "CPU optimization",
                f"Quantization ({self.compute_type})",
                "Native MP3 support (no conversion needed)"
            ]
        }