youtube-summarizer/backend/services/dual_transcript_service.py

"""
Dual transcript service that provides YouTube captions, Whisper AI transcription, or both.
Coordinates between different transcript sources and provides comparison functionality.
"""

import asyncio
import logging
import time
from typing import List, Dict, Optional, Tuple, Union
from enum import Enum

from .transcript_service import TranscriptService
from .faster_whisper_transcript_service import FasterWhisperTranscriptService
from ..config.video_download_config import VideoDownloadConfig
from ..models.transcript import (
    DualTranscriptSegment,
    DualTranscriptMetadata,
    TranscriptSource,
    DualTranscriptResult,
    TranscriptComparison,
    TranscriptSegment,
    TranscriptMetadata
)
from ..core.config import settings
logger = logging.getLogger(__name__)


class TranscriptQuality(Enum):
    """Transcript quality levels"""
    STANDARD = "standard"  # YouTube captions
    HIGH = "high"          # Whisper small/base
    PREMIUM = "premium"    # Whisper medium/large


class DualTranscriptService:
    """Service for managing dual transcript extraction and comparison."""

    def __init__(self):
        self.transcript_service = TranscriptService()
        # Load configuration for faster-whisper
        config = VideoDownloadConfig()
        self.whisper_service = FasterWhisperTranscriptService(
            model_size=config.whisper_model,
            device=config.whisper_device,
            compute_type=config.whisper_compute_type,
            beam_size=config.whisper_beam_size,
            vad_filter=config.whisper_vad_filter,
            word_timestamps=config.whisper_word_timestamps,
            temperature=config.whisper_temperature,
            best_of=config.whisper_best_of
        )

    async def get_transcript(
        self,
        video_id: str,
        video_url: str,
        source: TranscriptSource,
        progress_callback=None
    ) -> DualTranscriptResult:
        """
        Get transcript from specified source(s).

        Args:
            video_id: YouTube video ID
            video_url: Full YouTube video URL
            source: Which transcript source(s) to use
            progress_callback: Optional callback for progress updates

        Returns:
            DualTranscriptResult with requested transcript data
        """
        start_time = time.time()

        try:
            if source == TranscriptSource.YOUTUBE:
                return await self._get_youtube_only(
                    video_id, video_url, progress_callback
                )
            elif source == TranscriptSource.WHISPER:
                return await self._get_whisper_only(
                    video_id, video_url, progress_callback
                )
            elif source == TranscriptSource.BOTH:
                return await self._get_both_transcripts(
                    video_id, video_url, progress_callback
                )
            else:
                raise ValueError(f"Invalid transcript source: {source}")

        except Exception as e:
            logger.error(f"Failed to get transcript for video {video_id} from {source}: {e}")
            processing_time = time.time() - start_time
            return DualTranscriptResult(
                video_id=video_id,
                source=source,
                youtube_transcript=None,
                youtube_metadata=None,
                whisper_transcript=None,
                whisper_metadata=None,
                comparison=None,
                processing_time_seconds=processing_time,
                success=False,
                error=str(e)
            )

    async def _get_youtube_only(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> DualTranscriptResult:
        """Get YouTube captions only."""
        start_time = time.time()

        try:
            if progress_callback:
                await progress_callback("Extracting YouTube captions...")

            # Get YouTube transcript via existing transcript service
            transcript_result = await self.transcript_service.extract_transcript(video_id)
            if transcript_result.success and transcript_result.transcript:
                # Convert to dual transcript format
                youtube_segments = self._convert_to_dual_segments(transcript_result)
                youtube_metadata = self._convert_to_dual_metadata(transcript_result, video_id)
            else:
                raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}")

            if progress_callback:
                await progress_callback("YouTube captions extracted successfully")

            processing_time = time.time() - start_time

            return DualTranscriptResult(
                video_id=video_id,
                source=TranscriptSource.YOUTUBE,
                youtube_transcript=youtube_segments,
                youtube_metadata=youtube_metadata,
                whisper_transcript=None,
                whisper_metadata=None,
                comparison=None,
                processing_time_seconds=processing_time,
                success=True,
                error=None
            )

        except Exception as e:
            logger.error(f"YouTube transcript extraction failed: {e}")
            raise

    async def _get_whisper_only(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> DualTranscriptResult:
        """Get Whisper AI transcription only."""
        start_time = time.time()

        try:
            if progress_callback:
                await progress_callback("Starting AI transcription with Whisper...")

            # Get Whisper transcript
            whisper_segments, whisper_metadata = await self.whisper_service.transcribe_video(
                video_id, video_url, progress_callback
            )

            processing_time = time.time() - start_time
            whisper_metadata.processing_time_seconds = processing_time

            return DualTranscriptResult(
                video_id=video_id,
                source=TranscriptSource.WHISPER,
                youtube_transcript=None,
                youtube_metadata=None,
                whisper_transcript=whisper_segments,
                whisper_metadata=whisper_metadata,
                comparison=None,
                processing_time_seconds=processing_time,
                success=True,
                error=None
            )

        except Exception as e:
            logger.error(f"Whisper transcript extraction failed: {e}")
            raise

    async def _get_both_transcripts(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> DualTranscriptResult:
        """Get both YouTube and Whisper transcripts for comparison."""
        start_time = time.time()

        try:
            # Progress tracking
            if progress_callback:
                await progress_callback("Starting dual transcript extraction...")

            # Run both extractions in parallel
            youtube_task = asyncio.create_task(
                self._get_youtube_with_progress(video_id, video_url, progress_callback)
            )
            whisper_task = asyncio.create_task(
                self._get_whisper_with_progress(video_id, video_url, progress_callback)
            )

            # Wait for both to complete
            youtube_result, whisper_result = await asyncio.gather(
                youtube_task, whisper_task, return_exceptions=True
            )

            # Handle any exceptions
            youtube_segments, youtube_metadata = None, None
            whisper_segments, whisper_metadata = None, None
            errors = []

            if isinstance(youtube_result, Exception):
                logger.warning(f"YouTube extraction failed: {youtube_result}")
                errors.append(f"YouTube: {youtube_result}")
            else:
                youtube_segments, youtube_metadata = youtube_result

            if isinstance(whisper_result, Exception):
                logger.warning(f"Whisper extraction failed: {whisper_result}")
                errors.append(f"Whisper: {whisper_result}")
            else:
                whisper_segments, whisper_metadata = whisper_result

            # Generate comparison if we have both transcripts
            comparison = None
            if youtube_segments and whisper_segments:
                if progress_callback:
                    await progress_callback("Generating transcript comparison...")

                comparison = self._compare_transcripts(
                    youtube_segments, youtube_metadata,
                    whisper_segments, whisper_metadata
                )

            processing_time = time.time() - start_time
            if whisper_metadata:
                whisper_metadata.processing_time_seconds = processing_time

            # Determine success status
            success = (youtube_segments is not None) or (whisper_segments is not None)
            error_message = "; ".join(errors) if errors else None

            if progress_callback:
                if success:
                    await progress_callback("Dual transcript extraction completed")
                else:
                    await progress_callback("Dual transcript extraction failed")

            return DualTranscriptResult(
                video_id=video_id,
                source=TranscriptSource.BOTH,
                youtube_transcript=youtube_segments,
                youtube_metadata=youtube_metadata,
                whisper_transcript=whisper_segments,
                whisper_metadata=whisper_metadata,
                comparison=comparison,
                processing_time_seconds=processing_time,
                success=success,
                error=error_message
            )

        except Exception as e:
            logger.error(f"Dual transcript extraction failed: {e}")
            processing_time = time.time() - start_time
            return DualTranscriptResult(
                video_id=video_id,
                source=TranscriptSource.BOTH,
                youtube_transcript=None,
                youtube_metadata=None,
                whisper_transcript=None,
                whisper_metadata=None,
                comparison=None,
                processing_time_seconds=processing_time,
                success=False,
                error=str(e)
            )

    async def _get_youtube_with_progress(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
        """Get YouTube transcript with progress updates."""
        if progress_callback:
            await progress_callback("Extracting YouTube captions...")

        transcript_result = await self.transcript_service.extract_transcript(video_id)
        if not transcript_result.success:
            raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}")

        # Convert to dual transcript format
        result = (
            self._convert_to_dual_segments(transcript_result),
            self._convert_to_dual_metadata(transcript_result, video_id)
        )

        if progress_callback:
            await progress_callback("YouTube captions extracted")

        return result

    async def _get_whisper_with_progress(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
        """Get Whisper transcript with progress updates."""
        if progress_callback:
            await progress_callback("Starting AI transcription...")

        result = await self.whisper_service.transcribe_video(
            video_id, video_url, progress_callback
        )

        if progress_callback:
            await progress_callback("AI transcription completed")

        return result

    def _compare_transcripts(
        self,
        youtube_segments: List[TranscriptSegment],
        youtube_metadata: TranscriptMetadata,
        whisper_segments: List[TranscriptSegment],
        whisper_metadata: TranscriptMetadata
    ) -> TranscriptComparison:
        """Generate comparison between YouTube and Whisper transcripts."""

        # Combine segments into full text for comparison
        youtube_text = " ".join(segment.text for segment in youtube_segments)
        whisper_text = " ".join(segment.text for segment in whisper_segments)

        # Calculate basic metrics
        youtube_words = youtube_text.split()
        whisper_words = whisper_text.split()

        # Calculate word-level differences (simplified)
        word_differences = abs(len(youtube_words) - len(whisper_words))
        word_similarity = 1.0 - (word_differences / max(len(youtube_words), len(whisper_words), 1))

        # Calculate quality metrics
        punctuation_improvement = self._calculate_punctuation_improvement(youtube_text, whisper_text)
        capitalization_improvement = self._calculate_capitalization_improvement(youtube_text, whisper_text)

        # Determine recommendation
        recommendation = self._generate_recommendation(
            youtube_metadata, whisper_metadata, word_similarity,
            punctuation_improvement, capitalization_improvement
        )

        return TranscriptComparison(
            word_count_difference=word_differences,
            similarity_score=word_similarity,
            punctuation_improvement_score=punctuation_improvement,
            capitalization_improvement_score=capitalization_improvement,
            processing_time_ratio=whisper_metadata.processing_time_seconds / max(youtube_metadata.processing_time_seconds, 0.1),
            quality_difference=whisper_metadata.quality_score - youtube_metadata.quality_score,
            confidence_difference=whisper_metadata.confidence_score - youtube_metadata.confidence_score,
            recommendation=recommendation,
            significant_differences=self._find_significant_differences(youtube_text, whisper_text),
            technical_terms_improved=self._find_technical_improvements(youtube_text, whisper_text)
        )

    def _calculate_punctuation_improvement(self, youtube_text: str, whisper_text: str) -> float:
        """Calculate improvement in punctuation between transcripts."""
        youtube_punct = sum(1 for c in youtube_text if c in '.,!?;:')
        whisper_punct = sum(1 for c in whisper_text if c in '.,!?;:')

        # Normalize by text length
        youtube_punct_ratio = youtube_punct / max(len(youtube_text), 1)
        whisper_punct_ratio = whisper_punct / max(len(whisper_text), 1)

        # Return improvement score (0-1 scale)
        improvement = whisper_punct_ratio - youtube_punct_ratio
        return max(0.0, min(1.0, improvement * 10))  # Scale to 0-1

    def _calculate_capitalization_improvement(self, youtube_text: str, whisper_text: str) -> float:
        """Calculate improvement in capitalization between transcripts."""
        youtube_capitals = sum(1 for c in youtube_text if c.isupper())
        whisper_capitals = sum(1 for c in whisper_text if c.isupper())

        # Normalize by text length
        youtube_cap_ratio = youtube_capitals / max(len(youtube_text), 1)
        whisper_cap_ratio = whisper_capitals / max(len(whisper_text), 1)

        # Return improvement score (0-1 scale)
        improvement = whisper_cap_ratio - youtube_cap_ratio
        return max(0.0, min(1.0, improvement * 5))  # Scale to 0-1

    def _generate_recommendation(
        self,
        youtube_metadata: TranscriptMetadata,
        whisper_metadata: TranscriptMetadata,
        similarity: float,
        punct_improvement: float,
        cap_improvement: float
    ) -> str:
        """Generate recommendation based on comparison metrics."""

        # If very similar and YouTube is much faster
        if similarity > 0.95 and whisper_metadata.processing_time_seconds > youtube_metadata.processing_time_seconds * 10:
            return "youtube"

        # If significant quality improvement with Whisper
        if (whisper_metadata.quality_score - youtube_metadata.quality_score) > 0.2:
            return "whisper"

        # If significant punctuation/capitalization improvement
        if punct_improvement > 0.3 or cap_improvement > 0.3:
            return "whisper"

        # If low confidence in YouTube captions
        if youtube_metadata.confidence_score < 0.6 and whisper_metadata.confidence_score > 0.7:
            return "whisper"

        # Default to YouTube for speed if quality is similar
        return "youtube"

    def _find_significant_differences(self, youtube_text: str, whisper_text: str) -> List[str]:
        """Find significant textual differences between transcripts."""
        differences = []

        # Simple difference detection (can be enhanced with difflib)
        youtube_words = set(youtube_text.lower().split())
        whisper_words = set(whisper_text.lower().split())

        unique_to_whisper = whisper_words - youtube_words
        unique_to_youtube = youtube_words - whisper_words

        if len(unique_to_whisper) > 5:
            differences.append(f"Whisper includes {len(unique_to_whisper)} additional unique words")

        if len(unique_to_youtube) > 5:
            differences.append(f"YouTube includes {len(unique_to_youtube)} words not in Whisper")

        return differences[:5]  # Limit to 5 most significant

    def _find_technical_improvements(self, youtube_text: str, whisper_text: str) -> List[str]:
        """Find technical terms that were improved in Whisper transcript."""
        improvements = []

        # Common technical terms that might be improved
        technical_patterns = [
            ("API", "a p i"),
            ("URL", "u r l"),
            ("HTTP", "h t t p"),
            ("JSON", "jason"),
            ("SQL", "sequel"),
            ("AI", "a i"),
            ("ML", "m l"),
            ("GPU", "g p u"),
            ("CPU", "c p u")
        ]

        for correct, incorrect in technical_patterns:
            if incorrect.lower() in youtube_text.lower() and correct.lower() in whisper_text.lower():
                improvements.append(f"'{incorrect}' → '{correct}'")

        return improvements[:3]  # Limit to 3 most significant

    def estimate_processing_time(
        self,
        video_duration_seconds: float,
        source: TranscriptSource
    ) -> Dict[str, float]:
        """
        Estimate processing time for different transcript sources.

        Args:
            video_duration_seconds: Duration of the video in seconds
            source: Which transcript source(s) to estimate for

        Returns:
            Dictionary with time estimates in seconds
        """
        estimates = {}

        if source in [TranscriptSource.YOUTUBE, TranscriptSource.BOTH]:
            # YouTube API is very fast - usually 1-3 seconds regardless of video length
            estimates["youtube"] = min(3.0, max(1.0, video_duration_seconds * 0.01))

        if source in [TranscriptSource.WHISPER, TranscriptSource.BOTH]:
            # Whisper processing time depends on model size and duration
            # Rough estimates: ~0.1-0.5x real-time depending on hardware
            base_ratio = 0.3  # Conservative estimate
            device_multiplier = 0.5 if self.whisper_service.device == "cuda" else 1.5
            estimates["whisper"] = video_duration_seconds * base_ratio * device_multiplier

        if source == TranscriptSource.BOTH:
            # Parallel processing, so max of both plus comparison overhead
            estimates["total"] = max(estimates.get("youtube", 0), estimates.get("whisper", 0)) + 2.0
        else:
            estimates["total"] = sum(estimates.values())

        return estimates

    def _convert_to_dual_segments(self, transcript_result) -> List[DualTranscriptSegment]:
        """Convert TranscriptResult to DualTranscriptSegment list."""
        if not transcript_result.segments:
            # If no segments, create segments from plain text
            if transcript_result.transcript:
                # Simple conversion - split text into segments (basic implementation)
                text_segments = transcript_result.transcript.split('. ')
                segments = []
                current_time = 0.0

                for i, text in enumerate(text_segments):
                    if text.strip():
                        # Estimate duration based on word count (rough estimate)
                        word_count = len(text.split())
                        duration = word_count * 0.5  # 0.5 seconds per word (rough)

                        segments.append(DualTranscriptSegment(
                            start_time=current_time,
                            end_time=current_time + duration,
                            text=text.strip() + ('.' if not text.endswith('.') else ''),
                            confidence=0.8  # Default confidence for YouTube
                        ))
                        current_time += duration + 0.5  # Small gap between segments

                return segments
            return []

        # Convert existing segments
        dual_segments = []
        for segment in transcript_result.segments:
            dual_segments.append(DualTranscriptSegment(
                start_time=segment.start,
                end_time=segment.start + segment.duration,
                text=segment.text,
                confidence=0.8  # Default confidence for YouTube captions
            ))

        return dual_segments

    def _convert_to_dual_metadata(self, transcript_result, video_id: str) -> DualTranscriptMetadata:
        """Convert TranscriptResult to DualTranscriptMetadata."""
        word_count = len(transcript_result.transcript.split()) if transcript_result.transcript else 0

        return DualTranscriptMetadata(
            video_id=video_id,
            language=transcript_result.metadata.language if transcript_result.metadata else "en",
            word_count=word_count,
            total_segments=len(transcript_result.segments) if transcript_result.segments else 0,
            has_timestamps=transcript_result.segments is not None and len(transcript_result.segments) > 0,
            extraction_method=transcript_result.method.value,
            processing_time_seconds=transcript_result.metadata.processing_time_seconds if transcript_result.metadata else 0.0,
            quality_score=0.75,  # Default quality score for YouTube captions
            confidence_score=0.8  # Default confidence for YouTube captions
        )

    async def cleanup(self):
        """Clean up resources used by transcript services."""
        await self.whisper_service.cleanup()