youtube-summarizer/backend/services/enhanced_transcript_service.py

"""
Enhanced Transcript Service with local video file support.
Integrates with VideoDownloadService for local file-based transcription.
"""

import asyncio
import logging
from typing import Optional, Dict, Any
from pathlib import Path
import json

from backend.models.transcript import (
    TranscriptResult,
    TranscriptMetadata,
    TranscriptSegment,
    ExtractionMethod
)
from backend.core.exceptions import (
    TranscriptExtractionError,
    ErrorCode
)
from backend.services.transcript_service import TranscriptService
from backend.services.video_download_service import VideoDownloadService, VideoDownloadError
from backend.services.mock_cache import MockCacheClient

logger = logging.getLogger(__name__)


class MockWhisperService:
    """Mock Whisper service for local audio transcription."""

    def __init__(self):
        self.model_name = "base"
        self.language = "en"

    async def transcribe_audio(self, audio_path: Path) -> Dict[str, Any]:
        """
        Mock transcription of audio file.
        In production, this would use OpenAI Whisper or similar.

        Args:
            audio_path: Path to audio file

        Returns:
            Transcription result with segments
        """
        await asyncio.sleep(1.0)  # Simulate processing time

        # Generate mock transcript based on file
        video_id = audio_path.stem

        return {
            "text": f"""[Transcribed from local audio: {audio_path.name}]
This is a high-quality transcription from the downloaded video.
Local transcription provides better accuracy than online methods.

The video discusses important topics including:
- Advanced machine learning techniques
- Modern software architecture patterns
- Best practices for scalable applications
- Performance optimization strategies

Using local files ensures we can process videos even if they're removed from YouTube,
and we get consistent quality across all transcriptions.

This mock transcript demonstrates the enhanced capabilities of local processing,
which would include proper timestamps and speaker detection in production.""",

            "segments": [
                {
                    "text": "This is a high-quality transcription from the downloaded video.",
                    "start": 0.0,
                    "end": 4.0
                },
                {
                    "text": "Local transcription provides better accuracy than online methods.",
                    "start": 4.0,
                    "end": 8.0
                },
                {
                    "text": "The video discusses important topics including advanced machine learning techniques.",
                    "start": 8.0,
                    "end": 13.0
                }
            ],
            "language": "en",
            "duration": 120.0  # Mock duration
        }


class EnhancedTranscriptService(TranscriptService):
    """
    Enhanced transcript service that prioritizes local video files.

    Extraction priority:
    1. Check for locally downloaded video/audio files
    2. Fall back to YouTube Transcript API
    3. Download video and extract audio if needed
    4. Use Whisper for transcription
    """

    def __init__(
        self,
        video_service: Optional[VideoDownloadService] = None,
        cache_client: Optional[MockCacheClient] = None,
        whisper_service: Optional[MockWhisperService] = None
    ):
        """
        Initialize enhanced transcript service.

        Args:
            video_service: Video download service for local files
            cache_client: Cache client for transcript caching
            whisper_service: Whisper service for local transcription
        """
        super().__init__(cache_client=cache_client)
        self.video_service = video_service or VideoDownloadService()
        self.whisper_service = whisper_service or MockWhisperService()

        # Update success rates to prefer local files
        self._method_success_rates = {
            "local_file": 0.95,      # 95% success with local files
            "youtube_api": 0.7,      # 70% success with YouTube API
            "auto_captions": 0.5,    # 50% success with auto-captions
            "whisper_download": 0.9  # 90% success with download + Whisper
        }

    def _extract_video_id_from_url(self, url: str) -> str:
        """Extract video ID from YouTube URL."""
        # Simple extraction for common YouTube URL formats
        if "youtube.com/watch?v=" in url:
            return url.split("v=")[1].split("&")[0]
        elif "youtu.be/" in url:
            return url.split("youtu.be/")[1].split("?")[0]
        else:
            # Assume it's already a video ID
            return url

    async def extract_transcript(
        self,
        video_id_or_url: str,
        language_preference: str = "en",
        force_download: bool = False
    ) -> TranscriptResult:
        """
        Extract transcript with local file priority.

        Args:
            video_id_or_url: YouTube video ID or URL
            language_preference: Preferred language for transcript
            force_download: Force download even if online methods work

        Returns:
            TranscriptResult with transcript and metadata
        """
        # Determine if input is URL or video ID
        if "youtube.com" in video_id_or_url or "youtu.be" in video_id_or_url:
            url = video_id_or_url
            video_id = self._extract_video_id_from_url(url)
        else:
            video_id = video_id_or_url
            url = f"https://www.youtube.com/watch?v={video_id}"

        # Check cache first
        cache_key = f"transcript:{video_id}:{language_preference}"
        cached_result = await self.cache_client.get(cache_key)
        if cached_result:
            logger.info(f"Transcript cache hit for {video_id}")
            return TranscriptResult.model_validate(json.loads(cached_result))

        # Try local file first if available
        if self.video_service.is_video_downloaded(video_id):
            logger.info(f"Using local files for transcript extraction: {video_id}")
            local_result = await self._extract_from_local_video(video_id)
            if local_result:
                await self.cache_client.set(cache_key, local_result.model_dump_json(), ttl=86400)
                return local_result

        # If force_download, download the video first
        if force_download:
            logger.info(f"Force downloading video for transcription: {video_id}")
            download_result = await self._download_and_transcribe(url, video_id)
            if download_result:
                await self.cache_client.set(cache_key, download_result.model_dump_json(), ttl=86400)
                return download_result

        # Try YouTube API methods (from parent class)
        try:
            logger.info(f"Attempting YouTube API transcript extraction for {video_id}")
            api_result = await super().extract_transcript(video_id, language_preference)

            # Cache the result
            await self.cache_client.set(cache_key, api_result.model_dump_json(), ttl=86400)
            return api_result

        except TranscriptExtractionError as e:
            logger.warning(f"YouTube API methods failed: {e}")

            # As last resort, download video and transcribe
            logger.info(f"Falling back to download and transcribe for {video_id}")
            download_result = await self._download_and_transcribe(url, video_id)
            if download_result:
                await self.cache_client.set(cache_key, download_result.model_dump_json(), ttl=86400)
                return download_result

            # If all methods fail, raise error
            raise TranscriptExtractionError(
                message="Unable to extract transcript through any method",
                error_code=ErrorCode.TRANSCRIPT_UNAVAILABLE,
                details={
                    "video_id": video_id,
                    "attempted_methods": [
                        "local_file", "youtube_api", "auto_captions", "download_and_transcribe"
                    ],
                    "suggestions": [
                        "Check if video is available and public",
                        "Try again later",
                        "Enable captions on the video"
                    ]
                }
            )

    async def _extract_from_local_video(self, video_id: str) -> Optional[TranscriptResult]:
        """
        Extract transcript from locally stored video/audio files.

        Args:
            video_id: YouTube video ID

        Returns:
            TranscriptResult or None if extraction fails
        """
        try:
            # Get cached video info
            video_hash = self.video_service._get_video_hash(video_id)
            cached_info = self.video_service.cache.get(video_hash)

            if not cached_info:
                logger.warning(f"No cache info for downloaded video {video_id}")
                return None

            # Check for audio file
            audio_path = cached_info.get('audio_path')
            if audio_path:
                audio_file = Path(audio_path)
                if audio_file.exists():
                    logger.info(f"Transcribing from local audio: {audio_file}")

                    # Transcribe using Whisper
                    transcription = await self.whisper_service.transcribe_audio(audio_file)

                    # Convert to TranscriptResult
                    segments = [
                        TranscriptSegment(
                            text=seg["text"],
                            start=seg["start"],
                            duration=seg["end"] - seg["start"]
                        )
                        for seg in transcription.get("segments", [])
                    ]

                    metadata = TranscriptMetadata(
                        language=transcription.get("language", "en"),
                        duration=transcription.get("duration", 0),
                        word_count=len(transcription["text"].split()),
                        has_timestamps=bool(segments)
                    )

                    return TranscriptResult(
                        video_id=video_id,
                        transcript=transcription["text"],
                        segments=segments,
                        metadata=metadata,
                        method=ExtractionMethod.WHISPER_AUDIO,
                        language=transcription.get("language", "en"),
                        success=True,
                        from_cache=False,
                        processing_time=1.0  # Mock processing time
                    )

            # If no audio file, check for video file
            video_path = cached_info.get('video_path')
            if video_path:
                video_file = Path(video_path)
                if video_file.exists():
                    logger.info(f"Video found but no audio extracted yet: {video_file}")
                    # Could extract audio here if needed
                    return None

            return None

        except Exception as e:
            logger.error(f"Error extracting from local video {video_id}: {e}")
            return None

    async def _download_and_transcribe(self, url: str, video_id: str) -> Optional[TranscriptResult]:
        """
        Download video and transcribe the audio.

        Args:
            url: YouTube URL
            video_id: Video ID

        Returns:
            TranscriptResult or None if fails
        """
        try:
            logger.info(f"Downloading video for transcription: {video_id}")

            # Download video with audio extraction
            video_path, audio_path = await self.video_service.download_video(
                url=url,
                extract_audio=True,
                force=False
            )

            if audio_path and audio_path.exists():
                logger.info(f"Audio extracted, transcribing: {audio_path}")

                # Transcribe using Whisper
                transcription = await self.whisper_service.transcribe_audio(audio_path)

                # Convert to TranscriptResult
                segments = [
                    TranscriptSegment(
                        text=seg["text"],
                        start=seg["start"],
                        duration=seg["end"] - seg["start"]
                    )
                    for seg in transcription.get("segments", [])
                ]

                metadata = TranscriptMetadata(
                    language=transcription.get("language", "en"),
                    duration=transcription.get("duration", 0),
                    word_count=len(transcription["text"].split()),
                    has_timestamps=bool(segments)
                )

                return TranscriptResult(
                    video_id=video_id,
                    transcript=transcription["text"],
                    segments=segments,
                    metadata=metadata,
                    method=ExtractionMethod.WHISPER_AUDIO,
                    language=transcription.get("language", "en"),
                    success=True,
                    from_cache=False,
                    processing_time=2.0  # Mock processing time
                )

            logger.warning(f"Download succeeded but no audio extracted for {video_id}")
            return None

        except VideoDownloadError as e:
            logger.error(f"Failed to download video {video_id}: {e}")
            return None
        except Exception as e:
            logger.error(f"Error in download and transcribe for {video_id}: {e}")
            return None

    async def get_transcript_with_priority(
        self,
        video_id: str,
        prefer_local: bool = True,
        download_if_missing: bool = False
    ) -> TranscriptResult:
        """
        Get transcript with configurable priority.

        Args:
            video_id: YouTube video ID
            prefer_local: Prefer local files over API
            download_if_missing: Download video if not available locally

        Returns:
            TranscriptResult
        """
        url = f"https://www.youtube.com/watch?v={video_id}"

        if prefer_local and self.video_service.is_video_downloaded(video_id):
            # Try local first
            local_result = await self._extract_from_local_video(video_id)
            if local_result:
                return local_result

        # Try API methods
        try:
            return await super().extract_transcript(video_id)
        except TranscriptExtractionError:
            if download_if_missing:
                # Download and transcribe
                download_result = await self._download_and_transcribe(url, video_id)
                if download_result:
                    return download_result
            raise

    def get_extraction_stats(self) -> Dict[str, Any]:
        """Get statistics about extraction methods and success rates."""
        return {
            "method_success_rates": self._method_success_rates,
            "cached_videos": len(self.video_service.cache),
            "total_storage_mb": self.video_service.get_storage_stats()['total_size_mb'],
            "preferred_method": "local_file" if self.video_service.cache else "youtube_api"
        }