youtube-summarizer/backend/services/transcript_service.py

import asyncio
import time
import logging
from typing import Optional, List, Dict, Any, TYPE_CHECKING
import json
import random
from datetime import datetime

if TYPE_CHECKING:
    from backend.core.websocket_manager import WebSocketManager

from backend.models.transcript import (
    TranscriptResult,
    TranscriptMetadata,
    TranscriptSegment,
    ExtractionMethod
)
from backend.core.exceptions import (
    TranscriptExtractionError,
    ErrorCode
)
from backend.services.mock_cache import MockCacheClient
from backend.services.intelligent_video_downloader import IntelligentVideoDownloader
from backend.models.video_download import DownloadPreferences, VideoQuality, DownloadStatus

logger = logging.getLogger(__name__)


class MockWhisperClient:
    """Mock Whisper client for audio transcription simulation"""

    async def transcribe(self, video_id: str) -> str:
        await asyncio.sleep(0.5)  # Simulate processing time
        return f"[Whisper transcription] This is a mock audio transcription for video {video_id}."


class TranscriptNotAvailableError(Exception):
    """Raised when transcript is not available through YouTube API"""
    pass


class CaptionsNotAvailableError(Exception):
    """Raised when auto-captions are not available"""
    pass


class AudioTranscriptionError(Exception):
    """Raised when audio transcription fails"""
    pass


class TranscriptService:
    """Service for extracting video transcripts with fallback methods"""

    # Mock transcript data for demonstration
    MOCK_TRANSCRIPTS = {
        "dQw4w9WgXcQ": {
            "text": """Welcome to this comprehensive tutorial on modern web development.
Today we'll be exploring the fundamentals of building scalable applications.

First, let's discuss the importance of choosing the right architecture.
When building web applications, you need to consider factors like performance,
maintainability, and user experience.

The key components we'll cover include:
- Frontend frameworks and their ecosystems
- Backend API design patterns
- Database optimization strategies
- Deployment and DevOps best practices

Throughout this video, we'll build a real-world application step by step,
explaining each decision and trade-off along the way.

By the end of this tutorial, you'll have a solid understanding of modern
web development practices and be ready to build your own production-ready applications.""",
            "segments": [
                {"text": "Welcome to this comprehensive tutorial on modern web development.", "start": 0.0, "duration": 3.5},
                {"text": "Today we'll be exploring the fundamentals of building scalable applications.", "start": 3.5, "duration": 4.0},
                {"text": "First, let's discuss the importance of choosing the right architecture.", "start": 7.5, "duration": 3.8},
            ]
        },
        "test123": {
            "text": """This is a test video transcript for demonstration purposes.
It contains sample content that can be used for testing the summarization system.

The transcript includes multiple paragraphs and various topics to ensure
the system can handle different types of content effectively.""",
            "segments": []
        }
    }

    def __init__(self, cache_client: Optional[MockCacheClient] = None,
                 whisper_client: Optional[MockWhisperClient] = None,
                 websocket_manager: Optional['WebSocketManager'] = None):
        self.cache_client = cache_client or MockCacheClient()
        self.whisper_client = whisper_client or MockWhisperClient()
        self.websocket_manager = websocket_manager
        self._method_success_rates = {
            "youtube_api": 0.7,  # 70% success rate for primary method
            "auto_captions": 0.5,  # 50% success rate for auto-captions
            "whisper_audio": 0.9   # 90% success rate for Whisper
        }
        # Check if we should use real YouTube API based on environment settings
        from backend.core.config import settings
        self._use_real_youtube_api = not settings.USE_MOCK_SERVICES and settings.ENABLE_REAL_TRANSCRIPT_EXTRACTION
        self._using_real_whisper = whisper_client is not None and not isinstance(whisper_client, MockWhisperClient)

        # Initialize intelligent video downloader for additional fallback methods
        self.video_downloader = None

        # Store segments temporarily for passing to _create_result
        self._last_whisper_segments = None
        self._last_transcript_segments = None
        if self._use_real_youtube_api:
            try:
                self.video_downloader = IntelligentVideoDownloader(websocket_manager=websocket_manager)
                logger.info("Initialized IntelligentVideoDownloader with multiple fallback methods and WebSocket support")
            except Exception as e:
                logger.warning(f"Could not initialize IntelligentVideoDownloader: {e}")

        logger.info(f"TranscriptService initialized: use_real_youtube_api={self._use_real_youtube_api}, using_real_whisper={self._using_real_whisper}")

    async def extract_transcript(self, video_id: str,
                                language_preference: str = "en") -> TranscriptResult:
        """
        Extract transcript using fallback chain with caching.

        Args:
            video_id: YouTube video ID
            language_preference: Preferred language code

        Returns:
            TranscriptResult with transcript data or error
        """
        start_time = time.time()

        # Check cache first
        cache_key = f"transcript:{video_id}:{language_preference}"
        cached_result = await self.cache_client.get(cache_key)

        if cached_result:
            logger.info(f"Cache hit for video {video_id}")
            # The cached_result is a JSON string, parse it
            result_data = json.loads(cached_result) if isinstance(cached_result, str) else cached_result
            # Create TranscriptResult from cached data with from_cache flag
            return TranscriptResult(**result_data, from_cache=True)

        # Try primary method: YouTube Transcript API (mock)
        try:
            transcript = await self._extract_youtube_transcript(video_id, language_preference)
            result = await self._create_result(
                video_id, transcript, ExtractionMethod.YOUTUBE_API,
                language_preference, start_time
            )
            await self._cache_result(cache_key, result)
            return result
        except TranscriptNotAvailableError:
            logger.info(f"YouTube API transcript not available for {video_id}")

        # Fallback 1: Auto-generated captions (mock)
        try:
            transcript = await self._extract_auto_captions(video_id, language_preference)
            result = await self._create_result(
                video_id, transcript, ExtractionMethod.AUTO_CAPTIONS,
                language_preference, start_time
            )
            await self._cache_result(cache_key, result)
            return result
        except CaptionsNotAvailableError:
            logger.info(f"Auto-captions not available for {video_id}")

        # Fallback 2: Audio transcription with Whisper (mock)
        try:
            transcript = await self._transcribe_audio(video_id, language_preference)
            result = await self._create_result(
                video_id, transcript, ExtractionMethod.WHISPER_AUDIO,
                language_preference, start_time
            )
            await self._cache_result(cache_key, result)
            return result
        except AudioTranscriptionError as e:
            logger.info(f"Whisper transcription failed for {video_id}, trying advanced fallback methods")

        # Fallback 3-8: Use IntelligentVideoDownloader with multiple methods
        # This includes: pytubefix, yt-dlp, playwright, external tools, web services
        if self.video_downloader:
            try:
                transcript = await self._extract_with_video_downloader(video_id, language_preference)
                result = await self._create_result(
                    video_id, transcript, ExtractionMethod.WHISPER_AUDIO,  # Mark as audio since it's likely from audio
                    language_preference, start_time
                )
                await self._cache_result(cache_key, result)
                return result
            except Exception as e:
                logger.error(f"Advanced fallback methods failed for {video_id}: {e}")

        logger.error(f"All transcript extraction methods failed for {video_id}")

        return TranscriptResult(
                video_id=video_id,
                transcript=None,
                method=ExtractionMethod.FAILED,
                success=False,
                error={
                    "code": ErrorCode.TRANSCRIPT_NOT_AVAILABLE,
                    "message": "Unable to extract transcript from video",
                    "details": {
                        "video_id": video_id,
                        "attempted_methods": ["youtube_api", "auto_captions", "whisper_audio"],
                        "last_error": str(e),
                        "suggestions": [
                            "Try a different video with captions available",
                            "Check if video is public and accessible",
                            "Contact support if this video should have transcripts"
                        ]
                    }
                }
            )

    async def _extract_youtube_transcript(self, video_id: str,
                                         language: str) -> str:
        """YouTube Transcript API extraction (mock or real)"""

        # Use real implementation if available
        if self._use_real_youtube_api:
            try:
                from youtube_transcript_api import YouTubeTranscriptApi

                loop = asyncio.get_event_loop()

                def _fetch_transcript():
                    # Try multiple language preferences
                    languages = [language, 'en', 'en-US', 'en-GB']

                    for lang in languages:
                        try:
                            # Use the static method get_transcript directly
                            transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang])

                            # Store the raw transcript data for segments
                            self._last_transcript_segments = transcript_list

                            # Convert list of transcript entries to text
                            full_text = ' '.join([entry['text'] for entry in transcript_list])
                            return full_text
                        except Exception as e:
                            logger.debug(f"Failed to fetch transcript for language {lang}: {e}")
                            continue

                    raise TranscriptNotAvailableError(f"No transcript available for {video_id}")

                transcript_text = await loop.run_in_executor(None, _fetch_transcript)
                return transcript_text

            except Exception as e:
                logger.error(f"Real YouTube transcript extraction failed: {e}")
                raise TranscriptNotAvailableError(f"Failed to extract transcript: {e}")

        # Mock implementation
        await asyncio.sleep(0.3)  # Simulate API call

        # Simulate success/failure based on probability
        if random.random() > self._method_success_rates["youtube_api"]:
            raise TranscriptNotAvailableError(f"No transcript available for {video_id}")

        # Return mock transcript if available
        if video_id in self.MOCK_TRANSCRIPTS:
            return self.MOCK_TRANSCRIPTS[video_id]["text"]

        # Generate generic mock transcript
        return f"""This is a mock transcript extracted via YouTube API for video {video_id}.
The content discusses various topics related to technology and innovation.
This demonstration text shows how the transcript extraction service works."""

    async def _extract_auto_captions(self, video_id: str,
                                    language: str) -> str:
        """Auto-generated captions extraction"""

        # Use real implementation if available
        if self._use_real_youtube_api:
            try:
                from youtube_transcript_api import YouTubeTranscriptApi

                loop = asyncio.get_event_loop()

                def _fetch_auto_captions():
                    # Try to get auto-generated captions
                    try:
                        # List available transcripts for the video
                        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

                        # Try to find auto-generated caption for the requested language
                        for transcript in transcript_list:
                            if transcript.is_generated and transcript.language_code == language:
                                caption_list = transcript.fetch()
                                full_text = ' '.join([entry['text'] for entry in caption_list])
                                return f"[Auto-generated] {full_text}"

                        # Try English auto-generated as fallback
                        for transcript in transcript_list:
                            if transcript.is_generated and transcript.language_code in ['en', 'en-US']:
                                caption_list = transcript.fetch()
                                full_text = ' '.join([entry['text'] for entry in caption_list])
                                return f"[Auto-generated] {full_text}"

                        raise CaptionsNotAvailableError(f"No auto-generated captions available for {video_id}")
                    except Exception as e:
                        raise CaptionsNotAvailableError(f"Failed to fetch auto-captions: {e}")

                caption_text = await loop.run_in_executor(None, _fetch_auto_captions)
                return caption_text

            except Exception as e:
                logger.error(f"Real auto-caption extraction failed: {e}")
                raise CaptionsNotAvailableError(f"Failed to extract auto-captions: {e}")

        # Mock implementation fallback
        await asyncio.sleep(0.4)  # Simulate API call

        if random.random() > self._method_success_rates["auto_captions"]:
            raise CaptionsNotAvailableError(f"No auto-captions for {video_id}")

        return f"""[Auto-generated] This is a mock auto-caption transcript for video {video_id}.
Auto-generated captions may contain errors but provide useful content.
The transcript has been processed and cleaned for better readability."""

    async def _transcribe_audio(self, video_id: str,
                               language: str) -> str:
        """Audio transcription using Whisper (mock or real)"""

        # Use real implementation if available
        if self._using_real_whisper and self.whisper_client and not isinstance(self.whisper_client, MockWhisperClient):
            try:
                # Use the real Whisper service
                logger.info(f"Using real Whisper service for video {video_id}")
                video_url = f"https://www.youtube.com/watch?v={video_id}"
                segments, metadata = await self.whisper_client.transcribe_video(
                    video_id, video_url
                )

                # Convert DualTranscriptSegment to TranscriptSegment for compatibility
                from backend.models.transcript import TranscriptSegment
                converted_segments = []
                for segment in segments:
                    converted_segments.append(TranscriptSegment(
                        text=segment.text,
                        start=segment.start_time,
                        duration=segment.end_time - segment.start_time
                    ))

                # Store converted segments for use in _create_result
                self._last_whisper_segments = converted_segments

                # Convert segments to text
                transcript_text = ' '.join([segment.text for segment in segments])
                logger.info(f"Successfully transcribed audio for {video_id} - {metadata.word_count} words")
                return transcript_text

            except Exception as e:
                logger.error(f"Real audio transcription failed: {e}")
                raise AudioTranscriptionError(f"Failed to transcribe audio: {e}")

        # Mock implementation
        await asyncio.sleep(0.8)  # Simulate longer processing time

        if random.random() > self._method_success_rates["whisper_audio"]:
            raise AudioTranscriptionError(f"Failed to transcribe audio for {video_id}")

        return await self.whisper_client.transcribe(video_id)

    async def _extract_with_video_downloader(self, video_id: str, language: str) -> str:
        """Use IntelligentVideoDownloader with multiple fallback methods"""
        if not self.video_downloader:
            raise Exception("Video downloader not available")

        video_url = f"https://www.youtube.com/watch?v={video_id}"

        # Configure preferences for transcript extraction
        preferences = DownloadPreferences(
            quality=VideoQuality.AUDIO_ONLY,  # We only need audio for transcription
            prefer_audio_only=True,
            fallback_to_transcript=True
        )

        logger.info(f"Attempting advanced download methods for {video_id}")

        # The IntelligentVideoDownloader will try:
        # 1. pytubefix
        # 2. yt-dlp
        # 3. playwright (browser automation)
        # 4. external tools
        # 5. web services
        # 6. transcript only fallback
        result = await self.video_downloader.download_video(video_url, preferences)

        if result.status in [DownloadStatus.COMPLETED, DownloadStatus.PARTIAL]:
            # If we got audio, transcribe it
            if result.audio_file and result.audio_file.exists():
                # Use whisper to transcribe the downloaded audio
                if self._using_real_whisper and self.whisper_client:
                    segments, metadata = await self.whisper_client.transcribe_video(
                        video_id, video_url
                    )
                    return ' '.join([segment.text for segment in segments])
                else:
                    # Fall back to basic extraction
                    return f"[Advanced Download] Successfully downloaded audio for {video_id} using {result.method_used}"

            # If we only got transcript data
            if result.transcript:
                return result.transcript

            raise Exception(f"Download completed but no transcript available")
        else:
            raise Exception(f"All advanced download methods failed: {result.error_message}")

    async def _create_result(self, video_id: str, transcript: str,
                           method: ExtractionMethod, language: str,
                           start_time: float) -> TranscriptResult:
        """Create TranscriptResult with metadata"""
        processing_time = time.time() - start_time
        word_count = len(transcript.split())

        metadata = TranscriptMetadata(
            word_count=word_count,
            estimated_reading_time=int(word_count / 200 * 60),  # 200 WPM reading speed
            language=language,
            has_timestamps=method == ExtractionMethod.YOUTUBE_API,
            extraction_method=method,
            processing_time_seconds=processing_time
        )

        # Get segments if available
        segments = None

        # Check for real Whisper segments first
        if self._last_whisper_segments and method == ExtractionMethod.WHISPER_AUDIO:
            segments = self._last_whisper_segments
            self._last_whisper_segments = None  # Clear after use
        # Fall back to mock data segments
        elif video_id in self.MOCK_TRANSCRIPTS and self.MOCK_TRANSCRIPTS[video_id].get("segments"):
            segments = [TranscriptSegment(**seg) for seg in self.MOCK_TRANSCRIPTS[video_id]["segments"]]

        return TranscriptResult(
            video_id=video_id,
            transcript=transcript,
            segments=segments,
            metadata=metadata,
            method=method,
            success=True,
            from_cache=False
        )

    async def _cache_result(self, cache_key: str, result: TranscriptResult):
        """Cache the transcript result"""
        try:
            # Convert to dict for caching
            cache_data = result.model_dump(exclude={'from_cache'})
            await self.cache_client.set(cache_key, cache_data, ttl=86400)  # 24 hours
            logger.info(f"Cached transcript for key {cache_key}")
        except Exception as e:
            logger.error(f"Failed to cache transcript: {e}")

    def extract_metadata(self, transcript: str) -> Dict[str, Any]:
        """Extract metadata from transcript text"""
        word_count = len(transcript.split())
        char_count = len(transcript)
        line_count = len(transcript.split('\n'))

        return {
            "word_count": word_count,
            "character_count": char_count,
            "line_count": line_count,
            "estimated_reading_time_seconds": int(word_count / 200 * 60),
            "average_words_per_line": word_count / max(line_count, 1)
        }