import asyncio import time import logging from typing import Optional, List, Dict, Any, TYPE_CHECKING import json import random from datetime import datetime if TYPE_CHECKING: from backend.core.websocket_manager import WebSocketManager from backend.models.transcript import ( TranscriptResult, TranscriptMetadata, TranscriptSegment, ExtractionMethod ) from backend.core.exceptions import ( TranscriptExtractionError, ErrorCode ) from backend.services.mock_cache import MockCacheClient from backend.services.intelligent_video_downloader import IntelligentVideoDownloader from backend.models.video_download import DownloadPreferences, VideoQuality, DownloadStatus logger = logging.getLogger(__name__) class MockWhisperClient: """Mock Whisper client for audio transcription simulation""" async def transcribe(self, video_id: str) -> str: await asyncio.sleep(0.5) # Simulate processing time return f"[Whisper transcription] This is a mock audio transcription for video {video_id}." class TranscriptNotAvailableError(Exception): """Raised when transcript is not available through YouTube API""" pass class CaptionsNotAvailableError(Exception): """Raised when auto-captions are not available""" pass class AudioTranscriptionError(Exception): """Raised when audio transcription fails""" pass class TranscriptService: """Service for extracting video transcripts with fallback methods""" # Mock transcript data for demonstration MOCK_TRANSCRIPTS = { "dQw4w9WgXcQ": { "text": """Welcome to this comprehensive tutorial on modern web development. Today we'll be exploring the fundamentals of building scalable applications. First, let's discuss the importance of choosing the right architecture. When building web applications, you need to consider factors like performance, maintainability, and user experience. The key components we'll cover include: - Frontend frameworks and their ecosystems - Backend API design patterns - Database optimization strategies - Deployment and DevOps best practices Throughout this video, we'll build a real-world application step by step, explaining each decision and trade-off along the way. By the end of this tutorial, you'll have a solid understanding of modern web development practices and be ready to build your own production-ready applications.""", "segments": [ {"text": "Welcome to this comprehensive tutorial on modern web development.", "start": 0.0, "duration": 3.5}, {"text": "Today we'll be exploring the fundamentals of building scalable applications.", "start": 3.5, "duration": 4.0}, {"text": "First, let's discuss the importance of choosing the right architecture.", "start": 7.5, "duration": 3.8}, ] }, "test123": { "text": """This is a test video transcript for demonstration purposes. It contains sample content that can be used for testing the summarization system. The transcript includes multiple paragraphs and various topics to ensure the system can handle different types of content effectively.""", "segments": [] } } def __init__(self, cache_client: Optional[MockCacheClient] = None, whisper_client: Optional[MockWhisperClient] = None, websocket_manager: Optional['WebSocketManager'] = None): self.cache_client = cache_client or MockCacheClient() self.whisper_client = whisper_client or MockWhisperClient() self.websocket_manager = websocket_manager self._method_success_rates = { "youtube_api": 0.7, # 70% success rate for primary method "auto_captions": 0.5, # 50% success rate for auto-captions "whisper_audio": 0.9 # 90% success rate for Whisper } # Check if we should use real YouTube API based on environment settings from backend.core.config import settings self._use_real_youtube_api = not settings.USE_MOCK_SERVICES and settings.ENABLE_REAL_TRANSCRIPT_EXTRACTION self._using_real_whisper = whisper_client is not None and not isinstance(whisper_client, MockWhisperClient) # Initialize intelligent video downloader for additional fallback methods self.video_downloader = None # Store segments temporarily for passing to _create_result self._last_whisper_segments = None self._last_transcript_segments = None if self._use_real_youtube_api: try: self.video_downloader = IntelligentVideoDownloader(websocket_manager=websocket_manager) logger.info("Initialized IntelligentVideoDownloader with multiple fallback methods and WebSocket support") except Exception as e: logger.warning(f"Could not initialize IntelligentVideoDownloader: {e}") logger.info(f"TranscriptService initialized: use_real_youtube_api={self._use_real_youtube_api}, using_real_whisper={self._using_real_whisper}") async def extract_transcript(self, video_id: str, language_preference: str = "en") -> TranscriptResult: """ Extract transcript using fallback chain with caching. Args: video_id: YouTube video ID language_preference: Preferred language code Returns: TranscriptResult with transcript data or error """ start_time = time.time() # Check cache first cache_key = f"transcript:{video_id}:{language_preference}" cached_result = await self.cache_client.get(cache_key) if cached_result: logger.info(f"Cache hit for video {video_id}") # The cached_result is a JSON string, parse it result_data = json.loads(cached_result) if isinstance(cached_result, str) else cached_result # Create TranscriptResult from cached data with from_cache flag return TranscriptResult(**result_data, from_cache=True) # Try primary method: YouTube Transcript API (mock) try: transcript = await self._extract_youtube_transcript(video_id, language_preference) result = await self._create_result( video_id, transcript, ExtractionMethod.YOUTUBE_API, language_preference, start_time ) await self._cache_result(cache_key, result) return result except TranscriptNotAvailableError: logger.info(f"YouTube API transcript not available for {video_id}") # Fallback 1: Auto-generated captions (mock) try: transcript = await self._extract_auto_captions(video_id, language_preference) result = await self._create_result( video_id, transcript, ExtractionMethod.AUTO_CAPTIONS, language_preference, start_time ) await self._cache_result(cache_key, result) return result except CaptionsNotAvailableError: logger.info(f"Auto-captions not available for {video_id}") # Fallback 2: Audio transcription with Whisper (mock) try: transcript = await self._transcribe_audio(video_id, language_preference) result = await self._create_result( video_id, transcript, ExtractionMethod.WHISPER_AUDIO, language_preference, start_time ) await self._cache_result(cache_key, result) return result except AudioTranscriptionError as e: logger.info(f"Whisper transcription failed for {video_id}, trying advanced fallback methods") # Fallback 3-8: Use IntelligentVideoDownloader with multiple methods # This includes: pytubefix, yt-dlp, playwright, external tools, web services if self.video_downloader: try: transcript = await self._extract_with_video_downloader(video_id, language_preference) result = await self._create_result( video_id, transcript, ExtractionMethod.WHISPER_AUDIO, # Mark as audio since it's likely from audio language_preference, start_time ) await self._cache_result(cache_key, result) return result except Exception as e: logger.error(f"Advanced fallback methods failed for {video_id}: {e}") logger.error(f"All transcript extraction methods failed for {video_id}") return TranscriptResult( video_id=video_id, transcript=None, method=ExtractionMethod.FAILED, success=False, error={ "code": ErrorCode.TRANSCRIPT_NOT_AVAILABLE, "message": "Unable to extract transcript from video", "details": { "video_id": video_id, "attempted_methods": ["youtube_api", "auto_captions", "whisper_audio"], "last_error": str(e), "suggestions": [ "Try a different video with captions available", "Check if video is public and accessible", "Contact support if this video should have transcripts" ] } } ) async def _extract_youtube_transcript(self, video_id: str, language: str) -> str: """YouTube Transcript API extraction (mock or real)""" # Use real implementation if available if self._use_real_youtube_api: try: from youtube_transcript_api import YouTubeTranscriptApi loop = asyncio.get_event_loop() def _fetch_transcript(): # Try multiple language preferences languages = [language, 'en', 'en-US', 'en-GB'] for lang in languages: try: # Use the static method get_transcript directly transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang]) # Store the raw transcript data for segments self._last_transcript_segments = transcript_list # Convert list of transcript entries to text full_text = ' '.join([entry['text'] for entry in transcript_list]) return full_text except Exception as e: logger.debug(f"Failed to fetch transcript for language {lang}: {e}") continue raise TranscriptNotAvailableError(f"No transcript available for {video_id}") transcript_text = await loop.run_in_executor(None, _fetch_transcript) return transcript_text except Exception as e: logger.error(f"Real YouTube transcript extraction failed: {e}") raise TranscriptNotAvailableError(f"Failed to extract transcript: {e}") # Mock implementation await asyncio.sleep(0.3) # Simulate API call # Simulate success/failure based on probability if random.random() > self._method_success_rates["youtube_api"]: raise TranscriptNotAvailableError(f"No transcript available for {video_id}") # Return mock transcript if available if video_id in self.MOCK_TRANSCRIPTS: return self.MOCK_TRANSCRIPTS[video_id]["text"] # Generate generic mock transcript return f"""This is a mock transcript extracted via YouTube API for video {video_id}. The content discusses various topics related to technology and innovation. This demonstration text shows how the transcript extraction service works.""" async def _extract_auto_captions(self, video_id: str, language: str) -> str: """Auto-generated captions extraction""" # Use real implementation if available if self._use_real_youtube_api: try: from youtube_transcript_api import YouTubeTranscriptApi loop = asyncio.get_event_loop() def _fetch_auto_captions(): # Try to get auto-generated captions try: # List available transcripts for the video transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # Try to find auto-generated caption for the requested language for transcript in transcript_list: if transcript.is_generated and transcript.language_code == language: caption_list = transcript.fetch() full_text = ' '.join([entry['text'] for entry in caption_list]) return f"[Auto-generated] {full_text}" # Try English auto-generated as fallback for transcript in transcript_list: if transcript.is_generated and transcript.language_code in ['en', 'en-US']: caption_list = transcript.fetch() full_text = ' '.join([entry['text'] for entry in caption_list]) return f"[Auto-generated] {full_text}" raise CaptionsNotAvailableError(f"No auto-generated captions available for {video_id}") except Exception as e: raise CaptionsNotAvailableError(f"Failed to fetch auto-captions: {e}") caption_text = await loop.run_in_executor(None, _fetch_auto_captions) return caption_text except Exception as e: logger.error(f"Real auto-caption extraction failed: {e}") raise CaptionsNotAvailableError(f"Failed to extract auto-captions: {e}") # Mock implementation fallback await asyncio.sleep(0.4) # Simulate API call if random.random() > self._method_success_rates["auto_captions"]: raise CaptionsNotAvailableError(f"No auto-captions for {video_id}") return f"""[Auto-generated] This is a mock auto-caption transcript for video {video_id}. Auto-generated captions may contain errors but provide useful content. The transcript has been processed and cleaned for better readability.""" async def _transcribe_audio(self, video_id: str, language: str) -> str: """Audio transcription using Whisper (mock or real)""" # Use real implementation if available if self._using_real_whisper and self.whisper_client and not isinstance(self.whisper_client, MockWhisperClient): try: # Use the real Whisper service logger.info(f"Using real Whisper service for video {video_id}") video_url = f"https://www.youtube.com/watch?v={video_id}" segments, metadata = await self.whisper_client.transcribe_video( video_id, video_url ) # Convert DualTranscriptSegment to TranscriptSegment for compatibility from backend.models.transcript import TranscriptSegment converted_segments = [] for segment in segments: converted_segments.append(TranscriptSegment( text=segment.text, start=segment.start_time, duration=segment.end_time - segment.start_time )) # Store converted segments for use in _create_result self._last_whisper_segments = converted_segments # Convert segments to text transcript_text = ' '.join([segment.text for segment in segments]) logger.info(f"Successfully transcribed audio for {video_id} - {metadata.word_count} words") return transcript_text except Exception as e: logger.error(f"Real audio transcription failed: {e}") raise AudioTranscriptionError(f"Failed to transcribe audio: {e}") # Mock implementation await asyncio.sleep(0.8) # Simulate longer processing time if random.random() > self._method_success_rates["whisper_audio"]: raise AudioTranscriptionError(f"Failed to transcribe audio for {video_id}") return await self.whisper_client.transcribe(video_id) async def _extract_with_video_downloader(self, video_id: str, language: str) -> str: """Use IntelligentVideoDownloader with multiple fallback methods""" if not self.video_downloader: raise Exception("Video downloader not available") video_url = f"https://www.youtube.com/watch?v={video_id}" # Configure preferences for transcript extraction preferences = DownloadPreferences( quality=VideoQuality.AUDIO_ONLY, # We only need audio for transcription prefer_audio_only=True, fallback_to_transcript=True ) logger.info(f"Attempting advanced download methods for {video_id}") # The IntelligentVideoDownloader will try: # 1. pytubefix # 2. yt-dlp # 3. playwright (browser automation) # 4. external tools # 5. web services # 6. transcript only fallback result = await self.video_downloader.download_video(video_url, preferences) if result.status in [DownloadStatus.COMPLETED, DownloadStatus.PARTIAL]: # If we got audio, transcribe it if result.audio_file and result.audio_file.exists(): # Use whisper to transcribe the downloaded audio if self._using_real_whisper and self.whisper_client: segments, metadata = await self.whisper_client.transcribe_video( video_id, video_url ) return ' '.join([segment.text for segment in segments]) else: # Fall back to basic extraction return f"[Advanced Download] Successfully downloaded audio for {video_id} using {result.method_used}" # If we only got transcript data if result.transcript: return result.transcript raise Exception(f"Download completed but no transcript available") else: raise Exception(f"All advanced download methods failed: {result.error_message}") async def _create_result(self, video_id: str, transcript: str, method: ExtractionMethod, language: str, start_time: float) -> TranscriptResult: """Create TranscriptResult with metadata""" processing_time = time.time() - start_time word_count = len(transcript.split()) metadata = TranscriptMetadata( word_count=word_count, estimated_reading_time=int(word_count / 200 * 60), # 200 WPM reading speed language=language, has_timestamps=method == ExtractionMethod.YOUTUBE_API, extraction_method=method, processing_time_seconds=processing_time ) # Get segments if available segments = None # Check for real Whisper segments first if self._last_whisper_segments and method == ExtractionMethod.WHISPER_AUDIO: segments = self._last_whisper_segments self._last_whisper_segments = None # Clear after use # Fall back to mock data segments elif video_id in self.MOCK_TRANSCRIPTS and self.MOCK_TRANSCRIPTS[video_id].get("segments"): segments = [TranscriptSegment(**seg) for seg in self.MOCK_TRANSCRIPTS[video_id]["segments"]] return TranscriptResult( video_id=video_id, transcript=transcript, segments=segments, metadata=metadata, method=method, success=True, from_cache=False ) async def _cache_result(self, cache_key: str, result: TranscriptResult): """Cache the transcript result""" try: # Convert to dict for caching cache_data = result.model_dump(exclude={'from_cache'}) await self.cache_client.set(cache_key, cache_data, ttl=86400) # 24 hours logger.info(f"Cached transcript for key {cache_key}") except Exception as e: logger.error(f"Failed to cache transcript: {e}") def extract_metadata(self, transcript: str) -> Dict[str, Any]: """Extract metadata from transcript text""" word_count = len(transcript.split()) char_count = len(transcript) line_count = len(transcript.split('\n')) return { "word_count": word_count, "character_count": char_count, "line_count": line_count, "estimated_reading_time_seconds": int(word_count / 200 * 60), "average_words_per_line": word_count / max(line_count, 1) }