""" Dual transcript service that provides YouTube captions, Whisper AI transcription, or both. Coordinates between different transcript sources and provides comparison functionality. """ import asyncio import logging import time from typing import List, Dict, Optional, Tuple, Union from enum import Enum from .transcript_service import TranscriptService from .faster_whisper_transcript_service import FasterWhisperTranscriptService from ..config.video_download_config import VideoDownloadConfig from ..models.transcript import ( DualTranscriptSegment, DualTranscriptMetadata, TranscriptSource, DualTranscriptResult, TranscriptComparison, TranscriptSegment, TranscriptMetadata ) from ..core.config import settings logger = logging.getLogger(__name__) class TranscriptQuality(Enum): """Transcript quality levels""" STANDARD = "standard" # YouTube captions HIGH = "high" # Whisper small/base PREMIUM = "premium" # Whisper medium/large class DualTranscriptService: """Service for managing dual transcript extraction and comparison.""" def __init__(self): self.transcript_service = TranscriptService() # Load configuration for faster-whisper config = VideoDownloadConfig() self.whisper_service = FasterWhisperTranscriptService( model_size=config.whisper_model, device=config.whisper_device, compute_type=config.whisper_compute_type, beam_size=config.whisper_beam_size, vad_filter=config.whisper_vad_filter, word_timestamps=config.whisper_word_timestamps, temperature=config.whisper_temperature, best_of=config.whisper_best_of ) async def get_transcript( self, video_id: str, video_url: str, source: TranscriptSource, progress_callback=None ) -> DualTranscriptResult: """ Get transcript from specified source(s). Args: video_id: YouTube video ID video_url: Full YouTube video URL source: Which transcript source(s) to use progress_callback: Optional callback for progress updates Returns: DualTranscriptResult with requested transcript data """ start_time = time.time() try: if source == TranscriptSource.YOUTUBE: return await self._get_youtube_only( video_id, video_url, progress_callback ) elif source == TranscriptSource.WHISPER: return await self._get_whisper_only( video_id, video_url, progress_callback ) elif source == TranscriptSource.BOTH: return await self._get_both_transcripts( video_id, video_url, progress_callback ) else: raise ValueError(f"Invalid transcript source: {source}") except Exception as e: logger.error(f"Failed to get transcript for video {video_id} from {source}: {e}") processing_time = time.time() - start_time return DualTranscriptResult( video_id=video_id, source=source, youtube_transcript=None, youtube_metadata=None, whisper_transcript=None, whisper_metadata=None, comparison=None, processing_time_seconds=processing_time, success=False, error=str(e) ) async def _get_youtube_only( self, video_id: str, video_url: str, progress_callback=None ) -> DualTranscriptResult: """Get YouTube captions only.""" start_time = time.time() try: if progress_callback: await progress_callback("Extracting YouTube captions...") # Get YouTube transcript via existing transcript service transcript_result = await self.transcript_service.extract_transcript(video_id) if transcript_result.success and transcript_result.transcript: # Convert to dual transcript format youtube_segments = self._convert_to_dual_segments(transcript_result) youtube_metadata = self._convert_to_dual_metadata(transcript_result, video_id) else: raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}") if progress_callback: await progress_callback("YouTube captions extracted successfully") processing_time = time.time() - start_time return DualTranscriptResult( video_id=video_id, source=TranscriptSource.YOUTUBE, youtube_transcript=youtube_segments, youtube_metadata=youtube_metadata, whisper_transcript=None, whisper_metadata=None, comparison=None, processing_time_seconds=processing_time, success=True, error=None ) except Exception as e: logger.error(f"YouTube transcript extraction failed: {e}") raise async def _get_whisper_only( self, video_id: str, video_url: str, progress_callback=None ) -> DualTranscriptResult: """Get Whisper AI transcription only.""" start_time = time.time() try: if progress_callback: await progress_callback("Starting AI transcription with Whisper...") # Get Whisper transcript whisper_segments, whisper_metadata = await self.whisper_service.transcribe_video( video_id, video_url, progress_callback ) processing_time = time.time() - start_time whisper_metadata.processing_time_seconds = processing_time return DualTranscriptResult( video_id=video_id, source=TranscriptSource.WHISPER, youtube_transcript=None, youtube_metadata=None, whisper_transcript=whisper_segments, whisper_metadata=whisper_metadata, comparison=None, processing_time_seconds=processing_time, success=True, error=None ) except Exception as e: logger.error(f"Whisper transcript extraction failed: {e}") raise async def _get_both_transcripts( self, video_id: str, video_url: str, progress_callback=None ) -> DualTranscriptResult: """Get both YouTube and Whisper transcripts for comparison.""" start_time = time.time() try: # Progress tracking if progress_callback: await progress_callback("Starting dual transcript extraction...") # Run both extractions in parallel youtube_task = asyncio.create_task( self._get_youtube_with_progress(video_id, video_url, progress_callback) ) whisper_task = asyncio.create_task( self._get_whisper_with_progress(video_id, video_url, progress_callback) ) # Wait for both to complete youtube_result, whisper_result = await asyncio.gather( youtube_task, whisper_task, return_exceptions=True ) # Handle any exceptions youtube_segments, youtube_metadata = None, None whisper_segments, whisper_metadata = None, None errors = [] if isinstance(youtube_result, Exception): logger.warning(f"YouTube extraction failed: {youtube_result}") errors.append(f"YouTube: {youtube_result}") else: youtube_segments, youtube_metadata = youtube_result if isinstance(whisper_result, Exception): logger.warning(f"Whisper extraction failed: {whisper_result}") errors.append(f"Whisper: {whisper_result}") else: whisper_segments, whisper_metadata = whisper_result # Generate comparison if we have both transcripts comparison = None if youtube_segments and whisper_segments: if progress_callback: await progress_callback("Generating transcript comparison...") comparison = self._compare_transcripts( youtube_segments, youtube_metadata, whisper_segments, whisper_metadata ) processing_time = time.time() - start_time if whisper_metadata: whisper_metadata.processing_time_seconds = processing_time # Determine success status success = (youtube_segments is not None) or (whisper_segments is not None) error_message = "; ".join(errors) if errors else None if progress_callback: if success: await progress_callback("Dual transcript extraction completed") else: await progress_callback("Dual transcript extraction failed") return DualTranscriptResult( video_id=video_id, source=TranscriptSource.BOTH, youtube_transcript=youtube_segments, youtube_metadata=youtube_metadata, whisper_transcript=whisper_segments, whisper_metadata=whisper_metadata, comparison=comparison, processing_time_seconds=processing_time, success=success, error=error_message ) except Exception as e: logger.error(f"Dual transcript extraction failed: {e}") processing_time = time.time() - start_time return DualTranscriptResult( video_id=video_id, source=TranscriptSource.BOTH, youtube_transcript=None, youtube_metadata=None, whisper_transcript=None, whisper_metadata=None, comparison=None, processing_time_seconds=processing_time, success=False, error=str(e) ) async def _get_youtube_with_progress( self, video_id: str, video_url: str, progress_callback=None ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]: """Get YouTube transcript with progress updates.""" if progress_callback: await progress_callback("Extracting YouTube captions...") transcript_result = await self.transcript_service.extract_transcript(video_id) if not transcript_result.success: raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}") # Convert to dual transcript format result = ( self._convert_to_dual_segments(transcript_result), self._convert_to_dual_metadata(transcript_result, video_id) ) if progress_callback: await progress_callback("YouTube captions extracted") return result async def _get_whisper_with_progress( self, video_id: str, video_url: str, progress_callback=None ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]: """Get Whisper transcript with progress updates.""" if progress_callback: await progress_callback("Starting AI transcription...") result = await self.whisper_service.transcribe_video( video_id, video_url, progress_callback ) if progress_callback: await progress_callback("AI transcription completed") return result def _compare_transcripts( self, youtube_segments: List[TranscriptSegment], youtube_metadata: TranscriptMetadata, whisper_segments: List[TranscriptSegment], whisper_metadata: TranscriptMetadata ) -> TranscriptComparison: """Generate comparison between YouTube and Whisper transcripts.""" # Combine segments into full text for comparison youtube_text = " ".join(segment.text for segment in youtube_segments) whisper_text = " ".join(segment.text for segment in whisper_segments) # Calculate basic metrics youtube_words = youtube_text.split() whisper_words = whisper_text.split() # Calculate word-level differences (simplified) word_differences = abs(len(youtube_words) - len(whisper_words)) word_similarity = 1.0 - (word_differences / max(len(youtube_words), len(whisper_words), 1)) # Calculate quality metrics punctuation_improvement = self._calculate_punctuation_improvement(youtube_text, whisper_text) capitalization_improvement = self._calculate_capitalization_improvement(youtube_text, whisper_text) # Determine recommendation recommendation = self._generate_recommendation( youtube_metadata, whisper_metadata, word_similarity, punctuation_improvement, capitalization_improvement ) return TranscriptComparison( word_count_difference=word_differences, similarity_score=word_similarity, punctuation_improvement_score=punctuation_improvement, capitalization_improvement_score=capitalization_improvement, processing_time_ratio=whisper_metadata.processing_time_seconds / max(youtube_metadata.processing_time_seconds, 0.1), quality_difference=whisper_metadata.quality_score - youtube_metadata.quality_score, confidence_difference=whisper_metadata.confidence_score - youtube_metadata.confidence_score, recommendation=recommendation, significant_differences=self._find_significant_differences(youtube_text, whisper_text), technical_terms_improved=self._find_technical_improvements(youtube_text, whisper_text) ) def _calculate_punctuation_improvement(self, youtube_text: str, whisper_text: str) -> float: """Calculate improvement in punctuation between transcripts.""" youtube_punct = sum(1 for c in youtube_text if c in '.,!?;:') whisper_punct = sum(1 for c in whisper_text if c in '.,!?;:') # Normalize by text length youtube_punct_ratio = youtube_punct / max(len(youtube_text), 1) whisper_punct_ratio = whisper_punct / max(len(whisper_text), 1) # Return improvement score (0-1 scale) improvement = whisper_punct_ratio - youtube_punct_ratio return max(0.0, min(1.0, improvement * 10)) # Scale to 0-1 def _calculate_capitalization_improvement(self, youtube_text: str, whisper_text: str) -> float: """Calculate improvement in capitalization between transcripts.""" youtube_capitals = sum(1 for c in youtube_text if c.isupper()) whisper_capitals = sum(1 for c in whisper_text if c.isupper()) # Normalize by text length youtube_cap_ratio = youtube_capitals / max(len(youtube_text), 1) whisper_cap_ratio = whisper_capitals / max(len(whisper_text), 1) # Return improvement score (0-1 scale) improvement = whisper_cap_ratio - youtube_cap_ratio return max(0.0, min(1.0, improvement * 5)) # Scale to 0-1 def _generate_recommendation( self, youtube_metadata: TranscriptMetadata, whisper_metadata: TranscriptMetadata, similarity: float, punct_improvement: float, cap_improvement: float ) -> str: """Generate recommendation based on comparison metrics.""" # If very similar and YouTube is much faster if similarity > 0.95 and whisper_metadata.processing_time_seconds > youtube_metadata.processing_time_seconds * 10: return "youtube" # If significant quality improvement with Whisper if (whisper_metadata.quality_score - youtube_metadata.quality_score) > 0.2: return "whisper" # If significant punctuation/capitalization improvement if punct_improvement > 0.3 or cap_improvement > 0.3: return "whisper" # If low confidence in YouTube captions if youtube_metadata.confidence_score < 0.6 and whisper_metadata.confidence_score > 0.7: return "whisper" # Default to YouTube for speed if quality is similar return "youtube" def _find_significant_differences(self, youtube_text: str, whisper_text: str) -> List[str]: """Find significant textual differences between transcripts.""" differences = [] # Simple difference detection (can be enhanced with difflib) youtube_words = set(youtube_text.lower().split()) whisper_words = set(whisper_text.lower().split()) unique_to_whisper = whisper_words - youtube_words unique_to_youtube = youtube_words - whisper_words if len(unique_to_whisper) > 5: differences.append(f"Whisper includes {len(unique_to_whisper)} additional unique words") if len(unique_to_youtube) > 5: differences.append(f"YouTube includes {len(unique_to_youtube)} words not in Whisper") return differences[:5] # Limit to 5 most significant def _find_technical_improvements(self, youtube_text: str, whisper_text: str) -> List[str]: """Find technical terms that were improved in Whisper transcript.""" improvements = [] # Common technical terms that might be improved technical_patterns = [ ("API", "a p i"), ("URL", "u r l"), ("HTTP", "h t t p"), ("JSON", "jason"), ("SQL", "sequel"), ("AI", "a i"), ("ML", "m l"), ("GPU", "g p u"), ("CPU", "c p u") ] for correct, incorrect in technical_patterns: if incorrect.lower() in youtube_text.lower() and correct.lower() in whisper_text.lower(): improvements.append(f"'{incorrect}' → '{correct}'") return improvements[:3] # Limit to 3 most significant def estimate_processing_time( self, video_duration_seconds: float, source: TranscriptSource ) -> Dict[str, float]: """ Estimate processing time for different transcript sources. Args: video_duration_seconds: Duration of the video in seconds source: Which transcript source(s) to estimate for Returns: Dictionary with time estimates in seconds """ estimates = {} if source in [TranscriptSource.YOUTUBE, TranscriptSource.BOTH]: # YouTube API is very fast - usually 1-3 seconds regardless of video length estimates["youtube"] = min(3.0, max(1.0, video_duration_seconds * 0.01)) if source in [TranscriptSource.WHISPER, TranscriptSource.BOTH]: # Whisper processing time depends on model size and duration # Rough estimates: ~0.1-0.5x real-time depending on hardware base_ratio = 0.3 # Conservative estimate device_multiplier = 0.5 if self.whisper_service.device == "cuda" else 1.5 estimates["whisper"] = video_duration_seconds * base_ratio * device_multiplier if source == TranscriptSource.BOTH: # Parallel processing, so max of both plus comparison overhead estimates["total"] = max(estimates.get("youtube", 0), estimates.get("whisper", 0)) + 2.0 else: estimates["total"] = sum(estimates.values()) return estimates def _convert_to_dual_segments(self, transcript_result) -> List[DualTranscriptSegment]: """Convert TranscriptResult to DualTranscriptSegment list.""" if not transcript_result.segments: # If no segments, create segments from plain text if transcript_result.transcript: # Simple conversion - split text into segments (basic implementation) text_segments = transcript_result.transcript.split('. ') segments = [] current_time = 0.0 for i, text in enumerate(text_segments): if text.strip(): # Estimate duration based on word count (rough estimate) word_count = len(text.split()) duration = word_count * 0.5 # 0.5 seconds per word (rough) segments.append(DualTranscriptSegment( start_time=current_time, end_time=current_time + duration, text=text.strip() + ('.' if not text.endswith('.') else ''), confidence=0.8 # Default confidence for YouTube )) current_time += duration + 0.5 # Small gap between segments return segments return [] # Convert existing segments dual_segments = [] for segment in transcript_result.segments: dual_segments.append(DualTranscriptSegment( start_time=segment.start, end_time=segment.start + segment.duration, text=segment.text, confidence=0.8 # Default confidence for YouTube captions )) return dual_segments def _convert_to_dual_metadata(self, transcript_result, video_id: str) -> DualTranscriptMetadata: """Convert TranscriptResult to DualTranscriptMetadata.""" word_count = len(transcript_result.transcript.split()) if transcript_result.transcript else 0 return DualTranscriptMetadata( video_id=video_id, language=transcript_result.metadata.language if transcript_result.metadata else "en", word_count=word_count, total_segments=len(transcript_result.segments) if transcript_result.segments else 0, has_timestamps=transcript_result.segments is not None and len(transcript_result.segments) > 0, extraction_method=transcript_result.method.value, processing_time_seconds=transcript_result.metadata.processing_time_seconds if transcript_result.metadata else 0.0, quality_score=0.75, # Default quality score for YouTube captions confidence_score=0.8 # Default confidence for YouTube captions ) async def cleanup(self): """Clean up resources used by transcript services.""" await self.whisper_service.cleanup()