""" Whisper transcription service for YouTube videos. Adapted from archived personal-ai-assistant transcription service for YouTube video context. """ import os import logging import tempfile import asyncio from datetime import datetime from typing import List, Dict, Optional, Tuple from pathlib import Path import torch import whisper from pydub import AudioSegment import yt_dlp import aiofiles import aiohttp from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata from ..core.config import settings from ..config.video_download_config import VideoDownloadConfig logger = logging.getLogger(__name__) class WhisperTranscriptService: """Service for transcribing YouTube videos using OpenAI Whisper.""" def __init__(self, model_size: str = "small", device: str = "auto"): """ Initialize the Whisper transcription service. Args: model_size: Whisper model size ("tiny", "base", "small", "medium", "large") device: Device to run on ("cpu", "cuda", "auto") """ self.model_size = model_size self.device = self._get_device(device) self.model = None # Configuration self.chunk_duration = 30 * 60 # 30 minutes per chunk self.overlap_duration = 30 # 30 seconds overlap between chunks self.max_segment_length = 1000 # Maximum characters per segment # Use video storage configuration self.config = VideoDownloadConfig() self.config.ensure_directories() self.storage_dirs = self.config.get_storage_dirs() self.temp_dir = self.storage_dirs["temp"] def _get_device(self, device: str) -> str: """Determine the appropriate device for processing.""" if device == "auto": if torch.cuda.is_available(): return "cuda" else: return "cpu" return device async def _load_model(self) -> whisper.Whisper: """Load the Whisper model on-demand.""" if self.model is None: logger.info(f"Loading Whisper model '{self.model_size}' on device '{self.device}'") try: # Run model loading in executor to avoid blocking async loop loop = asyncio.get_event_loop() self.model = await loop.run_in_executor( None, lambda: whisper.load_model(self.model_size, device=self.device) ) logger.info(f"Successfully loaded Whisper model '{self.model_size}'") except Exception as e: logger.error(f"Failed to load Whisper model: {e}") raise return self.model async def transcribe_video( self, video_id: str, video_url: str, progress_callback=None ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]: """ Transcribe a YouTube video and return segments with metadata. Args: video_id: YouTube video ID video_url: Full YouTube video URL progress_callback: Optional callback for progress updates Returns: Tuple of (segments, metadata) """ try: if progress_callback: await progress_callback("Downloading audio from YouTube video...") # Download audio from YouTube video audio_path = await self._download_audio(video_id, video_url) if progress_callback: await progress_callback("Audio downloaded, starting transcription...") logger.info(f"Starting Whisper transcription for video {video_id}") # Transcribe the audio file segments = await self._transcribe_audio_file( audio_path, progress_callback=progress_callback ) # Create metadata metadata = DualTranscriptMetadata( video_id=video_id, language="en", # Whisper auto-detects, but assume English for now word_count=sum(len(segment.text.split()) for segment in segments), total_segments=len(segments), has_timestamps=True, extraction_method="whisper_ai", processing_time_seconds=0, # Will be calculated by caller quality_score=self._calculate_quality_score(segments), confidence_score=self._calculate_confidence_score(segments) ) logger.info(f"Completed Whisper transcription for video {video_id}. Generated {len(segments)} segments.") # Save transcript to file await self._save_transcript(video_id, segments) return segments, metadata except Exception as e: logger.error(f"Whisper transcription failed for video {video_id}: {e}") raise finally: # Clean up temporary WAV file, but keep MP3 for future re-transcription if 'audio_path' in locals() and audio_path.endswith('.wav'): wav_path = Path(audio_path) mp3_path = wav_path.with_suffix('.mp3') if mp3_path.exists() and wav_path.exists(): try: os.unlink(audio_path) logger.info(f"Cleaned up temporary WAV, keeping MP3: {mp3_path}") except Exception as e: logger.warning(f"Failed to clean up WAV file {audio_path}: {e}") else: logger.info(f"Keeping audio file: {audio_path}") async def _download_audio(self, video_id: str, video_url: str) -> str: """Download audio from YouTube video using yt-dlp.""" try: # Check if audio already exists (MP3 for storage) mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3" wav_path = self.storage_dirs["audio"] / f"{video_id}.wav" # If MP3 exists, convert to WAV for Whisper if mp3_path.exists(): logger.info(f"Using existing audio file: {mp3_path}") # Convert MP3 to WAV for Whisper processing await self._convert_audio(mp3_path, wav_path) return str(wav_path) # Download as MP3 for efficient storage ydl_opts = { 'format': 'bestaudio/best', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'outtmpl': str(self.storage_dirs["audio"] / f"{video_id}.%(ext)s"), 'quiet': True, 'no_warnings': True, } # Run yt-dlp in executor to avoid blocking loop = asyncio.get_event_loop() await loop.run_in_executor( None, lambda: self._run_yt_dlp(video_url, ydl_opts) ) # Convert MP3 to WAV for Whisper processing if mp3_path.exists(): await self._convert_audio(mp3_path, wav_path) return str(wav_path) raise RuntimeError(f"Failed to download audio for {video_id}") except Exception as e: logger.error(f"Failed to download audio for video {video_id}: {e}") raise RuntimeError(f"Audio download failed: {e}") def _run_yt_dlp(self, url: str, opts: dict): """Run yt-dlp synchronously.""" with yt_dlp.YoutubeDL(opts) as ydl: ydl.download([url]) async def _convert_audio(self, input_path: Path, output_path: Path): """Convert audio between formats using pydub.""" try: loop = asyncio.get_event_loop() def convert(): audio = AudioSegment.from_file(str(input_path)) audio.export(str(output_path), format=output_path.suffix[1:]) await loop.run_in_executor(None, convert) logger.info(f"Converted {input_path} to {output_path}") except Exception as e: logger.error(f"Audio conversion failed: {e}") raise async def _transcribe_audio_file( self, audio_path: str, progress_callback=None ) -> List[DualTranscriptSegment]: """ Transcribe an audio file with chunking for long videos. Args: audio_path: Path to the audio file progress_callback: Optional callback for progress updates Returns: List of transcription segments """ model = await self._load_model() # Get audio duration duration = await self._get_audio_duration(audio_path) logger.info(f"Audio duration: {duration:.2f} seconds ({duration/60:.1f} minutes)") if duration <= self.chunk_duration: # Process entire file at once for shorter videos return await self._transcribe_chunk( model, audio_path, 0, duration, progress_callback ) else: # Process in chunks for longer videos return await self._transcribe_in_chunks( model, audio_path, duration, progress_callback ) async def _get_audio_duration(self, audio_path: str) -> float: """Get audio duration using pydub.""" loop = asyncio.get_event_loop() audio = await loop.run_in_executor(None, AudioSegment.from_file, audio_path) return len(audio) / 1000.0 # Convert milliseconds to seconds async def _transcribe_chunk( self, model: whisper.Whisper, audio_path: str, start_time: float, end_time: float, progress_callback=None ) -> List[DualTranscriptSegment]: """ Transcribe a specific chunk of audio. Args: model: Loaded Whisper model audio_path: Path to the audio file start_time: Start time in seconds end_time: End time in seconds progress_callback: Optional callback for progress updates Returns: List of transcription segments for this chunk """ try: # Extract audio chunk if needed if start_time > 0 or end_time < await self._get_audio_duration(audio_path): chunk_path = await self._extract_audio_chunk( audio_path, start_time, end_time ) time_offset = start_time else: chunk_path = audio_path time_offset = 0 # Transcribe the chunk logger.info(f"Transcribing chunk {start_time:.1f}s - {end_time:.1f}s") loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, lambda: model.transcribe( chunk_path, word_timestamps=True, language="en", # Can be made configurable task="transcribe" ) ) # Convert to TranscriptSegment objects segments = [] for whisper_segment in result["segments"]: # Adjust timestamps if this is a chunk adj_start = whisper_segment["start"] + time_offset adj_end = whisper_segment["end"] + time_offset # Split long segments text = whisper_segment["text"].strip() if len(text) > self.max_segment_length: split_segments = self._split_long_segment( text, adj_start, adj_end ) segments.extend(split_segments) else: segments.append(DualTranscriptSegment( start_time=adj_start, end_time=adj_end, text=text, confidence=whisper_segment.get("avg_logprob", 0.0) )) # Clean up temporary chunk file if chunk_path != audio_path and os.path.exists(chunk_path): os.unlink(chunk_path) if progress_callback: await progress_callback(f"Transcribed chunk {start_time:.1f}s - {end_time:.1f}s") return segments except Exception as e: logger.error(f"Failed to transcribe chunk {start_time}-{end_time}: {e}") raise async def _extract_audio_chunk( self, audio_path: str, start_time: float, end_time: float ) -> str: """Extract a chunk of audio to a temporary file.""" chunk_path = self.temp_dir / f"chunk_{start_time}_{end_time}.wav" loop = asyncio.get_event_loop() def extract_chunk(): audio = AudioSegment.from_file(audio_path) chunk = audio[start_time*1000:end_time*1000] # pydub uses milliseconds chunk.export(str(chunk_path), format="wav") await loop.run_in_executor(None, extract_chunk) return str(chunk_path) async def _transcribe_in_chunks( self, model: whisper.Whisper, audio_path: str, total_duration: float, progress_callback=None ) -> List[DualTranscriptSegment]: """ Transcribe a long audio file in chunks with overlap. Args: model: Loaded Whisper model audio_path: Path to the audio file total_duration: Total duration in seconds progress_callback: Optional callback for progress updates Returns: List of transcription segments """ all_segments = [] current_time = 0 chunk_number = 1 while current_time < total_duration: # Calculate chunk boundaries chunk_start = max(0, current_time - self.overlap_duration) chunk_end = min(total_duration, current_time + self.chunk_duration) logger.info(f"Processing chunk {chunk_number}: {chunk_start:.1f}s - {chunk_end:.1f}s") # Transcribe chunk chunk_segments = await self._transcribe_chunk( model, audio_path, chunk_start, chunk_end, progress_callback ) # Filter overlapping segments (keep only new content) if current_time > 0: chunk_segments = [s for s in chunk_segments if s.start_time >= current_time] all_segments.extend(chunk_segments) # Move to next chunk current_time += self.chunk_duration chunk_number += 1 return all_segments def _split_long_segment( self, text: str, start_time: float, end_time: float ) -> List[DualTranscriptSegment]: """ Split a long text segment into smaller segments. Args: text: Text to split start_time: Start time of the original segment end_time: End time of the original segment Returns: List of smaller segments """ segments = [] duration = end_time - start_time # Split text by sentences or at word boundaries words = text.split() current_text = "" current_words = 0 time_per_word = duration / len(words) if len(words) > 0 else 0 for i, word in enumerate(words): if len(current_text + " " + word) > self.max_segment_length and current_text: # Create segment segment_start = start_time + (current_words - len(current_text.split())) * time_per_word segment_end = start_time + current_words * time_per_word segments.append(DualTranscriptSegment( start_time=segment_start, end_time=segment_end, text=current_text.strip() )) current_text = word else: current_text += " " + word if current_text else word current_words += 1 # Add final segment if current_text: segment_start = start_time + (current_words - len(current_text.split())) * time_per_word segments.append(DualTranscriptSegment( start_time=segment_start, end_time=end_time, text=current_text.strip() )) return segments def _calculate_quality_score(self, segments: List[DualTranscriptSegment]) -> float: """Calculate overall quality score based on segment characteristics.""" if not segments: return 0.0 # Simple quality heuristics total_confidence = sum(s.confidence for s in segments if s.confidence is not None) avg_confidence = total_confidence / len(segments) # Normalize confidence from log probability to 0-1 scale # Whisper typically gives log probabilities from -5 to 0 normalized_confidence = max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0)) return normalized_confidence def _calculate_confidence_score(self, segments: List[DualTranscriptSegment]) -> float: """Calculate average confidence score.""" if not segments: return 0.0 confidences = [s.confidence for s in segments if s.confidence is not None] if not confidences: return 0.0 avg_confidence = sum(confidences) / len(confidences) # Normalize from log probability to 0-1 scale return max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0)) async def _save_transcript(self, video_id: str, segments: List[DualTranscriptSegment]): """Save transcript and audio metadata to files for future use""" try: # Save audio metadata await self._save_audio_metadata(video_id) transcript_path = self.storage_dirs["transcripts"] / f"{video_id}.txt" # Create human-readable transcript file transcript_lines = [] for segment in segments: if segment.start_time and segment.end_time: timestamp = f"[{segment.start_time:.1f}s - {segment.end_time:.1f}s]" transcript_lines.append(f"{timestamp} {segment.text}") else: transcript_lines.append(segment.text) # Write transcript to file async with aiofiles.open(transcript_path, 'w', encoding='utf-8') as f: await f.write('\n'.join(transcript_lines)) logger.info(f"Saved transcript to {transcript_path}") # Also save as JSON for programmatic access json_path = self.storage_dirs["transcripts"] / f"{video_id}.json" segments_data = [ { "start_time": seg.start_time, "end_time": seg.end_time, "text": seg.text, "confidence": seg.confidence } for seg in segments ] async with aiofiles.open(json_path, 'w', encoding='utf-8') as f: import json await f.write(json.dumps(segments_data, indent=2)) logger.info(f"Saved transcript JSON to {json_path}") except Exception as e: logger.warning(f"Failed to save transcript for {video_id}: {e}") async def _save_audio_metadata(self, video_id: str): """Save audio metadata for tracking and management""" try: mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3" if not mp3_path.exists(): return # Get audio file info audio_info = { "video_id": video_id, "file_path": str(mp3_path), "file_size_mb": round(mp3_path.stat().st_size / (1024 * 1024), 2), "download_date": datetime.now().isoformat(), "format": "mp3", "quality": "192kbps", "model_used": self.model_size, "device": self.device } # Try to get audio duration try: loop = asyncio.get_event_loop() audio = await loop.run_in_executor(None, AudioSegment.from_file, str(mp3_path)) audio_info["duration_seconds"] = len(audio) / 1000.0 audio_info["duration_formatted"] = f"{int(audio_info['duration_seconds'] // 60)}:{int(audio_info['duration_seconds'] % 60):02d}" except: pass # Save metadata metadata_path = self.storage_dirs["audio"] / f"{video_id}_metadata.json" async with aiofiles.open(metadata_path, 'w', encoding='utf-8') as f: import json await f.write(json.dumps(audio_info, indent=2)) logger.info(f"Saved audio metadata to {metadata_path}") except Exception as e: logger.warning(f"Failed to save audio metadata for {video_id}: {e}") async def cleanup(self): """Clean up temporary files and resources.""" try: # Don't delete the whole temp directory as it's shared # Just clean up old files periodically # Unload model to free GPU memory if self.model is not None: del self.model if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception as e: logger.warning(f"Error during cleanup: {e}")