youtube-summarizer/backend/services/whisper_transcript_service.py

"""
Whisper transcription service for YouTube videos.
Adapted from archived personal-ai-assistant transcription service for YouTube video context.
"""

import os
import logging
import tempfile
import asyncio
from datetime import datetime
from typing import List, Dict, Optional, Tuple
from pathlib import Path
import torch
import whisper
from pydub import AudioSegment
import yt_dlp
import aiofiles
import aiohttp

from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
from ..core.config import settings
from ..config.video_download_config import VideoDownloadConfig
logger = logging.getLogger(__name__)


class WhisperTranscriptService:
    """Service for transcribing YouTube videos using OpenAI Whisper."""

    def __init__(self, model_size: str = "small", device: str = "auto"):
        """
        Initialize the Whisper transcription service.

        Args:
            model_size: Whisper model size ("tiny", "base", "small", "medium", "large")
            device: Device to run on ("cpu", "cuda", "auto")
        """
        self.model_size = model_size
        self.device = self._get_device(device)
        self.model = None

        # Configuration
        self.chunk_duration = 30 * 60  # 30 minutes per chunk
        self.overlap_duration = 30     # 30 seconds overlap between chunks
        self.max_segment_length = 1000 # Maximum characters per segment

        # Use video storage configuration
        self.config = VideoDownloadConfig()
        self.config.ensure_directories()
        self.storage_dirs = self.config.get_storage_dirs()
        self.temp_dir = self.storage_dirs["temp"]

    def _get_device(self, device: str) -> str:
        """Determine the appropriate device for processing."""
        if device == "auto":
            if torch.cuda.is_available():
                return "cuda"
            else:
                return "cpu"
        return device

    async def _load_model(self) -> whisper.Whisper:
        """Load the Whisper model on-demand."""
        if self.model is None:
            logger.info(f"Loading Whisper model '{self.model_size}' on device '{self.device}'")
            try:
                # Run model loading in executor to avoid blocking async loop
                loop = asyncio.get_event_loop()
                self.model = await loop.run_in_executor(
                    None,
                    lambda: whisper.load_model(self.model_size, device=self.device)
                )
                logger.info(f"Successfully loaded Whisper model '{self.model_size}'")
            except Exception as e:
                logger.error(f"Failed to load Whisper model: {e}")
                raise
        return self.model

    async def transcribe_video(
        self,
        video_id: str,
        video_url: str,
        progress_callback=None
    ) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
        """
        Transcribe a YouTube video and return segments with metadata.

        Args:
            video_id: YouTube video ID
            video_url: Full YouTube video URL
            progress_callback: Optional callback for progress updates

        Returns:
            Tuple of (segments, metadata)
        """
        try:
            if progress_callback:
                await progress_callback("Downloading audio from YouTube video...")

            # Download audio from YouTube video
            audio_path = await self._download_audio(video_id, video_url)

            if progress_callback:
                await progress_callback("Audio downloaded, starting transcription...")

            logger.info(f"Starting Whisper transcription for video {video_id}")

            # Transcribe the audio file
            segments = await self._transcribe_audio_file(
                audio_path,
                progress_callback=progress_callback
            )

            # Create metadata
            metadata = DualTranscriptMetadata(
                video_id=video_id,
                language="en",  # Whisper auto-detects, but assume English for now
                word_count=sum(len(segment.text.split()) for segment in segments),
                total_segments=len(segments),
                has_timestamps=True,
                extraction_method="whisper_ai",
                processing_time_seconds=0,  # Will be calculated by caller
                quality_score=self._calculate_quality_score(segments),
                confidence_score=self._calculate_confidence_score(segments)
            )

            logger.info(f"Completed Whisper transcription for video {video_id}. Generated {len(segments)} segments.")

            # Save transcript to file
            await self._save_transcript(video_id, segments)

            return segments, metadata

        except Exception as e:
            logger.error(f"Whisper transcription failed for video {video_id}: {e}")
            raise
        finally:
            # Clean up temporary WAV file, but keep MP3 for future re-transcription
            if 'audio_path' in locals() and audio_path.endswith('.wav'):
                wav_path = Path(audio_path)
                mp3_path = wav_path.with_suffix('.mp3')

                if mp3_path.exists() and wav_path.exists():
                    try:
                        os.unlink(audio_path)
                        logger.info(f"Cleaned up temporary WAV, keeping MP3: {mp3_path}")
                    except Exception as e:
                        logger.warning(f"Failed to clean up WAV file {audio_path}: {e}")
                else:
                    logger.info(f"Keeping audio file: {audio_path}")

    async def _download_audio(self, video_id: str, video_url: str) -> str:
        """Download audio from YouTube video using yt-dlp."""
        try:
            # Check if audio already exists (MP3 for storage)
            mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
            wav_path = self.storage_dirs["audio"] / f"{video_id}.wav"

            # If MP3 exists, convert to WAV for Whisper
            if mp3_path.exists():
                logger.info(f"Using existing audio file: {mp3_path}")
                # Convert MP3 to WAV for Whisper processing
                await self._convert_audio(mp3_path, wav_path)
                return str(wav_path)

            # Download as MP3 for efficient storage
            ydl_opts = {
                'format': 'bestaudio/best',
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
                'outtmpl': str(self.storage_dirs["audio"] / f"{video_id}.%(ext)s"),
                'quiet': True,
                'no_warnings': True,
            }

            # Run yt-dlp in executor to avoid blocking
            loop = asyncio.get_event_loop()
            await loop.run_in_executor(
                None,
                lambda: self._run_yt_dlp(video_url, ydl_opts)
            )

            # Convert MP3 to WAV for Whisper processing
            if mp3_path.exists():
                await self._convert_audio(mp3_path, wav_path)
                return str(wav_path)

            raise RuntimeError(f"Failed to download audio for {video_id}")

        except Exception as e:
            logger.error(f"Failed to download audio for video {video_id}: {e}")
            raise RuntimeError(f"Audio download failed: {e}")

    def _run_yt_dlp(self, url: str, opts: dict):
        """Run yt-dlp synchronously."""
        with yt_dlp.YoutubeDL(opts) as ydl:
            ydl.download([url])

    async def _convert_audio(self, input_path: Path, output_path: Path):
        """Convert audio between formats using pydub."""
        try:
            loop = asyncio.get_event_loop()

            def convert():
                audio = AudioSegment.from_file(str(input_path))
                audio.export(str(output_path), format=output_path.suffix[1:])

            await loop.run_in_executor(None, convert)
            logger.info(f"Converted {input_path} to {output_path}")
        except Exception as e:
            logger.error(f"Audio conversion failed: {e}")
            raise

    async def _transcribe_audio_file(
        self,
        audio_path: str,
        progress_callback=None
    ) -> List[DualTranscriptSegment]:
        """
        Transcribe an audio file with chunking for long videos.

        Args:
            audio_path: Path to the audio file
            progress_callback: Optional callback for progress updates

        Returns:
            List of transcription segments
        """
        model = await self._load_model()

        # Get audio duration
        duration = await self._get_audio_duration(audio_path)
        logger.info(f"Audio duration: {duration:.2f} seconds ({duration/60:.1f} minutes)")

        if duration <= self.chunk_duration:
            # Process entire file at once for shorter videos
            return await self._transcribe_chunk(
                model, audio_path, 0, duration, progress_callback
            )
        else:
            # Process in chunks for longer videos
            return await self._transcribe_in_chunks(
                model, audio_path, duration, progress_callback
            )

    async def _get_audio_duration(self, audio_path: str) -> float:
        """Get audio duration using pydub."""
        loop = asyncio.get_event_loop()
        audio = await loop.run_in_executor(None, AudioSegment.from_file, audio_path)
        return len(audio) / 1000.0  # Convert milliseconds to seconds

    async def _transcribe_chunk(
        self,
        model: whisper.Whisper,
        audio_path: str,
        start_time: float,
        end_time: float,
        progress_callback=None
    ) -> List[DualTranscriptSegment]:
        """
        Transcribe a specific chunk of audio.

        Args:
            model: Loaded Whisper model
            audio_path: Path to the audio file
            start_time: Start time in seconds
            end_time: End time in seconds
            progress_callback: Optional callback for progress updates

        Returns:
            List of transcription segments for this chunk
        """
        try:
            # Extract audio chunk if needed
            if start_time > 0 or end_time < await self._get_audio_duration(audio_path):
                chunk_path = await self._extract_audio_chunk(
                    audio_path, start_time, end_time
                )
                time_offset = start_time
            else:
                chunk_path = audio_path
                time_offset = 0

            # Transcribe the chunk
            logger.info(f"Transcribing chunk {start_time:.1f}s - {end_time:.1f}s")

            loop = asyncio.get_event_loop()
            result = await loop.run_in_executor(
                None,
                lambda: model.transcribe(
                    chunk_path,
                    word_timestamps=True,
                    language="en",  # Can be made configurable
                    task="transcribe"
                )
            )

            # Convert to TranscriptSegment objects
            segments = []

            for whisper_segment in result["segments"]:
                # Adjust timestamps if this is a chunk
                adj_start = whisper_segment["start"] + time_offset
                adj_end = whisper_segment["end"] + time_offset

                # Split long segments
                text = whisper_segment["text"].strip()
                if len(text) > self.max_segment_length:
                    split_segments = self._split_long_segment(
                        text, adj_start, adj_end
                    )
                    segments.extend(split_segments)
                else:
                    segments.append(DualTranscriptSegment(
                        start_time=adj_start,
                        end_time=adj_end,
                        text=text,
                        confidence=whisper_segment.get("avg_logprob", 0.0)
                    ))

            # Clean up temporary chunk file
            if chunk_path != audio_path and os.path.exists(chunk_path):
                os.unlink(chunk_path)

            if progress_callback:
                await progress_callback(f"Transcribed chunk {start_time:.1f}s - {end_time:.1f}s")

            return segments

        except Exception as e:
            logger.error(f"Failed to transcribe chunk {start_time}-{end_time}: {e}")
            raise

    async def _extract_audio_chunk(
        self,
        audio_path: str,
        start_time: float,
        end_time: float
    ) -> str:
        """Extract a chunk of audio to a temporary file."""
        chunk_path = self.temp_dir / f"chunk_{start_time}_{end_time}.wav"

        loop = asyncio.get_event_loop()

        def extract_chunk():
            audio = AudioSegment.from_file(audio_path)
            chunk = audio[start_time*1000:end_time*1000]  # pydub uses milliseconds
            chunk.export(str(chunk_path), format="wav")

        await loop.run_in_executor(None, extract_chunk)
        return str(chunk_path)

    async def _transcribe_in_chunks(
        self,
        model: whisper.Whisper,
        audio_path: str,
        total_duration: float,
        progress_callback=None
    ) -> List[DualTranscriptSegment]:
        """
        Transcribe a long audio file in chunks with overlap.

        Args:
            model: Loaded Whisper model
            audio_path: Path to the audio file
            total_duration: Total duration in seconds
            progress_callback: Optional callback for progress updates

        Returns:
            List of transcription segments
        """
        all_segments = []
        current_time = 0
        chunk_number = 1

        while current_time < total_duration:
            # Calculate chunk boundaries
            chunk_start = max(0, current_time - self.overlap_duration)
            chunk_end = min(total_duration, current_time + self.chunk_duration)

            logger.info(f"Processing chunk {chunk_number}: {chunk_start:.1f}s - {chunk_end:.1f}s")

            # Transcribe chunk
            chunk_segments = await self._transcribe_chunk(
                model, audio_path, chunk_start, chunk_end, progress_callback
            )

            # Filter overlapping segments (keep only new content)
            if current_time > 0:
                chunk_segments = [s for s in chunk_segments if s.start_time >= current_time]

            all_segments.extend(chunk_segments)

            # Move to next chunk
            current_time += self.chunk_duration
            chunk_number += 1

        return all_segments

    def _split_long_segment(
        self,
        text: str,
        start_time: float,
        end_time: float
    ) -> List[DualTranscriptSegment]:
        """
        Split a long text segment into smaller segments.

        Args:
            text: Text to split
            start_time: Start time of the original segment
            end_time: End time of the original segment

        Returns:
            List of smaller segments
        """
        segments = []
        duration = end_time - start_time

        # Split text by sentences or at word boundaries
        words = text.split()
        current_text = ""
        current_words = 0

        time_per_word = duration / len(words) if len(words) > 0 else 0

        for i, word in enumerate(words):
            if len(current_text + " " + word) > self.max_segment_length and current_text:
                # Create segment
                segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
                segment_end = start_time + current_words * time_per_word

                segments.append(DualTranscriptSegment(
                    start_time=segment_start,
                    end_time=segment_end,
                    text=current_text.strip()
                ))

                current_text = word
            else:
                current_text += " " + word if current_text else word

            current_words += 1

        # Add final segment
        if current_text:
            segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
            segments.append(DualTranscriptSegment(
                start_time=segment_start,
                end_time=end_time,
                text=current_text.strip()
            ))

        return segments

    def _calculate_quality_score(self, segments: List[DualTranscriptSegment]) -> float:
        """Calculate overall quality score based on segment characteristics."""
        if not segments:
            return 0.0

        # Simple quality heuristics
        total_confidence = sum(s.confidence for s in segments if s.confidence is not None)
        avg_confidence = total_confidence / len(segments)

        # Normalize confidence from log probability to 0-1 scale
        # Whisper typically gives log probabilities from -5 to 0
        normalized_confidence = max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))

        return normalized_confidence

    def _calculate_confidence_score(self, segments: List[DualTranscriptSegment]) -> float:
        """Calculate average confidence score."""
        if not segments:
            return 0.0

        confidences = [s.confidence for s in segments if s.confidence is not None]
        if not confidences:
            return 0.0

        avg_confidence = sum(confidences) / len(confidences)
        # Normalize from log probability to 0-1 scale
        return max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))

    async def _save_transcript(self, video_id: str, segments: List[DualTranscriptSegment]):
        """Save transcript and audio metadata to files for future use"""
        try:
            # Save audio metadata
            await self._save_audio_metadata(video_id)
            transcript_path = self.storage_dirs["transcripts"] / f"{video_id}.txt"

            # Create human-readable transcript file
            transcript_lines = []
            for segment in segments:
                if segment.start_time and segment.end_time:
                    timestamp = f"[{segment.start_time:.1f}s - {segment.end_time:.1f}s]"
                    transcript_lines.append(f"{timestamp} {segment.text}")
                else:
                    transcript_lines.append(segment.text)

            # Write transcript to file
            async with aiofiles.open(transcript_path, 'w', encoding='utf-8') as f:
                await f.write('\n'.join(transcript_lines))

            logger.info(f"Saved transcript to {transcript_path}")

            # Also save as JSON for programmatic access
            json_path = self.storage_dirs["transcripts"] / f"{video_id}.json"
            segments_data = [
                {
                    "start_time": seg.start_time,
                    "end_time": seg.end_time,
                    "text": seg.text,
                    "confidence": seg.confidence
                }
                for seg in segments
            ]

            async with aiofiles.open(json_path, 'w', encoding='utf-8') as f:
                import json
                await f.write(json.dumps(segments_data, indent=2))

            logger.info(f"Saved transcript JSON to {json_path}")

        except Exception as e:
            logger.warning(f"Failed to save transcript for {video_id}: {e}")

    async def _save_audio_metadata(self, video_id: str):
        """Save audio metadata for tracking and management"""
        try:
            mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
            if not mp3_path.exists():
                return

            # Get audio file info
            audio_info = {
                "video_id": video_id,
                "file_path": str(mp3_path),
                "file_size_mb": round(mp3_path.stat().st_size / (1024 * 1024), 2),
                "download_date": datetime.now().isoformat(),
                "format": "mp3",
                "quality": "192kbps",
                "model_used": self.model_size,
                "device": self.device
            }

            # Try to get audio duration
            try:
                loop = asyncio.get_event_loop()
                audio = await loop.run_in_executor(None, AudioSegment.from_file, str(mp3_path))
                audio_info["duration_seconds"] = len(audio) / 1000.0
                audio_info["duration_formatted"] = f"{int(audio_info['duration_seconds'] // 60)}:{int(audio_info['duration_seconds'] % 60):02d}"
            except:
                pass

            # Save metadata
            metadata_path = self.storage_dirs["audio"] / f"{video_id}_metadata.json"
            async with aiofiles.open(metadata_path, 'w', encoding='utf-8') as f:
                import json
                await f.write(json.dumps(audio_info, indent=2))

            logger.info(f"Saved audio metadata to {metadata_path}")

        except Exception as e:
            logger.warning(f"Failed to save audio metadata for {video_id}: {e}")

    async def cleanup(self):
        """Clean up temporary files and resources."""
        try:
            # Don't delete the whole temp directory as it's shared
            # Just clean up old files periodically

            # Unload model to free GPU memory
            if self.model is not None:
                del self.model
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        except Exception as e:
            logger.warning(f"Error during cleanup: {e}")