youtube-summarizer/backend/services/transcript_streaming_servic...

"""
Transcript streaming service for real-time transcript delivery (Task 14.3).
Provides live transcript chunks during video processing.
"""

import asyncio
import logging
from typing import Dict, List, Optional, AsyncGenerator, Any
from datetime import datetime
from dataclasses import dataclass

from ..core.websocket_manager import websocket_manager
from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata

logger = logging.getLogger(__name__)


@dataclass
class TranscriptChunk:
    """Represents a chunk of transcript data for streaming."""
    job_id: str
    chunk_index: int
    total_chunks: Optional[int]
    timestamp_start: float
    timestamp_end: float
    text: str
    confidence: Optional[float] = None
    words: Optional[List[Dict[str, Any]]] = None
    source: str = "unknown"  # "youtube", "whisper", "hybrid"
    processing_stage: str = "processing"  # "processing", "complete", "error"


class TranscriptStreamingService:
    """
    Service for streaming transcript data in real-time during processing.
    Integrates with WebSocket manager to deliver live transcript chunks.
    """

    def __init__(self):
        self.active_streams: Dict[str, Dict[str, Any]] = {}
        self.chunk_buffers: Dict[str, List[TranscriptChunk]] = {}
        self.stream_metadata: Dict[str, Dict[str, Any]] = {}

    async def start_transcript_stream(
        self,
        job_id: str,
        video_id: str,
        source: str = "hybrid",
        chunk_duration: float = 30.0
    ) -> None:
        """
        Start a transcript streaming session for a job.

        Args:
            job_id: Processing job identifier
            video_id: YouTube video ID
            source: Transcript source ("youtube", "whisper", "hybrid")
            chunk_duration: Duration of each transcript chunk in seconds
        """
        self.active_streams[job_id] = {
            "video_id": video_id,
            "source": source,
            "chunk_duration": chunk_duration,
            "started_at": datetime.utcnow(),
            "chunks_sent": 0,
            "total_text_length": 0,
            "status": "active"
        }

        self.chunk_buffers[job_id] = []
        self.stream_metadata[job_id] = {
            "estimated_total_chunks": None,
            "processing_method": source,
            "language": "auto-detect"
        }

        logger.info(f"Started transcript stream for job {job_id} (source: {source})")

        # Send initial stream notification
        await websocket_manager.send_transcript_chunk(job_id, {
            "type": "stream_started",
            "video_id": video_id,
            "source": source,
            "chunk_duration": chunk_duration,
            "message": f"Transcript streaming started using {source} method"
        })

    async def send_transcript_chunk(
        self,
        job_id: str,
        chunk: TranscriptChunk
    ) -> None:
        """
        Send a transcript chunk via WebSocket to connected clients.

        Args:
            job_id: Processing job identifier
            chunk: Transcript chunk data to send
        """
        if job_id not in self.active_streams:
            logger.warning(f"No active stream for job {job_id}")
            return

        # Update stream statistics
        stream_info = self.active_streams[job_id]
        stream_info["chunks_sent"] += 1
        stream_info["total_text_length"] += len(chunk.text)

        # Buffer the chunk for potential replay/reconnection
        self.chunk_buffers[job_id].append(chunk)

        # Limit buffer size to prevent memory issues
        if len(self.chunk_buffers[job_id]) > 100:
            self.chunk_buffers[job_id] = self.chunk_buffers[job_id][-50:]

        # Prepare chunk data for WebSocket transmission
        chunk_data = {
            "chunk_index": chunk.chunk_index,
            "total_chunks": chunk.total_chunks,
            "timestamp_start": chunk.timestamp_start,
            "timestamp_end": chunk.timestamp_end,
            "text": chunk.text,
            "confidence": chunk.confidence,
            "words": chunk.words,
            "source": chunk.source,
            "processing_stage": chunk.processing_stage,
            "stream_info": {
                "chunks_sent": stream_info["chunks_sent"],
                "total_text_length": stream_info["total_text_length"],
                "elapsed_time": (
                    datetime.utcnow() - stream_info["started_at"]
                ).total_seconds()
            }
        }

        # Send via WebSocket manager
        await websocket_manager.send_transcript_chunk(job_id, chunk_data)

        logger.debug(f"Sent transcript chunk {chunk.chunk_index} for job {job_id}")

    async def stream_from_segments(
        self,
        job_id: str,
        segments: List[DualTranscriptSegment],
        source: str = "processed",
        chunk_duration: float = 30.0
    ) -> None:
        """
        Stream transcript from a list of segments, grouping by duration.

        Args:
            job_id: Processing job identifier
            segments: List of transcript segments to stream
            source: Source of the segments
            chunk_duration: Target duration for each chunk
        """
        if not segments:
            logger.warning(f"No segments to stream for job {job_id}")
            return

        current_chunk_start = 0.0
        current_chunk_text = []
        current_chunk_words = []
        chunk_index = 0
        estimated_total_chunks = max(1, int(segments[-1].end_time / chunk_duration))

        # Update stream metadata
        if job_id in self.stream_metadata:
            self.stream_metadata[job_id]["estimated_total_chunks"] = estimated_total_chunks

        for segment in segments:
            # Check if we should start a new chunk
            if (segment.start_time - current_chunk_start >= chunk_duration and
                current_chunk_text):

                # Send current chunk
                chunk = TranscriptChunk(
                    job_id=job_id,
                    chunk_index=chunk_index,
                    total_chunks=estimated_total_chunks,
                    timestamp_start=current_chunk_start,
                    timestamp_end=segment.start_time,
                    text=" ".join(current_chunk_text),
                    confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None,
                    words=current_chunk_words,
                    source=source,
                    processing_stage="processing"
                )

                await self.send_transcript_chunk(job_id, chunk)

                # Start new chunk
                current_chunk_start = segment.start_time
                current_chunk_text = []
                current_chunk_words = []
                chunk_index += 1

                # Add small delay to simulate real-time processing
                await asyncio.sleep(0.1)

            # Add segment to current chunk
            current_chunk_text.append(segment.text)

            # Add word-level data if available
            if hasattr(segment, 'words') and segment.words:
                for word_info in segment.words:
                    current_chunk_words.append({
                        "word": word_info.get("word", ""),
                        "start_time": word_info.get("start_time", segment.start_time),
                        "end_time": word_info.get("end_time", segment.end_time),
                        "confidence": word_info.get("confidence", 0.9)
                    })

        # Send final chunk if there's remaining content
        if current_chunk_text:
            chunk = TranscriptChunk(
                job_id=job_id,
                chunk_index=chunk_index,
                total_chunks=chunk_index + 1,  # Final count
                timestamp_start=current_chunk_start,
                timestamp_end=segments[-1].end_time,
                text=" ".join(current_chunk_text),
                confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None,
                words=current_chunk_words,
                source=source,
                processing_stage="complete"
            )

            await self.send_transcript_chunk(job_id, chunk)

    async def complete_transcript_stream(
        self,
        job_id: str,
        final_transcript: str,
        metadata: Optional[Dict[str, Any]] = None
    ) -> None:
        """
        Complete a transcript stream and send final summary.

        Args:
            job_id: Processing job identifier
            final_transcript: Complete final transcript text
            metadata: Optional metadata about the completed transcript
        """
        if job_id not in self.active_streams:
            logger.warning(f"No active stream to complete for job {job_id}")
            return

        stream_info = self.active_streams[job_id]
        stream_info["status"] = "completed"
        stream_info["completed_at"] = datetime.utcnow()

        # Prepare final transcript data
        transcript_data = {
            "type": "stream_complete",
            "final_transcript": final_transcript,
            "stream_statistics": {
                "total_chunks": stream_info["chunks_sent"],
                "total_text_length": len(final_transcript),
                "processing_duration": (
                    stream_info["completed_at"] - stream_info["started_at"]
                ).total_seconds(),
                "source": stream_info["source"]
            },
            "metadata": metadata or {},
            "message": "Transcript streaming completed successfully"
        }

        # Send completion notification
        await websocket_manager.send_transcript_complete(job_id, transcript_data)

        logger.info(f"Completed transcript stream for job {job_id}")

        # Cleanup (keep buffer for a short time for potential reconnections)
        asyncio.create_task(self._cleanup_stream(job_id, delay=300))  # 5 minutes

    async def handle_stream_error(
        self,
        job_id: str,
        error: Exception,
        partial_transcript: Optional[str] = None
    ) -> None:
        """
        Handle errors during transcript streaming.

        Args:
            job_id: Processing job identifier
            error: Error that occurred
            partial_transcript: Any partial transcript data available
        """
        if job_id in self.active_streams:
            self.active_streams[job_id]["status"] = "error"
            self.active_streams[job_id]["error"] = str(error)

        error_data = {
            "type": "stream_error",
            "error_message": str(error),
            "error_type": type(error).__name__,
            "partial_transcript": partial_transcript,
            "message": "An error occurred during transcript streaming"
        }

        await websocket_manager.send_transcript_complete(job_id, error_data)

        logger.error(f"Transcript streaming error for job {job_id}: {error}")

        # Cleanup immediately on error
        asyncio.create_task(self._cleanup_stream(job_id, delay=60))

    async def _cleanup_stream(self, job_id: str, delay: int = 300) -> None:
        """Clean up stream data after a delay."""
        await asyncio.sleep(delay)

        if job_id in self.active_streams:
            del self.active_streams[job_id]
        if job_id in self.chunk_buffers:
            del self.chunk_buffers[job_id]
        if job_id in self.stream_metadata:
            del self.stream_metadata[job_id]

        logger.debug(f"Cleaned up streaming data for job {job_id}")

    def get_stream_status(self, job_id: str) -> Optional[Dict[str, Any]]:
        """Get current status of a transcript stream."""
        if job_id not in self.active_streams:
            return None

        return {
            **self.active_streams[job_id],
            "metadata": self.stream_metadata.get(job_id, {}),
            "buffer_size": len(self.chunk_buffers.get(job_id, []))
        }


# Global transcript streaming service instance
transcript_streaming_service = TranscriptStreamingService()