""" Transcript streaming service for real-time transcript delivery (Task 14.3). Provides live transcript chunks during video processing. """ import asyncio import logging from typing import Dict, List, Optional, AsyncGenerator, Any from datetime import datetime from dataclasses import dataclass from ..core.websocket_manager import websocket_manager from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata logger = logging.getLogger(__name__) @dataclass class TranscriptChunk: """Represents a chunk of transcript data for streaming.""" job_id: str chunk_index: int total_chunks: Optional[int] timestamp_start: float timestamp_end: float text: str confidence: Optional[float] = None words: Optional[List[Dict[str, Any]]] = None source: str = "unknown" # "youtube", "whisper", "hybrid" processing_stage: str = "processing" # "processing", "complete", "error" class TranscriptStreamingService: """ Service for streaming transcript data in real-time during processing. Integrates with WebSocket manager to deliver live transcript chunks. """ def __init__(self): self.active_streams: Dict[str, Dict[str, Any]] = {} self.chunk_buffers: Dict[str, List[TranscriptChunk]] = {} self.stream_metadata: Dict[str, Dict[str, Any]] = {} async def start_transcript_stream( self, job_id: str, video_id: str, source: str = "hybrid", chunk_duration: float = 30.0 ) -> None: """ Start a transcript streaming session for a job. Args: job_id: Processing job identifier video_id: YouTube video ID source: Transcript source ("youtube", "whisper", "hybrid") chunk_duration: Duration of each transcript chunk in seconds """ self.active_streams[job_id] = { "video_id": video_id, "source": source, "chunk_duration": chunk_duration, "started_at": datetime.utcnow(), "chunks_sent": 0, "total_text_length": 0, "status": "active" } self.chunk_buffers[job_id] = [] self.stream_metadata[job_id] = { "estimated_total_chunks": None, "processing_method": source, "language": "auto-detect" } logger.info(f"Started transcript stream for job {job_id} (source: {source})") # Send initial stream notification await websocket_manager.send_transcript_chunk(job_id, { "type": "stream_started", "video_id": video_id, "source": source, "chunk_duration": chunk_duration, "message": f"Transcript streaming started using {source} method" }) async def send_transcript_chunk( self, job_id: str, chunk: TranscriptChunk ) -> None: """ Send a transcript chunk via WebSocket to connected clients. Args: job_id: Processing job identifier chunk: Transcript chunk data to send """ if job_id not in self.active_streams: logger.warning(f"No active stream for job {job_id}") return # Update stream statistics stream_info = self.active_streams[job_id] stream_info["chunks_sent"] += 1 stream_info["total_text_length"] += len(chunk.text) # Buffer the chunk for potential replay/reconnection self.chunk_buffers[job_id].append(chunk) # Limit buffer size to prevent memory issues if len(self.chunk_buffers[job_id]) > 100: self.chunk_buffers[job_id] = self.chunk_buffers[job_id][-50:] # Prepare chunk data for WebSocket transmission chunk_data = { "chunk_index": chunk.chunk_index, "total_chunks": chunk.total_chunks, "timestamp_start": chunk.timestamp_start, "timestamp_end": chunk.timestamp_end, "text": chunk.text, "confidence": chunk.confidence, "words": chunk.words, "source": chunk.source, "processing_stage": chunk.processing_stage, "stream_info": { "chunks_sent": stream_info["chunks_sent"], "total_text_length": stream_info["total_text_length"], "elapsed_time": ( datetime.utcnow() - stream_info["started_at"] ).total_seconds() } } # Send via WebSocket manager await websocket_manager.send_transcript_chunk(job_id, chunk_data) logger.debug(f"Sent transcript chunk {chunk.chunk_index} for job {job_id}") async def stream_from_segments( self, job_id: str, segments: List[DualTranscriptSegment], source: str = "processed", chunk_duration: float = 30.0 ) -> None: """ Stream transcript from a list of segments, grouping by duration. Args: job_id: Processing job identifier segments: List of transcript segments to stream source: Source of the segments chunk_duration: Target duration for each chunk """ if not segments: logger.warning(f"No segments to stream for job {job_id}") return current_chunk_start = 0.0 current_chunk_text = [] current_chunk_words = [] chunk_index = 0 estimated_total_chunks = max(1, int(segments[-1].end_time / chunk_duration)) # Update stream metadata if job_id in self.stream_metadata: self.stream_metadata[job_id]["estimated_total_chunks"] = estimated_total_chunks for segment in segments: # Check if we should start a new chunk if (segment.start_time - current_chunk_start >= chunk_duration and current_chunk_text): # Send current chunk chunk = TranscriptChunk( job_id=job_id, chunk_index=chunk_index, total_chunks=estimated_total_chunks, timestamp_start=current_chunk_start, timestamp_end=segment.start_time, text=" ".join(current_chunk_text), confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None, words=current_chunk_words, source=source, processing_stage="processing" ) await self.send_transcript_chunk(job_id, chunk) # Start new chunk current_chunk_start = segment.start_time current_chunk_text = [] current_chunk_words = [] chunk_index += 1 # Add small delay to simulate real-time processing await asyncio.sleep(0.1) # Add segment to current chunk current_chunk_text.append(segment.text) # Add word-level data if available if hasattr(segment, 'words') and segment.words: for word_info in segment.words: current_chunk_words.append({ "word": word_info.get("word", ""), "start_time": word_info.get("start_time", segment.start_time), "end_time": word_info.get("end_time", segment.end_time), "confidence": word_info.get("confidence", 0.9) }) # Send final chunk if there's remaining content if current_chunk_text: chunk = TranscriptChunk( job_id=job_id, chunk_index=chunk_index, total_chunks=chunk_index + 1, # Final count timestamp_start=current_chunk_start, timestamp_end=segments[-1].end_time, text=" ".join(current_chunk_text), confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None, words=current_chunk_words, source=source, processing_stage="complete" ) await self.send_transcript_chunk(job_id, chunk) async def complete_transcript_stream( self, job_id: str, final_transcript: str, metadata: Optional[Dict[str, Any]] = None ) -> None: """ Complete a transcript stream and send final summary. Args: job_id: Processing job identifier final_transcript: Complete final transcript text metadata: Optional metadata about the completed transcript """ if job_id not in self.active_streams: logger.warning(f"No active stream to complete for job {job_id}") return stream_info = self.active_streams[job_id] stream_info["status"] = "completed" stream_info["completed_at"] = datetime.utcnow() # Prepare final transcript data transcript_data = { "type": "stream_complete", "final_transcript": final_transcript, "stream_statistics": { "total_chunks": stream_info["chunks_sent"], "total_text_length": len(final_transcript), "processing_duration": ( stream_info["completed_at"] - stream_info["started_at"] ).total_seconds(), "source": stream_info["source"] }, "metadata": metadata or {}, "message": "Transcript streaming completed successfully" } # Send completion notification await websocket_manager.send_transcript_complete(job_id, transcript_data) logger.info(f"Completed transcript stream for job {job_id}") # Cleanup (keep buffer for a short time for potential reconnections) asyncio.create_task(self._cleanup_stream(job_id, delay=300)) # 5 minutes async def handle_stream_error( self, job_id: str, error: Exception, partial_transcript: Optional[str] = None ) -> None: """ Handle errors during transcript streaming. Args: job_id: Processing job identifier error: Error that occurred partial_transcript: Any partial transcript data available """ if job_id in self.active_streams: self.active_streams[job_id]["status"] = "error" self.active_streams[job_id]["error"] = str(error) error_data = { "type": "stream_error", "error_message": str(error), "error_type": type(error).__name__, "partial_transcript": partial_transcript, "message": "An error occurred during transcript streaming" } await websocket_manager.send_transcript_complete(job_id, error_data) logger.error(f"Transcript streaming error for job {job_id}: {error}") # Cleanup immediately on error asyncio.create_task(self._cleanup_stream(job_id, delay=60)) async def _cleanup_stream(self, job_id: str, delay: int = 300) -> None: """Clean up stream data after a delay.""" await asyncio.sleep(delay) if job_id in self.active_streams: del self.active_streams[job_id] if job_id in self.chunk_buffers: del self.chunk_buffers[job_id] if job_id in self.stream_metadata: del self.stream_metadata[job_id] logger.debug(f"Cleaned up streaming data for job {job_id}") def get_stream_status(self, job_id: str) -> Optional[Dict[str, Any]]: """Get current status of a transcript stream.""" if job_id not in self.active_streams: return None return { **self.active_streams[job_id], "metadata": self.stream_metadata.get(job_id, {}), "buffer_size": len(self.chunk_buffers.get(job_id, [])) } # Global transcript streaming service instance transcript_streaming_service = TranscriptStreamingService()