youtube-summarizer/backend/services/transcript_streaming_servic...

336 lines
12 KiB
Python

"""
Transcript streaming service for real-time transcript delivery (Task 14.3).
Provides live transcript chunks during video processing.
"""
import asyncio
import logging
from typing import Dict, List, Optional, AsyncGenerator, Any
from datetime import datetime
from dataclasses import dataclass
from ..core.websocket_manager import websocket_manager
from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
logger = logging.getLogger(__name__)
@dataclass
class TranscriptChunk:
"""Represents a chunk of transcript data for streaming."""
job_id: str
chunk_index: int
total_chunks: Optional[int]
timestamp_start: float
timestamp_end: float
text: str
confidence: Optional[float] = None
words: Optional[List[Dict[str, Any]]] = None
source: str = "unknown" # "youtube", "whisper", "hybrid"
processing_stage: str = "processing" # "processing", "complete", "error"
class TranscriptStreamingService:
"""
Service for streaming transcript data in real-time during processing.
Integrates with WebSocket manager to deliver live transcript chunks.
"""
def __init__(self):
self.active_streams: Dict[str, Dict[str, Any]] = {}
self.chunk_buffers: Dict[str, List[TranscriptChunk]] = {}
self.stream_metadata: Dict[str, Dict[str, Any]] = {}
async def start_transcript_stream(
self,
job_id: str,
video_id: str,
source: str = "hybrid",
chunk_duration: float = 30.0
) -> None:
"""
Start a transcript streaming session for a job.
Args:
job_id: Processing job identifier
video_id: YouTube video ID
source: Transcript source ("youtube", "whisper", "hybrid")
chunk_duration: Duration of each transcript chunk in seconds
"""
self.active_streams[job_id] = {
"video_id": video_id,
"source": source,
"chunk_duration": chunk_duration,
"started_at": datetime.utcnow(),
"chunks_sent": 0,
"total_text_length": 0,
"status": "active"
}
self.chunk_buffers[job_id] = []
self.stream_metadata[job_id] = {
"estimated_total_chunks": None,
"processing_method": source,
"language": "auto-detect"
}
logger.info(f"Started transcript stream for job {job_id} (source: {source})")
# Send initial stream notification
await websocket_manager.send_transcript_chunk(job_id, {
"type": "stream_started",
"video_id": video_id,
"source": source,
"chunk_duration": chunk_duration,
"message": f"Transcript streaming started using {source} method"
})
async def send_transcript_chunk(
self,
job_id: str,
chunk: TranscriptChunk
) -> None:
"""
Send a transcript chunk via WebSocket to connected clients.
Args:
job_id: Processing job identifier
chunk: Transcript chunk data to send
"""
if job_id not in self.active_streams:
logger.warning(f"No active stream for job {job_id}")
return
# Update stream statistics
stream_info = self.active_streams[job_id]
stream_info["chunks_sent"] += 1
stream_info["total_text_length"] += len(chunk.text)
# Buffer the chunk for potential replay/reconnection
self.chunk_buffers[job_id].append(chunk)
# Limit buffer size to prevent memory issues
if len(self.chunk_buffers[job_id]) > 100:
self.chunk_buffers[job_id] = self.chunk_buffers[job_id][-50:]
# Prepare chunk data for WebSocket transmission
chunk_data = {
"chunk_index": chunk.chunk_index,
"total_chunks": chunk.total_chunks,
"timestamp_start": chunk.timestamp_start,
"timestamp_end": chunk.timestamp_end,
"text": chunk.text,
"confidence": chunk.confidence,
"words": chunk.words,
"source": chunk.source,
"processing_stage": chunk.processing_stage,
"stream_info": {
"chunks_sent": stream_info["chunks_sent"],
"total_text_length": stream_info["total_text_length"],
"elapsed_time": (
datetime.utcnow() - stream_info["started_at"]
).total_seconds()
}
}
# Send via WebSocket manager
await websocket_manager.send_transcript_chunk(job_id, chunk_data)
logger.debug(f"Sent transcript chunk {chunk.chunk_index} for job {job_id}")
async def stream_from_segments(
self,
job_id: str,
segments: List[DualTranscriptSegment],
source: str = "processed",
chunk_duration: float = 30.0
) -> None:
"""
Stream transcript from a list of segments, grouping by duration.
Args:
job_id: Processing job identifier
segments: List of transcript segments to stream
source: Source of the segments
chunk_duration: Target duration for each chunk
"""
if not segments:
logger.warning(f"No segments to stream for job {job_id}")
return
current_chunk_start = 0.0
current_chunk_text = []
current_chunk_words = []
chunk_index = 0
estimated_total_chunks = max(1, int(segments[-1].end_time / chunk_duration))
# Update stream metadata
if job_id in self.stream_metadata:
self.stream_metadata[job_id]["estimated_total_chunks"] = estimated_total_chunks
for segment in segments:
# Check if we should start a new chunk
if (segment.start_time - current_chunk_start >= chunk_duration and
current_chunk_text):
# Send current chunk
chunk = TranscriptChunk(
job_id=job_id,
chunk_index=chunk_index,
total_chunks=estimated_total_chunks,
timestamp_start=current_chunk_start,
timestamp_end=segment.start_time,
text=" ".join(current_chunk_text),
confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None,
words=current_chunk_words,
source=source,
processing_stage="processing"
)
await self.send_transcript_chunk(job_id, chunk)
# Start new chunk
current_chunk_start = segment.start_time
current_chunk_text = []
current_chunk_words = []
chunk_index += 1
# Add small delay to simulate real-time processing
await asyncio.sleep(0.1)
# Add segment to current chunk
current_chunk_text.append(segment.text)
# Add word-level data if available
if hasattr(segment, 'words') and segment.words:
for word_info in segment.words:
current_chunk_words.append({
"word": word_info.get("word", ""),
"start_time": word_info.get("start_time", segment.start_time),
"end_time": word_info.get("end_time", segment.end_time),
"confidence": word_info.get("confidence", 0.9)
})
# Send final chunk if there's remaining content
if current_chunk_text:
chunk = TranscriptChunk(
job_id=job_id,
chunk_index=chunk_index,
total_chunks=chunk_index + 1, # Final count
timestamp_start=current_chunk_start,
timestamp_end=segments[-1].end_time,
text=" ".join(current_chunk_text),
confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None,
words=current_chunk_words,
source=source,
processing_stage="complete"
)
await self.send_transcript_chunk(job_id, chunk)
async def complete_transcript_stream(
self,
job_id: str,
final_transcript: str,
metadata: Optional[Dict[str, Any]] = None
) -> None:
"""
Complete a transcript stream and send final summary.
Args:
job_id: Processing job identifier
final_transcript: Complete final transcript text
metadata: Optional metadata about the completed transcript
"""
if job_id not in self.active_streams:
logger.warning(f"No active stream to complete for job {job_id}")
return
stream_info = self.active_streams[job_id]
stream_info["status"] = "completed"
stream_info["completed_at"] = datetime.utcnow()
# Prepare final transcript data
transcript_data = {
"type": "stream_complete",
"final_transcript": final_transcript,
"stream_statistics": {
"total_chunks": stream_info["chunks_sent"],
"total_text_length": len(final_transcript),
"processing_duration": (
stream_info["completed_at"] - stream_info["started_at"]
).total_seconds(),
"source": stream_info["source"]
},
"metadata": metadata or {},
"message": "Transcript streaming completed successfully"
}
# Send completion notification
await websocket_manager.send_transcript_complete(job_id, transcript_data)
logger.info(f"Completed transcript stream for job {job_id}")
# Cleanup (keep buffer for a short time for potential reconnections)
asyncio.create_task(self._cleanup_stream(job_id, delay=300)) # 5 minutes
async def handle_stream_error(
self,
job_id: str,
error: Exception,
partial_transcript: Optional[str] = None
) -> None:
"""
Handle errors during transcript streaming.
Args:
job_id: Processing job identifier
error: Error that occurred
partial_transcript: Any partial transcript data available
"""
if job_id in self.active_streams:
self.active_streams[job_id]["status"] = "error"
self.active_streams[job_id]["error"] = str(error)
error_data = {
"type": "stream_error",
"error_message": str(error),
"error_type": type(error).__name__,
"partial_transcript": partial_transcript,
"message": "An error occurred during transcript streaming"
}
await websocket_manager.send_transcript_complete(job_id, error_data)
logger.error(f"Transcript streaming error for job {job_id}: {error}")
# Cleanup immediately on error
asyncio.create_task(self._cleanup_stream(job_id, delay=60))
async def _cleanup_stream(self, job_id: str, delay: int = 300) -> None:
"""Clean up stream data after a delay."""
await asyncio.sleep(delay)
if job_id in self.active_streams:
del self.active_streams[job_id]
if job_id in self.chunk_buffers:
del self.chunk_buffers[job_id]
if job_id in self.stream_metadata:
del self.stream_metadata[job_id]
logger.debug(f"Cleaned up streaming data for job {job_id}")
def get_stream_status(self, job_id: str) -> Optional[Dict[str, Any]]:
"""Get current status of a transcript stream."""
if job_id not in self.active_streams:
return None
return {
**self.active_streams[job_id],
"metadata": self.stream_metadata.get(job_id, {}),
"buffer_size": len(self.chunk_buffers.get(job_id, []))
}
# Global transcript streaming service instance
transcript_streaming_service = TranscriptStreamingService()