336 lines
12 KiB
Python
336 lines
12 KiB
Python
"""
|
|
Transcript streaming service for real-time transcript delivery (Task 14.3).
|
|
Provides live transcript chunks during video processing.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Dict, List, Optional, AsyncGenerator, Any
|
|
from datetime import datetime
|
|
from dataclasses import dataclass
|
|
|
|
from ..core.websocket_manager import websocket_manager
|
|
from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TranscriptChunk:
|
|
"""Represents a chunk of transcript data for streaming."""
|
|
job_id: str
|
|
chunk_index: int
|
|
total_chunks: Optional[int]
|
|
timestamp_start: float
|
|
timestamp_end: float
|
|
text: str
|
|
confidence: Optional[float] = None
|
|
words: Optional[List[Dict[str, Any]]] = None
|
|
source: str = "unknown" # "youtube", "whisper", "hybrid"
|
|
processing_stage: str = "processing" # "processing", "complete", "error"
|
|
|
|
|
|
class TranscriptStreamingService:
|
|
"""
|
|
Service for streaming transcript data in real-time during processing.
|
|
Integrates with WebSocket manager to deliver live transcript chunks.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.active_streams: Dict[str, Dict[str, Any]] = {}
|
|
self.chunk_buffers: Dict[str, List[TranscriptChunk]] = {}
|
|
self.stream_metadata: Dict[str, Dict[str, Any]] = {}
|
|
|
|
async def start_transcript_stream(
|
|
self,
|
|
job_id: str,
|
|
video_id: str,
|
|
source: str = "hybrid",
|
|
chunk_duration: float = 30.0
|
|
) -> None:
|
|
"""
|
|
Start a transcript streaming session for a job.
|
|
|
|
Args:
|
|
job_id: Processing job identifier
|
|
video_id: YouTube video ID
|
|
source: Transcript source ("youtube", "whisper", "hybrid")
|
|
chunk_duration: Duration of each transcript chunk in seconds
|
|
"""
|
|
self.active_streams[job_id] = {
|
|
"video_id": video_id,
|
|
"source": source,
|
|
"chunk_duration": chunk_duration,
|
|
"started_at": datetime.utcnow(),
|
|
"chunks_sent": 0,
|
|
"total_text_length": 0,
|
|
"status": "active"
|
|
}
|
|
|
|
self.chunk_buffers[job_id] = []
|
|
self.stream_metadata[job_id] = {
|
|
"estimated_total_chunks": None,
|
|
"processing_method": source,
|
|
"language": "auto-detect"
|
|
}
|
|
|
|
logger.info(f"Started transcript stream for job {job_id} (source: {source})")
|
|
|
|
# Send initial stream notification
|
|
await websocket_manager.send_transcript_chunk(job_id, {
|
|
"type": "stream_started",
|
|
"video_id": video_id,
|
|
"source": source,
|
|
"chunk_duration": chunk_duration,
|
|
"message": f"Transcript streaming started using {source} method"
|
|
})
|
|
|
|
async def send_transcript_chunk(
|
|
self,
|
|
job_id: str,
|
|
chunk: TranscriptChunk
|
|
) -> None:
|
|
"""
|
|
Send a transcript chunk via WebSocket to connected clients.
|
|
|
|
Args:
|
|
job_id: Processing job identifier
|
|
chunk: Transcript chunk data to send
|
|
"""
|
|
if job_id not in self.active_streams:
|
|
logger.warning(f"No active stream for job {job_id}")
|
|
return
|
|
|
|
# Update stream statistics
|
|
stream_info = self.active_streams[job_id]
|
|
stream_info["chunks_sent"] += 1
|
|
stream_info["total_text_length"] += len(chunk.text)
|
|
|
|
# Buffer the chunk for potential replay/reconnection
|
|
self.chunk_buffers[job_id].append(chunk)
|
|
|
|
# Limit buffer size to prevent memory issues
|
|
if len(self.chunk_buffers[job_id]) > 100:
|
|
self.chunk_buffers[job_id] = self.chunk_buffers[job_id][-50:]
|
|
|
|
# Prepare chunk data for WebSocket transmission
|
|
chunk_data = {
|
|
"chunk_index": chunk.chunk_index,
|
|
"total_chunks": chunk.total_chunks,
|
|
"timestamp_start": chunk.timestamp_start,
|
|
"timestamp_end": chunk.timestamp_end,
|
|
"text": chunk.text,
|
|
"confidence": chunk.confidence,
|
|
"words": chunk.words,
|
|
"source": chunk.source,
|
|
"processing_stage": chunk.processing_stage,
|
|
"stream_info": {
|
|
"chunks_sent": stream_info["chunks_sent"],
|
|
"total_text_length": stream_info["total_text_length"],
|
|
"elapsed_time": (
|
|
datetime.utcnow() - stream_info["started_at"]
|
|
).total_seconds()
|
|
}
|
|
}
|
|
|
|
# Send via WebSocket manager
|
|
await websocket_manager.send_transcript_chunk(job_id, chunk_data)
|
|
|
|
logger.debug(f"Sent transcript chunk {chunk.chunk_index} for job {job_id}")
|
|
|
|
async def stream_from_segments(
|
|
self,
|
|
job_id: str,
|
|
segments: List[DualTranscriptSegment],
|
|
source: str = "processed",
|
|
chunk_duration: float = 30.0
|
|
) -> None:
|
|
"""
|
|
Stream transcript from a list of segments, grouping by duration.
|
|
|
|
Args:
|
|
job_id: Processing job identifier
|
|
segments: List of transcript segments to stream
|
|
source: Source of the segments
|
|
chunk_duration: Target duration for each chunk
|
|
"""
|
|
if not segments:
|
|
logger.warning(f"No segments to stream for job {job_id}")
|
|
return
|
|
|
|
current_chunk_start = 0.0
|
|
current_chunk_text = []
|
|
current_chunk_words = []
|
|
chunk_index = 0
|
|
estimated_total_chunks = max(1, int(segments[-1].end_time / chunk_duration))
|
|
|
|
# Update stream metadata
|
|
if job_id in self.stream_metadata:
|
|
self.stream_metadata[job_id]["estimated_total_chunks"] = estimated_total_chunks
|
|
|
|
for segment in segments:
|
|
# Check if we should start a new chunk
|
|
if (segment.start_time - current_chunk_start >= chunk_duration and
|
|
current_chunk_text):
|
|
|
|
# Send current chunk
|
|
chunk = TranscriptChunk(
|
|
job_id=job_id,
|
|
chunk_index=chunk_index,
|
|
total_chunks=estimated_total_chunks,
|
|
timestamp_start=current_chunk_start,
|
|
timestamp_end=segment.start_time,
|
|
text=" ".join(current_chunk_text),
|
|
confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None,
|
|
words=current_chunk_words,
|
|
source=source,
|
|
processing_stage="processing"
|
|
)
|
|
|
|
await self.send_transcript_chunk(job_id, chunk)
|
|
|
|
# Start new chunk
|
|
current_chunk_start = segment.start_time
|
|
current_chunk_text = []
|
|
current_chunk_words = []
|
|
chunk_index += 1
|
|
|
|
# Add small delay to simulate real-time processing
|
|
await asyncio.sleep(0.1)
|
|
|
|
# Add segment to current chunk
|
|
current_chunk_text.append(segment.text)
|
|
|
|
# Add word-level data if available
|
|
if hasattr(segment, 'words') and segment.words:
|
|
for word_info in segment.words:
|
|
current_chunk_words.append({
|
|
"word": word_info.get("word", ""),
|
|
"start_time": word_info.get("start_time", segment.start_time),
|
|
"end_time": word_info.get("end_time", segment.end_time),
|
|
"confidence": word_info.get("confidence", 0.9)
|
|
})
|
|
|
|
# Send final chunk if there's remaining content
|
|
if current_chunk_text:
|
|
chunk = TranscriptChunk(
|
|
job_id=job_id,
|
|
chunk_index=chunk_index,
|
|
total_chunks=chunk_index + 1, # Final count
|
|
timestamp_start=current_chunk_start,
|
|
timestamp_end=segments[-1].end_time,
|
|
text=" ".join(current_chunk_text),
|
|
confidence=sum(w.get("confidence", 0.9) for w in current_chunk_words) / len(current_chunk_words) if current_chunk_words else None,
|
|
words=current_chunk_words,
|
|
source=source,
|
|
processing_stage="complete"
|
|
)
|
|
|
|
await self.send_transcript_chunk(job_id, chunk)
|
|
|
|
async def complete_transcript_stream(
|
|
self,
|
|
job_id: str,
|
|
final_transcript: str,
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
) -> None:
|
|
"""
|
|
Complete a transcript stream and send final summary.
|
|
|
|
Args:
|
|
job_id: Processing job identifier
|
|
final_transcript: Complete final transcript text
|
|
metadata: Optional metadata about the completed transcript
|
|
"""
|
|
if job_id not in self.active_streams:
|
|
logger.warning(f"No active stream to complete for job {job_id}")
|
|
return
|
|
|
|
stream_info = self.active_streams[job_id]
|
|
stream_info["status"] = "completed"
|
|
stream_info["completed_at"] = datetime.utcnow()
|
|
|
|
# Prepare final transcript data
|
|
transcript_data = {
|
|
"type": "stream_complete",
|
|
"final_transcript": final_transcript,
|
|
"stream_statistics": {
|
|
"total_chunks": stream_info["chunks_sent"],
|
|
"total_text_length": len(final_transcript),
|
|
"processing_duration": (
|
|
stream_info["completed_at"] - stream_info["started_at"]
|
|
).total_seconds(),
|
|
"source": stream_info["source"]
|
|
},
|
|
"metadata": metadata or {},
|
|
"message": "Transcript streaming completed successfully"
|
|
}
|
|
|
|
# Send completion notification
|
|
await websocket_manager.send_transcript_complete(job_id, transcript_data)
|
|
|
|
logger.info(f"Completed transcript stream for job {job_id}")
|
|
|
|
# Cleanup (keep buffer for a short time for potential reconnections)
|
|
asyncio.create_task(self._cleanup_stream(job_id, delay=300)) # 5 minutes
|
|
|
|
async def handle_stream_error(
|
|
self,
|
|
job_id: str,
|
|
error: Exception,
|
|
partial_transcript: Optional[str] = None
|
|
) -> None:
|
|
"""
|
|
Handle errors during transcript streaming.
|
|
|
|
Args:
|
|
job_id: Processing job identifier
|
|
error: Error that occurred
|
|
partial_transcript: Any partial transcript data available
|
|
"""
|
|
if job_id in self.active_streams:
|
|
self.active_streams[job_id]["status"] = "error"
|
|
self.active_streams[job_id]["error"] = str(error)
|
|
|
|
error_data = {
|
|
"type": "stream_error",
|
|
"error_message": str(error),
|
|
"error_type": type(error).__name__,
|
|
"partial_transcript": partial_transcript,
|
|
"message": "An error occurred during transcript streaming"
|
|
}
|
|
|
|
await websocket_manager.send_transcript_complete(job_id, error_data)
|
|
|
|
logger.error(f"Transcript streaming error for job {job_id}: {error}")
|
|
|
|
# Cleanup immediately on error
|
|
asyncio.create_task(self._cleanup_stream(job_id, delay=60))
|
|
|
|
async def _cleanup_stream(self, job_id: str, delay: int = 300) -> None:
|
|
"""Clean up stream data after a delay."""
|
|
await asyncio.sleep(delay)
|
|
|
|
if job_id in self.active_streams:
|
|
del self.active_streams[job_id]
|
|
if job_id in self.chunk_buffers:
|
|
del self.chunk_buffers[job_id]
|
|
if job_id in self.stream_metadata:
|
|
del self.stream_metadata[job_id]
|
|
|
|
logger.debug(f"Cleaned up streaming data for job {job_id}")
|
|
|
|
def get_stream_status(self, job_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get current status of a transcript stream."""
|
|
if job_id not in self.active_streams:
|
|
return None
|
|
|
|
return {
|
|
**self.active_streams[job_id],
|
|
"metadata": self.stream_metadata.get(job_id, {}),
|
|
"buffer_size": len(self.chunk_buffers.get(job_id, []))
|
|
}
|
|
|
|
|
|
# Global transcript streaming service instance
|
|
transcript_streaming_service = TranscriptStreamingService() |