youtube-summarizer/backend/services/summary_pipeline.py

1080 lines
40 KiB
Python

"""Summary Pipeline orchestrates the complete YouTube summarization workflow."""
import asyncio
import logging
import uuid
from datetime import datetime, timedelta
from typing import Dict, Optional, List, Any, Callable, Set
from dataclasses import asdict
logger = logging.getLogger(__name__)
from ..services.video_service import VideoService
from ..services.transcript_service import TranscriptService
from ..services.deepseek_summarizer import DeepSeekSummarizer
from ..services.cache_manager import CacheManager
from ..services.notification_service import NotificationService
from ..services.transcript_streaming_service import transcript_streaming_service
from ..services.browser_notification_service import browser_notification_service
from ..models.pipeline import (
PipelineStage, PipelineConfig, PipelineProgress, PipelineResult,
ContentAnalysis, QualityMetrics
)
from ..core.websocket_manager import websocket_manager
from ..core.exceptions import PipelineError
class SummaryPipeline:
"""Orchestrates the complete YouTube summarization workflow."""
def __init__(
self,
video_service: VideoService,
transcript_service: TranscriptService,
ai_service: DeepSeekSummarizer,
cache_manager: CacheManager,
notification_service: NotificationService = None
):
self.video_service = video_service
self.transcript_service = transcript_service
self.ai_service = ai_service
self.cache_manager = cache_manager
self.notification_service = notification_service or NotificationService()
# Active jobs tracking
self.active_jobs: Dict[str, PipelineResult] = {}
self.progress_callbacks: Dict[str, List[Callable]] = {}
self._cleanup_tasks: Dict[str, asyncio.Task] = {}
self._cancelled_jobs: Set[str] = set() # Track cancelled jobs
async def process_video(
self,
video_url: str,
config: PipelineConfig = None,
progress_callback: Callable = None
) -> str:
"""Start video processing pipeline and return job ID.
Args:
video_url: YouTube video URL to process
config: Pipeline configuration options
progress_callback: Optional callback for progress updates
Returns:
Job ID for tracking pipeline progress
"""
if config is None:
config = PipelineConfig()
job_id = str(uuid.uuid4())
# Initialize pipeline result
result = PipelineResult(
job_id=job_id,
video_url=video_url,
video_id="", # Will be populated during validation
status=PipelineStage.INITIALIZED,
started_at=datetime.utcnow(),
retry_count=0
)
self.active_jobs[job_id] = result
if progress_callback:
self.progress_callbacks[job_id] = [progress_callback]
# Start processing in background
task = asyncio.create_task(self._execute_pipeline(job_id, config))
self._cleanup_tasks[job_id] = task
await self._update_progress(
job_id,
PipelineStage.INITIALIZED,
0,
"Pipeline initialized, starting processing..."
)
return job_id
async def cancel_job(self, job_id: str) -> bool:
"""Cancel an active pipeline job.
Args:
job_id: Job ID to cancel
Returns:
True if cancellation was successful
"""
if job_id not in self.active_jobs:
return False
# Mark job as cancelled
self._cancelled_jobs.add(job_id)
result = self.active_jobs[job_id]
result.status = PipelineStage.CANCELLED
# Cancel the background task
if job_id in self._cleanup_tasks:
task = self._cleanup_tasks[job_id]
if not task.done():
task.cancel()
# Update progress
await self._update_progress(
job_id,
PipelineStage.CANCELLED,
result.progress_percentage if hasattr(result, 'progress_percentage') else 0,
f"🛑 Cancelled: {result.display_name}",
{"cancelled_at": datetime.utcnow().isoformat()}
)
# Send cancellation notification
await websocket_manager.send_error_notification(job_id, {
"error": "cancelled",
"message": "Processing was cancelled by user",
"timestamp": datetime.utcnow().isoformat()
})
# Clean up after a delay
asyncio.create_task(self._cleanup_cancelled_job(job_id))
return True
async def _cleanup_cancelled_job(self, job_id: str):
"""Clean up a cancelled job after a delay."""
await asyncio.sleep(5) # Keep for 5 seconds for client to receive updates
# Clean up tracking
self._cancelled_jobs.discard(job_id)
if job_id in self.active_jobs:
del self.active_jobs[job_id]
if job_id in self.progress_callbacks:
del self.progress_callbacks[job_id]
if job_id in self._cleanup_tasks:
del self._cleanup_tasks[job_id]
def _is_job_cancelled(self, job_id: str) -> bool:
"""Check if a job has been cancelled."""
return job_id in self._cancelled_jobs
async def _execute_pipeline(self, job_id: str, config: PipelineConfig):
"""Execute the complete processing pipeline.
Args:
job_id: Pipeline job identifier
config: Pipeline configuration
"""
result = self.active_jobs[job_id]
try:
# Stage 1: URL Validation
if self._is_job_cancelled(job_id):
return # Early exit if cancelled
await self._update_progress(
job_id,
PipelineStage.VALIDATING_URL,
5,
"Validating YouTube URL..."
)
video_id = await self.video_service.extract_video_id(result.video_url)
result.video_id = video_id
# Check cache for existing result
cached_result = await self.cache_manager.get_cached_pipeline_result(job_id)
if cached_result and cached_result.get("status") == PipelineStage.COMPLETED.value:
await self._restore_from_cache(job_id, cached_result)
return
# Check cancellation before next stage
if self._is_job_cancelled(job_id):
return
# Stage 2: Extract Video Metadata
await self._update_progress(
job_id,
PipelineStage.EXTRACTING_METADATA,
15,
f"Extracting video information..."
)
metadata = await self._extract_enhanced_metadata(video_id)
result.video_metadata = metadata
# Now that we have metadata, update progress with video title
await self._update_progress(
job_id,
PipelineStage.EXTRACTING_METADATA,
18,
f"Processing: {result.display_name}"
)
# Check cancellation before transcript extraction
if self._is_job_cancelled(job_id):
return
# Stage 3: Extract Transcript
await self._update_progress(
job_id,
PipelineStage.EXTRACTING_TRANSCRIPT,
35,
f"Extracting transcript for: {result.display_name}"
)
# Start transcript streaming if enabled
await transcript_streaming_service.start_transcript_stream(
job_id=job_id,
video_id=video_id,
source="hybrid",
chunk_duration=30.0
)
try:
transcript_result = await self.transcript_service.extract_transcript(video_id)
result.transcript = transcript_result.transcript
# Stream the transcript segments if available
if hasattr(transcript_result, 'segments') and transcript_result.segments:
await transcript_streaming_service.stream_from_segments(
job_id=job_id,
segments=transcript_result.segments,
source=getattr(transcript_result, 'source', 'processed'),
chunk_duration=30.0
)
# Complete the transcript stream
await transcript_streaming_service.complete_transcript_stream(
job_id=job_id,
final_transcript=result.transcript,
metadata={
"video_id": video_id,
"source": getattr(transcript_result, 'source', 'processed'),
"language": getattr(transcript_result, 'language', 'auto'),
"segment_count": len(getattr(transcript_result, 'segments', []))
}
)
# Send browser notification for transcript completion
video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
transcript_url = f"/transcript/{job_id}" if job_id else None
await browser_notification_service.send_transcript_ready_notification(
job_id=job_id,
video_title=video_title,
user_id=getattr(result, 'user_id', None),
transcript_url=transcript_url
)
except Exception as transcript_error:
# Handle streaming error
await transcript_streaming_service.handle_stream_error(
job_id=job_id,
error=transcript_error,
partial_transcript=getattr(result, 'transcript', None)
)
raise transcript_error
# Check cancellation before content analysis
if self._is_job_cancelled(job_id):
return
# Stage 4: Analyze Content for Optimization
await self._update_progress(
job_id,
PipelineStage.ANALYZING_CONTENT,
50,
f"Analyzing content: {result.display_name}"
)
content_analysis = await self._analyze_content_characteristics(
result.transcript, metadata
)
optimized_config = self._optimize_config_for_content(config, content_analysis)
# Check cancellation before expensive AI operation
if self._is_job_cancelled(job_id):
return
# Stage 5: Generate Summary
await self._update_progress(
job_id,
PipelineStage.GENERATING_SUMMARY,
75,
f"Generating AI summary for: {result.display_name}"
)
summary_result = await self._generate_optimized_summary(
result.transcript, optimized_config, content_analysis
)
# Populate summary results
result.summary = summary_result.summary
result.key_points = summary_result.key_points
result.main_themes = summary_result.main_themes
result.actionable_insights = summary_result.actionable_insights
result.confidence_score = summary_result.confidence_score
result.processing_metadata = summary_result.processing_metadata
result.cost_data = summary_result.cost_data
# Stage 6: Quality Validation
await self._update_progress(
job_id,
PipelineStage.VALIDATING_QUALITY,
90,
f"Validating quality: {result.display_name}"
)
quality_score = await self._validate_summary_quality(result, content_analysis)
result.quality_score = quality_score
# Check if quality meets threshold
if quality_score < config.quality_threshold and result.retry_count < config.max_retries:
await self._retry_with_improvements(job_id, config, "Low quality score")
return
# Stage 7: Complete
result.completed_at = datetime.utcnow()
result.processing_time_seconds = (
result.completed_at - result.started_at
).total_seconds()
result.status = PipelineStage.COMPLETED
await self._update_progress(
job_id,
PipelineStage.COMPLETED,
100,
f"✅ Completed: {result.display_name}"
)
# Cache the result
await self.cache_manager.cache_pipeline_result(job_id, result)
# Save to database for unified storage across frontend and CLI
try:
from ..services.database_storage_service import database_storage_service
saved_summary = database_storage_service.save_summary_from_pipeline(result)
result.summary_id = saved_summary.id
logger.info(f"Saved summary {saved_summary.id} to database for '{result.display_name}' (job {job_id})")
except Exception as db_error:
logger.error(f"Failed to save summary for '{result.display_name}' to database: {db_error}")
# Don't fail the pipeline if database save fails
# Send completion notifications
if config.enable_notifications:
await self._send_completion_notifications(result)
except Exception as e:
await self._handle_pipeline_error(job_id, e, config)
finally:
# Cleanup task reference
if job_id in self._cleanup_tasks:
del self._cleanup_tasks[job_id]
async def _extract_enhanced_metadata(self, video_id: str) -> Dict[str, Any]:
"""Extract rich video metadata using YouTube Data API.
Args:
video_id: YouTube video identifier
Returns:
Enhanced video metadata dictionary
"""
try:
# Check cache first
cached_metadata = await self.cache_manager.get_cached_video_metadata(video_id)
if cached_metadata:
return cached_metadata
# Use video service to get metadata
metadata = await self.video_service.get_video_metadata(video_id)
# Enhance with additional processing
enhanced_metadata = {
**metadata,
"content_warnings": await self._detect_content_warnings(metadata),
"estimated_complexity": self._estimate_content_complexity(metadata),
"processing_hints": self._generate_processing_hints(metadata)
}
# Cache the enhanced metadata
await self.cache_manager.cache_video_metadata(video_id, enhanced_metadata)
return enhanced_metadata
except Exception as e:
# Return basic metadata if enhanced extraction fails
return {
"video_id": video_id,
"title": f"Video {video_id}",
"error": f"Enhanced metadata extraction failed: {str(e)}"
}
async def _detect_content_warnings(self, metadata: Dict[str, Any]) -> List[str]:
"""Detect potential content warnings from metadata."""
warnings = []
title = metadata.get("title", "").lower()
description = metadata.get("description", "").lower()
tags = [tag.lower() for tag in metadata.get("tags", [])]
# Check for mature content indicators
mature_indicators = [
"explicit", "mature", "adult", "nsfw", "warning",
"violence", "profanity", "disturbing"
]
for indicator in mature_indicators:
if (indicator in title or
indicator in description or
any(indicator in tag for tag in tags)):
warnings.append(f"Contains {indicator} content")
return warnings
def _estimate_content_complexity(self, metadata: Dict[str, Any]) -> str:
"""Estimate content complexity from metadata."""
# Simple heuristic based on category and tags
category = metadata.get("category", "").lower()
tags = [tag.lower() for tag in metadata.get("tags", [])]
technical_categories = ["science & technology", "education", "howto & style"]
technical_tags = ["tutorial", "programming", "science", "technical", "academic"]
if (category in technical_categories or
any(tag in technical_tags for tag in tags)):
return "high"
elif category in ["entertainment", "gaming", "sports"]:
return "low"
else:
return "medium"
def _generate_processing_hints(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
"""Generate hints for optimal processing based on metadata."""
hints = {
"recommended_summary_length": "standard",
"focus_areas": [],
"special_handling": []
}
# Duration-based hints
duration_str = metadata.get("duration", "PT0S")
duration_seconds = self._parse_iso_duration(duration_str)
if duration_seconds > 3600: # > 1 hour
hints["recommended_summary_length"] = "detailed"
hints["special_handling"].append("long_form_content")
elif duration_seconds < 300: # < 5 minutes
hints["recommended_summary_length"] = "brief"
hints["special_handling"].append("short_form_content")
# Category-based hints
category = metadata.get("category", "").lower()
if "education" in category:
hints["focus_areas"].extend(["learning_objectives", "key_concepts"])
elif "tutorial" in category or "howto" in category:
hints["focus_areas"].extend(["step_by_step_instructions", "practical_tips"])
return hints
def _parse_iso_duration(self, duration: str) -> int:
"""Parse ISO 8601 duration to seconds."""
try:
# Simple parser for PT#H#M#S format
if not duration.startswith("PT"):
return 0
duration = duration[2:] # Remove "PT"
seconds = 0
if "H" in duration:
hours, duration = duration.split("H", 1)
seconds += int(hours) * 3600
if "M" in duration:
minutes, duration = duration.split("M", 1)
seconds += int(minutes) * 60
if "S" in duration:
secs = duration.split("S")[0]
if secs:
seconds += int(secs)
return seconds
except (ValueError, AttributeError):
return 0
async def _analyze_content_characteristics(
self,
transcript: str,
metadata: Dict[str, Any]
) -> ContentAnalysis:
"""Analyze transcript and metadata to determine optimal processing strategy.
Args:
transcript: Video transcript text
metadata: Video metadata dictionary
Returns:
ContentAnalysis object with processing insights
"""
word_count = len(transcript.split())
transcript_lower = transcript.lower()
# Basic content type detection
content_type = "general"
technical_indicators = []
educational_indicators = []
entertainment_indicators = []
# Technical content indicators
technical_terms = [
"algorithm", "function", "variable", "database", "api", "code",
"programming", "software", "development", "technical", "implementation"
]
for term in technical_terms:
if term in transcript_lower:
technical_indicators.append(term)
# Educational content indicators
educational_terms = [
"learn", "tutorial", "explain", "understand", "concept", "example",
"lesson", "course", "study", "knowledge", "teach", "instruction"
]
for term in educational_terms:
if term in transcript_lower:
educational_indicators.append(term)
# Entertainment content indicators
entertainment_terms = [
"funny", "story", "experience", "adventure", "review", "reaction",
"entertainment", "fun", "hilarious", "amazing", "incredible"
]
for term in entertainment_terms:
if term in transcript_lower:
entertainment_indicators.append(term)
# Determine primary content type
if len(technical_indicators) >= 3:
content_type = "technical"
elif len(educational_indicators) >= 3:
content_type = "educational"
elif len(entertainment_indicators) >= 2:
content_type = "entertainment"
# Complexity scoring based on sentence length and vocabulary
sentences = [s.strip() for s in transcript.split('.') if s.strip()]
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
complexity_score = 0.5 # Base complexity
if avg_sentence_length > 20:
complexity_score = min(1.0, complexity_score + 0.3)
elif avg_sentence_length < 10:
complexity_score = max(0.1, complexity_score - 0.2)
# Adjust based on technical indicators
if len(technical_indicators) > 5:
complexity_score = min(1.0, complexity_score + 0.2)
return ContentAnalysis(
transcript_length=len(transcript),
word_count=word_count,
estimated_reading_time=word_count / 250, # Words per minute
complexity_score=complexity_score,
content_type=content_type,
language=metadata.get("language", "en"),
technical_indicators=technical_indicators,
educational_indicators=educational_indicators,
entertainment_indicators=entertainment_indicators
)
def _optimize_config_for_content(
self,
base_config: PipelineConfig,
analysis: ContentAnalysis
) -> PipelineConfig:
"""Optimize processing configuration based on content analysis.
Args:
base_config: Original pipeline configuration
analysis: Content analysis results
Returns:
Optimized pipeline configuration
"""
optimized_config = PipelineConfig(**asdict(base_config))
# Adjust summary length based on content
if analysis.word_count > 5000 and optimized_config.summary_length == "standard":
optimized_config.summary_length = "detailed"
elif analysis.word_count < 500 and optimized_config.summary_length == "standard":
optimized_config.summary_length = "brief"
# Add focus areas based on content type
if not optimized_config.focus_areas:
optimized_config.focus_areas = []
if analysis.content_type == "technical":
optimized_config.focus_areas.extend([
"technical concepts",
"implementation details"
])
elif analysis.content_type == "educational":
optimized_config.focus_areas.extend([
"learning objectives",
"key concepts",
"practical applications"
])
elif analysis.content_type == "entertainment":
optimized_config.focus_areas.extend([
"main highlights",
"key moments",
"overall message"
])
# Adjust quality threshold based on complexity
if analysis.complexity_score > 0.7:
optimized_config.quality_threshold = max(
0.6,
optimized_config.quality_threshold - 0.1
)
return optimized_config
async def _generate_optimized_summary(
self,
transcript: str,
config: PipelineConfig,
analysis: ContentAnalysis
) -> Any:
"""Generate summary with content-aware optimizations.
Args:
transcript: Video transcript
config: Optimized pipeline configuration
analysis: Content analysis results
Returns:
SummaryResult from AI service
"""
from ..services.ai_service import SummaryRequest, SummaryLength
# Map config to AI service parameters
length_mapping = {
"brief": SummaryLength.BRIEF,
"standard": SummaryLength.STANDARD,
"detailed": SummaryLength.DETAILED
}
summary_request = SummaryRequest(
transcript=transcript,
length=length_mapping[config.summary_length],
focus_areas=config.focus_areas or [],
language=analysis.language,
include_timestamps=config.include_timestamps
)
# Add content-specific prompt enhancements
if analysis.content_type == "technical":
summary_request.focus_areas.append("explain technical concepts clearly")
elif analysis.content_type == "educational":
summary_request.focus_areas.append("highlight learning outcomes")
elif analysis.content_type == "entertainment":
summary_request.focus_areas.append("capture entertainment value")
return await self.ai_service.generate_summary(summary_request)
async def _validate_summary_quality(
self,
result: PipelineResult,
analysis: ContentAnalysis
) -> float:
"""Validate and score summary quality.
Args:
result: Pipeline result with summary data
analysis: Content analysis for validation context
Returns:
Quality score between 0.0 and 1.0
"""
quality_score = 0.0
if not result.summary:
return quality_score
# Check summary length appropriateness
summary_word_count = len(result.summary.split())
transcript_word_count = analysis.word_count
# Good summary should be 5-15% of original length
compression_ratio = (
summary_word_count / transcript_word_count
if transcript_word_count > 0 else 0
)
if 0.05 <= compression_ratio <= 0.15:
quality_score += 0.3
elif 0.03 <= compression_ratio <= 0.20:
quality_score += 0.2
# Check key points availability and quality
if result.key_points and len(result.key_points) >= 3:
quality_score += 0.2
# Quality check: ensure key points have substance
avg_point_length = sum(len(point.split()) for point in result.key_points) / len(result.key_points)
if avg_point_length >= 5:
quality_score += 0.1
# Check main themes availability
if result.main_themes and len(result.main_themes) >= 2:
quality_score += 0.15
# Check actionable insights
if result.actionable_insights and len(result.actionable_insights) >= 1:
quality_score += 0.15
# Use AI confidence score
if result.confidence_score:
if result.confidence_score > 0.8:
quality_score += 0.2
elif result.confidence_score > 0.6:
quality_score += 0.1
return min(1.0, quality_score)
async def _retry_with_improvements(
self,
job_id: str,
config: PipelineConfig,
reason: str
):
"""Retry pipeline with improved configuration.
Args:
job_id: Pipeline job identifier
config: Current pipeline configuration
reason: Reason for retry
"""
result = self.active_jobs[job_id]
result.retry_count += 1
await self._update_progress(
job_id,
PipelineStage.ANALYZING_CONTENT,
40,
f"Retrying '{result.display_name}' (attempt {result.retry_count + 1}/{config.max_retries + 1}): {reason}"
)
# Improve configuration for retry
improved_config = PipelineConfig(**asdict(config))
improved_config.summary_length = "detailed" # Try more detailed summary
improved_config.quality_threshold = max(
0.5,
config.quality_threshold - 0.1
) # Lower threshold slightly
# Continue pipeline with improved config
await self._execute_pipeline(job_id, improved_config)
async def _handle_pipeline_error(
self,
job_id: str,
error: Exception,
config: PipelineConfig
):
"""Handle pipeline errors with retry logic.
Args:
job_id: Pipeline job identifier
error: Exception that occurred
config: Pipeline configuration
"""
result = self.active_jobs[job_id]
result.status = PipelineStage.FAILED
result.error = {
"message": str(error),
"type": type(error).__name__,
"stage": result.status.value,
"retry_count": result.retry_count
}
# Attempt retry if within limits
if result.retry_count < config.max_retries:
await asyncio.sleep(2 ** result.retry_count) # Exponential backoff
await self._retry_with_improvements(
job_id,
config,
f"Error: {str(error)}"
)
else:
result.completed_at = datetime.utcnow()
await self._update_progress(
job_id,
PipelineStage.FAILED,
0,
f"❌ Failed '{result.display_name}' after {result.retry_count + 1} attempts: {str(error)}"
)
# Send error notification
if config.enable_notifications:
await self._send_error_notifications(result)
async def _update_progress(
self,
job_id: str,
stage: PipelineStage,
percentage: float,
message: str,
details: Dict[str, Any] = None,
sub_progress: Dict[str, Any] = None
):
"""Update pipeline progress with enhanced tracking.
Args:
job_id: Pipeline job identifier
stage: Current pipeline stage
percentage: Progress percentage (0-100)
message: Progress message
details: Optional additional details
sub_progress: Optional sub-task progress (e.g., chunk processing)
"""
from ..core.websocket_manager import ProcessingStage, ProgressData
result = self.active_jobs.get(job_id)
if result:
result.status = stage
# Calculate time elapsed
time_elapsed = (datetime.utcnow() - result.started_at).total_seconds()
# Map PipelineStage to ProcessingStage
stage_mapping = {
PipelineStage.INITIALIZED: ProcessingStage.INITIALIZED,
PipelineStage.VALIDATING_URL: ProcessingStage.VALIDATING_URL,
PipelineStage.EXTRACTING_METADATA: ProcessingStage.EXTRACTING_METADATA,
PipelineStage.EXTRACTING_TRANSCRIPT: ProcessingStage.EXTRACTING_TRANSCRIPT,
PipelineStage.ANALYZING_CONTENT: ProcessingStage.ANALYZING_CONTENT,
PipelineStage.GENERATING_SUMMARY: ProcessingStage.GENERATING_SUMMARY,
PipelineStage.VALIDATING_QUALITY: ProcessingStage.VALIDATING_QUALITY,
PipelineStage.COMPLETED: ProcessingStage.COMPLETED,
PipelineStage.FAILED: ProcessingStage.FAILED,
PipelineStage.CANCELLED: ProcessingStage.CANCELLED
}
processing_stage = stage_mapping.get(stage, ProcessingStage.INITIALIZED)
# Estimate remaining time
estimated_remaining = websocket_manager.estimate_remaining_time(job_id, percentage)
# Create enhanced progress data
progress_data = ProgressData(
job_id=job_id,
stage=processing_stage,
percentage=percentage,
message=message,
time_elapsed=time_elapsed,
estimated_remaining=estimated_remaining,
sub_progress=sub_progress,
details=details
)
# Update WebSocket manager with enhanced tracking
websocket_manager.update_job_progress(job_id, progress_data)
progress = PipelineProgress(
stage=stage,
percentage=percentage,
message=message,
current_step_details=details
)
# Send enhanced WebSocket update
await websocket_manager.send_progress_update(job_id, {
"stage": stage.value,
"percentage": percentage,
"message": message,
"details": details,
"sub_progress": sub_progress,
"time_elapsed": time_elapsed if result else 0,
"estimated_remaining": estimated_remaining if result else None
})
# Notify registered callbacks
callbacks = self.progress_callbacks.get(job_id, [])
for callback in callbacks:
try:
await callback(job_id, progress)
except Exception as e:
print(f"Progress callback error: {e}")
# Send progress notification for major milestones
await self.notification_service.send_progress_notification(
job_id,
asdict(progress)
)
async def _send_completion_notifications(self, result: PipelineResult):
"""Send completion notifications via multiple channels.
Args:
result: Completed pipeline result
"""
# Send via notification service
notification_result = {
"video_metadata": result.video_metadata,
"processing_time_seconds": result.processing_time_seconds,
"quality_score": result.quality_score,
"summary": result.summary
}
await self.notification_service.send_completion_notification(
result.job_id,
notification_result
)
# Send via WebSocket
await websocket_manager.send_completion_notification(result.job_id, {
"status": "completed",
"summary": result.summary,
"key_points": result.key_points,
"main_themes": result.main_themes,
"actionable_insights": result.actionable_insights,
"quality_score": result.quality_score,
"processing_time": result.processing_time_seconds
})
# Send browser notification
video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
summary_url = f"/summary/{result.summary_id}" if result.summary_id else None
await browser_notification_service.send_processing_complete_notification(
job_id=result.job_id,
video_title=video_title,
user_id=getattr(result, 'user_id', None),
summary_url=summary_url
)
async def _send_error_notifications(self, result: PipelineResult):
"""Send error notifications via multiple channels.
Args:
result: Failed pipeline result
"""
await self.notification_service.send_error_notification(
result.job_id,
result.error
)
await websocket_manager.send_error_notification(result.job_id, {
"status": "failed",
"error": result.error
})
# Send browser notification for errors
video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
retry_url = f"/retry/{result.job_id}" if result.job_id else None
await browser_notification_service.send_processing_failed_notification(
job_id=result.job_id,
video_title=video_title,
error_message=result.error or "Unknown error occurred",
user_id=getattr(result, 'user_id', None),
retry_url=retry_url
)
async def _restore_from_cache(self, job_id: str, cached_result: Dict[str, Any]):
"""Restore pipeline result from cache.
Args:
job_id: Pipeline job identifier
cached_result: Cached pipeline result data
"""
# Convert cached data back to PipelineResult
result = PipelineResult(**cached_result)
self.active_jobs[job_id] = result
await self._update_progress(
job_id,
PipelineStage.COMPLETED,
100,
"Retrieved from cache"
)
# Send completion notification
await self._send_completion_notifications(result)
async def get_pipeline_result(self, job_id: str) -> Optional[PipelineResult]:
"""Get pipeline result by job ID.
Args:
job_id: Pipeline job identifier
Returns:
PipelineResult if found, None otherwise
"""
# Check active jobs first
if job_id in self.active_jobs:
return self.active_jobs[job_id]
# Check cache for completed jobs
cached_result = await self.cache_manager.get_cached_pipeline_result(job_id)
if cached_result:
return PipelineResult(**cached_result)
return None
async def cancel_pipeline(self, job_id: str) -> bool:
"""Cancel running pipeline.
Args:
job_id: Pipeline job identifier
Returns:
True if cancelled successfully, False otherwise
"""
if job_id in self.active_jobs:
result = self.active_jobs[job_id]
result.status = PipelineStage.CANCELLED
result.completed_at = datetime.utcnow()
await self._update_progress(
job_id,
PipelineStage.CANCELLED,
0,
"Pipeline cancelled by user"
)
# Cancel the background task
if job_id in self._cleanup_tasks:
task = self._cleanup_tasks[job_id]
if not task.done():
task.cancel()
del self._cleanup_tasks[job_id]
return True
return False
def get_active_jobs(self) -> List[str]:
"""Get list of active job IDs.
Returns:
List of active job IDs
"""
return list(self.active_jobs.keys())
async def cleanup_completed_jobs(self, max_age_hours: int = 24):
"""Clean up old completed jobs from memory.
Args:
max_age_hours: Maximum age in hours before cleanup
"""
cutoff_time = datetime.utcnow() - timedelta(hours=max_age_hours)
jobs_to_remove = []
for job_id, result in self.active_jobs.items():
if (result.completed_at and
result.completed_at < cutoff_time and
result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED, PipelineStage.CANCELLED]):
jobs_to_remove.append(job_id)
for job_id in jobs_to_remove:
del self.active_jobs[job_id]
if job_id in self.progress_callbacks:
del self.progress_callbacks[job_id]