1080 lines
40 KiB
Python
1080 lines
40 KiB
Python
"""Summary Pipeline orchestrates the complete YouTube summarization workflow."""
|
|
import asyncio
|
|
import logging
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Optional, List, Any, Callable, Set
|
|
from dataclasses import asdict
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from ..services.video_service import VideoService
|
|
from ..services.transcript_service import TranscriptService
|
|
from ..services.deepseek_summarizer import DeepSeekSummarizer
|
|
from ..services.cache_manager import CacheManager
|
|
from ..services.notification_service import NotificationService
|
|
from ..services.transcript_streaming_service import transcript_streaming_service
|
|
from ..services.browser_notification_service import browser_notification_service
|
|
from ..models.pipeline import (
|
|
PipelineStage, PipelineConfig, PipelineProgress, PipelineResult,
|
|
ContentAnalysis, QualityMetrics
|
|
)
|
|
from ..core.websocket_manager import websocket_manager
|
|
from ..core.exceptions import PipelineError
|
|
|
|
|
|
class SummaryPipeline:
|
|
"""Orchestrates the complete YouTube summarization workflow."""
|
|
|
|
def __init__(
|
|
self,
|
|
video_service: VideoService,
|
|
transcript_service: TranscriptService,
|
|
ai_service: DeepSeekSummarizer,
|
|
cache_manager: CacheManager,
|
|
notification_service: NotificationService = None
|
|
):
|
|
self.video_service = video_service
|
|
self.transcript_service = transcript_service
|
|
self.ai_service = ai_service
|
|
self.cache_manager = cache_manager
|
|
self.notification_service = notification_service or NotificationService()
|
|
|
|
# Active jobs tracking
|
|
self.active_jobs: Dict[str, PipelineResult] = {}
|
|
self.progress_callbacks: Dict[str, List[Callable]] = {}
|
|
self._cleanup_tasks: Dict[str, asyncio.Task] = {}
|
|
self._cancelled_jobs: Set[str] = set() # Track cancelled jobs
|
|
|
|
async def process_video(
|
|
self,
|
|
video_url: str,
|
|
config: PipelineConfig = None,
|
|
progress_callback: Callable = None
|
|
) -> str:
|
|
"""Start video processing pipeline and return job ID.
|
|
|
|
Args:
|
|
video_url: YouTube video URL to process
|
|
config: Pipeline configuration options
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
Job ID for tracking pipeline progress
|
|
"""
|
|
if config is None:
|
|
config = PipelineConfig()
|
|
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Initialize pipeline result
|
|
result = PipelineResult(
|
|
job_id=job_id,
|
|
video_url=video_url,
|
|
video_id="", # Will be populated during validation
|
|
status=PipelineStage.INITIALIZED,
|
|
started_at=datetime.utcnow(),
|
|
retry_count=0
|
|
)
|
|
|
|
self.active_jobs[job_id] = result
|
|
|
|
if progress_callback:
|
|
self.progress_callbacks[job_id] = [progress_callback]
|
|
|
|
# Start processing in background
|
|
task = asyncio.create_task(self._execute_pipeline(job_id, config))
|
|
self._cleanup_tasks[job_id] = task
|
|
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.INITIALIZED,
|
|
0,
|
|
"Pipeline initialized, starting processing..."
|
|
)
|
|
|
|
return job_id
|
|
|
|
async def cancel_job(self, job_id: str) -> bool:
|
|
"""Cancel an active pipeline job.
|
|
|
|
Args:
|
|
job_id: Job ID to cancel
|
|
|
|
Returns:
|
|
True if cancellation was successful
|
|
"""
|
|
if job_id not in self.active_jobs:
|
|
return False
|
|
|
|
# Mark job as cancelled
|
|
self._cancelled_jobs.add(job_id)
|
|
result = self.active_jobs[job_id]
|
|
result.status = PipelineStage.CANCELLED
|
|
|
|
# Cancel the background task
|
|
if job_id in self._cleanup_tasks:
|
|
task = self._cleanup_tasks[job_id]
|
|
if not task.done():
|
|
task.cancel()
|
|
|
|
# Update progress
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.CANCELLED,
|
|
result.progress_percentage if hasattr(result, 'progress_percentage') else 0,
|
|
f"🛑 Cancelled: {result.display_name}",
|
|
{"cancelled_at": datetime.utcnow().isoformat()}
|
|
)
|
|
|
|
# Send cancellation notification
|
|
await websocket_manager.send_error_notification(job_id, {
|
|
"error": "cancelled",
|
|
"message": "Processing was cancelled by user",
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
})
|
|
|
|
# Clean up after a delay
|
|
asyncio.create_task(self._cleanup_cancelled_job(job_id))
|
|
return True
|
|
|
|
async def _cleanup_cancelled_job(self, job_id: str):
|
|
"""Clean up a cancelled job after a delay."""
|
|
await asyncio.sleep(5) # Keep for 5 seconds for client to receive updates
|
|
|
|
# Clean up tracking
|
|
self._cancelled_jobs.discard(job_id)
|
|
if job_id in self.active_jobs:
|
|
del self.active_jobs[job_id]
|
|
if job_id in self.progress_callbacks:
|
|
del self.progress_callbacks[job_id]
|
|
if job_id in self._cleanup_tasks:
|
|
del self._cleanup_tasks[job_id]
|
|
|
|
def _is_job_cancelled(self, job_id: str) -> bool:
|
|
"""Check if a job has been cancelled."""
|
|
return job_id in self._cancelled_jobs
|
|
|
|
async def _execute_pipeline(self, job_id: str, config: PipelineConfig):
|
|
"""Execute the complete processing pipeline.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
config: Pipeline configuration
|
|
"""
|
|
result = self.active_jobs[job_id]
|
|
|
|
try:
|
|
# Stage 1: URL Validation
|
|
if self._is_job_cancelled(job_id):
|
|
return # Early exit if cancelled
|
|
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.VALIDATING_URL,
|
|
5,
|
|
"Validating YouTube URL..."
|
|
)
|
|
video_id = await self.video_service.extract_video_id(result.video_url)
|
|
result.video_id = video_id
|
|
|
|
# Check cache for existing result
|
|
cached_result = await self.cache_manager.get_cached_pipeline_result(job_id)
|
|
if cached_result and cached_result.get("status") == PipelineStage.COMPLETED.value:
|
|
await self._restore_from_cache(job_id, cached_result)
|
|
return
|
|
|
|
# Check cancellation before next stage
|
|
if self._is_job_cancelled(job_id):
|
|
return
|
|
|
|
# Stage 2: Extract Video Metadata
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.EXTRACTING_METADATA,
|
|
15,
|
|
f"Extracting video information..."
|
|
)
|
|
metadata = await self._extract_enhanced_metadata(video_id)
|
|
result.video_metadata = metadata
|
|
|
|
# Now that we have metadata, update progress with video title
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.EXTRACTING_METADATA,
|
|
18,
|
|
f"Processing: {result.display_name}"
|
|
)
|
|
|
|
# Check cancellation before transcript extraction
|
|
if self._is_job_cancelled(job_id):
|
|
return
|
|
|
|
# Stage 3: Extract Transcript
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.EXTRACTING_TRANSCRIPT,
|
|
35,
|
|
f"Extracting transcript for: {result.display_name}"
|
|
)
|
|
|
|
# Start transcript streaming if enabled
|
|
await transcript_streaming_service.start_transcript_stream(
|
|
job_id=job_id,
|
|
video_id=video_id,
|
|
source="hybrid",
|
|
chunk_duration=30.0
|
|
)
|
|
|
|
try:
|
|
transcript_result = await self.transcript_service.extract_transcript(video_id)
|
|
result.transcript = transcript_result.transcript
|
|
|
|
# Stream the transcript segments if available
|
|
if hasattr(transcript_result, 'segments') and transcript_result.segments:
|
|
await transcript_streaming_service.stream_from_segments(
|
|
job_id=job_id,
|
|
segments=transcript_result.segments,
|
|
source=getattr(transcript_result, 'source', 'processed'),
|
|
chunk_duration=30.0
|
|
)
|
|
|
|
# Complete the transcript stream
|
|
await transcript_streaming_service.complete_transcript_stream(
|
|
job_id=job_id,
|
|
final_transcript=result.transcript,
|
|
metadata={
|
|
"video_id": video_id,
|
|
"source": getattr(transcript_result, 'source', 'processed'),
|
|
"language": getattr(transcript_result, 'language', 'auto'),
|
|
"segment_count": len(getattr(transcript_result, 'segments', []))
|
|
}
|
|
)
|
|
|
|
# Send browser notification for transcript completion
|
|
video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
|
|
transcript_url = f"/transcript/{job_id}" if job_id else None
|
|
|
|
await browser_notification_service.send_transcript_ready_notification(
|
|
job_id=job_id,
|
|
video_title=video_title,
|
|
user_id=getattr(result, 'user_id', None),
|
|
transcript_url=transcript_url
|
|
)
|
|
|
|
except Exception as transcript_error:
|
|
# Handle streaming error
|
|
await transcript_streaming_service.handle_stream_error(
|
|
job_id=job_id,
|
|
error=transcript_error,
|
|
partial_transcript=getattr(result, 'transcript', None)
|
|
)
|
|
raise transcript_error
|
|
|
|
# Check cancellation before content analysis
|
|
if self._is_job_cancelled(job_id):
|
|
return
|
|
|
|
# Stage 4: Analyze Content for Optimization
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.ANALYZING_CONTENT,
|
|
50,
|
|
f"Analyzing content: {result.display_name}"
|
|
)
|
|
content_analysis = await self._analyze_content_characteristics(
|
|
result.transcript, metadata
|
|
)
|
|
optimized_config = self._optimize_config_for_content(config, content_analysis)
|
|
|
|
# Check cancellation before expensive AI operation
|
|
if self._is_job_cancelled(job_id):
|
|
return
|
|
|
|
# Stage 5: Generate Summary
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.GENERATING_SUMMARY,
|
|
75,
|
|
f"Generating AI summary for: {result.display_name}"
|
|
)
|
|
summary_result = await self._generate_optimized_summary(
|
|
result.transcript, optimized_config, content_analysis
|
|
)
|
|
|
|
# Populate summary results
|
|
result.summary = summary_result.summary
|
|
result.key_points = summary_result.key_points
|
|
result.main_themes = summary_result.main_themes
|
|
result.actionable_insights = summary_result.actionable_insights
|
|
result.confidence_score = summary_result.confidence_score
|
|
result.processing_metadata = summary_result.processing_metadata
|
|
result.cost_data = summary_result.cost_data
|
|
|
|
# Stage 6: Quality Validation
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.VALIDATING_QUALITY,
|
|
90,
|
|
f"Validating quality: {result.display_name}"
|
|
)
|
|
quality_score = await self._validate_summary_quality(result, content_analysis)
|
|
result.quality_score = quality_score
|
|
|
|
# Check if quality meets threshold
|
|
if quality_score < config.quality_threshold and result.retry_count < config.max_retries:
|
|
await self._retry_with_improvements(job_id, config, "Low quality score")
|
|
return
|
|
|
|
# Stage 7: Complete
|
|
result.completed_at = datetime.utcnow()
|
|
result.processing_time_seconds = (
|
|
result.completed_at - result.started_at
|
|
).total_seconds()
|
|
result.status = PipelineStage.COMPLETED
|
|
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.COMPLETED,
|
|
100,
|
|
f"✅ Completed: {result.display_name}"
|
|
)
|
|
|
|
# Cache the result
|
|
await self.cache_manager.cache_pipeline_result(job_id, result)
|
|
|
|
# Save to database for unified storage across frontend and CLI
|
|
try:
|
|
from ..services.database_storage_service import database_storage_service
|
|
saved_summary = database_storage_service.save_summary_from_pipeline(result)
|
|
result.summary_id = saved_summary.id
|
|
logger.info(f"Saved summary {saved_summary.id} to database for '{result.display_name}' (job {job_id})")
|
|
except Exception as db_error:
|
|
logger.error(f"Failed to save summary for '{result.display_name}' to database: {db_error}")
|
|
# Don't fail the pipeline if database save fails
|
|
|
|
# Send completion notifications
|
|
if config.enable_notifications:
|
|
await self._send_completion_notifications(result)
|
|
|
|
except Exception as e:
|
|
await self._handle_pipeline_error(job_id, e, config)
|
|
finally:
|
|
# Cleanup task reference
|
|
if job_id in self._cleanup_tasks:
|
|
del self._cleanup_tasks[job_id]
|
|
|
|
async def _extract_enhanced_metadata(self, video_id: str) -> Dict[str, Any]:
|
|
"""Extract rich video metadata using YouTube Data API.
|
|
|
|
Args:
|
|
video_id: YouTube video identifier
|
|
|
|
Returns:
|
|
Enhanced video metadata dictionary
|
|
"""
|
|
try:
|
|
# Check cache first
|
|
cached_metadata = await self.cache_manager.get_cached_video_metadata(video_id)
|
|
if cached_metadata:
|
|
return cached_metadata
|
|
|
|
# Use video service to get metadata
|
|
metadata = await self.video_service.get_video_metadata(video_id)
|
|
|
|
# Enhance with additional processing
|
|
enhanced_metadata = {
|
|
**metadata,
|
|
"content_warnings": await self._detect_content_warnings(metadata),
|
|
"estimated_complexity": self._estimate_content_complexity(metadata),
|
|
"processing_hints": self._generate_processing_hints(metadata)
|
|
}
|
|
|
|
# Cache the enhanced metadata
|
|
await self.cache_manager.cache_video_metadata(video_id, enhanced_metadata)
|
|
|
|
return enhanced_metadata
|
|
|
|
except Exception as e:
|
|
# Return basic metadata if enhanced extraction fails
|
|
return {
|
|
"video_id": video_id,
|
|
"title": f"Video {video_id}",
|
|
"error": f"Enhanced metadata extraction failed: {str(e)}"
|
|
}
|
|
|
|
async def _detect_content_warnings(self, metadata: Dict[str, Any]) -> List[str]:
|
|
"""Detect potential content warnings from metadata."""
|
|
warnings = []
|
|
|
|
title = metadata.get("title", "").lower()
|
|
description = metadata.get("description", "").lower()
|
|
tags = [tag.lower() for tag in metadata.get("tags", [])]
|
|
|
|
# Check for mature content indicators
|
|
mature_indicators = [
|
|
"explicit", "mature", "adult", "nsfw", "warning",
|
|
"violence", "profanity", "disturbing"
|
|
]
|
|
|
|
for indicator in mature_indicators:
|
|
if (indicator in title or
|
|
indicator in description or
|
|
any(indicator in tag for tag in tags)):
|
|
warnings.append(f"Contains {indicator} content")
|
|
|
|
return warnings
|
|
|
|
def _estimate_content_complexity(self, metadata: Dict[str, Any]) -> str:
|
|
"""Estimate content complexity from metadata."""
|
|
# Simple heuristic based on category and tags
|
|
category = metadata.get("category", "").lower()
|
|
tags = [tag.lower() for tag in metadata.get("tags", [])]
|
|
|
|
technical_categories = ["science & technology", "education", "howto & style"]
|
|
technical_tags = ["tutorial", "programming", "science", "technical", "academic"]
|
|
|
|
if (category in technical_categories or
|
|
any(tag in technical_tags for tag in tags)):
|
|
return "high"
|
|
elif category in ["entertainment", "gaming", "sports"]:
|
|
return "low"
|
|
else:
|
|
return "medium"
|
|
|
|
def _generate_processing_hints(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate hints for optimal processing based on metadata."""
|
|
hints = {
|
|
"recommended_summary_length": "standard",
|
|
"focus_areas": [],
|
|
"special_handling": []
|
|
}
|
|
|
|
# Duration-based hints
|
|
duration_str = metadata.get("duration", "PT0S")
|
|
duration_seconds = self._parse_iso_duration(duration_str)
|
|
|
|
if duration_seconds > 3600: # > 1 hour
|
|
hints["recommended_summary_length"] = "detailed"
|
|
hints["special_handling"].append("long_form_content")
|
|
elif duration_seconds < 300: # < 5 minutes
|
|
hints["recommended_summary_length"] = "brief"
|
|
hints["special_handling"].append("short_form_content")
|
|
|
|
# Category-based hints
|
|
category = metadata.get("category", "").lower()
|
|
if "education" in category:
|
|
hints["focus_areas"].extend(["learning_objectives", "key_concepts"])
|
|
elif "tutorial" in category or "howto" in category:
|
|
hints["focus_areas"].extend(["step_by_step_instructions", "practical_tips"])
|
|
|
|
return hints
|
|
|
|
def _parse_iso_duration(self, duration: str) -> int:
|
|
"""Parse ISO 8601 duration to seconds."""
|
|
try:
|
|
# Simple parser for PT#H#M#S format
|
|
if not duration.startswith("PT"):
|
|
return 0
|
|
|
|
duration = duration[2:] # Remove "PT"
|
|
seconds = 0
|
|
|
|
if "H" in duration:
|
|
hours, duration = duration.split("H", 1)
|
|
seconds += int(hours) * 3600
|
|
|
|
if "M" in duration:
|
|
minutes, duration = duration.split("M", 1)
|
|
seconds += int(minutes) * 60
|
|
|
|
if "S" in duration:
|
|
secs = duration.split("S")[0]
|
|
if secs:
|
|
seconds += int(secs)
|
|
|
|
return seconds
|
|
except (ValueError, AttributeError):
|
|
return 0
|
|
|
|
async def _analyze_content_characteristics(
|
|
self,
|
|
transcript: str,
|
|
metadata: Dict[str, Any]
|
|
) -> ContentAnalysis:
|
|
"""Analyze transcript and metadata to determine optimal processing strategy.
|
|
|
|
Args:
|
|
transcript: Video transcript text
|
|
metadata: Video metadata dictionary
|
|
|
|
Returns:
|
|
ContentAnalysis object with processing insights
|
|
"""
|
|
word_count = len(transcript.split())
|
|
transcript_lower = transcript.lower()
|
|
|
|
# Basic content type detection
|
|
content_type = "general"
|
|
technical_indicators = []
|
|
educational_indicators = []
|
|
entertainment_indicators = []
|
|
|
|
# Technical content indicators
|
|
technical_terms = [
|
|
"algorithm", "function", "variable", "database", "api", "code",
|
|
"programming", "software", "development", "technical", "implementation"
|
|
]
|
|
for term in technical_terms:
|
|
if term in transcript_lower:
|
|
technical_indicators.append(term)
|
|
|
|
# Educational content indicators
|
|
educational_terms = [
|
|
"learn", "tutorial", "explain", "understand", "concept", "example",
|
|
"lesson", "course", "study", "knowledge", "teach", "instruction"
|
|
]
|
|
for term in educational_terms:
|
|
if term in transcript_lower:
|
|
educational_indicators.append(term)
|
|
|
|
# Entertainment content indicators
|
|
entertainment_terms = [
|
|
"funny", "story", "experience", "adventure", "review", "reaction",
|
|
"entertainment", "fun", "hilarious", "amazing", "incredible"
|
|
]
|
|
for term in entertainment_terms:
|
|
if term in transcript_lower:
|
|
entertainment_indicators.append(term)
|
|
|
|
# Determine primary content type
|
|
if len(technical_indicators) >= 3:
|
|
content_type = "technical"
|
|
elif len(educational_indicators) >= 3:
|
|
content_type = "educational"
|
|
elif len(entertainment_indicators) >= 2:
|
|
content_type = "entertainment"
|
|
|
|
# Complexity scoring based on sentence length and vocabulary
|
|
sentences = [s.strip() for s in transcript.split('.') if s.strip()]
|
|
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
|
|
|
|
complexity_score = 0.5 # Base complexity
|
|
|
|
if avg_sentence_length > 20:
|
|
complexity_score = min(1.0, complexity_score + 0.3)
|
|
elif avg_sentence_length < 10:
|
|
complexity_score = max(0.1, complexity_score - 0.2)
|
|
|
|
# Adjust based on technical indicators
|
|
if len(technical_indicators) > 5:
|
|
complexity_score = min(1.0, complexity_score + 0.2)
|
|
|
|
return ContentAnalysis(
|
|
transcript_length=len(transcript),
|
|
word_count=word_count,
|
|
estimated_reading_time=word_count / 250, # Words per minute
|
|
complexity_score=complexity_score,
|
|
content_type=content_type,
|
|
language=metadata.get("language", "en"),
|
|
technical_indicators=technical_indicators,
|
|
educational_indicators=educational_indicators,
|
|
entertainment_indicators=entertainment_indicators
|
|
)
|
|
|
|
def _optimize_config_for_content(
|
|
self,
|
|
base_config: PipelineConfig,
|
|
analysis: ContentAnalysis
|
|
) -> PipelineConfig:
|
|
"""Optimize processing configuration based on content analysis.
|
|
|
|
Args:
|
|
base_config: Original pipeline configuration
|
|
analysis: Content analysis results
|
|
|
|
Returns:
|
|
Optimized pipeline configuration
|
|
"""
|
|
optimized_config = PipelineConfig(**asdict(base_config))
|
|
|
|
# Adjust summary length based on content
|
|
if analysis.word_count > 5000 and optimized_config.summary_length == "standard":
|
|
optimized_config.summary_length = "detailed"
|
|
elif analysis.word_count < 500 and optimized_config.summary_length == "standard":
|
|
optimized_config.summary_length = "brief"
|
|
|
|
# Add focus areas based on content type
|
|
if not optimized_config.focus_areas:
|
|
optimized_config.focus_areas = []
|
|
|
|
if analysis.content_type == "technical":
|
|
optimized_config.focus_areas.extend([
|
|
"technical concepts",
|
|
"implementation details"
|
|
])
|
|
elif analysis.content_type == "educational":
|
|
optimized_config.focus_areas.extend([
|
|
"learning objectives",
|
|
"key concepts",
|
|
"practical applications"
|
|
])
|
|
elif analysis.content_type == "entertainment":
|
|
optimized_config.focus_areas.extend([
|
|
"main highlights",
|
|
"key moments",
|
|
"overall message"
|
|
])
|
|
|
|
# Adjust quality threshold based on complexity
|
|
if analysis.complexity_score > 0.7:
|
|
optimized_config.quality_threshold = max(
|
|
0.6,
|
|
optimized_config.quality_threshold - 0.1
|
|
)
|
|
|
|
return optimized_config
|
|
|
|
async def _generate_optimized_summary(
|
|
self,
|
|
transcript: str,
|
|
config: PipelineConfig,
|
|
analysis: ContentAnalysis
|
|
) -> Any:
|
|
"""Generate summary with content-aware optimizations.
|
|
|
|
Args:
|
|
transcript: Video transcript
|
|
config: Optimized pipeline configuration
|
|
analysis: Content analysis results
|
|
|
|
Returns:
|
|
SummaryResult from AI service
|
|
"""
|
|
from ..services.ai_service import SummaryRequest, SummaryLength
|
|
|
|
# Map config to AI service parameters
|
|
length_mapping = {
|
|
"brief": SummaryLength.BRIEF,
|
|
"standard": SummaryLength.STANDARD,
|
|
"detailed": SummaryLength.DETAILED
|
|
}
|
|
|
|
summary_request = SummaryRequest(
|
|
transcript=transcript,
|
|
length=length_mapping[config.summary_length],
|
|
focus_areas=config.focus_areas or [],
|
|
language=analysis.language,
|
|
include_timestamps=config.include_timestamps
|
|
)
|
|
|
|
# Add content-specific prompt enhancements
|
|
if analysis.content_type == "technical":
|
|
summary_request.focus_areas.append("explain technical concepts clearly")
|
|
elif analysis.content_type == "educational":
|
|
summary_request.focus_areas.append("highlight learning outcomes")
|
|
elif analysis.content_type == "entertainment":
|
|
summary_request.focus_areas.append("capture entertainment value")
|
|
|
|
return await self.ai_service.generate_summary(summary_request)
|
|
|
|
async def _validate_summary_quality(
|
|
self,
|
|
result: PipelineResult,
|
|
analysis: ContentAnalysis
|
|
) -> float:
|
|
"""Validate and score summary quality.
|
|
|
|
Args:
|
|
result: Pipeline result with summary data
|
|
analysis: Content analysis for validation context
|
|
|
|
Returns:
|
|
Quality score between 0.0 and 1.0
|
|
"""
|
|
quality_score = 0.0
|
|
|
|
if not result.summary:
|
|
return quality_score
|
|
|
|
# Check summary length appropriateness
|
|
summary_word_count = len(result.summary.split())
|
|
transcript_word_count = analysis.word_count
|
|
|
|
# Good summary should be 5-15% of original length
|
|
compression_ratio = (
|
|
summary_word_count / transcript_word_count
|
|
if transcript_word_count > 0 else 0
|
|
)
|
|
|
|
if 0.05 <= compression_ratio <= 0.15:
|
|
quality_score += 0.3
|
|
elif 0.03 <= compression_ratio <= 0.20:
|
|
quality_score += 0.2
|
|
|
|
# Check key points availability and quality
|
|
if result.key_points and len(result.key_points) >= 3:
|
|
quality_score += 0.2
|
|
|
|
# Quality check: ensure key points have substance
|
|
avg_point_length = sum(len(point.split()) for point in result.key_points) / len(result.key_points)
|
|
if avg_point_length >= 5:
|
|
quality_score += 0.1
|
|
|
|
# Check main themes availability
|
|
if result.main_themes and len(result.main_themes) >= 2:
|
|
quality_score += 0.15
|
|
|
|
# Check actionable insights
|
|
if result.actionable_insights and len(result.actionable_insights) >= 1:
|
|
quality_score += 0.15
|
|
|
|
# Use AI confidence score
|
|
if result.confidence_score:
|
|
if result.confidence_score > 0.8:
|
|
quality_score += 0.2
|
|
elif result.confidence_score > 0.6:
|
|
quality_score += 0.1
|
|
|
|
return min(1.0, quality_score)
|
|
|
|
async def _retry_with_improvements(
|
|
self,
|
|
job_id: str,
|
|
config: PipelineConfig,
|
|
reason: str
|
|
):
|
|
"""Retry pipeline with improved configuration.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
config: Current pipeline configuration
|
|
reason: Reason for retry
|
|
"""
|
|
result = self.active_jobs[job_id]
|
|
result.retry_count += 1
|
|
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.ANALYZING_CONTENT,
|
|
40,
|
|
f"Retrying '{result.display_name}' (attempt {result.retry_count + 1}/{config.max_retries + 1}): {reason}"
|
|
)
|
|
|
|
# Improve configuration for retry
|
|
improved_config = PipelineConfig(**asdict(config))
|
|
improved_config.summary_length = "detailed" # Try more detailed summary
|
|
improved_config.quality_threshold = max(
|
|
0.5,
|
|
config.quality_threshold - 0.1
|
|
) # Lower threshold slightly
|
|
|
|
# Continue pipeline with improved config
|
|
await self._execute_pipeline(job_id, improved_config)
|
|
|
|
async def _handle_pipeline_error(
|
|
self,
|
|
job_id: str,
|
|
error: Exception,
|
|
config: PipelineConfig
|
|
):
|
|
"""Handle pipeline errors with retry logic.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
error: Exception that occurred
|
|
config: Pipeline configuration
|
|
"""
|
|
result = self.active_jobs[job_id]
|
|
result.status = PipelineStage.FAILED
|
|
result.error = {
|
|
"message": str(error),
|
|
"type": type(error).__name__,
|
|
"stage": result.status.value,
|
|
"retry_count": result.retry_count
|
|
}
|
|
|
|
# Attempt retry if within limits
|
|
if result.retry_count < config.max_retries:
|
|
await asyncio.sleep(2 ** result.retry_count) # Exponential backoff
|
|
await self._retry_with_improvements(
|
|
job_id,
|
|
config,
|
|
f"Error: {str(error)}"
|
|
)
|
|
else:
|
|
result.completed_at = datetime.utcnow()
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.FAILED,
|
|
0,
|
|
f"❌ Failed '{result.display_name}' after {result.retry_count + 1} attempts: {str(error)}"
|
|
)
|
|
|
|
# Send error notification
|
|
if config.enable_notifications:
|
|
await self._send_error_notifications(result)
|
|
|
|
async def _update_progress(
|
|
self,
|
|
job_id: str,
|
|
stage: PipelineStage,
|
|
percentage: float,
|
|
message: str,
|
|
details: Dict[str, Any] = None,
|
|
sub_progress: Dict[str, Any] = None
|
|
):
|
|
"""Update pipeline progress with enhanced tracking.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
stage: Current pipeline stage
|
|
percentage: Progress percentage (0-100)
|
|
message: Progress message
|
|
details: Optional additional details
|
|
sub_progress: Optional sub-task progress (e.g., chunk processing)
|
|
"""
|
|
from ..core.websocket_manager import ProcessingStage, ProgressData
|
|
|
|
result = self.active_jobs.get(job_id)
|
|
if result:
|
|
result.status = stage
|
|
|
|
# Calculate time elapsed
|
|
time_elapsed = (datetime.utcnow() - result.started_at).total_seconds()
|
|
|
|
# Map PipelineStage to ProcessingStage
|
|
stage_mapping = {
|
|
PipelineStage.INITIALIZED: ProcessingStage.INITIALIZED,
|
|
PipelineStage.VALIDATING_URL: ProcessingStage.VALIDATING_URL,
|
|
PipelineStage.EXTRACTING_METADATA: ProcessingStage.EXTRACTING_METADATA,
|
|
PipelineStage.EXTRACTING_TRANSCRIPT: ProcessingStage.EXTRACTING_TRANSCRIPT,
|
|
PipelineStage.ANALYZING_CONTENT: ProcessingStage.ANALYZING_CONTENT,
|
|
PipelineStage.GENERATING_SUMMARY: ProcessingStage.GENERATING_SUMMARY,
|
|
PipelineStage.VALIDATING_QUALITY: ProcessingStage.VALIDATING_QUALITY,
|
|
PipelineStage.COMPLETED: ProcessingStage.COMPLETED,
|
|
PipelineStage.FAILED: ProcessingStage.FAILED,
|
|
PipelineStage.CANCELLED: ProcessingStage.CANCELLED
|
|
}
|
|
|
|
processing_stage = stage_mapping.get(stage, ProcessingStage.INITIALIZED)
|
|
|
|
# Estimate remaining time
|
|
estimated_remaining = websocket_manager.estimate_remaining_time(job_id, percentage)
|
|
|
|
# Create enhanced progress data
|
|
progress_data = ProgressData(
|
|
job_id=job_id,
|
|
stage=processing_stage,
|
|
percentage=percentage,
|
|
message=message,
|
|
time_elapsed=time_elapsed,
|
|
estimated_remaining=estimated_remaining,
|
|
sub_progress=sub_progress,
|
|
details=details
|
|
)
|
|
|
|
# Update WebSocket manager with enhanced tracking
|
|
websocket_manager.update_job_progress(job_id, progress_data)
|
|
|
|
progress = PipelineProgress(
|
|
stage=stage,
|
|
percentage=percentage,
|
|
message=message,
|
|
current_step_details=details
|
|
)
|
|
|
|
# Send enhanced WebSocket update
|
|
await websocket_manager.send_progress_update(job_id, {
|
|
"stage": stage.value,
|
|
"percentage": percentage,
|
|
"message": message,
|
|
"details": details,
|
|
"sub_progress": sub_progress,
|
|
"time_elapsed": time_elapsed if result else 0,
|
|
"estimated_remaining": estimated_remaining if result else None
|
|
})
|
|
|
|
# Notify registered callbacks
|
|
callbacks = self.progress_callbacks.get(job_id, [])
|
|
for callback in callbacks:
|
|
try:
|
|
await callback(job_id, progress)
|
|
except Exception as e:
|
|
print(f"Progress callback error: {e}")
|
|
|
|
# Send progress notification for major milestones
|
|
await self.notification_service.send_progress_notification(
|
|
job_id,
|
|
asdict(progress)
|
|
)
|
|
|
|
async def _send_completion_notifications(self, result: PipelineResult):
|
|
"""Send completion notifications via multiple channels.
|
|
|
|
Args:
|
|
result: Completed pipeline result
|
|
"""
|
|
# Send via notification service
|
|
notification_result = {
|
|
"video_metadata": result.video_metadata,
|
|
"processing_time_seconds": result.processing_time_seconds,
|
|
"quality_score": result.quality_score,
|
|
"summary": result.summary
|
|
}
|
|
|
|
await self.notification_service.send_completion_notification(
|
|
result.job_id,
|
|
notification_result
|
|
)
|
|
|
|
# Send via WebSocket
|
|
await websocket_manager.send_completion_notification(result.job_id, {
|
|
"status": "completed",
|
|
"summary": result.summary,
|
|
"key_points": result.key_points,
|
|
"main_themes": result.main_themes,
|
|
"actionable_insights": result.actionable_insights,
|
|
"quality_score": result.quality_score,
|
|
"processing_time": result.processing_time_seconds
|
|
})
|
|
|
|
# Send browser notification
|
|
video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
|
|
summary_url = f"/summary/{result.summary_id}" if result.summary_id else None
|
|
|
|
await browser_notification_service.send_processing_complete_notification(
|
|
job_id=result.job_id,
|
|
video_title=video_title,
|
|
user_id=getattr(result, 'user_id', None),
|
|
summary_url=summary_url
|
|
)
|
|
|
|
async def _send_error_notifications(self, result: PipelineResult):
|
|
"""Send error notifications via multiple channels.
|
|
|
|
Args:
|
|
result: Failed pipeline result
|
|
"""
|
|
await self.notification_service.send_error_notification(
|
|
result.job_id,
|
|
result.error
|
|
)
|
|
|
|
await websocket_manager.send_error_notification(result.job_id, {
|
|
"status": "failed",
|
|
"error": result.error
|
|
})
|
|
|
|
# Send browser notification for errors
|
|
video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
|
|
retry_url = f"/retry/{result.job_id}" if result.job_id else None
|
|
|
|
await browser_notification_service.send_processing_failed_notification(
|
|
job_id=result.job_id,
|
|
video_title=video_title,
|
|
error_message=result.error or "Unknown error occurred",
|
|
user_id=getattr(result, 'user_id', None),
|
|
retry_url=retry_url
|
|
)
|
|
|
|
async def _restore_from_cache(self, job_id: str, cached_result: Dict[str, Any]):
|
|
"""Restore pipeline result from cache.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
cached_result: Cached pipeline result data
|
|
"""
|
|
# Convert cached data back to PipelineResult
|
|
result = PipelineResult(**cached_result)
|
|
self.active_jobs[job_id] = result
|
|
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.COMPLETED,
|
|
100,
|
|
"Retrieved from cache"
|
|
)
|
|
|
|
# Send completion notification
|
|
await self._send_completion_notifications(result)
|
|
|
|
async def get_pipeline_result(self, job_id: str) -> Optional[PipelineResult]:
|
|
"""Get pipeline result by job ID.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
|
|
Returns:
|
|
PipelineResult if found, None otherwise
|
|
"""
|
|
# Check active jobs first
|
|
if job_id in self.active_jobs:
|
|
return self.active_jobs[job_id]
|
|
|
|
# Check cache for completed jobs
|
|
cached_result = await self.cache_manager.get_cached_pipeline_result(job_id)
|
|
if cached_result:
|
|
return PipelineResult(**cached_result)
|
|
|
|
return None
|
|
|
|
async def cancel_pipeline(self, job_id: str) -> bool:
|
|
"""Cancel running pipeline.
|
|
|
|
Args:
|
|
job_id: Pipeline job identifier
|
|
|
|
Returns:
|
|
True if cancelled successfully, False otherwise
|
|
"""
|
|
if job_id in self.active_jobs:
|
|
result = self.active_jobs[job_id]
|
|
result.status = PipelineStage.CANCELLED
|
|
result.completed_at = datetime.utcnow()
|
|
|
|
await self._update_progress(
|
|
job_id,
|
|
PipelineStage.CANCELLED,
|
|
0,
|
|
"Pipeline cancelled by user"
|
|
)
|
|
|
|
# Cancel the background task
|
|
if job_id in self._cleanup_tasks:
|
|
task = self._cleanup_tasks[job_id]
|
|
if not task.done():
|
|
task.cancel()
|
|
del self._cleanup_tasks[job_id]
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
def get_active_jobs(self) -> List[str]:
|
|
"""Get list of active job IDs.
|
|
|
|
Returns:
|
|
List of active job IDs
|
|
"""
|
|
return list(self.active_jobs.keys())
|
|
|
|
async def cleanup_completed_jobs(self, max_age_hours: int = 24):
|
|
"""Clean up old completed jobs from memory.
|
|
|
|
Args:
|
|
max_age_hours: Maximum age in hours before cleanup
|
|
"""
|
|
cutoff_time = datetime.utcnow() - timedelta(hours=max_age_hours)
|
|
|
|
jobs_to_remove = []
|
|
for job_id, result in self.active_jobs.items():
|
|
if (result.completed_at and
|
|
result.completed_at < cutoff_time and
|
|
result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED, PipelineStage.CANCELLED]):
|
|
jobs_to_remove.append(job_id)
|
|
|
|
for job_id in jobs_to_remove:
|
|
del self.active_jobs[job_id]
|
|
if job_id in self.progress_callbacks:
|
|
del self.progress_callbacks[job_id] |