"""Summary Pipeline orchestrates the complete YouTube summarization workflow.""" import asyncio import logging import uuid from datetime import datetime, timedelta from typing import Dict, Optional, List, Any, Callable, Set from dataclasses import asdict logger = logging.getLogger(__name__) from ..services.video_service import VideoService from ..services.transcript_service import TranscriptService from ..services.deepseek_summarizer import DeepSeekSummarizer from ..services.cache_manager import CacheManager from ..services.notification_service import NotificationService from ..services.transcript_streaming_service import transcript_streaming_service from ..services.browser_notification_service import browser_notification_service from ..models.pipeline import ( PipelineStage, PipelineConfig, PipelineProgress, PipelineResult, ContentAnalysis, QualityMetrics ) from ..core.websocket_manager import websocket_manager from ..core.exceptions import PipelineError class SummaryPipeline: """Orchestrates the complete YouTube summarization workflow.""" def __init__( self, video_service: VideoService, transcript_service: TranscriptService, ai_service: DeepSeekSummarizer, cache_manager: CacheManager, notification_service: NotificationService = None ): self.video_service = video_service self.transcript_service = transcript_service self.ai_service = ai_service self.cache_manager = cache_manager self.notification_service = notification_service or NotificationService() # Active jobs tracking self.active_jobs: Dict[str, PipelineResult] = {} self.progress_callbacks: Dict[str, List[Callable]] = {} self._cleanup_tasks: Dict[str, asyncio.Task] = {} self._cancelled_jobs: Set[str] = set() # Track cancelled jobs async def process_video( self, video_url: str, config: PipelineConfig = None, progress_callback: Callable = None ) -> str: """Start video processing pipeline and return job ID. Args: video_url: YouTube video URL to process config: Pipeline configuration options progress_callback: Optional callback for progress updates Returns: Job ID for tracking pipeline progress """ if config is None: config = PipelineConfig() job_id = str(uuid.uuid4()) # Initialize pipeline result result = PipelineResult( job_id=job_id, video_url=video_url, video_id="", # Will be populated during validation status=PipelineStage.INITIALIZED, started_at=datetime.utcnow(), retry_count=0 ) self.active_jobs[job_id] = result if progress_callback: self.progress_callbacks[job_id] = [progress_callback] # Start processing in background task = asyncio.create_task(self._execute_pipeline(job_id, config)) self._cleanup_tasks[job_id] = task await self._update_progress( job_id, PipelineStage.INITIALIZED, 0, "Pipeline initialized, starting processing..." ) return job_id async def cancel_job(self, job_id: str) -> bool: """Cancel an active pipeline job. Args: job_id: Job ID to cancel Returns: True if cancellation was successful """ if job_id not in self.active_jobs: return False # Mark job as cancelled self._cancelled_jobs.add(job_id) result = self.active_jobs[job_id] result.status = PipelineStage.CANCELLED # Cancel the background task if job_id in self._cleanup_tasks: task = self._cleanup_tasks[job_id] if not task.done(): task.cancel() # Update progress await self._update_progress( job_id, PipelineStage.CANCELLED, result.progress_percentage if hasattr(result, 'progress_percentage') else 0, f"🛑 Cancelled: {result.display_name}", {"cancelled_at": datetime.utcnow().isoformat()} ) # Send cancellation notification await websocket_manager.send_error_notification(job_id, { "error": "cancelled", "message": "Processing was cancelled by user", "timestamp": datetime.utcnow().isoformat() }) # Clean up after a delay asyncio.create_task(self._cleanup_cancelled_job(job_id)) return True async def _cleanup_cancelled_job(self, job_id: str): """Clean up a cancelled job after a delay.""" await asyncio.sleep(5) # Keep for 5 seconds for client to receive updates # Clean up tracking self._cancelled_jobs.discard(job_id) if job_id in self.active_jobs: del self.active_jobs[job_id] if job_id in self.progress_callbacks: del self.progress_callbacks[job_id] if job_id in self._cleanup_tasks: del self._cleanup_tasks[job_id] def _is_job_cancelled(self, job_id: str) -> bool: """Check if a job has been cancelled.""" return job_id in self._cancelled_jobs async def _execute_pipeline(self, job_id: str, config: PipelineConfig): """Execute the complete processing pipeline. Args: job_id: Pipeline job identifier config: Pipeline configuration """ result = self.active_jobs[job_id] try: # Stage 1: URL Validation if self._is_job_cancelled(job_id): return # Early exit if cancelled await self._update_progress( job_id, PipelineStage.VALIDATING_URL, 5, "Validating YouTube URL..." ) video_id = await self.video_service.extract_video_id(result.video_url) result.video_id = video_id # Check cache for existing result cached_result = await self.cache_manager.get_cached_pipeline_result(job_id) if cached_result and cached_result.get("status") == PipelineStage.COMPLETED.value: await self._restore_from_cache(job_id, cached_result) return # Check cancellation before next stage if self._is_job_cancelled(job_id): return # Stage 2: Extract Video Metadata await self._update_progress( job_id, PipelineStage.EXTRACTING_METADATA, 15, f"Extracting video information..." ) metadata = await self._extract_enhanced_metadata(video_id) result.video_metadata = metadata # Now that we have metadata, update progress with video title await self._update_progress( job_id, PipelineStage.EXTRACTING_METADATA, 18, f"Processing: {result.display_name}" ) # Check cancellation before transcript extraction if self._is_job_cancelled(job_id): return # Stage 3: Extract Transcript await self._update_progress( job_id, PipelineStage.EXTRACTING_TRANSCRIPT, 35, f"Extracting transcript for: {result.display_name}" ) # Start transcript streaming if enabled await transcript_streaming_service.start_transcript_stream( job_id=job_id, video_id=video_id, source="hybrid", chunk_duration=30.0 ) try: transcript_result = await self.transcript_service.extract_transcript(video_id) result.transcript = transcript_result.transcript # Stream the transcript segments if available if hasattr(transcript_result, 'segments') and transcript_result.segments: await transcript_streaming_service.stream_from_segments( job_id=job_id, segments=transcript_result.segments, source=getattr(transcript_result, 'source', 'processed'), chunk_duration=30.0 ) # Complete the transcript stream await transcript_streaming_service.complete_transcript_stream( job_id=job_id, final_transcript=result.transcript, metadata={ "video_id": video_id, "source": getattr(transcript_result, 'source', 'processed'), "language": getattr(transcript_result, 'language', 'auto'), "segment_count": len(getattr(transcript_result, 'segments', [])) } ) # Send browser notification for transcript completion video_title = result.display_name or result.video_metadata.get("title", "Unknown Video") transcript_url = f"/transcript/{job_id}" if job_id else None await browser_notification_service.send_transcript_ready_notification( job_id=job_id, video_title=video_title, user_id=getattr(result, 'user_id', None), transcript_url=transcript_url ) except Exception as transcript_error: # Handle streaming error await transcript_streaming_service.handle_stream_error( job_id=job_id, error=transcript_error, partial_transcript=getattr(result, 'transcript', None) ) raise transcript_error # Check cancellation before content analysis if self._is_job_cancelled(job_id): return # Stage 4: Analyze Content for Optimization await self._update_progress( job_id, PipelineStage.ANALYZING_CONTENT, 50, f"Analyzing content: {result.display_name}" ) content_analysis = await self._analyze_content_characteristics( result.transcript, metadata ) optimized_config = self._optimize_config_for_content(config, content_analysis) # Check cancellation before expensive AI operation if self._is_job_cancelled(job_id): return # Stage 5: Generate Summary await self._update_progress( job_id, PipelineStage.GENERATING_SUMMARY, 75, f"Generating AI summary for: {result.display_name}" ) summary_result = await self._generate_optimized_summary( result.transcript, optimized_config, content_analysis ) # Populate summary results result.summary = summary_result.summary result.key_points = summary_result.key_points result.main_themes = summary_result.main_themes result.actionable_insights = summary_result.actionable_insights result.confidence_score = summary_result.confidence_score result.processing_metadata = summary_result.processing_metadata result.cost_data = summary_result.cost_data # Stage 6: Quality Validation await self._update_progress( job_id, PipelineStage.VALIDATING_QUALITY, 90, f"Validating quality: {result.display_name}" ) quality_score = await self._validate_summary_quality(result, content_analysis) result.quality_score = quality_score # Check if quality meets threshold if quality_score < config.quality_threshold and result.retry_count < config.max_retries: await self._retry_with_improvements(job_id, config, "Low quality score") return # Stage 7: Complete result.completed_at = datetime.utcnow() result.processing_time_seconds = ( result.completed_at - result.started_at ).total_seconds() result.status = PipelineStage.COMPLETED await self._update_progress( job_id, PipelineStage.COMPLETED, 100, f"✅ Completed: {result.display_name}" ) # Cache the result await self.cache_manager.cache_pipeline_result(job_id, result) # Save to database for unified storage across frontend and CLI try: from ..services.database_storage_service import database_storage_service saved_summary = database_storage_service.save_summary_from_pipeline(result) result.summary_id = saved_summary.id logger.info(f"Saved summary {saved_summary.id} to database for '{result.display_name}' (job {job_id})") except Exception as db_error: logger.error(f"Failed to save summary for '{result.display_name}' to database: {db_error}") # Don't fail the pipeline if database save fails # Send completion notifications if config.enable_notifications: await self._send_completion_notifications(result) except Exception as e: await self._handle_pipeline_error(job_id, e, config) finally: # Cleanup task reference if job_id in self._cleanup_tasks: del self._cleanup_tasks[job_id] async def _extract_enhanced_metadata(self, video_id: str) -> Dict[str, Any]: """Extract rich video metadata using YouTube Data API. Args: video_id: YouTube video identifier Returns: Enhanced video metadata dictionary """ try: # Check cache first cached_metadata = await self.cache_manager.get_cached_video_metadata(video_id) if cached_metadata: return cached_metadata # Use video service to get metadata metadata = await self.video_service.get_video_metadata(video_id) # Enhance with additional processing enhanced_metadata = { **metadata, "content_warnings": await self._detect_content_warnings(metadata), "estimated_complexity": self._estimate_content_complexity(metadata), "processing_hints": self._generate_processing_hints(metadata) } # Cache the enhanced metadata await self.cache_manager.cache_video_metadata(video_id, enhanced_metadata) return enhanced_metadata except Exception as e: # Return basic metadata if enhanced extraction fails return { "video_id": video_id, "title": f"Video {video_id}", "error": f"Enhanced metadata extraction failed: {str(e)}" } async def _detect_content_warnings(self, metadata: Dict[str, Any]) -> List[str]: """Detect potential content warnings from metadata.""" warnings = [] title = metadata.get("title", "").lower() description = metadata.get("description", "").lower() tags = [tag.lower() for tag in metadata.get("tags", [])] # Check for mature content indicators mature_indicators = [ "explicit", "mature", "adult", "nsfw", "warning", "violence", "profanity", "disturbing" ] for indicator in mature_indicators: if (indicator in title or indicator in description or any(indicator in tag for tag in tags)): warnings.append(f"Contains {indicator} content") return warnings def _estimate_content_complexity(self, metadata: Dict[str, Any]) -> str: """Estimate content complexity from metadata.""" # Simple heuristic based on category and tags category = metadata.get("category", "").lower() tags = [tag.lower() for tag in metadata.get("tags", [])] technical_categories = ["science & technology", "education", "howto & style"] technical_tags = ["tutorial", "programming", "science", "technical", "academic"] if (category in technical_categories or any(tag in technical_tags for tag in tags)): return "high" elif category in ["entertainment", "gaming", "sports"]: return "low" else: return "medium" def _generate_processing_hints(self, metadata: Dict[str, Any]) -> Dict[str, Any]: """Generate hints for optimal processing based on metadata.""" hints = { "recommended_summary_length": "standard", "focus_areas": [], "special_handling": [] } # Duration-based hints duration_str = metadata.get("duration", "PT0S") duration_seconds = self._parse_iso_duration(duration_str) if duration_seconds > 3600: # > 1 hour hints["recommended_summary_length"] = "detailed" hints["special_handling"].append("long_form_content") elif duration_seconds < 300: # < 5 minutes hints["recommended_summary_length"] = "brief" hints["special_handling"].append("short_form_content") # Category-based hints category = metadata.get("category", "").lower() if "education" in category: hints["focus_areas"].extend(["learning_objectives", "key_concepts"]) elif "tutorial" in category or "howto" in category: hints["focus_areas"].extend(["step_by_step_instructions", "practical_tips"]) return hints def _parse_iso_duration(self, duration: str) -> int: """Parse ISO 8601 duration to seconds.""" try: # Simple parser for PT#H#M#S format if not duration.startswith("PT"): return 0 duration = duration[2:] # Remove "PT" seconds = 0 if "H" in duration: hours, duration = duration.split("H", 1) seconds += int(hours) * 3600 if "M" in duration: minutes, duration = duration.split("M", 1) seconds += int(minutes) * 60 if "S" in duration: secs = duration.split("S")[0] if secs: seconds += int(secs) return seconds except (ValueError, AttributeError): return 0 async def _analyze_content_characteristics( self, transcript: str, metadata: Dict[str, Any] ) -> ContentAnalysis: """Analyze transcript and metadata to determine optimal processing strategy. Args: transcript: Video transcript text metadata: Video metadata dictionary Returns: ContentAnalysis object with processing insights """ word_count = len(transcript.split()) transcript_lower = transcript.lower() # Basic content type detection content_type = "general" technical_indicators = [] educational_indicators = [] entertainment_indicators = [] # Technical content indicators technical_terms = [ "algorithm", "function", "variable", "database", "api", "code", "programming", "software", "development", "technical", "implementation" ] for term in technical_terms: if term in transcript_lower: technical_indicators.append(term) # Educational content indicators educational_terms = [ "learn", "tutorial", "explain", "understand", "concept", "example", "lesson", "course", "study", "knowledge", "teach", "instruction" ] for term in educational_terms: if term in transcript_lower: educational_indicators.append(term) # Entertainment content indicators entertainment_terms = [ "funny", "story", "experience", "adventure", "review", "reaction", "entertainment", "fun", "hilarious", "amazing", "incredible" ] for term in entertainment_terms: if term in transcript_lower: entertainment_indicators.append(term) # Determine primary content type if len(technical_indicators) >= 3: content_type = "technical" elif len(educational_indicators) >= 3: content_type = "educational" elif len(entertainment_indicators) >= 2: content_type = "entertainment" # Complexity scoring based on sentence length and vocabulary sentences = [s.strip() for s in transcript.split('.') if s.strip()] avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0 complexity_score = 0.5 # Base complexity if avg_sentence_length > 20: complexity_score = min(1.0, complexity_score + 0.3) elif avg_sentence_length < 10: complexity_score = max(0.1, complexity_score - 0.2) # Adjust based on technical indicators if len(technical_indicators) > 5: complexity_score = min(1.0, complexity_score + 0.2) return ContentAnalysis( transcript_length=len(transcript), word_count=word_count, estimated_reading_time=word_count / 250, # Words per minute complexity_score=complexity_score, content_type=content_type, language=metadata.get("language", "en"), technical_indicators=technical_indicators, educational_indicators=educational_indicators, entertainment_indicators=entertainment_indicators ) def _optimize_config_for_content( self, base_config: PipelineConfig, analysis: ContentAnalysis ) -> PipelineConfig: """Optimize processing configuration based on content analysis. Args: base_config: Original pipeline configuration analysis: Content analysis results Returns: Optimized pipeline configuration """ optimized_config = PipelineConfig(**asdict(base_config)) # Adjust summary length based on content if analysis.word_count > 5000 and optimized_config.summary_length == "standard": optimized_config.summary_length = "detailed" elif analysis.word_count < 500 and optimized_config.summary_length == "standard": optimized_config.summary_length = "brief" # Add focus areas based on content type if not optimized_config.focus_areas: optimized_config.focus_areas = [] if analysis.content_type == "technical": optimized_config.focus_areas.extend([ "technical concepts", "implementation details" ]) elif analysis.content_type == "educational": optimized_config.focus_areas.extend([ "learning objectives", "key concepts", "practical applications" ]) elif analysis.content_type == "entertainment": optimized_config.focus_areas.extend([ "main highlights", "key moments", "overall message" ]) # Adjust quality threshold based on complexity if analysis.complexity_score > 0.7: optimized_config.quality_threshold = max( 0.6, optimized_config.quality_threshold - 0.1 ) return optimized_config async def _generate_optimized_summary( self, transcript: str, config: PipelineConfig, analysis: ContentAnalysis ) -> Any: """Generate summary with content-aware optimizations. Args: transcript: Video transcript config: Optimized pipeline configuration analysis: Content analysis results Returns: SummaryResult from AI service """ from ..services.ai_service import SummaryRequest, SummaryLength # Map config to AI service parameters length_mapping = { "brief": SummaryLength.BRIEF, "standard": SummaryLength.STANDARD, "detailed": SummaryLength.DETAILED } summary_request = SummaryRequest( transcript=transcript, length=length_mapping[config.summary_length], focus_areas=config.focus_areas or [], language=analysis.language, include_timestamps=config.include_timestamps ) # Add content-specific prompt enhancements if analysis.content_type == "technical": summary_request.focus_areas.append("explain technical concepts clearly") elif analysis.content_type == "educational": summary_request.focus_areas.append("highlight learning outcomes") elif analysis.content_type == "entertainment": summary_request.focus_areas.append("capture entertainment value") return await self.ai_service.generate_summary(summary_request) async def _validate_summary_quality( self, result: PipelineResult, analysis: ContentAnalysis ) -> float: """Validate and score summary quality. Args: result: Pipeline result with summary data analysis: Content analysis for validation context Returns: Quality score between 0.0 and 1.0 """ quality_score = 0.0 if not result.summary: return quality_score # Check summary length appropriateness summary_word_count = len(result.summary.split()) transcript_word_count = analysis.word_count # Good summary should be 5-15% of original length compression_ratio = ( summary_word_count / transcript_word_count if transcript_word_count > 0 else 0 ) if 0.05 <= compression_ratio <= 0.15: quality_score += 0.3 elif 0.03 <= compression_ratio <= 0.20: quality_score += 0.2 # Check key points availability and quality if result.key_points and len(result.key_points) >= 3: quality_score += 0.2 # Quality check: ensure key points have substance avg_point_length = sum(len(point.split()) for point in result.key_points) / len(result.key_points) if avg_point_length >= 5: quality_score += 0.1 # Check main themes availability if result.main_themes and len(result.main_themes) >= 2: quality_score += 0.15 # Check actionable insights if result.actionable_insights and len(result.actionable_insights) >= 1: quality_score += 0.15 # Use AI confidence score if result.confidence_score: if result.confidence_score > 0.8: quality_score += 0.2 elif result.confidence_score > 0.6: quality_score += 0.1 return min(1.0, quality_score) async def _retry_with_improvements( self, job_id: str, config: PipelineConfig, reason: str ): """Retry pipeline with improved configuration. Args: job_id: Pipeline job identifier config: Current pipeline configuration reason: Reason for retry """ result = self.active_jobs[job_id] result.retry_count += 1 await self._update_progress( job_id, PipelineStage.ANALYZING_CONTENT, 40, f"Retrying '{result.display_name}' (attempt {result.retry_count + 1}/{config.max_retries + 1}): {reason}" ) # Improve configuration for retry improved_config = PipelineConfig(**asdict(config)) improved_config.summary_length = "detailed" # Try more detailed summary improved_config.quality_threshold = max( 0.5, config.quality_threshold - 0.1 ) # Lower threshold slightly # Continue pipeline with improved config await self._execute_pipeline(job_id, improved_config) async def _handle_pipeline_error( self, job_id: str, error: Exception, config: PipelineConfig ): """Handle pipeline errors with retry logic. Args: job_id: Pipeline job identifier error: Exception that occurred config: Pipeline configuration """ result = self.active_jobs[job_id] result.status = PipelineStage.FAILED result.error = { "message": str(error), "type": type(error).__name__, "stage": result.status.value, "retry_count": result.retry_count } # Attempt retry if within limits if result.retry_count < config.max_retries: await asyncio.sleep(2 ** result.retry_count) # Exponential backoff await self._retry_with_improvements( job_id, config, f"Error: {str(error)}" ) else: result.completed_at = datetime.utcnow() await self._update_progress( job_id, PipelineStage.FAILED, 0, f"❌ Failed '{result.display_name}' after {result.retry_count + 1} attempts: {str(error)}" ) # Send error notification if config.enable_notifications: await self._send_error_notifications(result) async def _update_progress( self, job_id: str, stage: PipelineStage, percentage: float, message: str, details: Dict[str, Any] = None, sub_progress: Dict[str, Any] = None ): """Update pipeline progress with enhanced tracking. Args: job_id: Pipeline job identifier stage: Current pipeline stage percentage: Progress percentage (0-100) message: Progress message details: Optional additional details sub_progress: Optional sub-task progress (e.g., chunk processing) """ from ..core.websocket_manager import ProcessingStage, ProgressData result = self.active_jobs.get(job_id) if result: result.status = stage # Calculate time elapsed time_elapsed = (datetime.utcnow() - result.started_at).total_seconds() # Map PipelineStage to ProcessingStage stage_mapping = { PipelineStage.INITIALIZED: ProcessingStage.INITIALIZED, PipelineStage.VALIDATING_URL: ProcessingStage.VALIDATING_URL, PipelineStage.EXTRACTING_METADATA: ProcessingStage.EXTRACTING_METADATA, PipelineStage.EXTRACTING_TRANSCRIPT: ProcessingStage.EXTRACTING_TRANSCRIPT, PipelineStage.ANALYZING_CONTENT: ProcessingStage.ANALYZING_CONTENT, PipelineStage.GENERATING_SUMMARY: ProcessingStage.GENERATING_SUMMARY, PipelineStage.VALIDATING_QUALITY: ProcessingStage.VALIDATING_QUALITY, PipelineStage.COMPLETED: ProcessingStage.COMPLETED, PipelineStage.FAILED: ProcessingStage.FAILED, PipelineStage.CANCELLED: ProcessingStage.CANCELLED } processing_stage = stage_mapping.get(stage, ProcessingStage.INITIALIZED) # Estimate remaining time estimated_remaining = websocket_manager.estimate_remaining_time(job_id, percentage) # Create enhanced progress data progress_data = ProgressData( job_id=job_id, stage=processing_stage, percentage=percentage, message=message, time_elapsed=time_elapsed, estimated_remaining=estimated_remaining, sub_progress=sub_progress, details=details ) # Update WebSocket manager with enhanced tracking websocket_manager.update_job_progress(job_id, progress_data) progress = PipelineProgress( stage=stage, percentage=percentage, message=message, current_step_details=details ) # Send enhanced WebSocket update await websocket_manager.send_progress_update(job_id, { "stage": stage.value, "percentage": percentage, "message": message, "details": details, "sub_progress": sub_progress, "time_elapsed": time_elapsed if result else 0, "estimated_remaining": estimated_remaining if result else None }) # Notify registered callbacks callbacks = self.progress_callbacks.get(job_id, []) for callback in callbacks: try: await callback(job_id, progress) except Exception as e: print(f"Progress callback error: {e}") # Send progress notification for major milestones await self.notification_service.send_progress_notification( job_id, asdict(progress) ) async def _send_completion_notifications(self, result: PipelineResult): """Send completion notifications via multiple channels. Args: result: Completed pipeline result """ # Send via notification service notification_result = { "video_metadata": result.video_metadata, "processing_time_seconds": result.processing_time_seconds, "quality_score": result.quality_score, "summary": result.summary } await self.notification_service.send_completion_notification( result.job_id, notification_result ) # Send via WebSocket await websocket_manager.send_completion_notification(result.job_id, { "status": "completed", "summary": result.summary, "key_points": result.key_points, "main_themes": result.main_themes, "actionable_insights": result.actionable_insights, "quality_score": result.quality_score, "processing_time": result.processing_time_seconds }) # Send browser notification video_title = result.display_name or result.video_metadata.get("title", "Unknown Video") summary_url = f"/summary/{result.summary_id}" if result.summary_id else None await browser_notification_service.send_processing_complete_notification( job_id=result.job_id, video_title=video_title, user_id=getattr(result, 'user_id', None), summary_url=summary_url ) async def _send_error_notifications(self, result: PipelineResult): """Send error notifications via multiple channels. Args: result: Failed pipeline result """ await self.notification_service.send_error_notification( result.job_id, result.error ) await websocket_manager.send_error_notification(result.job_id, { "status": "failed", "error": result.error }) # Send browser notification for errors video_title = result.display_name or result.video_metadata.get("title", "Unknown Video") retry_url = f"/retry/{result.job_id}" if result.job_id else None await browser_notification_service.send_processing_failed_notification( job_id=result.job_id, video_title=video_title, error_message=result.error or "Unknown error occurred", user_id=getattr(result, 'user_id', None), retry_url=retry_url ) async def _restore_from_cache(self, job_id: str, cached_result: Dict[str, Any]): """Restore pipeline result from cache. Args: job_id: Pipeline job identifier cached_result: Cached pipeline result data """ # Convert cached data back to PipelineResult result = PipelineResult(**cached_result) self.active_jobs[job_id] = result await self._update_progress( job_id, PipelineStage.COMPLETED, 100, "Retrieved from cache" ) # Send completion notification await self._send_completion_notifications(result) async def get_pipeline_result(self, job_id: str) -> Optional[PipelineResult]: """Get pipeline result by job ID. Args: job_id: Pipeline job identifier Returns: PipelineResult if found, None otherwise """ # Check active jobs first if job_id in self.active_jobs: return self.active_jobs[job_id] # Check cache for completed jobs cached_result = await self.cache_manager.get_cached_pipeline_result(job_id) if cached_result: return PipelineResult(**cached_result) return None async def cancel_pipeline(self, job_id: str) -> bool: """Cancel running pipeline. Args: job_id: Pipeline job identifier Returns: True if cancelled successfully, False otherwise """ if job_id in self.active_jobs: result = self.active_jobs[job_id] result.status = PipelineStage.CANCELLED result.completed_at = datetime.utcnow() await self._update_progress( job_id, PipelineStage.CANCELLED, 0, "Pipeline cancelled by user" ) # Cancel the background task if job_id in self._cleanup_tasks: task = self._cleanup_tasks[job_id] if not task.done(): task.cancel() del self._cleanup_tasks[job_id] return True return False def get_active_jobs(self) -> List[str]: """Get list of active job IDs. Returns: List of active job IDs """ return list(self.active_jobs.keys()) async def cleanup_completed_jobs(self, max_age_hours: int = 24): """Clean up old completed jobs from memory. Args: max_age_hours: Maximum age in hours before cleanup """ cutoff_time = datetime.utcnow() - timedelta(hours=max_age_hours) jobs_to_remove = [] for job_id, result in self.active_jobs.items(): if (result.completed_at and result.completed_at < cutoff_time and result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED, PipelineStage.CANCELLED]): jobs_to_remove.append(job_id) for job_id in jobs_to_remove: del self.active_jobs[job_id] if job_id in self.progress_callbacks: del self.progress_callbacks[job_id]