youtube-summarizer/backend/services/summary_pipeline.py

"""Summary Pipeline orchestrates the complete YouTube summarization workflow."""
import asyncio
import logging
import uuid
from datetime import datetime, timedelta
from typing import Dict, Optional, List, Any, Callable, Set
from dataclasses import asdict

logger = logging.getLogger(__name__)

from ..services.video_service import VideoService
from ..services.transcript_service import TranscriptService
from ..services.deepseek_summarizer import DeepSeekSummarizer
from ..services.cache_manager import CacheManager
from ..services.notification_service import NotificationService
from ..services.transcript_streaming_service import transcript_streaming_service
from ..services.browser_notification_service import browser_notification_service
from ..models.pipeline import (
    PipelineStage, PipelineConfig, PipelineProgress, PipelineResult,
    ContentAnalysis, QualityMetrics
)
from ..core.websocket_manager import websocket_manager
from ..core.exceptions import PipelineError


class SummaryPipeline:
    """Orchestrates the complete YouTube summarization workflow."""

    def __init__(
        self,
        video_service: VideoService,
        transcript_service: TranscriptService,
        ai_service: DeepSeekSummarizer,
        cache_manager: CacheManager,
        notification_service: NotificationService = None
    ):
        self.video_service = video_service
        self.transcript_service = transcript_service
        self.ai_service = ai_service
        self.cache_manager = cache_manager
        self.notification_service = notification_service or NotificationService()

        # Active jobs tracking
        self.active_jobs: Dict[str, PipelineResult] = {}
        self.progress_callbacks: Dict[str, List[Callable]] = {}
        self._cleanup_tasks: Dict[str, asyncio.Task] = {}
        self._cancelled_jobs: Set[str] = set()  # Track cancelled jobs

    async def process_video(
        self,
        video_url: str,
        config: PipelineConfig = None,
        progress_callback: Callable = None
    ) -> str:
        """Start video processing pipeline and return job ID.

        Args:
            video_url: YouTube video URL to process
            config: Pipeline configuration options
            progress_callback: Optional callback for progress updates

        Returns:
            Job ID for tracking pipeline progress
        """
        if config is None:
            config = PipelineConfig()

        job_id = str(uuid.uuid4())

        # Initialize pipeline result
        result = PipelineResult(
            job_id=job_id,
            video_url=video_url,
            video_id="",  # Will be populated during validation
            status=PipelineStage.INITIALIZED,
            started_at=datetime.utcnow(),
            retry_count=0
        )

        self.active_jobs[job_id] = result

        if progress_callback:
            self.progress_callbacks[job_id] = [progress_callback]

        # Start processing in background
        task = asyncio.create_task(self._execute_pipeline(job_id, config))
        self._cleanup_tasks[job_id] = task

        await self._update_progress(
            job_id,
            PipelineStage.INITIALIZED,
            0,
            "Pipeline initialized, starting processing..."
        )

        return job_id

    async def cancel_job(self, job_id: str) -> bool:
        """Cancel an active pipeline job.

        Args:
            job_id: Job ID to cancel

        Returns:
            True if cancellation was successful
        """
        if job_id not in self.active_jobs:
            return False

        # Mark job as cancelled
        self._cancelled_jobs.add(job_id)
        result = self.active_jobs[job_id]
        result.status = PipelineStage.CANCELLED

        # Cancel the background task
        if job_id in self._cleanup_tasks:
            task = self._cleanup_tasks[job_id]
            if not task.done():
                task.cancel()

        # Update progress
        await self._update_progress(
            job_id,
            PipelineStage.CANCELLED,
            result.progress_percentage if hasattr(result, 'progress_percentage') else 0,
            f"🛑 Cancelled: {result.display_name}",
            {"cancelled_at": datetime.utcnow().isoformat()}
        )

        # Send cancellation notification
        await websocket_manager.send_error_notification(job_id, {
            "error": "cancelled",
            "message": "Processing was cancelled by user",
            "timestamp": datetime.utcnow().isoformat()
        })

        # Clean up after a delay
        asyncio.create_task(self._cleanup_cancelled_job(job_id))
        return True

    async def _cleanup_cancelled_job(self, job_id: str):
        """Clean up a cancelled job after a delay."""
        await asyncio.sleep(5)  # Keep for 5 seconds for client to receive updates

        # Clean up tracking
        self._cancelled_jobs.discard(job_id)
        if job_id in self.active_jobs:
            del self.active_jobs[job_id]
        if job_id in self.progress_callbacks:
            del self.progress_callbacks[job_id]
        if job_id in self._cleanup_tasks:
            del self._cleanup_tasks[job_id]

    def _is_job_cancelled(self, job_id: str) -> bool:
        """Check if a job has been cancelled."""
        return job_id in self._cancelled_jobs

    async def _execute_pipeline(self, job_id: str, config: PipelineConfig):
        """Execute the complete processing pipeline.

        Args:
            job_id: Pipeline job identifier
            config: Pipeline configuration
        """
        result = self.active_jobs[job_id]

        try:
            # Stage 1: URL Validation
            if self._is_job_cancelled(job_id):
                return  # Early exit if cancelled

            await self._update_progress(
                job_id,
                PipelineStage.VALIDATING_URL,
                5,
                "Validating YouTube URL..."
            )
            video_id = await self.video_service.extract_video_id(result.video_url)
            result.video_id = video_id

            # Check cache for existing result
            cached_result = await self.cache_manager.get_cached_pipeline_result(job_id)
            if cached_result and cached_result.get("status") == PipelineStage.COMPLETED.value:
                await self._restore_from_cache(job_id, cached_result)
                return

            # Check cancellation before next stage
            if self._is_job_cancelled(job_id):
                return

            # Stage 2: Extract Video Metadata
            await self._update_progress(
                job_id,
                PipelineStage.EXTRACTING_METADATA,
                15,
                f"Extracting video information..."
            )
            metadata = await self._extract_enhanced_metadata(video_id)
            result.video_metadata = metadata

            # Now that we have metadata, update progress with video title
            await self._update_progress(
                job_id,
                PipelineStage.EXTRACTING_METADATA,
                18,
                f"Processing: {result.display_name}"
            )

            # Check cancellation before transcript extraction
            if self._is_job_cancelled(job_id):
                return

            # Stage 3: Extract Transcript
            await self._update_progress(
                job_id,
                PipelineStage.EXTRACTING_TRANSCRIPT,
                35,
                f"Extracting transcript for: {result.display_name}"
            )

            # Start transcript streaming if enabled
            await transcript_streaming_service.start_transcript_stream(
                job_id=job_id,
                video_id=video_id,
                source="hybrid",
                chunk_duration=30.0
            )

            try:
                transcript_result = await self.transcript_service.extract_transcript(video_id)
                result.transcript = transcript_result.transcript

                # Stream the transcript segments if available
                if hasattr(transcript_result, 'segments') and transcript_result.segments:
                    await transcript_streaming_service.stream_from_segments(
                        job_id=job_id,
                        segments=transcript_result.segments,
                        source=getattr(transcript_result, 'source', 'processed'),
                        chunk_duration=30.0
                    )

                # Complete the transcript stream
                await transcript_streaming_service.complete_transcript_stream(
                    job_id=job_id,
                    final_transcript=result.transcript,
                    metadata={
                        "video_id": video_id,
                        "source": getattr(transcript_result, 'source', 'processed'),
                        "language": getattr(transcript_result, 'language', 'auto'),
                        "segment_count": len(getattr(transcript_result, 'segments', []))
                    }
                )

                # Send browser notification for transcript completion
                video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
                transcript_url = f"/transcript/{job_id}" if job_id else None

                await browser_notification_service.send_transcript_ready_notification(
                    job_id=job_id,
                    video_title=video_title,
                    user_id=getattr(result, 'user_id', None),
                    transcript_url=transcript_url
                )

            except Exception as transcript_error:
                # Handle streaming error
                await transcript_streaming_service.handle_stream_error(
                    job_id=job_id,
                    error=transcript_error,
                    partial_transcript=getattr(result, 'transcript', None)
                )
                raise transcript_error

            # Check cancellation before content analysis
            if self._is_job_cancelled(job_id):
                return

            # Stage 4: Analyze Content for Optimization
            await self._update_progress(
                job_id,
                PipelineStage.ANALYZING_CONTENT,
                50,
                f"Analyzing content: {result.display_name}"
            )
            content_analysis = await self._analyze_content_characteristics(
                result.transcript, metadata
            )
            optimized_config = self._optimize_config_for_content(config, content_analysis)

            # Check cancellation before expensive AI operation
            if self._is_job_cancelled(job_id):
                return

            # Stage 5: Generate Summary
            await self._update_progress(
                job_id,
                PipelineStage.GENERATING_SUMMARY,
                75,
                f"Generating AI summary for: {result.display_name}"
            )
            summary_result = await self._generate_optimized_summary(
                result.transcript, optimized_config, content_analysis
            )

            # Populate summary results
            result.summary = summary_result.summary
            result.key_points = summary_result.key_points
            result.main_themes = summary_result.main_themes
            result.actionable_insights = summary_result.actionable_insights
            result.confidence_score = summary_result.confidence_score
            result.processing_metadata = summary_result.processing_metadata
            result.cost_data = summary_result.cost_data

            # Stage 6: Quality Validation
            await self._update_progress(
                job_id,
                PipelineStage.VALIDATING_QUALITY,
                90,
                f"Validating quality: {result.display_name}"
            )
            quality_score = await self._validate_summary_quality(result, content_analysis)
            result.quality_score = quality_score

            # Check if quality meets threshold
            if quality_score < config.quality_threshold and result.retry_count < config.max_retries:
                await self._retry_with_improvements(job_id, config, "Low quality score")
                return

            # Stage 7: Complete
            result.completed_at = datetime.utcnow()
            result.processing_time_seconds = (
                result.completed_at - result.started_at
            ).total_seconds()
            result.status = PipelineStage.COMPLETED

            await self._update_progress(
                job_id,
                PipelineStage.COMPLETED,
                100,
                f"✅ Completed: {result.display_name}"
            )

            # Cache the result
            await self.cache_manager.cache_pipeline_result(job_id, result)

            # Save to database for unified storage across frontend and CLI
            try:
                from ..services.database_storage_service import database_storage_service
                saved_summary = database_storage_service.save_summary_from_pipeline(result)
                result.summary_id = saved_summary.id
                logger.info(f"Saved summary {saved_summary.id} to database for '{result.display_name}' (job {job_id})")
            except Exception as db_error:
                logger.error(f"Failed to save summary for '{result.display_name}' to database: {db_error}")
                # Don't fail the pipeline if database save fails

            # Send completion notifications
            if config.enable_notifications:
                await self._send_completion_notifications(result)

        except Exception as e:
            await self._handle_pipeline_error(job_id, e, config)
        finally:
            # Cleanup task reference
            if job_id in self._cleanup_tasks:
                del self._cleanup_tasks[job_id]

    async def _extract_enhanced_metadata(self, video_id: str) -> Dict[str, Any]:
        """Extract rich video metadata using YouTube Data API.

        Args:
            video_id: YouTube video identifier

        Returns:
            Enhanced video metadata dictionary
        """
        try:
            # Check cache first
            cached_metadata = await self.cache_manager.get_cached_video_metadata(video_id)
            if cached_metadata:
                return cached_metadata

            # Use video service to get metadata
            metadata = await self.video_service.get_video_metadata(video_id)

            # Enhance with additional processing
            enhanced_metadata = {
                **metadata,
                "content_warnings": await self._detect_content_warnings(metadata),
                "estimated_complexity": self._estimate_content_complexity(metadata),
                "processing_hints": self._generate_processing_hints(metadata)
            }

            # Cache the enhanced metadata
            await self.cache_manager.cache_video_metadata(video_id, enhanced_metadata)

            return enhanced_metadata

        except Exception as e:
            # Return basic metadata if enhanced extraction fails
            return {
                "video_id": video_id,
                "title": f"Video {video_id}",
                "error": f"Enhanced metadata extraction failed: {str(e)}"
            }

    async def _detect_content_warnings(self, metadata: Dict[str, Any]) -> List[str]:
        """Detect potential content warnings from metadata."""
        warnings = []

        title = metadata.get("title", "").lower()
        description = metadata.get("description", "").lower()
        tags = [tag.lower() for tag in metadata.get("tags", [])]

        # Check for mature content indicators
        mature_indicators = [
            "explicit", "mature", "adult", "nsfw", "warning",
            "violence", "profanity", "disturbing"
        ]

        for indicator in mature_indicators:
            if (indicator in title or
                indicator in description or
                any(indicator in tag for tag in tags)):
                warnings.append(f"Contains {indicator} content")

        return warnings

    def _estimate_content_complexity(self, metadata: Dict[str, Any]) -> str:
        """Estimate content complexity from metadata."""
        # Simple heuristic based on category and tags
        category = metadata.get("category", "").lower()
        tags = [tag.lower() for tag in metadata.get("tags", [])]

        technical_categories = ["science & technology", "education", "howto & style"]
        technical_tags = ["tutorial", "programming", "science", "technical", "academic"]

        if (category in technical_categories or
            any(tag in technical_tags for tag in tags)):
            return "high"
        elif category in ["entertainment", "gaming", "sports"]:
            return "low"
        else:
            return "medium"

    def _generate_processing_hints(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
        """Generate hints for optimal processing based on metadata."""
        hints = {
            "recommended_summary_length": "standard",
            "focus_areas": [],
            "special_handling": []
        }

        # Duration-based hints
        duration_str = metadata.get("duration", "PT0S")
        duration_seconds = self._parse_iso_duration(duration_str)

        if duration_seconds > 3600:  # > 1 hour
            hints["recommended_summary_length"] = "detailed"
            hints["special_handling"].append("long_form_content")
        elif duration_seconds < 300:  # < 5 minutes
            hints["recommended_summary_length"] = "brief"
            hints["special_handling"].append("short_form_content")

        # Category-based hints
        category = metadata.get("category", "").lower()
        if "education" in category:
            hints["focus_areas"].extend(["learning_objectives", "key_concepts"])
        elif "tutorial" in category or "howto" in category:
            hints["focus_areas"].extend(["step_by_step_instructions", "practical_tips"])

        return hints

    def _parse_iso_duration(self, duration: str) -> int:
        """Parse ISO 8601 duration to seconds."""
        try:
            # Simple parser for PT#H#M#S format
            if not duration.startswith("PT"):
                return 0

            duration = duration[2:]  # Remove "PT"
            seconds = 0

            if "H" in duration:
                hours, duration = duration.split("H", 1)
                seconds += int(hours) * 3600

            if "M" in duration:
                minutes, duration = duration.split("M", 1)
                seconds += int(minutes) * 60

            if "S" in duration:
                secs = duration.split("S")[0]
                if secs:
                    seconds += int(secs)

            return seconds
        except (ValueError, AttributeError):
            return 0

    async def _analyze_content_characteristics(
        self,
        transcript: str,
        metadata: Dict[str, Any]
    ) -> ContentAnalysis:
        """Analyze transcript and metadata to determine optimal processing strategy.

        Args:
            transcript: Video transcript text
            metadata: Video metadata dictionary

        Returns:
            ContentAnalysis object with processing insights
        """
        word_count = len(transcript.split())
        transcript_lower = transcript.lower()

        # Basic content type detection
        content_type = "general"
        technical_indicators = []
        educational_indicators = []
        entertainment_indicators = []

        # Technical content indicators
        technical_terms = [
            "algorithm", "function", "variable", "database", "api", "code",
            "programming", "software", "development", "technical", "implementation"
        ]
        for term in technical_terms:
            if term in transcript_lower:
                technical_indicators.append(term)

        # Educational content indicators
        educational_terms = [
            "learn", "tutorial", "explain", "understand", "concept", "example",
            "lesson", "course", "study", "knowledge", "teach", "instruction"
        ]
        for term in educational_terms:
            if term in transcript_lower:
                educational_indicators.append(term)

        # Entertainment content indicators
        entertainment_terms = [
            "funny", "story", "experience", "adventure", "review", "reaction",
            "entertainment", "fun", "hilarious", "amazing", "incredible"
        ]
        for term in entertainment_terms:
            if term in transcript_lower:
                entertainment_indicators.append(term)

        # Determine primary content type
        if len(technical_indicators) >= 3:
            content_type = "technical"
        elif len(educational_indicators) >= 3:
            content_type = "educational"
        elif len(entertainment_indicators) >= 2:
            content_type = "entertainment"

        # Complexity scoring based on sentence length and vocabulary
        sentences = [s.strip() for s in transcript.split('.') if s.strip()]
        avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0

        complexity_score = 0.5  # Base complexity

        if avg_sentence_length > 20:
            complexity_score = min(1.0, complexity_score + 0.3)
        elif avg_sentence_length < 10:
            complexity_score = max(0.1, complexity_score - 0.2)

        # Adjust based on technical indicators
        if len(technical_indicators) > 5:
            complexity_score = min(1.0, complexity_score + 0.2)

        return ContentAnalysis(
            transcript_length=len(transcript),
            word_count=word_count,
            estimated_reading_time=word_count / 250,  # Words per minute
            complexity_score=complexity_score,
            content_type=content_type,
            language=metadata.get("language", "en"),
            technical_indicators=technical_indicators,
            educational_indicators=educational_indicators,
            entertainment_indicators=entertainment_indicators
        )

    def _optimize_config_for_content(
        self,
        base_config: PipelineConfig,
        analysis: ContentAnalysis
    ) -> PipelineConfig:
        """Optimize processing configuration based on content analysis.

        Args:
            base_config: Original pipeline configuration
            analysis: Content analysis results

        Returns:
            Optimized pipeline configuration
        """
        optimized_config = PipelineConfig(**asdict(base_config))

        # Adjust summary length based on content
        if analysis.word_count > 5000 and optimized_config.summary_length == "standard":
            optimized_config.summary_length = "detailed"
        elif analysis.word_count < 500 and optimized_config.summary_length == "standard":
            optimized_config.summary_length = "brief"

        # Add focus areas based on content type
        if not optimized_config.focus_areas:
            optimized_config.focus_areas = []

        if analysis.content_type == "technical":
            optimized_config.focus_areas.extend([
                "technical concepts",
                "implementation details"
            ])
        elif analysis.content_type == "educational":
            optimized_config.focus_areas.extend([
                "learning objectives",
                "key concepts",
                "practical applications"
            ])
        elif analysis.content_type == "entertainment":
            optimized_config.focus_areas.extend([
                "main highlights",
                "key moments",
                "overall message"
            ])

        # Adjust quality threshold based on complexity
        if analysis.complexity_score > 0.7:
            optimized_config.quality_threshold = max(
                0.6,
                optimized_config.quality_threshold - 0.1
            )

        return optimized_config

    async def _generate_optimized_summary(
        self,
        transcript: str,
        config: PipelineConfig,
        analysis: ContentAnalysis
    ) -> Any:
        """Generate summary with content-aware optimizations.

        Args:
            transcript: Video transcript
            config: Optimized pipeline configuration
            analysis: Content analysis results

        Returns:
            SummaryResult from AI service
        """
        from ..services.ai_service import SummaryRequest, SummaryLength

        # Map config to AI service parameters
        length_mapping = {
            "brief": SummaryLength.BRIEF,
            "standard": SummaryLength.STANDARD,
            "detailed": SummaryLength.DETAILED
        }

        summary_request = SummaryRequest(
            transcript=transcript,
            length=length_mapping[config.summary_length],
            focus_areas=config.focus_areas or [],
            language=analysis.language,
            include_timestamps=config.include_timestamps
        )

        # Add content-specific prompt enhancements
        if analysis.content_type == "technical":
            summary_request.focus_areas.append("explain technical concepts clearly")
        elif analysis.content_type == "educational":
            summary_request.focus_areas.append("highlight learning outcomes")
        elif analysis.content_type == "entertainment":
            summary_request.focus_areas.append("capture entertainment value")

        return await self.ai_service.generate_summary(summary_request)

    async def _validate_summary_quality(
        self,
        result: PipelineResult,
        analysis: ContentAnalysis
    ) -> float:
        """Validate and score summary quality.

        Args:
            result: Pipeline result with summary data
            analysis: Content analysis for validation context

        Returns:
            Quality score between 0.0 and 1.0
        """
        quality_score = 0.0

        if not result.summary:
            return quality_score

        # Check summary length appropriateness
        summary_word_count = len(result.summary.split())
        transcript_word_count = analysis.word_count

        # Good summary should be 5-15% of original length
        compression_ratio = (
            summary_word_count / transcript_word_count
            if transcript_word_count > 0 else 0
        )

        if 0.05 <= compression_ratio <= 0.15:
            quality_score += 0.3
        elif 0.03 <= compression_ratio <= 0.20:
            quality_score += 0.2

        # Check key points availability and quality
        if result.key_points and len(result.key_points) >= 3:
            quality_score += 0.2

            # Quality check: ensure key points have substance
            avg_point_length = sum(len(point.split()) for point in result.key_points) / len(result.key_points)
            if avg_point_length >= 5:
                quality_score += 0.1

        # Check main themes availability
        if result.main_themes and len(result.main_themes) >= 2:
            quality_score += 0.15

        # Check actionable insights
        if result.actionable_insights and len(result.actionable_insights) >= 1:
            quality_score += 0.15

        # Use AI confidence score
        if result.confidence_score:
            if result.confidence_score > 0.8:
                quality_score += 0.2
            elif result.confidence_score > 0.6:
                quality_score += 0.1

        return min(1.0, quality_score)

    async def _retry_with_improvements(
        self,
        job_id: str,
        config: PipelineConfig,
        reason: str
    ):
        """Retry pipeline with improved configuration.

        Args:
            job_id: Pipeline job identifier
            config: Current pipeline configuration
            reason: Reason for retry
        """
        result = self.active_jobs[job_id]
        result.retry_count += 1

        await self._update_progress(
            job_id,
            PipelineStage.ANALYZING_CONTENT,
            40,
            f"Retrying '{result.display_name}' (attempt {result.retry_count + 1}/{config.max_retries + 1}): {reason}"
        )

        # Improve configuration for retry
        improved_config = PipelineConfig(**asdict(config))
        improved_config.summary_length = "detailed"  # Try more detailed summary
        improved_config.quality_threshold = max(
            0.5,
            config.quality_threshold - 0.1
        )  # Lower threshold slightly

        # Continue pipeline with improved config
        await self._execute_pipeline(job_id, improved_config)

    async def _handle_pipeline_error(
        self,
        job_id: str,
        error: Exception,
        config: PipelineConfig
    ):
        """Handle pipeline errors with retry logic.

        Args:
            job_id: Pipeline job identifier
            error: Exception that occurred
            config: Pipeline configuration
        """
        result = self.active_jobs[job_id]
        result.status = PipelineStage.FAILED
        result.error = {
            "message": str(error),
            "type": type(error).__name__,
            "stage": result.status.value,
            "retry_count": result.retry_count
        }

        # Attempt retry if within limits
        if result.retry_count < config.max_retries:
            await asyncio.sleep(2 ** result.retry_count)  # Exponential backoff
            await self._retry_with_improvements(
                job_id,
                config,
                f"Error: {str(error)}"
            )
        else:
            result.completed_at = datetime.utcnow()
            await self._update_progress(
                job_id,
                PipelineStage.FAILED,
                0,
                f"❌ Failed '{result.display_name}' after {result.retry_count + 1} attempts: {str(error)}"
            )

            # Send error notification
            if config.enable_notifications:
                await self._send_error_notifications(result)

    async def _update_progress(
        self,
        job_id: str,
        stage: PipelineStage,
        percentage: float,
        message: str,
        details: Dict[str, Any] = None,
        sub_progress: Dict[str, Any] = None
    ):
        """Update pipeline progress with enhanced tracking.

        Args:
            job_id: Pipeline job identifier
            stage: Current pipeline stage
            percentage: Progress percentage (0-100)
            message: Progress message
            details: Optional additional details
            sub_progress: Optional sub-task progress (e.g., chunk processing)
        """
        from ..core.websocket_manager import ProcessingStage, ProgressData

        result = self.active_jobs.get(job_id)
        if result:
            result.status = stage

            # Calculate time elapsed
            time_elapsed = (datetime.utcnow() - result.started_at).total_seconds()

            # Map PipelineStage to ProcessingStage
            stage_mapping = {
                PipelineStage.INITIALIZED: ProcessingStage.INITIALIZED,
                PipelineStage.VALIDATING_URL: ProcessingStage.VALIDATING_URL,
                PipelineStage.EXTRACTING_METADATA: ProcessingStage.EXTRACTING_METADATA,
                PipelineStage.EXTRACTING_TRANSCRIPT: ProcessingStage.EXTRACTING_TRANSCRIPT,
                PipelineStage.ANALYZING_CONTENT: ProcessingStage.ANALYZING_CONTENT,
                PipelineStage.GENERATING_SUMMARY: ProcessingStage.GENERATING_SUMMARY,
                PipelineStage.VALIDATING_QUALITY: ProcessingStage.VALIDATING_QUALITY,
                PipelineStage.COMPLETED: ProcessingStage.COMPLETED,
                PipelineStage.FAILED: ProcessingStage.FAILED,
                PipelineStage.CANCELLED: ProcessingStage.CANCELLED
            }

            processing_stage = stage_mapping.get(stage, ProcessingStage.INITIALIZED)

            # Estimate remaining time
            estimated_remaining = websocket_manager.estimate_remaining_time(job_id, percentage)

            # Create enhanced progress data
            progress_data = ProgressData(
                job_id=job_id,
                stage=processing_stage,
                percentage=percentage,
                message=message,
                time_elapsed=time_elapsed,
                estimated_remaining=estimated_remaining,
                sub_progress=sub_progress,
                details=details
            )

            # Update WebSocket manager with enhanced tracking
            websocket_manager.update_job_progress(job_id, progress_data)

        progress = PipelineProgress(
            stage=stage,
            percentage=percentage,
            message=message,
            current_step_details=details
        )

        # Send enhanced WebSocket update
        await websocket_manager.send_progress_update(job_id, {
            "stage": stage.value,
            "percentage": percentage,
            "message": message,
            "details": details,
            "sub_progress": sub_progress,
            "time_elapsed": time_elapsed if result else 0,
            "estimated_remaining": estimated_remaining if result else None
        })

        # Notify registered callbacks
        callbacks = self.progress_callbacks.get(job_id, [])
        for callback in callbacks:
            try:
                await callback(job_id, progress)
            except Exception as e:
                print(f"Progress callback error: {e}")

        # Send progress notification for major milestones
        await self.notification_service.send_progress_notification(
            job_id,
            asdict(progress)
        )

    async def _send_completion_notifications(self, result: PipelineResult):
        """Send completion notifications via multiple channels.

        Args:
            result: Completed pipeline result
        """
        # Send via notification service
        notification_result = {
            "video_metadata": result.video_metadata,
            "processing_time_seconds": result.processing_time_seconds,
            "quality_score": result.quality_score,
            "summary": result.summary
        }

        await self.notification_service.send_completion_notification(
            result.job_id,
            notification_result
        )

        # Send via WebSocket
        await websocket_manager.send_completion_notification(result.job_id, {
            "status": "completed",
            "summary": result.summary,
            "key_points": result.key_points,
            "main_themes": result.main_themes,
            "actionable_insights": result.actionable_insights,
            "quality_score": result.quality_score,
            "processing_time": result.processing_time_seconds
        })

        # Send browser notification
        video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
        summary_url = f"/summary/{result.summary_id}" if result.summary_id else None

        await browser_notification_service.send_processing_complete_notification(
            job_id=result.job_id,
            video_title=video_title,
            user_id=getattr(result, 'user_id', None),
            summary_url=summary_url
        )

    async def _send_error_notifications(self, result: PipelineResult):
        """Send error notifications via multiple channels.

        Args:
            result: Failed pipeline result
        """
        await self.notification_service.send_error_notification(
            result.job_id,
            result.error
        )

        await websocket_manager.send_error_notification(result.job_id, {
            "status": "failed",
            "error": result.error
        })

        # Send browser notification for errors
        video_title = result.display_name or result.video_metadata.get("title", "Unknown Video")
        retry_url = f"/retry/{result.job_id}" if result.job_id else None

        await browser_notification_service.send_processing_failed_notification(
            job_id=result.job_id,
            video_title=video_title,
            error_message=result.error or "Unknown error occurred",
            user_id=getattr(result, 'user_id', None),
            retry_url=retry_url
        )

    async def _restore_from_cache(self, job_id: str, cached_result: Dict[str, Any]):
        """Restore pipeline result from cache.

        Args:
            job_id: Pipeline job identifier
            cached_result: Cached pipeline result data
        """
        # Convert cached data back to PipelineResult
        result = PipelineResult(**cached_result)
        self.active_jobs[job_id] = result

        await self._update_progress(
            job_id,
            PipelineStage.COMPLETED,
            100,
            "Retrieved from cache"
        )

        # Send completion notification
        await self._send_completion_notifications(result)

    async def get_pipeline_result(self, job_id: str) -> Optional[PipelineResult]:
        """Get pipeline result by job ID.

        Args:
            job_id: Pipeline job identifier

        Returns:
            PipelineResult if found, None otherwise
        """
        # Check active jobs first
        if job_id in self.active_jobs:
            return self.active_jobs[job_id]

        # Check cache for completed jobs
        cached_result = await self.cache_manager.get_cached_pipeline_result(job_id)
        if cached_result:
            return PipelineResult(**cached_result)

        return None

    async def cancel_pipeline(self, job_id: str) -> bool:
        """Cancel running pipeline.

        Args:
            job_id: Pipeline job identifier

        Returns:
            True if cancelled successfully, False otherwise
        """
        if job_id in self.active_jobs:
            result = self.active_jobs[job_id]
            result.status = PipelineStage.CANCELLED
            result.completed_at = datetime.utcnow()

            await self._update_progress(
                job_id,
                PipelineStage.CANCELLED,
                0,
                "Pipeline cancelled by user"
            )

            # Cancel the background task
            if job_id in self._cleanup_tasks:
                task = self._cleanup_tasks[job_id]
                if not task.done():
                    task.cancel()
                del self._cleanup_tasks[job_id]

            return True

        return False

    def get_active_jobs(self) -> List[str]:
        """Get list of active job IDs.

        Returns:
            List of active job IDs
        """
        return list(self.active_jobs.keys())

    async def cleanup_completed_jobs(self, max_age_hours: int = 24):
        """Clean up old completed jobs from memory.

        Args:
            max_age_hours: Maximum age in hours before cleanup
        """
        cutoff_time = datetime.utcnow() - timedelta(hours=max_age_hours)

        jobs_to_remove = []
        for job_id, result in self.active_jobs.items():
            if (result.completed_at and
                result.completed_at < cutoff_time and
                result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED, PipelineStage.CANCELLED]):
                jobs_to_remove.append(job_id)

        for job_id in jobs_to_remove:
            del self.active_jobs[job_id]
            if job_id in self.progress_callbacks:
                del self.progress_callbacks[job_id]