youtube-summarizer/backend/services/playlist_analyzer.py

"""Playlist analysis service for multi-video analysis with multi-agent system."""

import asyncio
import logging
import re
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime
from urllib.parse import urlparse, parse_qs

from ..core.exceptions import ServiceError
from .multi_agent_orchestrator import MultiAgentVideoOrchestrator
from .transcript_service import TranscriptService
from .video_service import VideoService

logger = logging.getLogger(__name__)


class PlaylistAnalyzer:
    """Service for analyzing YouTube playlists with multi-agent system."""

    def __init__(
        self,
        orchestrator: Optional[MultiAgentVideoOrchestrator] = None,
        transcript_service: Optional[TranscriptService] = None,
        video_service: Optional[VideoService] = None
    ):
        """Initialize the playlist analyzer.

        Args:
            orchestrator: Multi-agent orchestrator for video analysis
            transcript_service: Service for extracting video transcripts
            video_service: Service for video metadata and operations
        """
        self.orchestrator = orchestrator or MultiAgentVideoOrchestrator()
        self.transcript_service = transcript_service or TranscriptService()
        self.video_service = video_service or VideoService()

        self._is_initialized = False

    async def initialize(self) -> None:
        """Initialize the playlist analyzer."""
        if self._is_initialized:
            return

        logger.info("Initializing playlist analyzer")

        # Initialize the multi-agent orchestrator
        await self.orchestrator.initialize()

        self._is_initialized = True
        logger.info("Playlist analyzer initialized")

    async def shutdown(self) -> None:
        """Shutdown the playlist analyzer."""
        if self.orchestrator:
            await self.orchestrator.shutdown()

        self._is_initialized = False
        logger.info("Playlist analyzer shutdown complete")

    def extract_playlist_id(self, playlist_url: str) -> Optional[str]:
        """Extract playlist ID from various YouTube playlist URL formats.

        Args:
            playlist_url: YouTube playlist URL

        Returns:
            Playlist ID if valid, None otherwise
        """
        try:
            # Parse the URL
            parsed_url = urlparse(playlist_url)

            # Check if it's a valid YouTube domain
            if parsed_url.netloc not in ['youtube.com', 'www.youtube.com', 'youtu.be']:
                return None

            # Extract playlist ID from query parameters
            query_params = parse_qs(parsed_url.query)

            # Check for playlist ID in 'list' parameter
            if 'list' in query_params:
                playlist_id = query_params['list'][0]
                # Validate playlist ID format (typically starts with 'PL' and is 34 characters total)
                if re.match(r'^[A-Za-z0-9_-]{34}$', playlist_id):
                    return playlist_id

            return None

        except Exception as e:
            logger.error(f"Error extracting playlist ID from {playlist_url}: {e}")
            return None

    def extract_video_ids_from_playlist(self, playlist_url: str) -> List[str]:
        """Extract video IDs from a YouTube playlist.

        Note: This is a simplified implementation. In production, you would use
        the YouTube Data API to get the actual video list from a playlist.

        Args:
            playlist_url: YouTube playlist URL

        Returns:
            List of video IDs (mock implementation)
        """
        # Mock implementation - in reality would use YouTube Data API
        playlist_id = self.extract_playlist_id(playlist_url)

        if not playlist_id:
            logger.error(f"Invalid playlist URL: {playlist_url}")
            return []

        # Mock video IDs for demonstration
        # In production, use: youtube.playlistItems().list(playlistId=playlist_id, part='snippet')
        mock_video_ids = [
            "dQw4w9WgXcQ",  # Rick Astley - Never Gonna Give You Up
            "9bZkp7q19f0",  # PSY - GANGNAM STYLE
            "kffacxfA7G4",  # Baby Shark Dance
        ]

        logger.info(f"Extracted {len(mock_video_ids)} videos from playlist {playlist_id}")
        return mock_video_ids

    async def analyze_playlist(
        self,
        playlist_url: str,
        perspectives: Optional[List[str]] = None,
        max_videos: Optional[int] = None,
        include_cross_video_analysis: bool = True
    ) -> Dict[str, Any]:
        """Analyze all videos in a YouTube playlist using multi-agent system.

        Args:
            playlist_url: YouTube playlist URL
            perspectives: List of perspectives to analyze (defaults to all)
            max_videos: Maximum number of videos to analyze (None = all)
            include_cross_video_analysis: Whether to perform cross-video analysis

        Returns:
            Complete playlist analysis result
        """
        if not self._is_initialized:
            await self.initialize()

        logger.info(f"Starting playlist analysis for: {playlist_url}")

        try:
            # Extract playlist and video information
            playlist_id = self.extract_playlist_id(playlist_url)
            if not playlist_id:
                raise ServiceError(f"Invalid playlist URL: {playlist_url}")

            # Get video IDs from playlist
            video_ids = self.extract_video_ids_from_playlist(playlist_url)

            if not video_ids:
                raise ServiceError(f"No videos found in playlist: {playlist_id}")

            # Limit number of videos if specified
            if max_videos:
                video_ids = video_ids[:max_videos]

            logger.info(f"Analyzing {len(video_ids)} videos from playlist {playlist_id}")

            # Process each video with multi-agent analysis
            video_analyses = []
            total_processing_time = 0.0

            for i, video_id in enumerate(video_ids):
                logger.info(f"Processing video {i+1}/{len(video_ids)}: {video_id}")

                try:
                    # Analyze single video
                    video_result = await self.analyze_single_video(
                        video_id=video_id,
                        perspectives=perspectives
                    )

                    if video_result:
                        video_analyses.append(video_result)
                        total_processing_time += video_result.get("processing_time_seconds", 0)

                except Exception as e:
                    logger.error(f"Error analyzing video {video_id}: {e}")
                    # Continue with other videos even if one fails
                    video_analyses.append({
                        "video_id": video_id,
                        "status": "error",
                        "error": str(e),
                        "processing_time_seconds": 0
                    })

            # Perform cross-video analysis if requested
            cross_video_insights = {}
            if include_cross_video_analysis and len(video_analyses) > 1:
                cross_video_insights = await self.perform_cross_video_analysis(video_analyses)

            # Calculate overall playlist quality score
            playlist_quality = self._calculate_playlist_quality(video_analyses)

            # Extract key themes across all videos
            playlist_themes = self._extract_playlist_themes(video_analyses)

            # Build final result
            result = {
                "playlist_id": playlist_id,
                "playlist_url": playlist_url,
                "video_count": len(video_ids),
                "successfully_analyzed": len([v for v in video_analyses if v.get("status") != "error"]),
                "video_analyses": video_analyses,
                "cross_video_insights": cross_video_insights,
                "playlist_themes": playlist_themes,
                "overall_quality_score": playlist_quality,
                "total_processing_time_seconds": total_processing_time,
                "analyzed_at": datetime.now().isoformat()
            }

            logger.info(f"Playlist analysis completed for {playlist_id} in {total_processing_time:.2f}s")
            return result

        except Exception as e:
            logger.error(f"Error in playlist analysis: {e}")
            raise ServiceError(f"Playlist analysis failed: {str(e)}")

    async def analyze_single_video(
        self,
        video_id: str,
        perspectives: Optional[List[str]] = None
    ) -> Optional[Dict[str, Any]]:
        """Analyze a single video using multi-agent system.

        Args:
            video_id: YouTube video ID
            perspectives: List of perspectives to analyze

        Returns:
            Video analysis result or None if failed
        """
        try:
            # Get video metadata
            try:
                video_metadata = await self.video_service.get_video_metadata(video_id)
                video_title = video_metadata.get("title", f"Video {video_id}")
            except Exception as e:
                logger.warning(f"Could not get metadata for video {video_id}: {e}")
                video_title = f"Video {video_id}"
                video_metadata = {"title": video_title}

            # Extract transcript
            try:
                transcript = await self.transcript_service.extract_transcript(video_id)

                if not transcript or len(transcript.strip()) < 50:
                    logger.warning(f"Transcript too short for video {video_id}")
                    return {
                        "video_id": video_id,
                        "video_title": video_title,
                        "status": "skipped",
                        "reason": "transcript_too_short",
                        "processing_time_seconds": 0
                    }

            except Exception as e:
                logger.warning(f"Could not extract transcript for video {video_id}: {e}")
                return {
                    "video_id": video_id,
                    "video_title": video_title,
                    "status": "error",
                    "error": f"transcript_extraction_failed: {str(e)}",
                    "processing_time_seconds": 0
                }

            # Perform multi-agent analysis
            analysis_result = await self.orchestrator.analyze_video_with_multiple_perspectives(
                transcript=transcript,
                video_id=video_id,
                video_title=video_title,
                perspectives=perspectives
            )

            # Add video metadata to result
            analysis_result["video_metadata"] = video_metadata
            analysis_result["transcript_length"] = len(transcript)
            analysis_result["status"] = "completed"

            return analysis_result

        except Exception as e:
            logger.error(f"Error analyzing single video {video_id}: {e}")
            return {
                "video_id": video_id,
                "status": "error",
                "error": str(e),
                "processing_time_seconds": 0
            }

    async def perform_cross_video_analysis(
        self,
        video_analyses: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """Perform cross-video analysis to identify patterns and themes.

        Args:
            video_analyses: List of individual video analysis results

        Returns:
            Cross-video analysis insights
        """
        logger.info(f"Performing cross-video analysis on {len(video_analyses)} videos")

        try:
            # Filter successful analyses
            successful_analyses = [
                analysis for analysis in video_analyses
                if analysis.get("status") == "completed"
            ]

            if len(successful_analyses) < 2:
                return {
                    "status": "skipped",
                    "reason": "insufficient_successful_analyses",
                    "minimum_required": 2,
                    "successful_count": len(successful_analyses)
                }

            # Extract common themes across videos
            common_themes = self._identify_common_themes(successful_analyses)

            # Analyze content progression
            content_progression = self._analyze_content_progression(successful_analyses)

            # Identify key insights patterns
            insight_patterns = self._analyze_insight_patterns(successful_analyses)

            # Calculate cross-video quality consistency
            quality_consistency = self._calculate_quality_consistency(successful_analyses)

            return {
                "status": "completed",
                "analyzed_videos": len(successful_analyses),
                "common_themes": common_themes,
                "content_progression": content_progression,
                "insight_patterns": insight_patterns,
                "quality_consistency": quality_consistency,
                "analysis_timestamp": datetime.now().isoformat()
            }

        except Exception as e:
            logger.error(f"Error in cross-video analysis: {e}")
            return {
                "status": "error",
                "error": str(e)
            }

    def _identify_common_themes(self, analyses: List[Dict[str, Any]]) -> List[str]:
        """Identify common themes across multiple video analyses."""
        theme_frequency = {}

        for analysis in analyses:
            perspectives = analysis.get("perspectives", {})

            # Collect themes from all perspectives
            for perspective_data in perspectives.values():
                focus_areas = perspective_data.get("focus_areas", [])
                key_insights = perspective_data.get("key_insights", [])

                # Count focus areas
                for area in focus_areas:
                    theme_frequency[area] = theme_frequency.get(area, 0) + 1

                # Extract keywords from insights
                for insight in key_insights:
                    # Simple keyword extraction (in production, use NLP)
                    words = insight.lower().split()
                    for word in words:
                        if len(word) > 4:  # Filter short words
                            theme_frequency[word] = theme_frequency.get(word, 0) + 1

        # Return most common themes
        sorted_themes = sorted(theme_frequency.items(), key=lambda x: x[1], reverse=True)
        return [theme for theme, count in sorted_themes[:10] if count > 1]

    def _analyze_content_progression(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze how content progresses across videos in the playlist."""
        progression = {
            "video_count": len(analyses),
            "average_quality": 0.0,
            "quality_trend": "stable",
            "complexity_evolution": "consistent"
        }

        # Calculate average quality
        quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses]
        progression["average_quality"] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0

        # Simple trend analysis
        if len(quality_scores) > 2:
            first_half_avg = sum(quality_scores[:len(quality_scores)//2]) / (len(quality_scores)//2)
            second_half_avg = sum(quality_scores[len(quality_scores)//2:]) / (len(quality_scores) - len(quality_scores)//2)

            if second_half_avg > first_half_avg + 0.1:
                progression["quality_trend"] = "improving"
            elif second_half_avg < first_half_avg - 0.1:
                progression["quality_trend"] = "declining"

        return progression

    def _analyze_insight_patterns(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze patterns in insights across videos."""
        patterns = {
            "consistent_perspectives": [],
            "dominant_themes": [],
            "recurring_recommendations": []
        }

        perspective_consistency = {}
        recommendation_frequency = {}

        for analysis in analyses:
            perspectives = analysis.get("perspectives", {})

            # Track perspective consistency
            for perspective_name in perspectives.keys():
                if perspective_name not in perspective_consistency:
                    perspective_consistency[perspective_name] = 0
                perspective_consistency[perspective_name] += 1

            # Track recommendation patterns
            for perspective_data in perspectives.values():
                recommendations = perspective_data.get("recommendations", [])
                for rec in recommendations:
                    # Simple keyword extraction from recommendations
                    key_words = [word.lower() for word in rec.split() if len(word) > 4]
                    for word in key_words[:3]:  # Take first 3 significant words
                        recommendation_frequency[word] = recommendation_frequency.get(word, 0) + 1

        # Identify consistent perspectives
        total_videos = len(analyses)
        patterns["consistent_perspectives"] = [
            perspective for perspective, count in perspective_consistency.items()
            if count >= total_videos * 0.8  # Present in at least 80% of videos
        ]

        # Identify recurring recommendations
        patterns["recurring_recommendations"] = [
            word for word, count in sorted(recommendation_frequency.items(), key=lambda x: x[1], reverse=True)[:5]
            if count > 1
        ]

        return patterns

    def _calculate_quality_consistency(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Calculate quality consistency across videos."""
        quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses]

        if not quality_scores:
            return {"consistency": "unknown", "variance": 0.0, "range": 0.0}

        avg_quality = sum(quality_scores) / len(quality_scores)
        variance = sum((score - avg_quality) ** 2 for score in quality_scores) / len(quality_scores)
        quality_range = max(quality_scores) - min(quality_scores)

        # Determine consistency level
        if variance < 0.01:
            consistency = "very_high"
        elif variance < 0.05:
            consistency = "high"
        elif variance < 0.1:
            consistency = "moderate"
        else:
            consistency = "low"

        return {
            "consistency": consistency,
            "average_quality": avg_quality,
            "variance": variance,
            "range": quality_range,
            "min_quality": min(quality_scores),
            "max_quality": max(quality_scores)
        }

    def _calculate_playlist_quality(self, analyses: List[Dict[str, Any]]) -> float:
        """Calculate overall playlist quality score."""
        successful_analyses = [
            analysis for analysis in analyses
            if analysis.get("status") == "completed"
        ]

        if not successful_analyses:
            return 0.0

        # Average quality of successful analyses
        quality_scores = [analysis.get("quality_score", 0.0) for analysis in successful_analyses]
        avg_quality = sum(quality_scores) / len(quality_scores)

        # Factor in success rate
        success_rate = len(successful_analyses) / len(analyses)

        # Weighted score
        playlist_quality = (avg_quality * 0.8) + (success_rate * 0.2)
        return round(playlist_quality, 2)

    def _extract_playlist_themes(self, analyses: List[Dict[str, Any]]) -> List[str]:
        """Extract key themes from the entire playlist."""
        successful_analyses = [
            analysis for analysis in analyses
            if analysis.get("status") == "completed"
        ]

        if not successful_analyses:
            return []

        # Collect all themes from video analyses
        all_themes = []

        for analysis in successful_analyses:
            # Get unified insights if available
            unified_insights = analysis.get("unified_insights", [])
            all_themes.extend(unified_insights[:3])  # Top 3 from each video

            # Also get themes from synthesis if available
            perspectives = analysis.get("perspectives", {})
            if "synthesis" in perspectives:
                synthesis_insights = perspectives["synthesis"].get("unified_insights", [])
                all_themes.extend(synthesis_insights[:2])  # Top 2 from synthesis

        # Simple deduplication and ranking (in production, use more sophisticated NLP)
        theme_counts = {}
        for theme in all_themes:
            # Extract key terms from theme
            key_terms = [word.lower() for word in theme.split() if len(word) > 4]
            for term in key_terms[:2]:  # Take first 2 significant terms
                theme_counts[term] = theme_counts.get(term, 0) + 1

        # Return most common themes
        top_themes = sorted(theme_counts.items(), key=lambda x: x[1], reverse=True)
        return [theme for theme, count in top_themes[:8] if count > 1]

    async def get_service_health(self) -> Dict[str, Any]:
        """Get health status of the playlist analyzer service.

        Returns:
            Service health information
        """
        health_info = {
            "service": "playlist_analyzer",
            "initialized": self._is_initialized,
            "timestamp": datetime.now().isoformat()
        }

        if self._is_initialized and self.orchestrator:
            # Get orchestrator health
            try:
                orchestrator_health = await self.orchestrator.get_orchestrator_health()
                health_info["orchestrator_health"] = orchestrator_health

                if orchestrator_health.get("status") == "healthy":
                    health_info["status"] = "healthy"
                else:
                    health_info["status"] = "degraded"

            except Exception as e:
                logger.error(f"Error getting orchestrator health: {e}")
                health_info["status"] = "error"
                health_info["error"] = str(e)
        else:
            health_info["status"] = "not_initialized"

        return health_info