"""Playlist analysis service for multi-video analysis with multi-agent system.""" import asyncio import logging import re from typing import Dict, List, Optional, Any, Tuple from datetime import datetime from urllib.parse import urlparse, parse_qs from ..core.exceptions import ServiceError from .multi_agent_orchestrator import MultiAgentVideoOrchestrator from .transcript_service import TranscriptService from .video_service import VideoService logger = logging.getLogger(__name__) class PlaylistAnalyzer: """Service for analyzing YouTube playlists with multi-agent system.""" def __init__( self, orchestrator: Optional[MultiAgentVideoOrchestrator] = None, transcript_service: Optional[TranscriptService] = None, video_service: Optional[VideoService] = None ): """Initialize the playlist analyzer. Args: orchestrator: Multi-agent orchestrator for video analysis transcript_service: Service for extracting video transcripts video_service: Service for video metadata and operations """ self.orchestrator = orchestrator or MultiAgentVideoOrchestrator() self.transcript_service = transcript_service or TranscriptService() self.video_service = video_service or VideoService() self._is_initialized = False async def initialize(self) -> None: """Initialize the playlist analyzer.""" if self._is_initialized: return logger.info("Initializing playlist analyzer") # Initialize the multi-agent orchestrator await self.orchestrator.initialize() self._is_initialized = True logger.info("Playlist analyzer initialized") async def shutdown(self) -> None: """Shutdown the playlist analyzer.""" if self.orchestrator: await self.orchestrator.shutdown() self._is_initialized = False logger.info("Playlist analyzer shutdown complete") def extract_playlist_id(self, playlist_url: str) -> Optional[str]: """Extract playlist ID from various YouTube playlist URL formats. Args: playlist_url: YouTube playlist URL Returns: Playlist ID if valid, None otherwise """ try: # Parse the URL parsed_url = urlparse(playlist_url) # Check if it's a valid YouTube domain if parsed_url.netloc not in ['youtube.com', 'www.youtube.com', 'youtu.be']: return None # Extract playlist ID from query parameters query_params = parse_qs(parsed_url.query) # Check for playlist ID in 'list' parameter if 'list' in query_params: playlist_id = query_params['list'][0] # Validate playlist ID format (typically starts with 'PL' and is 34 characters total) if re.match(r'^[A-Za-z0-9_-]{34}$', playlist_id): return playlist_id return None except Exception as e: logger.error(f"Error extracting playlist ID from {playlist_url}: {e}") return None def extract_video_ids_from_playlist(self, playlist_url: str) -> List[str]: """Extract video IDs from a YouTube playlist. Note: This is a simplified implementation. In production, you would use the YouTube Data API to get the actual video list from a playlist. Args: playlist_url: YouTube playlist URL Returns: List of video IDs (mock implementation) """ # Mock implementation - in reality would use YouTube Data API playlist_id = self.extract_playlist_id(playlist_url) if not playlist_id: logger.error(f"Invalid playlist URL: {playlist_url}") return [] # Mock video IDs for demonstration # In production, use: youtube.playlistItems().list(playlistId=playlist_id, part='snippet') mock_video_ids = [ "dQw4w9WgXcQ", # Rick Astley - Never Gonna Give You Up "9bZkp7q19f0", # PSY - GANGNAM STYLE "kffacxfA7G4", # Baby Shark Dance ] logger.info(f"Extracted {len(mock_video_ids)} videos from playlist {playlist_id}") return mock_video_ids async def analyze_playlist( self, playlist_url: str, perspectives: Optional[List[str]] = None, max_videos: Optional[int] = None, include_cross_video_analysis: bool = True ) -> Dict[str, Any]: """Analyze all videos in a YouTube playlist using multi-agent system. Args: playlist_url: YouTube playlist URL perspectives: List of perspectives to analyze (defaults to all) max_videos: Maximum number of videos to analyze (None = all) include_cross_video_analysis: Whether to perform cross-video analysis Returns: Complete playlist analysis result """ if not self._is_initialized: await self.initialize() logger.info(f"Starting playlist analysis for: {playlist_url}") try: # Extract playlist and video information playlist_id = self.extract_playlist_id(playlist_url) if not playlist_id: raise ServiceError(f"Invalid playlist URL: {playlist_url}") # Get video IDs from playlist video_ids = self.extract_video_ids_from_playlist(playlist_url) if not video_ids: raise ServiceError(f"No videos found in playlist: {playlist_id}") # Limit number of videos if specified if max_videos: video_ids = video_ids[:max_videos] logger.info(f"Analyzing {len(video_ids)} videos from playlist {playlist_id}") # Process each video with multi-agent analysis video_analyses = [] total_processing_time = 0.0 for i, video_id in enumerate(video_ids): logger.info(f"Processing video {i+1}/{len(video_ids)}: {video_id}") try: # Analyze single video video_result = await self.analyze_single_video( video_id=video_id, perspectives=perspectives ) if video_result: video_analyses.append(video_result) total_processing_time += video_result.get("processing_time_seconds", 0) except Exception as e: logger.error(f"Error analyzing video {video_id}: {e}") # Continue with other videos even if one fails video_analyses.append({ "video_id": video_id, "status": "error", "error": str(e), "processing_time_seconds": 0 }) # Perform cross-video analysis if requested cross_video_insights = {} if include_cross_video_analysis and len(video_analyses) > 1: cross_video_insights = await self.perform_cross_video_analysis(video_analyses) # Calculate overall playlist quality score playlist_quality = self._calculate_playlist_quality(video_analyses) # Extract key themes across all videos playlist_themes = self._extract_playlist_themes(video_analyses) # Build final result result = { "playlist_id": playlist_id, "playlist_url": playlist_url, "video_count": len(video_ids), "successfully_analyzed": len([v for v in video_analyses if v.get("status") != "error"]), "video_analyses": video_analyses, "cross_video_insights": cross_video_insights, "playlist_themes": playlist_themes, "overall_quality_score": playlist_quality, "total_processing_time_seconds": total_processing_time, "analyzed_at": datetime.now().isoformat() } logger.info(f"Playlist analysis completed for {playlist_id} in {total_processing_time:.2f}s") return result except Exception as e: logger.error(f"Error in playlist analysis: {e}") raise ServiceError(f"Playlist analysis failed: {str(e)}") async def analyze_single_video( self, video_id: str, perspectives: Optional[List[str]] = None ) -> Optional[Dict[str, Any]]: """Analyze a single video using multi-agent system. Args: video_id: YouTube video ID perspectives: List of perspectives to analyze Returns: Video analysis result or None if failed """ try: # Get video metadata try: video_metadata = await self.video_service.get_video_metadata(video_id) video_title = video_metadata.get("title", f"Video {video_id}") except Exception as e: logger.warning(f"Could not get metadata for video {video_id}: {e}") video_title = f"Video {video_id}" video_metadata = {"title": video_title} # Extract transcript try: transcript = await self.transcript_service.extract_transcript(video_id) if not transcript or len(transcript.strip()) < 50: logger.warning(f"Transcript too short for video {video_id}") return { "video_id": video_id, "video_title": video_title, "status": "skipped", "reason": "transcript_too_short", "processing_time_seconds": 0 } except Exception as e: logger.warning(f"Could not extract transcript for video {video_id}: {e}") return { "video_id": video_id, "video_title": video_title, "status": "error", "error": f"transcript_extraction_failed: {str(e)}", "processing_time_seconds": 0 } # Perform multi-agent analysis analysis_result = await self.orchestrator.analyze_video_with_multiple_perspectives( transcript=transcript, video_id=video_id, video_title=video_title, perspectives=perspectives ) # Add video metadata to result analysis_result["video_metadata"] = video_metadata analysis_result["transcript_length"] = len(transcript) analysis_result["status"] = "completed" return analysis_result except Exception as e: logger.error(f"Error analyzing single video {video_id}: {e}") return { "video_id": video_id, "status": "error", "error": str(e), "processing_time_seconds": 0 } async def perform_cross_video_analysis( self, video_analyses: List[Dict[str, Any]] ) -> Dict[str, Any]: """Perform cross-video analysis to identify patterns and themes. Args: video_analyses: List of individual video analysis results Returns: Cross-video analysis insights """ logger.info(f"Performing cross-video analysis on {len(video_analyses)} videos") try: # Filter successful analyses successful_analyses = [ analysis for analysis in video_analyses if analysis.get("status") == "completed" ] if len(successful_analyses) < 2: return { "status": "skipped", "reason": "insufficient_successful_analyses", "minimum_required": 2, "successful_count": len(successful_analyses) } # Extract common themes across videos common_themes = self._identify_common_themes(successful_analyses) # Analyze content progression content_progression = self._analyze_content_progression(successful_analyses) # Identify key insights patterns insight_patterns = self._analyze_insight_patterns(successful_analyses) # Calculate cross-video quality consistency quality_consistency = self._calculate_quality_consistency(successful_analyses) return { "status": "completed", "analyzed_videos": len(successful_analyses), "common_themes": common_themes, "content_progression": content_progression, "insight_patterns": insight_patterns, "quality_consistency": quality_consistency, "analysis_timestamp": datetime.now().isoformat() } except Exception as e: logger.error(f"Error in cross-video analysis: {e}") return { "status": "error", "error": str(e) } def _identify_common_themes(self, analyses: List[Dict[str, Any]]) -> List[str]: """Identify common themes across multiple video analyses.""" theme_frequency = {} for analysis in analyses: perspectives = analysis.get("perspectives", {}) # Collect themes from all perspectives for perspective_data in perspectives.values(): focus_areas = perspective_data.get("focus_areas", []) key_insights = perspective_data.get("key_insights", []) # Count focus areas for area in focus_areas: theme_frequency[area] = theme_frequency.get(area, 0) + 1 # Extract keywords from insights for insight in key_insights: # Simple keyword extraction (in production, use NLP) words = insight.lower().split() for word in words: if len(word) > 4: # Filter short words theme_frequency[word] = theme_frequency.get(word, 0) + 1 # Return most common themes sorted_themes = sorted(theme_frequency.items(), key=lambda x: x[1], reverse=True) return [theme for theme, count in sorted_themes[:10] if count > 1] def _analyze_content_progression(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze how content progresses across videos in the playlist.""" progression = { "video_count": len(analyses), "average_quality": 0.0, "quality_trend": "stable", "complexity_evolution": "consistent" } # Calculate average quality quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses] progression["average_quality"] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0 # Simple trend analysis if len(quality_scores) > 2: first_half_avg = sum(quality_scores[:len(quality_scores)//2]) / (len(quality_scores)//2) second_half_avg = sum(quality_scores[len(quality_scores)//2:]) / (len(quality_scores) - len(quality_scores)//2) if second_half_avg > first_half_avg + 0.1: progression["quality_trend"] = "improving" elif second_half_avg < first_half_avg - 0.1: progression["quality_trend"] = "declining" return progression def _analyze_insight_patterns(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze patterns in insights across videos.""" patterns = { "consistent_perspectives": [], "dominant_themes": [], "recurring_recommendations": [] } perspective_consistency = {} recommendation_frequency = {} for analysis in analyses: perspectives = analysis.get("perspectives", {}) # Track perspective consistency for perspective_name in perspectives.keys(): if perspective_name not in perspective_consistency: perspective_consistency[perspective_name] = 0 perspective_consistency[perspective_name] += 1 # Track recommendation patterns for perspective_data in perspectives.values(): recommendations = perspective_data.get("recommendations", []) for rec in recommendations: # Simple keyword extraction from recommendations key_words = [word.lower() for word in rec.split() if len(word) > 4] for word in key_words[:3]: # Take first 3 significant words recommendation_frequency[word] = recommendation_frequency.get(word, 0) + 1 # Identify consistent perspectives total_videos = len(analyses) patterns["consistent_perspectives"] = [ perspective for perspective, count in perspective_consistency.items() if count >= total_videos * 0.8 # Present in at least 80% of videos ] # Identify recurring recommendations patterns["recurring_recommendations"] = [ word for word, count in sorted(recommendation_frequency.items(), key=lambda x: x[1], reverse=True)[:5] if count > 1 ] return patterns def _calculate_quality_consistency(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]: """Calculate quality consistency across videos.""" quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses] if not quality_scores: return {"consistency": "unknown", "variance": 0.0, "range": 0.0} avg_quality = sum(quality_scores) / len(quality_scores) variance = sum((score - avg_quality) ** 2 for score in quality_scores) / len(quality_scores) quality_range = max(quality_scores) - min(quality_scores) # Determine consistency level if variance < 0.01: consistency = "very_high" elif variance < 0.05: consistency = "high" elif variance < 0.1: consistency = "moderate" else: consistency = "low" return { "consistency": consistency, "average_quality": avg_quality, "variance": variance, "range": quality_range, "min_quality": min(quality_scores), "max_quality": max(quality_scores) } def _calculate_playlist_quality(self, analyses: List[Dict[str, Any]]) -> float: """Calculate overall playlist quality score.""" successful_analyses = [ analysis for analysis in analyses if analysis.get("status") == "completed" ] if not successful_analyses: return 0.0 # Average quality of successful analyses quality_scores = [analysis.get("quality_score", 0.0) for analysis in successful_analyses] avg_quality = sum(quality_scores) / len(quality_scores) # Factor in success rate success_rate = len(successful_analyses) / len(analyses) # Weighted score playlist_quality = (avg_quality * 0.8) + (success_rate * 0.2) return round(playlist_quality, 2) def _extract_playlist_themes(self, analyses: List[Dict[str, Any]]) -> List[str]: """Extract key themes from the entire playlist.""" successful_analyses = [ analysis for analysis in analyses if analysis.get("status") == "completed" ] if not successful_analyses: return [] # Collect all themes from video analyses all_themes = [] for analysis in successful_analyses: # Get unified insights if available unified_insights = analysis.get("unified_insights", []) all_themes.extend(unified_insights[:3]) # Top 3 from each video # Also get themes from synthesis if available perspectives = analysis.get("perspectives", {}) if "synthesis" in perspectives: synthesis_insights = perspectives["synthesis"].get("unified_insights", []) all_themes.extend(synthesis_insights[:2]) # Top 2 from synthesis # Simple deduplication and ranking (in production, use more sophisticated NLP) theme_counts = {} for theme in all_themes: # Extract key terms from theme key_terms = [word.lower() for word in theme.split() if len(word) > 4] for term in key_terms[:2]: # Take first 2 significant terms theme_counts[term] = theme_counts.get(term, 0) + 1 # Return most common themes top_themes = sorted(theme_counts.items(), key=lambda x: x[1], reverse=True) return [theme for theme, count in top_themes[:8] if count > 1] async def get_service_health(self) -> Dict[str, Any]: """Get health status of the playlist analyzer service. Returns: Service health information """ health_info = { "service": "playlist_analyzer", "initialized": self._is_initialized, "timestamp": datetime.now().isoformat() } if self._is_initialized and self.orchestrator: # Get orchestrator health try: orchestrator_health = await self.orchestrator.get_orchestrator_health() health_info["orchestrator_health"] = orchestrator_health if orchestrator_health.get("status") == "healthy": health_info["status"] = "healthy" else: health_info["status"] = "degraded" except Exception as e: logger.error(f"Error getting orchestrator health: {e}") health_info["status"] = "error" health_info["error"] = str(e) else: health_info["status"] = "not_initialized" return health_info