youtube-summarizer/backend/services/playlist_analyzer.py

569 lines
23 KiB
Python

"""Playlist analysis service for multi-video analysis with multi-agent system."""
import asyncio
import logging
import re
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime
from urllib.parse import urlparse, parse_qs
from ..core.exceptions import ServiceError
from .multi_agent_orchestrator import MultiAgentVideoOrchestrator
from .transcript_service import TranscriptService
from .video_service import VideoService
logger = logging.getLogger(__name__)
class PlaylistAnalyzer:
"""Service for analyzing YouTube playlists with multi-agent system."""
def __init__(
self,
orchestrator: Optional[MultiAgentVideoOrchestrator] = None,
transcript_service: Optional[TranscriptService] = None,
video_service: Optional[VideoService] = None
):
"""Initialize the playlist analyzer.
Args:
orchestrator: Multi-agent orchestrator for video analysis
transcript_service: Service for extracting video transcripts
video_service: Service for video metadata and operations
"""
self.orchestrator = orchestrator or MultiAgentVideoOrchestrator()
self.transcript_service = transcript_service or TranscriptService()
self.video_service = video_service or VideoService()
self._is_initialized = False
async def initialize(self) -> None:
"""Initialize the playlist analyzer."""
if self._is_initialized:
return
logger.info("Initializing playlist analyzer")
# Initialize the multi-agent orchestrator
await self.orchestrator.initialize()
self._is_initialized = True
logger.info("Playlist analyzer initialized")
async def shutdown(self) -> None:
"""Shutdown the playlist analyzer."""
if self.orchestrator:
await self.orchestrator.shutdown()
self._is_initialized = False
logger.info("Playlist analyzer shutdown complete")
def extract_playlist_id(self, playlist_url: str) -> Optional[str]:
"""Extract playlist ID from various YouTube playlist URL formats.
Args:
playlist_url: YouTube playlist URL
Returns:
Playlist ID if valid, None otherwise
"""
try:
# Parse the URL
parsed_url = urlparse(playlist_url)
# Check if it's a valid YouTube domain
if parsed_url.netloc not in ['youtube.com', 'www.youtube.com', 'youtu.be']:
return None
# Extract playlist ID from query parameters
query_params = parse_qs(parsed_url.query)
# Check for playlist ID in 'list' parameter
if 'list' in query_params:
playlist_id = query_params['list'][0]
# Validate playlist ID format (typically starts with 'PL' and is 34 characters total)
if re.match(r'^[A-Za-z0-9_-]{34}$', playlist_id):
return playlist_id
return None
except Exception as e:
logger.error(f"Error extracting playlist ID from {playlist_url}: {e}")
return None
def extract_video_ids_from_playlist(self, playlist_url: str) -> List[str]:
"""Extract video IDs from a YouTube playlist.
Note: This is a simplified implementation. In production, you would use
the YouTube Data API to get the actual video list from a playlist.
Args:
playlist_url: YouTube playlist URL
Returns:
List of video IDs (mock implementation)
"""
# Mock implementation - in reality would use YouTube Data API
playlist_id = self.extract_playlist_id(playlist_url)
if not playlist_id:
logger.error(f"Invalid playlist URL: {playlist_url}")
return []
# Mock video IDs for demonstration
# In production, use: youtube.playlistItems().list(playlistId=playlist_id, part='snippet')
mock_video_ids = [
"dQw4w9WgXcQ", # Rick Astley - Never Gonna Give You Up
"9bZkp7q19f0", # PSY - GANGNAM STYLE
"kffacxfA7G4", # Baby Shark Dance
]
logger.info(f"Extracted {len(mock_video_ids)} videos from playlist {playlist_id}")
return mock_video_ids
async def analyze_playlist(
self,
playlist_url: str,
perspectives: Optional[List[str]] = None,
max_videos: Optional[int] = None,
include_cross_video_analysis: bool = True
) -> Dict[str, Any]:
"""Analyze all videos in a YouTube playlist using multi-agent system.
Args:
playlist_url: YouTube playlist URL
perspectives: List of perspectives to analyze (defaults to all)
max_videos: Maximum number of videos to analyze (None = all)
include_cross_video_analysis: Whether to perform cross-video analysis
Returns:
Complete playlist analysis result
"""
if not self._is_initialized:
await self.initialize()
logger.info(f"Starting playlist analysis for: {playlist_url}")
try:
# Extract playlist and video information
playlist_id = self.extract_playlist_id(playlist_url)
if not playlist_id:
raise ServiceError(f"Invalid playlist URL: {playlist_url}")
# Get video IDs from playlist
video_ids = self.extract_video_ids_from_playlist(playlist_url)
if not video_ids:
raise ServiceError(f"No videos found in playlist: {playlist_id}")
# Limit number of videos if specified
if max_videos:
video_ids = video_ids[:max_videos]
logger.info(f"Analyzing {len(video_ids)} videos from playlist {playlist_id}")
# Process each video with multi-agent analysis
video_analyses = []
total_processing_time = 0.0
for i, video_id in enumerate(video_ids):
logger.info(f"Processing video {i+1}/{len(video_ids)}: {video_id}")
try:
# Analyze single video
video_result = await self.analyze_single_video(
video_id=video_id,
perspectives=perspectives
)
if video_result:
video_analyses.append(video_result)
total_processing_time += video_result.get("processing_time_seconds", 0)
except Exception as e:
logger.error(f"Error analyzing video {video_id}: {e}")
# Continue with other videos even if one fails
video_analyses.append({
"video_id": video_id,
"status": "error",
"error": str(e),
"processing_time_seconds": 0
})
# Perform cross-video analysis if requested
cross_video_insights = {}
if include_cross_video_analysis and len(video_analyses) > 1:
cross_video_insights = await self.perform_cross_video_analysis(video_analyses)
# Calculate overall playlist quality score
playlist_quality = self._calculate_playlist_quality(video_analyses)
# Extract key themes across all videos
playlist_themes = self._extract_playlist_themes(video_analyses)
# Build final result
result = {
"playlist_id": playlist_id,
"playlist_url": playlist_url,
"video_count": len(video_ids),
"successfully_analyzed": len([v for v in video_analyses if v.get("status") != "error"]),
"video_analyses": video_analyses,
"cross_video_insights": cross_video_insights,
"playlist_themes": playlist_themes,
"overall_quality_score": playlist_quality,
"total_processing_time_seconds": total_processing_time,
"analyzed_at": datetime.now().isoformat()
}
logger.info(f"Playlist analysis completed for {playlist_id} in {total_processing_time:.2f}s")
return result
except Exception as e:
logger.error(f"Error in playlist analysis: {e}")
raise ServiceError(f"Playlist analysis failed: {str(e)}")
async def analyze_single_video(
self,
video_id: str,
perspectives: Optional[List[str]] = None
) -> Optional[Dict[str, Any]]:
"""Analyze a single video using multi-agent system.
Args:
video_id: YouTube video ID
perspectives: List of perspectives to analyze
Returns:
Video analysis result or None if failed
"""
try:
# Get video metadata
try:
video_metadata = await self.video_service.get_video_metadata(video_id)
video_title = video_metadata.get("title", f"Video {video_id}")
except Exception as e:
logger.warning(f"Could not get metadata for video {video_id}: {e}")
video_title = f"Video {video_id}"
video_metadata = {"title": video_title}
# Extract transcript
try:
transcript = await self.transcript_service.extract_transcript(video_id)
if not transcript or len(transcript.strip()) < 50:
logger.warning(f"Transcript too short for video {video_id}")
return {
"video_id": video_id,
"video_title": video_title,
"status": "skipped",
"reason": "transcript_too_short",
"processing_time_seconds": 0
}
except Exception as e:
logger.warning(f"Could not extract transcript for video {video_id}: {e}")
return {
"video_id": video_id,
"video_title": video_title,
"status": "error",
"error": f"transcript_extraction_failed: {str(e)}",
"processing_time_seconds": 0
}
# Perform multi-agent analysis
analysis_result = await self.orchestrator.analyze_video_with_multiple_perspectives(
transcript=transcript,
video_id=video_id,
video_title=video_title,
perspectives=perspectives
)
# Add video metadata to result
analysis_result["video_metadata"] = video_metadata
analysis_result["transcript_length"] = len(transcript)
analysis_result["status"] = "completed"
return analysis_result
except Exception as e:
logger.error(f"Error analyzing single video {video_id}: {e}")
return {
"video_id": video_id,
"status": "error",
"error": str(e),
"processing_time_seconds": 0
}
async def perform_cross_video_analysis(
self,
video_analyses: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""Perform cross-video analysis to identify patterns and themes.
Args:
video_analyses: List of individual video analysis results
Returns:
Cross-video analysis insights
"""
logger.info(f"Performing cross-video analysis on {len(video_analyses)} videos")
try:
# Filter successful analyses
successful_analyses = [
analysis for analysis in video_analyses
if analysis.get("status") == "completed"
]
if len(successful_analyses) < 2:
return {
"status": "skipped",
"reason": "insufficient_successful_analyses",
"minimum_required": 2,
"successful_count": len(successful_analyses)
}
# Extract common themes across videos
common_themes = self._identify_common_themes(successful_analyses)
# Analyze content progression
content_progression = self._analyze_content_progression(successful_analyses)
# Identify key insights patterns
insight_patterns = self._analyze_insight_patterns(successful_analyses)
# Calculate cross-video quality consistency
quality_consistency = self._calculate_quality_consistency(successful_analyses)
return {
"status": "completed",
"analyzed_videos": len(successful_analyses),
"common_themes": common_themes,
"content_progression": content_progression,
"insight_patterns": insight_patterns,
"quality_consistency": quality_consistency,
"analysis_timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error in cross-video analysis: {e}")
return {
"status": "error",
"error": str(e)
}
def _identify_common_themes(self, analyses: List[Dict[str, Any]]) -> List[str]:
"""Identify common themes across multiple video analyses."""
theme_frequency = {}
for analysis in analyses:
perspectives = analysis.get("perspectives", {})
# Collect themes from all perspectives
for perspective_data in perspectives.values():
focus_areas = perspective_data.get("focus_areas", [])
key_insights = perspective_data.get("key_insights", [])
# Count focus areas
for area in focus_areas:
theme_frequency[area] = theme_frequency.get(area, 0) + 1
# Extract keywords from insights
for insight in key_insights:
# Simple keyword extraction (in production, use NLP)
words = insight.lower().split()
for word in words:
if len(word) > 4: # Filter short words
theme_frequency[word] = theme_frequency.get(word, 0) + 1
# Return most common themes
sorted_themes = sorted(theme_frequency.items(), key=lambda x: x[1], reverse=True)
return [theme for theme, count in sorted_themes[:10] if count > 1]
def _analyze_content_progression(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze how content progresses across videos in the playlist."""
progression = {
"video_count": len(analyses),
"average_quality": 0.0,
"quality_trend": "stable",
"complexity_evolution": "consistent"
}
# Calculate average quality
quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses]
progression["average_quality"] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0
# Simple trend analysis
if len(quality_scores) > 2:
first_half_avg = sum(quality_scores[:len(quality_scores)//2]) / (len(quality_scores)//2)
second_half_avg = sum(quality_scores[len(quality_scores)//2:]) / (len(quality_scores) - len(quality_scores)//2)
if second_half_avg > first_half_avg + 0.1:
progression["quality_trend"] = "improving"
elif second_half_avg < first_half_avg - 0.1:
progression["quality_trend"] = "declining"
return progression
def _analyze_insight_patterns(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze patterns in insights across videos."""
patterns = {
"consistent_perspectives": [],
"dominant_themes": [],
"recurring_recommendations": []
}
perspective_consistency = {}
recommendation_frequency = {}
for analysis in analyses:
perspectives = analysis.get("perspectives", {})
# Track perspective consistency
for perspective_name in perspectives.keys():
if perspective_name not in perspective_consistency:
perspective_consistency[perspective_name] = 0
perspective_consistency[perspective_name] += 1
# Track recommendation patterns
for perspective_data in perspectives.values():
recommendations = perspective_data.get("recommendations", [])
for rec in recommendations:
# Simple keyword extraction from recommendations
key_words = [word.lower() for word in rec.split() if len(word) > 4]
for word in key_words[:3]: # Take first 3 significant words
recommendation_frequency[word] = recommendation_frequency.get(word, 0) + 1
# Identify consistent perspectives
total_videos = len(analyses)
patterns["consistent_perspectives"] = [
perspective for perspective, count in perspective_consistency.items()
if count >= total_videos * 0.8 # Present in at least 80% of videos
]
# Identify recurring recommendations
patterns["recurring_recommendations"] = [
word for word, count in sorted(recommendation_frequency.items(), key=lambda x: x[1], reverse=True)[:5]
if count > 1
]
return patterns
def _calculate_quality_consistency(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Calculate quality consistency across videos."""
quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses]
if not quality_scores:
return {"consistency": "unknown", "variance": 0.0, "range": 0.0}
avg_quality = sum(quality_scores) / len(quality_scores)
variance = sum((score - avg_quality) ** 2 for score in quality_scores) / len(quality_scores)
quality_range = max(quality_scores) - min(quality_scores)
# Determine consistency level
if variance < 0.01:
consistency = "very_high"
elif variance < 0.05:
consistency = "high"
elif variance < 0.1:
consistency = "moderate"
else:
consistency = "low"
return {
"consistency": consistency,
"average_quality": avg_quality,
"variance": variance,
"range": quality_range,
"min_quality": min(quality_scores),
"max_quality": max(quality_scores)
}
def _calculate_playlist_quality(self, analyses: List[Dict[str, Any]]) -> float:
"""Calculate overall playlist quality score."""
successful_analyses = [
analysis for analysis in analyses
if analysis.get("status") == "completed"
]
if not successful_analyses:
return 0.0
# Average quality of successful analyses
quality_scores = [analysis.get("quality_score", 0.0) for analysis in successful_analyses]
avg_quality = sum(quality_scores) / len(quality_scores)
# Factor in success rate
success_rate = len(successful_analyses) / len(analyses)
# Weighted score
playlist_quality = (avg_quality * 0.8) + (success_rate * 0.2)
return round(playlist_quality, 2)
def _extract_playlist_themes(self, analyses: List[Dict[str, Any]]) -> List[str]:
"""Extract key themes from the entire playlist."""
successful_analyses = [
analysis for analysis in analyses
if analysis.get("status") == "completed"
]
if not successful_analyses:
return []
# Collect all themes from video analyses
all_themes = []
for analysis in successful_analyses:
# Get unified insights if available
unified_insights = analysis.get("unified_insights", [])
all_themes.extend(unified_insights[:3]) # Top 3 from each video
# Also get themes from synthesis if available
perspectives = analysis.get("perspectives", {})
if "synthesis" in perspectives:
synthesis_insights = perspectives["synthesis"].get("unified_insights", [])
all_themes.extend(synthesis_insights[:2]) # Top 2 from synthesis
# Simple deduplication and ranking (in production, use more sophisticated NLP)
theme_counts = {}
for theme in all_themes:
# Extract key terms from theme
key_terms = [word.lower() for word in theme.split() if len(word) > 4]
for term in key_terms[:2]: # Take first 2 significant terms
theme_counts[term] = theme_counts.get(term, 0) + 1
# Return most common themes
top_themes = sorted(theme_counts.items(), key=lambda x: x[1], reverse=True)
return [theme for theme, count in top_themes[:8] if count > 1]
async def get_service_health(self) -> Dict[str, Any]:
"""Get health status of the playlist analyzer service.
Returns:
Service health information
"""
health_info = {
"service": "playlist_analyzer",
"initialized": self._is_initialized,
"timestamp": datetime.now().isoformat()
}
if self._is_initialized and self.orchestrator:
# Get orchestrator health
try:
orchestrator_health = await self.orchestrator.get_orchestrator_health()
health_info["orchestrator_health"] = orchestrator_health
if orchestrator_health.get("status") == "healthy":
health_info["status"] = "healthy"
else:
health_info["status"] = "degraded"
except Exception as e:
logger.error(f"Error getting orchestrator health: {e}")
health_info["status"] = "error"
health_info["error"] = str(e)
else:
health_info["status"] = "not_initialized"
return health_info