569 lines
23 KiB
Python
569 lines
23 KiB
Python
"""Playlist analysis service for multi-video analysis with multi-agent system."""
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from datetime import datetime
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
from ..core.exceptions import ServiceError
|
|
from .multi_agent_orchestrator import MultiAgentVideoOrchestrator
|
|
from .transcript_service import TranscriptService
|
|
from .video_service import VideoService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PlaylistAnalyzer:
|
|
"""Service for analyzing YouTube playlists with multi-agent system."""
|
|
|
|
def __init__(
|
|
self,
|
|
orchestrator: Optional[MultiAgentVideoOrchestrator] = None,
|
|
transcript_service: Optional[TranscriptService] = None,
|
|
video_service: Optional[VideoService] = None
|
|
):
|
|
"""Initialize the playlist analyzer.
|
|
|
|
Args:
|
|
orchestrator: Multi-agent orchestrator for video analysis
|
|
transcript_service: Service for extracting video transcripts
|
|
video_service: Service for video metadata and operations
|
|
"""
|
|
self.orchestrator = orchestrator or MultiAgentVideoOrchestrator()
|
|
self.transcript_service = transcript_service or TranscriptService()
|
|
self.video_service = video_service or VideoService()
|
|
|
|
self._is_initialized = False
|
|
|
|
async def initialize(self) -> None:
|
|
"""Initialize the playlist analyzer."""
|
|
if self._is_initialized:
|
|
return
|
|
|
|
logger.info("Initializing playlist analyzer")
|
|
|
|
# Initialize the multi-agent orchestrator
|
|
await self.orchestrator.initialize()
|
|
|
|
self._is_initialized = True
|
|
logger.info("Playlist analyzer initialized")
|
|
|
|
async def shutdown(self) -> None:
|
|
"""Shutdown the playlist analyzer."""
|
|
if self.orchestrator:
|
|
await self.orchestrator.shutdown()
|
|
|
|
self._is_initialized = False
|
|
logger.info("Playlist analyzer shutdown complete")
|
|
|
|
def extract_playlist_id(self, playlist_url: str) -> Optional[str]:
|
|
"""Extract playlist ID from various YouTube playlist URL formats.
|
|
|
|
Args:
|
|
playlist_url: YouTube playlist URL
|
|
|
|
Returns:
|
|
Playlist ID if valid, None otherwise
|
|
"""
|
|
try:
|
|
# Parse the URL
|
|
parsed_url = urlparse(playlist_url)
|
|
|
|
# Check if it's a valid YouTube domain
|
|
if parsed_url.netloc not in ['youtube.com', 'www.youtube.com', 'youtu.be']:
|
|
return None
|
|
|
|
# Extract playlist ID from query parameters
|
|
query_params = parse_qs(parsed_url.query)
|
|
|
|
# Check for playlist ID in 'list' parameter
|
|
if 'list' in query_params:
|
|
playlist_id = query_params['list'][0]
|
|
# Validate playlist ID format (typically starts with 'PL' and is 34 characters total)
|
|
if re.match(r'^[A-Za-z0-9_-]{34}$', playlist_id):
|
|
return playlist_id
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting playlist ID from {playlist_url}: {e}")
|
|
return None
|
|
|
|
def extract_video_ids_from_playlist(self, playlist_url: str) -> List[str]:
|
|
"""Extract video IDs from a YouTube playlist.
|
|
|
|
Note: This is a simplified implementation. In production, you would use
|
|
the YouTube Data API to get the actual video list from a playlist.
|
|
|
|
Args:
|
|
playlist_url: YouTube playlist URL
|
|
|
|
Returns:
|
|
List of video IDs (mock implementation)
|
|
"""
|
|
# Mock implementation - in reality would use YouTube Data API
|
|
playlist_id = self.extract_playlist_id(playlist_url)
|
|
|
|
if not playlist_id:
|
|
logger.error(f"Invalid playlist URL: {playlist_url}")
|
|
return []
|
|
|
|
# Mock video IDs for demonstration
|
|
# In production, use: youtube.playlistItems().list(playlistId=playlist_id, part='snippet')
|
|
mock_video_ids = [
|
|
"dQw4w9WgXcQ", # Rick Astley - Never Gonna Give You Up
|
|
"9bZkp7q19f0", # PSY - GANGNAM STYLE
|
|
"kffacxfA7G4", # Baby Shark Dance
|
|
]
|
|
|
|
logger.info(f"Extracted {len(mock_video_ids)} videos from playlist {playlist_id}")
|
|
return mock_video_ids
|
|
|
|
async def analyze_playlist(
|
|
self,
|
|
playlist_url: str,
|
|
perspectives: Optional[List[str]] = None,
|
|
max_videos: Optional[int] = None,
|
|
include_cross_video_analysis: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""Analyze all videos in a YouTube playlist using multi-agent system.
|
|
|
|
Args:
|
|
playlist_url: YouTube playlist URL
|
|
perspectives: List of perspectives to analyze (defaults to all)
|
|
max_videos: Maximum number of videos to analyze (None = all)
|
|
include_cross_video_analysis: Whether to perform cross-video analysis
|
|
|
|
Returns:
|
|
Complete playlist analysis result
|
|
"""
|
|
if not self._is_initialized:
|
|
await self.initialize()
|
|
|
|
logger.info(f"Starting playlist analysis for: {playlist_url}")
|
|
|
|
try:
|
|
# Extract playlist and video information
|
|
playlist_id = self.extract_playlist_id(playlist_url)
|
|
if not playlist_id:
|
|
raise ServiceError(f"Invalid playlist URL: {playlist_url}")
|
|
|
|
# Get video IDs from playlist
|
|
video_ids = self.extract_video_ids_from_playlist(playlist_url)
|
|
|
|
if not video_ids:
|
|
raise ServiceError(f"No videos found in playlist: {playlist_id}")
|
|
|
|
# Limit number of videos if specified
|
|
if max_videos:
|
|
video_ids = video_ids[:max_videos]
|
|
|
|
logger.info(f"Analyzing {len(video_ids)} videos from playlist {playlist_id}")
|
|
|
|
# Process each video with multi-agent analysis
|
|
video_analyses = []
|
|
total_processing_time = 0.0
|
|
|
|
for i, video_id in enumerate(video_ids):
|
|
logger.info(f"Processing video {i+1}/{len(video_ids)}: {video_id}")
|
|
|
|
try:
|
|
# Analyze single video
|
|
video_result = await self.analyze_single_video(
|
|
video_id=video_id,
|
|
perspectives=perspectives
|
|
)
|
|
|
|
if video_result:
|
|
video_analyses.append(video_result)
|
|
total_processing_time += video_result.get("processing_time_seconds", 0)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing video {video_id}: {e}")
|
|
# Continue with other videos even if one fails
|
|
video_analyses.append({
|
|
"video_id": video_id,
|
|
"status": "error",
|
|
"error": str(e),
|
|
"processing_time_seconds": 0
|
|
})
|
|
|
|
# Perform cross-video analysis if requested
|
|
cross_video_insights = {}
|
|
if include_cross_video_analysis and len(video_analyses) > 1:
|
|
cross_video_insights = await self.perform_cross_video_analysis(video_analyses)
|
|
|
|
# Calculate overall playlist quality score
|
|
playlist_quality = self._calculate_playlist_quality(video_analyses)
|
|
|
|
# Extract key themes across all videos
|
|
playlist_themes = self._extract_playlist_themes(video_analyses)
|
|
|
|
# Build final result
|
|
result = {
|
|
"playlist_id": playlist_id,
|
|
"playlist_url": playlist_url,
|
|
"video_count": len(video_ids),
|
|
"successfully_analyzed": len([v for v in video_analyses if v.get("status") != "error"]),
|
|
"video_analyses": video_analyses,
|
|
"cross_video_insights": cross_video_insights,
|
|
"playlist_themes": playlist_themes,
|
|
"overall_quality_score": playlist_quality,
|
|
"total_processing_time_seconds": total_processing_time,
|
|
"analyzed_at": datetime.now().isoformat()
|
|
}
|
|
|
|
logger.info(f"Playlist analysis completed for {playlist_id} in {total_processing_time:.2f}s")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in playlist analysis: {e}")
|
|
raise ServiceError(f"Playlist analysis failed: {str(e)}")
|
|
|
|
async def analyze_single_video(
|
|
self,
|
|
video_id: str,
|
|
perspectives: Optional[List[str]] = None
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Analyze a single video using multi-agent system.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
perspectives: List of perspectives to analyze
|
|
|
|
Returns:
|
|
Video analysis result or None if failed
|
|
"""
|
|
try:
|
|
# Get video metadata
|
|
try:
|
|
video_metadata = await self.video_service.get_video_metadata(video_id)
|
|
video_title = video_metadata.get("title", f"Video {video_id}")
|
|
except Exception as e:
|
|
logger.warning(f"Could not get metadata for video {video_id}: {e}")
|
|
video_title = f"Video {video_id}"
|
|
video_metadata = {"title": video_title}
|
|
|
|
# Extract transcript
|
|
try:
|
|
transcript = await self.transcript_service.extract_transcript(video_id)
|
|
|
|
if not transcript or len(transcript.strip()) < 50:
|
|
logger.warning(f"Transcript too short for video {video_id}")
|
|
return {
|
|
"video_id": video_id,
|
|
"video_title": video_title,
|
|
"status": "skipped",
|
|
"reason": "transcript_too_short",
|
|
"processing_time_seconds": 0
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Could not extract transcript for video {video_id}: {e}")
|
|
return {
|
|
"video_id": video_id,
|
|
"video_title": video_title,
|
|
"status": "error",
|
|
"error": f"transcript_extraction_failed: {str(e)}",
|
|
"processing_time_seconds": 0
|
|
}
|
|
|
|
# Perform multi-agent analysis
|
|
analysis_result = await self.orchestrator.analyze_video_with_multiple_perspectives(
|
|
transcript=transcript,
|
|
video_id=video_id,
|
|
video_title=video_title,
|
|
perspectives=perspectives
|
|
)
|
|
|
|
# Add video metadata to result
|
|
analysis_result["video_metadata"] = video_metadata
|
|
analysis_result["transcript_length"] = len(transcript)
|
|
analysis_result["status"] = "completed"
|
|
|
|
return analysis_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing single video {video_id}: {e}")
|
|
return {
|
|
"video_id": video_id,
|
|
"status": "error",
|
|
"error": str(e),
|
|
"processing_time_seconds": 0
|
|
}
|
|
|
|
async def perform_cross_video_analysis(
|
|
self,
|
|
video_analyses: List[Dict[str, Any]]
|
|
) -> Dict[str, Any]:
|
|
"""Perform cross-video analysis to identify patterns and themes.
|
|
|
|
Args:
|
|
video_analyses: List of individual video analysis results
|
|
|
|
Returns:
|
|
Cross-video analysis insights
|
|
"""
|
|
logger.info(f"Performing cross-video analysis on {len(video_analyses)} videos")
|
|
|
|
try:
|
|
# Filter successful analyses
|
|
successful_analyses = [
|
|
analysis for analysis in video_analyses
|
|
if analysis.get("status") == "completed"
|
|
]
|
|
|
|
if len(successful_analyses) < 2:
|
|
return {
|
|
"status": "skipped",
|
|
"reason": "insufficient_successful_analyses",
|
|
"minimum_required": 2,
|
|
"successful_count": len(successful_analyses)
|
|
}
|
|
|
|
# Extract common themes across videos
|
|
common_themes = self._identify_common_themes(successful_analyses)
|
|
|
|
# Analyze content progression
|
|
content_progression = self._analyze_content_progression(successful_analyses)
|
|
|
|
# Identify key insights patterns
|
|
insight_patterns = self._analyze_insight_patterns(successful_analyses)
|
|
|
|
# Calculate cross-video quality consistency
|
|
quality_consistency = self._calculate_quality_consistency(successful_analyses)
|
|
|
|
return {
|
|
"status": "completed",
|
|
"analyzed_videos": len(successful_analyses),
|
|
"common_themes": common_themes,
|
|
"content_progression": content_progression,
|
|
"insight_patterns": insight_patterns,
|
|
"quality_consistency": quality_consistency,
|
|
"analysis_timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in cross-video analysis: {e}")
|
|
return {
|
|
"status": "error",
|
|
"error": str(e)
|
|
}
|
|
|
|
def _identify_common_themes(self, analyses: List[Dict[str, Any]]) -> List[str]:
|
|
"""Identify common themes across multiple video analyses."""
|
|
theme_frequency = {}
|
|
|
|
for analysis in analyses:
|
|
perspectives = analysis.get("perspectives", {})
|
|
|
|
# Collect themes from all perspectives
|
|
for perspective_data in perspectives.values():
|
|
focus_areas = perspective_data.get("focus_areas", [])
|
|
key_insights = perspective_data.get("key_insights", [])
|
|
|
|
# Count focus areas
|
|
for area in focus_areas:
|
|
theme_frequency[area] = theme_frequency.get(area, 0) + 1
|
|
|
|
# Extract keywords from insights
|
|
for insight in key_insights:
|
|
# Simple keyword extraction (in production, use NLP)
|
|
words = insight.lower().split()
|
|
for word in words:
|
|
if len(word) > 4: # Filter short words
|
|
theme_frequency[word] = theme_frequency.get(word, 0) + 1
|
|
|
|
# Return most common themes
|
|
sorted_themes = sorted(theme_frequency.items(), key=lambda x: x[1], reverse=True)
|
|
return [theme for theme, count in sorted_themes[:10] if count > 1]
|
|
|
|
def _analyze_content_progression(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Analyze how content progresses across videos in the playlist."""
|
|
progression = {
|
|
"video_count": len(analyses),
|
|
"average_quality": 0.0,
|
|
"quality_trend": "stable",
|
|
"complexity_evolution": "consistent"
|
|
}
|
|
|
|
# Calculate average quality
|
|
quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses]
|
|
progression["average_quality"] = sum(quality_scores) / len(quality_scores) if quality_scores else 0.0
|
|
|
|
# Simple trend analysis
|
|
if len(quality_scores) > 2:
|
|
first_half_avg = sum(quality_scores[:len(quality_scores)//2]) / (len(quality_scores)//2)
|
|
second_half_avg = sum(quality_scores[len(quality_scores)//2:]) / (len(quality_scores) - len(quality_scores)//2)
|
|
|
|
if second_half_avg > first_half_avg + 0.1:
|
|
progression["quality_trend"] = "improving"
|
|
elif second_half_avg < first_half_avg - 0.1:
|
|
progression["quality_trend"] = "declining"
|
|
|
|
return progression
|
|
|
|
def _analyze_insight_patterns(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Analyze patterns in insights across videos."""
|
|
patterns = {
|
|
"consistent_perspectives": [],
|
|
"dominant_themes": [],
|
|
"recurring_recommendations": []
|
|
}
|
|
|
|
perspective_consistency = {}
|
|
recommendation_frequency = {}
|
|
|
|
for analysis in analyses:
|
|
perspectives = analysis.get("perspectives", {})
|
|
|
|
# Track perspective consistency
|
|
for perspective_name in perspectives.keys():
|
|
if perspective_name not in perspective_consistency:
|
|
perspective_consistency[perspective_name] = 0
|
|
perspective_consistency[perspective_name] += 1
|
|
|
|
# Track recommendation patterns
|
|
for perspective_data in perspectives.values():
|
|
recommendations = perspective_data.get("recommendations", [])
|
|
for rec in recommendations:
|
|
# Simple keyword extraction from recommendations
|
|
key_words = [word.lower() for word in rec.split() if len(word) > 4]
|
|
for word in key_words[:3]: # Take first 3 significant words
|
|
recommendation_frequency[word] = recommendation_frequency.get(word, 0) + 1
|
|
|
|
# Identify consistent perspectives
|
|
total_videos = len(analyses)
|
|
patterns["consistent_perspectives"] = [
|
|
perspective for perspective, count in perspective_consistency.items()
|
|
if count >= total_videos * 0.8 # Present in at least 80% of videos
|
|
]
|
|
|
|
# Identify recurring recommendations
|
|
patterns["recurring_recommendations"] = [
|
|
word for word, count in sorted(recommendation_frequency.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
if count > 1
|
|
]
|
|
|
|
return patterns
|
|
|
|
def _calculate_quality_consistency(self, analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""Calculate quality consistency across videos."""
|
|
quality_scores = [analysis.get("quality_score", 0.0) for analysis in analyses]
|
|
|
|
if not quality_scores:
|
|
return {"consistency": "unknown", "variance": 0.0, "range": 0.0}
|
|
|
|
avg_quality = sum(quality_scores) / len(quality_scores)
|
|
variance = sum((score - avg_quality) ** 2 for score in quality_scores) / len(quality_scores)
|
|
quality_range = max(quality_scores) - min(quality_scores)
|
|
|
|
# Determine consistency level
|
|
if variance < 0.01:
|
|
consistency = "very_high"
|
|
elif variance < 0.05:
|
|
consistency = "high"
|
|
elif variance < 0.1:
|
|
consistency = "moderate"
|
|
else:
|
|
consistency = "low"
|
|
|
|
return {
|
|
"consistency": consistency,
|
|
"average_quality": avg_quality,
|
|
"variance": variance,
|
|
"range": quality_range,
|
|
"min_quality": min(quality_scores),
|
|
"max_quality": max(quality_scores)
|
|
}
|
|
|
|
def _calculate_playlist_quality(self, analyses: List[Dict[str, Any]]) -> float:
|
|
"""Calculate overall playlist quality score."""
|
|
successful_analyses = [
|
|
analysis for analysis in analyses
|
|
if analysis.get("status") == "completed"
|
|
]
|
|
|
|
if not successful_analyses:
|
|
return 0.0
|
|
|
|
# Average quality of successful analyses
|
|
quality_scores = [analysis.get("quality_score", 0.0) for analysis in successful_analyses]
|
|
avg_quality = sum(quality_scores) / len(quality_scores)
|
|
|
|
# Factor in success rate
|
|
success_rate = len(successful_analyses) / len(analyses)
|
|
|
|
# Weighted score
|
|
playlist_quality = (avg_quality * 0.8) + (success_rate * 0.2)
|
|
return round(playlist_quality, 2)
|
|
|
|
def _extract_playlist_themes(self, analyses: List[Dict[str, Any]]) -> List[str]:
|
|
"""Extract key themes from the entire playlist."""
|
|
successful_analyses = [
|
|
analysis for analysis in analyses
|
|
if analysis.get("status") == "completed"
|
|
]
|
|
|
|
if not successful_analyses:
|
|
return []
|
|
|
|
# Collect all themes from video analyses
|
|
all_themes = []
|
|
|
|
for analysis in successful_analyses:
|
|
# Get unified insights if available
|
|
unified_insights = analysis.get("unified_insights", [])
|
|
all_themes.extend(unified_insights[:3]) # Top 3 from each video
|
|
|
|
# Also get themes from synthesis if available
|
|
perspectives = analysis.get("perspectives", {})
|
|
if "synthesis" in perspectives:
|
|
synthesis_insights = perspectives["synthesis"].get("unified_insights", [])
|
|
all_themes.extend(synthesis_insights[:2]) # Top 2 from synthesis
|
|
|
|
# Simple deduplication and ranking (in production, use more sophisticated NLP)
|
|
theme_counts = {}
|
|
for theme in all_themes:
|
|
# Extract key terms from theme
|
|
key_terms = [word.lower() for word in theme.split() if len(word) > 4]
|
|
for term in key_terms[:2]: # Take first 2 significant terms
|
|
theme_counts[term] = theme_counts.get(term, 0) + 1
|
|
|
|
# Return most common themes
|
|
top_themes = sorted(theme_counts.items(), key=lambda x: x[1], reverse=True)
|
|
return [theme for theme, count in top_themes[:8] if count > 1]
|
|
|
|
async def get_service_health(self) -> Dict[str, Any]:
|
|
"""Get health status of the playlist analyzer service.
|
|
|
|
Returns:
|
|
Service health information
|
|
"""
|
|
health_info = {
|
|
"service": "playlist_analyzer",
|
|
"initialized": self._is_initialized,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
if self._is_initialized and self.orchestrator:
|
|
# Get orchestrator health
|
|
try:
|
|
orchestrator_health = await self.orchestrator.get_orchestrator_health()
|
|
health_info["orchestrator_health"] = orchestrator_health
|
|
|
|
if orchestrator_health.get("status") == "healthy":
|
|
health_info["status"] = "healthy"
|
|
else:
|
|
health_info["status"] = "degraded"
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting orchestrator health: {e}")
|
|
health_info["status"] = "error"
|
|
health_info["error"] = str(e)
|
|
else:
|
|
health_info["status"] = "not_initialized"
|
|
|
|
return health_info |