youtube-summarizer/backend/services/playlist_service.py

381 lines
16 KiB
Python

"""Playlist processing service for multi-video analysis."""
import asyncio
import logging
import re
from typing import Dict, List, Optional, Any
from datetime import datetime
import uuid
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from backend.core.config import settings
from backend.core.exceptions import ServiceError
from backend.services.video_service import VideoService
from backend.services.transcript_service import TranscriptService
from backend.services.multi_agent_service import MultiAgentSummarizerService, AgentPerspective
logger = logging.getLogger(__name__)
class PlaylistVideo:
"""Represents a video in a playlist."""
def __init__(self, video_id: str, title: str, position: int,
duration: Optional[str] = None, upload_date: Optional[str] = None):
self.video_id = video_id
self.title = title
self.position = position
self.duration = duration
self.upload_date = upload_date
self.analysis_result: Optional[Dict[str, Any]] = None
self.error: Optional[str] = None
class PlaylistMetadata:
"""Metadata for a YouTube playlist."""
def __init__(self, playlist_id: str, title: str, channel_name: str,
video_count: int, total_duration: Optional[int] = None):
self.playlist_id = playlist_id
self.title = title
self.channel_name = channel_name
self.video_count = video_count
self.total_duration = total_duration
class PlaylistProcessingResult:
"""Result of playlist processing with multi-agent analysis."""
def __init__(self, job_id: str, playlist_url: str):
self.job_id = job_id
self.playlist_url = playlist_url
self.playlist_metadata: Optional[PlaylistMetadata] = None
self.videos: List[PlaylistVideo] = []
self.processed_videos: int = 0
self.failed_videos: int = 0
self.progress_percentage: float = 0.0
self.current_video: Optional[str] = None
self.status: str = "initializing" # initializing, processing, completed, failed, cancelled
self.error: Optional[str] = None
self.cross_video_analysis: Optional[Dict[str, Any]] = None
self.started_at: datetime = datetime.now()
self.completed_at: Optional[datetime] = None
class PlaylistService:
"""Service for processing YouTube playlists with multi-agent analysis."""
def __init__(self, youtube_api_key: Optional[str] = None):
self.youtube_api_key = youtube_api_key or settings.YOUTUBE_API_KEY
self.youtube = None
if self.youtube_api_key:
self.youtube = build('youtube', 'v3', developerKey=self.youtube_api_key)
self.video_service = VideoService()
self.transcript_service = TranscriptService()
self.multi_agent_service = MultiAgentSummarizerService()
# Active job tracking
self.active_jobs: Dict[str, PlaylistProcessingResult] = {}
def extract_playlist_id(self, playlist_url: str) -> Optional[str]:
"""Extract playlist ID from YouTube playlist URL."""
patterns = [
r'list=([a-zA-Z0-9_-]+)', # Standard playlist parameter
r'playlist\?list=([a-zA-Z0-9_-]+)', # Direct playlist URL
r'youtube\.com/.*[?&]list=([a-zA-Z0-9_-]+)', # Any YouTube URL with list param
]
for pattern in patterns:
match = re.search(pattern, playlist_url)
if match:
return match.group(1)
return None
async def get_playlist_metadata(self, playlist_id: str) -> Optional[PlaylistMetadata]:
"""Get playlist metadata from YouTube Data API."""
if not self.youtube:
logger.warning("YouTube Data API not configured, using mock data")
return PlaylistMetadata(
playlist_id=playlist_id,
title=f"Mock Playlist {playlist_id}",
channel_name="Mock Channel",
video_count=5
)
try:
# Get playlist details
playlist_response = self.youtube.playlists().list(
part='snippet,contentDetails',
id=playlist_id,
maxResults=1
).execute()
if not playlist_response.get('items'):
return None
playlist_item = playlist_response['items'][0]
snippet = playlist_item['snippet']
content_details = playlist_item['contentDetails']
return PlaylistMetadata(
playlist_id=playlist_id,
title=snippet.get('title', ''),
channel_name=snippet.get('channelTitle', ''),
video_count=content_details.get('itemCount', 0)
)
except HttpError as e:
logger.error(f"Error fetching playlist metadata: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error in get_playlist_metadata: {e}")
return None
async def discover_playlist_videos(self, playlist_id: str, max_videos: Optional[int] = None) -> List[PlaylistVideo]:
"""Discover all videos in a playlist."""
if not self.youtube:
logger.warning("YouTube Data API not configured, using mock data")
return [
PlaylistVideo(f"mock_video_{i}", f"Mock Video {i+1}", i)
for i in range(min(max_videos or 5, 5))
]
videos = []
next_page_token = None
position = 0
try:
while True:
# Get playlist items
playlist_items_response = self.youtube.playlistItems().list(
part='snippet,contentDetails',
playlistId=playlist_id,
maxResults=50, # Maximum allowed by YouTube API
pageToken=next_page_token
).execute()
items = playlist_items_response.get('items', [])
if not items:
break
# Process each video
for item in items:
if max_videos and len(videos) >= max_videos:
break
snippet = item['snippet']
content_details = item['contentDetails']
video_id = content_details.get('videoId')
if not video_id:
continue # Skip deleted or private videos
title = snippet.get('title', f'Video {position + 1}')
upload_date = snippet.get('publishedAt')
videos.append(PlaylistVideo(
video_id=video_id,
title=title,
position=position,
upload_date=upload_date
))
position += 1
# Check if we need to fetch more pages
next_page_token = playlist_items_response.get('nextPageToken')
if not next_page_token or (max_videos and len(videos) >= max_videos):
break
logger.info(f"Discovered {len(videos)} videos in playlist {playlist_id}")
return videos
except HttpError as e:
logger.error(f"Error fetching playlist videos: {e}")
raise ServiceError(f"Failed to fetch playlist videos: {str(e)}")
except Exception as e:
logger.error(f"Unexpected error in discover_playlist_videos: {e}")
raise ServiceError(f"Unexpected error discovering videos: {str(e)}")
async def start_playlist_processing(self, playlist_url: str, max_videos: Optional[int] = None,
agent_types: Optional[List[str]] = None) -> str:
"""Start processing a playlist with multi-agent analysis."""
job_id = str(uuid.uuid4())
# Initialize job result
result = PlaylistProcessingResult(job_id=job_id, playlist_url=playlist_url)
self.active_jobs[job_id] = result
# Start background processing
asyncio.create_task(self._process_playlist_background(
job_id, playlist_url, max_videos, agent_types or ["technical", "business", "user"]
))
return job_id
async def _process_playlist_background(self, job_id: str, playlist_url: str,
max_videos: Optional[int], agent_types: List[str]):
"""Background task to process playlist."""
result = self.active_jobs[job_id]
try:
result.status = "processing"
# Extract playlist ID
playlist_id = self.extract_playlist_id(playlist_url)
if not playlist_id:
raise ServiceError("Invalid playlist URL")
logger.info(f"Starting playlist processing for job {job_id}, playlist {playlist_id}")
# Get playlist metadata
result.playlist_metadata = await self.get_playlist_metadata(playlist_id)
if not result.playlist_metadata:
raise ServiceError("Could not fetch playlist metadata")
# Discover videos
result.videos = await self.discover_playlist_videos(playlist_id, max_videos)
if not result.videos:
raise ServiceError("No videos found in playlist")
# Convert agent type strings to enums
perspectives = []
for agent_type in agent_types:
if agent_type == "technical":
perspectives.append(AgentPerspective.TECHNICAL)
elif agent_type == "business":
perspectives.append(AgentPerspective.BUSINESS)
elif agent_type == "user":
perspectives.append(AgentPerspective.USER_EXPERIENCE)
# Process each video
for i, video in enumerate(result.videos):
if result.status == "cancelled":
break
result.current_video = video.title
result.progress_percentage = (i / len(result.videos)) * 90 # Reserve 10% for cross-video analysis
try:
logger.info(f"Processing video {i+1}/{len(result.videos)}: {video.video_id}")
# Get transcript
transcript_result = await self.transcript_service.extract_transcript(video.video_id)
if not transcript_result or not transcript_result.get('transcript'):
video.error = "Could not extract transcript"
result.failed_videos += 1
continue
transcript = transcript_result['transcript']
# Perform multi-agent analysis
analysis_result = await self.multi_agent_service.analyze_with_multiple_perspectives(
transcript=transcript,
video_id=video.video_id,
video_title=video.title,
perspectives=perspectives
)
video.analysis_result = analysis_result.dict()
result.processed_videos += 1
logger.info(f"Completed analysis for video {video.video_id}")
except Exception as e:
logger.error(f"Error processing video {video.video_id}: {e}")
video.error = str(e)
result.failed_videos += 1
# Small delay to prevent overwhelming APIs
await asyncio.sleep(0.5)
# Perform cross-video analysis
result.current_video = "Cross-video analysis"
result.progress_percentage = 95.0
result.cross_video_analysis = await self._perform_cross_video_analysis(result.videos)
# Mark as completed
result.status = "completed"
result.progress_percentage = 100.0
result.completed_at = datetime.now()
result.current_video = None
logger.info(f"Playlist processing completed for job {job_id}")
except Exception as e:
logger.error(f"Error in playlist processing for job {job_id}: {e}")
result.status = "failed"
result.error = str(e)
result.current_video = None
async def _perform_cross_video_analysis(self, videos: List[PlaylistVideo]) -> Dict[str, Any]:
"""Perform cross-video analysis to identify themes and patterns."""
# For now, implement a simple analysis
# In production, this could use AI to identify themes across videos
successful_videos = [v for v in videos if v.analysis_result and not v.error]
if not successful_videos:
return {"error": "No successful video analyses to compare"}
# Extract common themes from titles and summaries
all_titles = [v.title for v in successful_videos]
# Simple theme extraction (could be enhanced with AI)
themes = []
if len(all_titles) > 1:
themes = ["Multi-part series", "Educational content", "Topic progression"]
analysis = {
"total_videos": len(videos),
"successfully_analyzed": len(successful_videos),
"failed_analyses": len(videos) - len(successful_videos),
"identified_themes": themes,
"content_progression": "Sequential learning path detected" if len(successful_videos) > 2 else "Standalone content",
"key_insights": [
f"Analyzed {len(successful_videos)} videos successfully",
f"Common themes: {', '.join(themes) if themes else 'None identified'}",
"Multi-agent perspectives provide comprehensive analysis"
],
"agent_perspectives": {
"technical": "Technical concepts build upon each other",
"business": "Business value increases with series completion",
"user": "User journey spans multiple videos for complete understanding"
}
}
return analysis
def get_playlist_status(self, job_id: str) -> Optional[PlaylistProcessingResult]:
"""Get the current status of a playlist processing job."""
return self.active_jobs.get(job_id)
def cancel_playlist_processing(self, job_id: str) -> bool:
"""Cancel a running playlist processing job."""
if job_id in self.active_jobs:
job = self.active_jobs[job_id]
if job.status in ["initializing", "processing"]:
job.status = "cancelled"
job.error = "Job cancelled by user"
job.current_video = None
logger.info(f"Cancelled playlist processing job {job_id}")
return True
return False
def cleanup_completed_jobs(self, max_age_hours: int = 24):
"""Clean up old completed jobs to prevent memory leaks."""
cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
jobs_to_remove = []
for job_id, job in self.active_jobs.items():
if job.status in ["completed", "failed", "cancelled"]:
if job.completed_at and job.completed_at.timestamp() < cutoff_time:
jobs_to_remove.append(job_id)
for job_id in jobs_to_remove:
del self.active_jobs[job_id]
logger.info(f"Cleaned up old job {job_id}")