381 lines
16 KiB
Python
381 lines
16 KiB
Python
"""Playlist processing service for multi-video analysis."""
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from typing import Dict, List, Optional, Any
|
|
from datetime import datetime
|
|
import uuid
|
|
|
|
from googleapiclient.discovery import build
|
|
from googleapiclient.errors import HttpError
|
|
|
|
from backend.core.config import settings
|
|
from backend.core.exceptions import ServiceError
|
|
from backend.services.video_service import VideoService
|
|
from backend.services.transcript_service import TranscriptService
|
|
from backend.services.multi_agent_service import MultiAgentSummarizerService, AgentPerspective
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PlaylistVideo:
|
|
"""Represents a video in a playlist."""
|
|
|
|
def __init__(self, video_id: str, title: str, position: int,
|
|
duration: Optional[str] = None, upload_date: Optional[str] = None):
|
|
self.video_id = video_id
|
|
self.title = title
|
|
self.position = position
|
|
self.duration = duration
|
|
self.upload_date = upload_date
|
|
self.analysis_result: Optional[Dict[str, Any]] = None
|
|
self.error: Optional[str] = None
|
|
|
|
|
|
class PlaylistMetadata:
|
|
"""Metadata for a YouTube playlist."""
|
|
|
|
def __init__(self, playlist_id: str, title: str, channel_name: str,
|
|
video_count: int, total_duration: Optional[int] = None):
|
|
self.playlist_id = playlist_id
|
|
self.title = title
|
|
self.channel_name = channel_name
|
|
self.video_count = video_count
|
|
self.total_duration = total_duration
|
|
|
|
|
|
class PlaylistProcessingResult:
|
|
"""Result of playlist processing with multi-agent analysis."""
|
|
|
|
def __init__(self, job_id: str, playlist_url: str):
|
|
self.job_id = job_id
|
|
self.playlist_url = playlist_url
|
|
self.playlist_metadata: Optional[PlaylistMetadata] = None
|
|
self.videos: List[PlaylistVideo] = []
|
|
self.processed_videos: int = 0
|
|
self.failed_videos: int = 0
|
|
self.progress_percentage: float = 0.0
|
|
self.current_video: Optional[str] = None
|
|
self.status: str = "initializing" # initializing, processing, completed, failed, cancelled
|
|
self.error: Optional[str] = None
|
|
self.cross_video_analysis: Optional[Dict[str, Any]] = None
|
|
self.started_at: datetime = datetime.now()
|
|
self.completed_at: Optional[datetime] = None
|
|
|
|
|
|
class PlaylistService:
|
|
"""Service for processing YouTube playlists with multi-agent analysis."""
|
|
|
|
def __init__(self, youtube_api_key: Optional[str] = None):
|
|
self.youtube_api_key = youtube_api_key or settings.YOUTUBE_API_KEY
|
|
self.youtube = None
|
|
if self.youtube_api_key:
|
|
self.youtube = build('youtube', 'v3', developerKey=self.youtube_api_key)
|
|
|
|
self.video_service = VideoService()
|
|
self.transcript_service = TranscriptService()
|
|
self.multi_agent_service = MultiAgentSummarizerService()
|
|
|
|
# Active job tracking
|
|
self.active_jobs: Dict[str, PlaylistProcessingResult] = {}
|
|
|
|
def extract_playlist_id(self, playlist_url: str) -> Optional[str]:
|
|
"""Extract playlist ID from YouTube playlist URL."""
|
|
patterns = [
|
|
r'list=([a-zA-Z0-9_-]+)', # Standard playlist parameter
|
|
r'playlist\?list=([a-zA-Z0-9_-]+)', # Direct playlist URL
|
|
r'youtube\.com/.*[?&]list=([a-zA-Z0-9_-]+)', # Any YouTube URL with list param
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, playlist_url)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
async def get_playlist_metadata(self, playlist_id: str) -> Optional[PlaylistMetadata]:
|
|
"""Get playlist metadata from YouTube Data API."""
|
|
if not self.youtube:
|
|
logger.warning("YouTube Data API not configured, using mock data")
|
|
return PlaylistMetadata(
|
|
playlist_id=playlist_id,
|
|
title=f"Mock Playlist {playlist_id}",
|
|
channel_name="Mock Channel",
|
|
video_count=5
|
|
)
|
|
|
|
try:
|
|
# Get playlist details
|
|
playlist_response = self.youtube.playlists().list(
|
|
part='snippet,contentDetails',
|
|
id=playlist_id,
|
|
maxResults=1
|
|
).execute()
|
|
|
|
if not playlist_response.get('items'):
|
|
return None
|
|
|
|
playlist_item = playlist_response['items'][0]
|
|
snippet = playlist_item['snippet']
|
|
content_details = playlist_item['contentDetails']
|
|
|
|
return PlaylistMetadata(
|
|
playlist_id=playlist_id,
|
|
title=snippet.get('title', ''),
|
|
channel_name=snippet.get('channelTitle', ''),
|
|
video_count=content_details.get('itemCount', 0)
|
|
)
|
|
|
|
except HttpError as e:
|
|
logger.error(f"Error fetching playlist metadata: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in get_playlist_metadata: {e}")
|
|
return None
|
|
|
|
async def discover_playlist_videos(self, playlist_id: str, max_videos: Optional[int] = None) -> List[PlaylistVideo]:
|
|
"""Discover all videos in a playlist."""
|
|
if not self.youtube:
|
|
logger.warning("YouTube Data API not configured, using mock data")
|
|
return [
|
|
PlaylistVideo(f"mock_video_{i}", f"Mock Video {i+1}", i)
|
|
for i in range(min(max_videos or 5, 5))
|
|
]
|
|
|
|
videos = []
|
|
next_page_token = None
|
|
position = 0
|
|
|
|
try:
|
|
while True:
|
|
# Get playlist items
|
|
playlist_items_response = self.youtube.playlistItems().list(
|
|
part='snippet,contentDetails',
|
|
playlistId=playlist_id,
|
|
maxResults=50, # Maximum allowed by YouTube API
|
|
pageToken=next_page_token
|
|
).execute()
|
|
|
|
items = playlist_items_response.get('items', [])
|
|
if not items:
|
|
break
|
|
|
|
# Process each video
|
|
for item in items:
|
|
if max_videos and len(videos) >= max_videos:
|
|
break
|
|
|
|
snippet = item['snippet']
|
|
content_details = item['contentDetails']
|
|
|
|
video_id = content_details.get('videoId')
|
|
if not video_id:
|
|
continue # Skip deleted or private videos
|
|
|
|
title = snippet.get('title', f'Video {position + 1}')
|
|
upload_date = snippet.get('publishedAt')
|
|
|
|
videos.append(PlaylistVideo(
|
|
video_id=video_id,
|
|
title=title,
|
|
position=position,
|
|
upload_date=upload_date
|
|
))
|
|
position += 1
|
|
|
|
# Check if we need to fetch more pages
|
|
next_page_token = playlist_items_response.get('nextPageToken')
|
|
if not next_page_token or (max_videos and len(videos) >= max_videos):
|
|
break
|
|
|
|
logger.info(f"Discovered {len(videos)} videos in playlist {playlist_id}")
|
|
return videos
|
|
|
|
except HttpError as e:
|
|
logger.error(f"Error fetching playlist videos: {e}")
|
|
raise ServiceError(f"Failed to fetch playlist videos: {str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in discover_playlist_videos: {e}")
|
|
raise ServiceError(f"Unexpected error discovering videos: {str(e)}")
|
|
|
|
async def start_playlist_processing(self, playlist_url: str, max_videos: Optional[int] = None,
|
|
agent_types: Optional[List[str]] = None) -> str:
|
|
"""Start processing a playlist with multi-agent analysis."""
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Initialize job result
|
|
result = PlaylistProcessingResult(job_id=job_id, playlist_url=playlist_url)
|
|
self.active_jobs[job_id] = result
|
|
|
|
# Start background processing
|
|
asyncio.create_task(self._process_playlist_background(
|
|
job_id, playlist_url, max_videos, agent_types or ["technical", "business", "user"]
|
|
))
|
|
|
|
return job_id
|
|
|
|
async def _process_playlist_background(self, job_id: str, playlist_url: str,
|
|
max_videos: Optional[int], agent_types: List[str]):
|
|
"""Background task to process playlist."""
|
|
result = self.active_jobs[job_id]
|
|
|
|
try:
|
|
result.status = "processing"
|
|
|
|
# Extract playlist ID
|
|
playlist_id = self.extract_playlist_id(playlist_url)
|
|
if not playlist_id:
|
|
raise ServiceError("Invalid playlist URL")
|
|
|
|
logger.info(f"Starting playlist processing for job {job_id}, playlist {playlist_id}")
|
|
|
|
# Get playlist metadata
|
|
result.playlist_metadata = await self.get_playlist_metadata(playlist_id)
|
|
if not result.playlist_metadata:
|
|
raise ServiceError("Could not fetch playlist metadata")
|
|
|
|
# Discover videos
|
|
result.videos = await self.discover_playlist_videos(playlist_id, max_videos)
|
|
if not result.videos:
|
|
raise ServiceError("No videos found in playlist")
|
|
|
|
# Convert agent type strings to enums
|
|
perspectives = []
|
|
for agent_type in agent_types:
|
|
if agent_type == "technical":
|
|
perspectives.append(AgentPerspective.TECHNICAL)
|
|
elif agent_type == "business":
|
|
perspectives.append(AgentPerspective.BUSINESS)
|
|
elif agent_type == "user":
|
|
perspectives.append(AgentPerspective.USER_EXPERIENCE)
|
|
|
|
# Process each video
|
|
for i, video in enumerate(result.videos):
|
|
if result.status == "cancelled":
|
|
break
|
|
|
|
result.current_video = video.title
|
|
result.progress_percentage = (i / len(result.videos)) * 90 # Reserve 10% for cross-video analysis
|
|
|
|
try:
|
|
logger.info(f"Processing video {i+1}/{len(result.videos)}: {video.video_id}")
|
|
|
|
# Get transcript
|
|
transcript_result = await self.transcript_service.extract_transcript(video.video_id)
|
|
if not transcript_result or not transcript_result.get('transcript'):
|
|
video.error = "Could not extract transcript"
|
|
result.failed_videos += 1
|
|
continue
|
|
|
|
transcript = transcript_result['transcript']
|
|
|
|
# Perform multi-agent analysis
|
|
analysis_result = await self.multi_agent_service.analyze_with_multiple_perspectives(
|
|
transcript=transcript,
|
|
video_id=video.video_id,
|
|
video_title=video.title,
|
|
perspectives=perspectives
|
|
)
|
|
|
|
video.analysis_result = analysis_result.dict()
|
|
result.processed_videos += 1
|
|
|
|
logger.info(f"Completed analysis for video {video.video_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing video {video.video_id}: {e}")
|
|
video.error = str(e)
|
|
result.failed_videos += 1
|
|
|
|
# Small delay to prevent overwhelming APIs
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Perform cross-video analysis
|
|
result.current_video = "Cross-video analysis"
|
|
result.progress_percentage = 95.0
|
|
|
|
result.cross_video_analysis = await self._perform_cross_video_analysis(result.videos)
|
|
|
|
# Mark as completed
|
|
result.status = "completed"
|
|
result.progress_percentage = 100.0
|
|
result.completed_at = datetime.now()
|
|
result.current_video = None
|
|
|
|
logger.info(f"Playlist processing completed for job {job_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in playlist processing for job {job_id}: {e}")
|
|
result.status = "failed"
|
|
result.error = str(e)
|
|
result.current_video = None
|
|
|
|
async def _perform_cross_video_analysis(self, videos: List[PlaylistVideo]) -> Dict[str, Any]:
|
|
"""Perform cross-video analysis to identify themes and patterns."""
|
|
# For now, implement a simple analysis
|
|
# In production, this could use AI to identify themes across videos
|
|
|
|
successful_videos = [v for v in videos if v.analysis_result and not v.error]
|
|
|
|
if not successful_videos:
|
|
return {"error": "No successful video analyses to compare"}
|
|
|
|
# Extract common themes from titles and summaries
|
|
all_titles = [v.title for v in successful_videos]
|
|
|
|
# Simple theme extraction (could be enhanced with AI)
|
|
themes = []
|
|
if len(all_titles) > 1:
|
|
themes = ["Multi-part series", "Educational content", "Topic progression"]
|
|
|
|
analysis = {
|
|
"total_videos": len(videos),
|
|
"successfully_analyzed": len(successful_videos),
|
|
"failed_analyses": len(videos) - len(successful_videos),
|
|
"identified_themes": themes,
|
|
"content_progression": "Sequential learning path detected" if len(successful_videos) > 2 else "Standalone content",
|
|
"key_insights": [
|
|
f"Analyzed {len(successful_videos)} videos successfully",
|
|
f"Common themes: {', '.join(themes) if themes else 'None identified'}",
|
|
"Multi-agent perspectives provide comprehensive analysis"
|
|
],
|
|
"agent_perspectives": {
|
|
"technical": "Technical concepts build upon each other",
|
|
"business": "Business value increases with series completion",
|
|
"user": "User journey spans multiple videos for complete understanding"
|
|
}
|
|
}
|
|
|
|
return analysis
|
|
|
|
def get_playlist_status(self, job_id: str) -> Optional[PlaylistProcessingResult]:
|
|
"""Get the current status of a playlist processing job."""
|
|
return self.active_jobs.get(job_id)
|
|
|
|
def cancel_playlist_processing(self, job_id: str) -> bool:
|
|
"""Cancel a running playlist processing job."""
|
|
if job_id in self.active_jobs:
|
|
job = self.active_jobs[job_id]
|
|
if job.status in ["initializing", "processing"]:
|
|
job.status = "cancelled"
|
|
job.error = "Job cancelled by user"
|
|
job.current_video = None
|
|
logger.info(f"Cancelled playlist processing job {job_id}")
|
|
return True
|
|
return False
|
|
|
|
def cleanup_completed_jobs(self, max_age_hours: int = 24):
|
|
"""Clean up old completed jobs to prevent memory leaks."""
|
|
cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)
|
|
|
|
jobs_to_remove = []
|
|
for job_id, job in self.active_jobs.items():
|
|
if job.status in ["completed", "failed", "cancelled"]:
|
|
if job.completed_at and job.completed_at.timestamp() < cutoff_time:
|
|
jobs_to_remove.append(job_id)
|
|
|
|
for job_id in jobs_to_remove:
|
|
del self.active_jobs[job_id]
|
|
logger.info(f"Cleaned up old job {job_id}") |