youtube-summarizer/backend/services/playlist_service.py

"""Playlist processing service for multi-video analysis."""

import asyncio
import logging
import re
from typing import Dict, List, Optional, Any
from datetime import datetime
import uuid

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

from backend.core.config import settings
from backend.core.exceptions import ServiceError
from backend.services.video_service import VideoService
from backend.services.transcript_service import TranscriptService
from backend.services.multi_agent_service import MultiAgentSummarizerService, AgentPerspective

logger = logging.getLogger(__name__)


class PlaylistVideo:
    """Represents a video in a playlist."""

    def __init__(self, video_id: str, title: str, position: int,
                 duration: Optional[str] = None, upload_date: Optional[str] = None):
        self.video_id = video_id
        self.title = title
        self.position = position
        self.duration = duration
        self.upload_date = upload_date
        self.analysis_result: Optional[Dict[str, Any]] = None
        self.error: Optional[str] = None


class PlaylistMetadata:
    """Metadata for a YouTube playlist."""

    def __init__(self, playlist_id: str, title: str, channel_name: str,
                 video_count: int, total_duration: Optional[int] = None):
        self.playlist_id = playlist_id
        self.title = title
        self.channel_name = channel_name
        self.video_count = video_count
        self.total_duration = total_duration


class PlaylistProcessingResult:
    """Result of playlist processing with multi-agent analysis."""

    def __init__(self, job_id: str, playlist_url: str):
        self.job_id = job_id
        self.playlist_url = playlist_url
        self.playlist_metadata: Optional[PlaylistMetadata] = None
        self.videos: List[PlaylistVideo] = []
        self.processed_videos: int = 0
        self.failed_videos: int = 0
        self.progress_percentage: float = 0.0
        self.current_video: Optional[str] = None
        self.status: str = "initializing"  # initializing, processing, completed, failed, cancelled
        self.error: Optional[str] = None
        self.cross_video_analysis: Optional[Dict[str, Any]] = None
        self.started_at: datetime = datetime.now()
        self.completed_at: Optional[datetime] = None


class PlaylistService:
    """Service for processing YouTube playlists with multi-agent analysis."""

    def __init__(self, youtube_api_key: Optional[str] = None):
        self.youtube_api_key = youtube_api_key or settings.YOUTUBE_API_KEY
        self.youtube = None
        if self.youtube_api_key:
            self.youtube = build('youtube', 'v3', developerKey=self.youtube_api_key)

        self.video_service = VideoService()
        self.transcript_service = TranscriptService()
        self.multi_agent_service = MultiAgentSummarizerService()

        # Active job tracking
        self.active_jobs: Dict[str, PlaylistProcessingResult] = {}

    def extract_playlist_id(self, playlist_url: str) -> Optional[str]:
        """Extract playlist ID from YouTube playlist URL."""
        patterns = [
            r'list=([a-zA-Z0-9_-]+)',  # Standard playlist parameter
            r'playlist\?list=([a-zA-Z0-9_-]+)',  # Direct playlist URL
            r'youtube\.com/.*[?&]list=([a-zA-Z0-9_-]+)',  # Any YouTube URL with list param
        ]

        for pattern in patterns:
            match = re.search(pattern, playlist_url)
            if match:
                return match.group(1)

        return None

    async def get_playlist_metadata(self, playlist_id: str) -> Optional[PlaylistMetadata]:
        """Get playlist metadata from YouTube Data API."""
        if not self.youtube:
            logger.warning("YouTube Data API not configured, using mock data")
            return PlaylistMetadata(
                playlist_id=playlist_id,
                title=f"Mock Playlist {playlist_id}",
                channel_name="Mock Channel",
                video_count=5
            )

        try:
            # Get playlist details
            playlist_response = self.youtube.playlists().list(
                part='snippet,contentDetails',
                id=playlist_id,
                maxResults=1
            ).execute()

            if not playlist_response.get('items'):
                return None

            playlist_item = playlist_response['items'][0]
            snippet = playlist_item['snippet']
            content_details = playlist_item['contentDetails']

            return PlaylistMetadata(
                playlist_id=playlist_id,
                title=snippet.get('title', ''),
                channel_name=snippet.get('channelTitle', ''),
                video_count=content_details.get('itemCount', 0)
            )

        except HttpError as e:
            logger.error(f"Error fetching playlist metadata: {e}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error in get_playlist_metadata: {e}")
            return None

    async def discover_playlist_videos(self, playlist_id: str, max_videos: Optional[int] = None) -> List[PlaylistVideo]:
        """Discover all videos in a playlist."""
        if not self.youtube:
            logger.warning("YouTube Data API not configured, using mock data")
            return [
                PlaylistVideo(f"mock_video_{i}", f"Mock Video {i+1}", i)
                for i in range(min(max_videos or 5, 5))
            ]

        videos = []
        next_page_token = None
        position = 0

        try:
            while True:
                # Get playlist items
                playlist_items_response = self.youtube.playlistItems().list(
                    part='snippet,contentDetails',
                    playlistId=playlist_id,
                    maxResults=50,  # Maximum allowed by YouTube API
                    pageToken=next_page_token
                ).execute()

                items = playlist_items_response.get('items', [])
                if not items:
                    break

                # Process each video
                for item in items:
                    if max_videos and len(videos) >= max_videos:
                        break

                    snippet = item['snippet']
                    content_details = item['contentDetails']

                    video_id = content_details.get('videoId')
                    if not video_id:
                        continue  # Skip deleted or private videos

                    title = snippet.get('title', f'Video {position + 1}')
                    upload_date = snippet.get('publishedAt')

                    videos.append(PlaylistVideo(
                        video_id=video_id,
                        title=title,
                        position=position,
                        upload_date=upload_date
                    ))
                    position += 1

                # Check if we need to fetch more pages
                next_page_token = playlist_items_response.get('nextPageToken')
                if not next_page_token or (max_videos and len(videos) >= max_videos):
                    break

            logger.info(f"Discovered {len(videos)} videos in playlist {playlist_id}")
            return videos

        except HttpError as e:
            logger.error(f"Error fetching playlist videos: {e}")
            raise ServiceError(f"Failed to fetch playlist videos: {str(e)}")
        except Exception as e:
            logger.error(f"Unexpected error in discover_playlist_videos: {e}")
            raise ServiceError(f"Unexpected error discovering videos: {str(e)}")

    async def start_playlist_processing(self, playlist_url: str, max_videos: Optional[int] = None,
                                      agent_types: Optional[List[str]] = None) -> str:
        """Start processing a playlist with multi-agent analysis."""
        job_id = str(uuid.uuid4())

        # Initialize job result
        result = PlaylistProcessingResult(job_id=job_id, playlist_url=playlist_url)
        self.active_jobs[job_id] = result

        # Start background processing
        asyncio.create_task(self._process_playlist_background(
            job_id, playlist_url, max_videos, agent_types or ["technical", "business", "user"]
        ))

        return job_id

    async def _process_playlist_background(self, job_id: str, playlist_url: str,
                                         max_videos: Optional[int], agent_types: List[str]):
        """Background task to process playlist."""
        result = self.active_jobs[job_id]

        try:
            result.status = "processing"

            # Extract playlist ID
            playlist_id = self.extract_playlist_id(playlist_url)
            if not playlist_id:
                raise ServiceError("Invalid playlist URL")

            logger.info(f"Starting playlist processing for job {job_id}, playlist {playlist_id}")

            # Get playlist metadata
            result.playlist_metadata = await self.get_playlist_metadata(playlist_id)
            if not result.playlist_metadata:
                raise ServiceError("Could not fetch playlist metadata")

            # Discover videos
            result.videos = await self.discover_playlist_videos(playlist_id, max_videos)
            if not result.videos:
                raise ServiceError("No videos found in playlist")

            # Convert agent type strings to enums
            perspectives = []
            for agent_type in agent_types:
                if agent_type == "technical":
                    perspectives.append(AgentPerspective.TECHNICAL)
                elif agent_type == "business":
                    perspectives.append(AgentPerspective.BUSINESS)
                elif agent_type == "user":
                    perspectives.append(AgentPerspective.USER_EXPERIENCE)

            # Process each video
            for i, video in enumerate(result.videos):
                if result.status == "cancelled":
                    break

                result.current_video = video.title
                result.progress_percentage = (i / len(result.videos)) * 90  # Reserve 10% for cross-video analysis

                try:
                    logger.info(f"Processing video {i+1}/{len(result.videos)}: {video.video_id}")

                    # Get transcript
                    transcript_result = await self.transcript_service.extract_transcript(video.video_id)
                    if not transcript_result or not transcript_result.get('transcript'):
                        video.error = "Could not extract transcript"
                        result.failed_videos += 1
                        continue

                    transcript = transcript_result['transcript']

                    # Perform multi-agent analysis
                    analysis_result = await self.multi_agent_service.analyze_with_multiple_perspectives(
                        transcript=transcript,
                        video_id=video.video_id,
                        video_title=video.title,
                        perspectives=perspectives
                    )

                    video.analysis_result = analysis_result.dict()
                    result.processed_videos += 1

                    logger.info(f"Completed analysis for video {video.video_id}")

                except Exception as e:
                    logger.error(f"Error processing video {video.video_id}: {e}")
                    video.error = str(e)
                    result.failed_videos += 1

                # Small delay to prevent overwhelming APIs
                await asyncio.sleep(0.5)

            # Perform cross-video analysis
            result.current_video = "Cross-video analysis"
            result.progress_percentage = 95.0

            result.cross_video_analysis = await self._perform_cross_video_analysis(result.videos)

            # Mark as completed
            result.status = "completed"
            result.progress_percentage = 100.0
            result.completed_at = datetime.now()
            result.current_video = None

            logger.info(f"Playlist processing completed for job {job_id}")

        except Exception as e:
            logger.error(f"Error in playlist processing for job {job_id}: {e}")
            result.status = "failed"
            result.error = str(e)
            result.current_video = None

    async def _perform_cross_video_analysis(self, videos: List[PlaylistVideo]) -> Dict[str, Any]:
        """Perform cross-video analysis to identify themes and patterns."""
        # For now, implement a simple analysis
        # In production, this could use AI to identify themes across videos

        successful_videos = [v for v in videos if v.analysis_result and not v.error]

        if not successful_videos:
            return {"error": "No successful video analyses to compare"}

        # Extract common themes from titles and summaries
        all_titles = [v.title for v in successful_videos]

        # Simple theme extraction (could be enhanced with AI)
        themes = []
        if len(all_titles) > 1:
            themes = ["Multi-part series", "Educational content", "Topic progression"]

        analysis = {
            "total_videos": len(videos),
            "successfully_analyzed": len(successful_videos),
            "failed_analyses": len(videos) - len(successful_videos),
            "identified_themes": themes,
            "content_progression": "Sequential learning path detected" if len(successful_videos) > 2 else "Standalone content",
            "key_insights": [
                f"Analyzed {len(successful_videos)} videos successfully",
                f"Common themes: {', '.join(themes) if themes else 'None identified'}",
                "Multi-agent perspectives provide comprehensive analysis"
            ],
            "agent_perspectives": {
                "technical": "Technical concepts build upon each other",
                "business": "Business value increases with series completion",
                "user": "User journey spans multiple videos for complete understanding"
            }
        }

        return analysis

    def get_playlist_status(self, job_id: str) -> Optional[PlaylistProcessingResult]:
        """Get the current status of a playlist processing job."""
        return self.active_jobs.get(job_id)

    def cancel_playlist_processing(self, job_id: str) -> bool:
        """Cancel a running playlist processing job."""
        if job_id in self.active_jobs:
            job = self.active_jobs[job_id]
            if job.status in ["initializing", "processing"]:
                job.status = "cancelled"
                job.error = "Job cancelled by user"
                job.current_video = None
                logger.info(f"Cancelled playlist processing job {job_id}")
                return True
        return False

    def cleanup_completed_jobs(self, max_age_hours: int = 24):
        """Clean up old completed jobs to prevent memory leaks."""
        cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600)

        jobs_to_remove = []
        for job_id, job in self.active_jobs.items():
            if job.status in ["completed", "failed", "cancelled"]:
                if job.completed_at and job.completed_at.timestamp() < cutoff_time:
                    jobs_to_remove.append(job_id)

        for job_id in jobs_to_remove:
            del self.active_jobs[job_id]
            logger.info(f"Cleaned up old job {job_id}")