youtube-summarizer/backend/services/video_downloaders/playwright_downloader.py

"""
Playwright-based video downloader using browser automation
"""
import asyncio
import time
import json
import re
from pathlib import Path
from typing import Optional, Dict, Any, List
import logging

from backend.models.video_download import (
    VideoDownloadResult,
    DownloadPreferences,
    DownloadMethod,
    DownloadStatus,
    VideoMetadata,
    TranscriptData,
    DownloaderException,
    VideoNotAvailableError,
    NetworkError
)
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader

logger = logging.getLogger(__name__)


class PlaywrightDownloader(BaseVideoDownloader):
    """Playwright-based video downloader using MCP server with persistent authentication"""

    def __init__(self, method: DownloadMethod = DownloadMethod.PLAYWRIGHT, config: Optional[Dict[str, Any]] = None):
        super().__init__(method, config)
        self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage')
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # Configuration
        self.headless = config.get('headless', True) if config else True
        self.timeout = config.get('timeout', 30000) if config else 30000
        self.session_file = config.get('session_file') if config else None

        # Authentication settings
        self.use_authentication = config.get('use_authentication', True) if config else True
        self.fallback_to_guest = config.get('fallback_to_guest', True) if config else True
        self._auth_checked = False
        self._is_authenticated = False

    async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
        """Download video using Playwright browser automation with persistent authentication"""
        start_time = time.time()
        video_id = await self.extract_video_id(url)

        try:
            # Use the MCP Playwright server for browser automation
            from backend.core.mcp_client import get_mcp_client

            mcp_client = get_mcp_client("playwright")

            # Check authentication status if enabled
            if self.use_authentication and not self._auth_checked:
                await self._check_authentication_status(mcp_client)

            # Navigate to video page
            await self._navigate_to_video(mcp_client, url)

            # Extract video metadata from page
            metadata = await self._extract_metadata_from_page(mcp_client, video_id)

            # Check duration limits
            if metadata.duration_seconds and preferences.max_duration_minutes > 0:
                if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
                    return self.create_result(
                        video_id, url, DownloadStatus.FAILED,
                        f"Video too long: {metadata.duration_seconds//60} minutes"
                    )

            # Extract video URLs from page
            video_urls = await self._extract_video_urls(mcp_client)

            if not video_urls:
                raise DownloaderException("Could not extract video URLs from page")

            # Download the video/audio streams
            video_path = None
            audio_path = None

            if preferences.prefer_audio_only:
                audio_path = await self._download_audio_stream(video_urls, video_id)
            else:
                video_path, audio_path = await self._download_video_and_audio_streams(
                    video_urls, video_id, preferences
                )

            # Get transcript
            transcript = None
            if preferences.enable_subtitles:
                transcript = await self._extract_transcript_from_browser(mcp_client, video_id)

            processing_time = time.time() - start_time

            # Calculate file sizes
            file_size = 0
            if audio_path and audio_path.exists():
                file_size += audio_path.stat().st_size
            if video_path and video_path.exists():
                file_size += video_path.stat().st_size

            return VideoDownloadResult(
                video_id=video_id,
                video_url=url,
                status=DownloadStatus.COMPLETED,
                method=self.method,
                video_path=video_path,
                audio_path=audio_path,
                transcript=transcript,
                metadata=metadata,
                processing_time_seconds=processing_time,
                file_size_bytes=file_size
            )

        except Exception as e:
            self.logger.error(f"Playwright download failed for {video_id}: {e}")

            error_str = str(e).lower()
            if "blocked" in error_str or "forbidden" in error_str:
                raise NetworkError(f"Browser request blocked: {e}")
            elif "private" in error_str or "unavailable" in error_str:
                raise VideoNotAvailableError(f"Video not available: {e}")
            else:
                raise DownloaderException(f"Playwright error: {e}")

    async def _check_authentication_status(self, mcp_client):
        """Check if browser session is authenticated to YouTube"""
        try:
            self.logger.info("Checking YouTube authentication status...")

            # Navigate to YouTube first
            await mcp_client.call_tool("browser_navigate", {
                "url": "https://www.youtube.com"
            })

            await asyncio.sleep(2)

            # Check for authentication indicators
            auth_check = await mcp_client.call_tool("browser_evaluate", {
                "function": """() => {
                    const loginButton = document.querySelector('a[href*="signin"], button[aria-label*="Sign in"]');
                    const accountButton = document.querySelector('[data-ved] [aria-label*="Google Account"], .gb_d .gb_e, #avatar-btn');
                    const channelButton = document.querySelector('#channel-handle, #channel-title');

                    const isAuthenticated = !loginButton && (!!accountButton || !!channelButton);

                    let userInfo = null;
                    if (isAuthenticated) {
                        try {
                            const avatar = document.querySelector('#avatar img, .gb_h img');
                            const name = document.querySelector('.gb_e .gb_f, #channel-handle');
                            userInfo = {
                                avatar: avatar ? avatar.src : null,
                                name: name ? name.textContent.trim() : 'Authenticated User'
                            };
                        } catch (e) {
                            userInfo = { name: 'Authenticated User', avatar: null };
                        }
                    }

                    return {
                        isAuthenticated: isAuthenticated,
                        hasLoginButton: !!loginButton,
                        hasAccountButton: !!accountButton,
                        hasChannelButton: !!channelButton,
                        userInfo: userInfo,
                        cookies: document.cookie.length > 0
                    };
                }"""
            })

            if isinstance(auth_check, str):
                auth_check = json.loads(auth_check)

            self._is_authenticated = auth_check.get("isAuthenticated", False)
            self._auth_checked = True

            if self._is_authenticated:
                user_info = auth_check.get("userInfo", {})
                self.logger.info(f"Browser session is authenticated to YouTube as: {user_info.get('name', 'Unknown')}")
            else:
                self.logger.info("Browser session is not authenticated to YouTube")
                if not self.fallback_to_guest:
                    raise DownloaderException("YouTube authentication required but not available")

        except Exception as e:
            self.logger.warning(f"Authentication status check failed: {e}")
            self._is_authenticated = False
            self._auth_checked = True

            if not self.fallback_to_guest:
                raise DownloaderException(f"Authentication check failed and guest fallback disabled: {e}")

    async def _navigate_to_video(self, mcp_client, url: str):
        """Navigate to YouTube video page"""
        try:
            # Use MCP Playwright server to navigate
            result = await mcp_client.call_tool("browser_navigate", {
                "url": url,
                "wait_until": "networkidle"
            })

            # Wait for video to load
            await asyncio.sleep(3)

            # Check if page loaded successfully
            if "error" in str(result).lower():
                raise DownloaderException(f"Failed to navigate to video: {result}")

        except Exception as e:
            raise DownloaderException(f"Navigation failed: {e}")

    async def _extract_metadata_from_page(self, mcp_client, video_id: str) -> VideoMetadata:
        """Extract video metadata from the YouTube page"""
        try:
            # JavaScript to extract metadata from YouTube page
            js_code = """
            () => {
                // Try to get data from YouTube's initial data
                const getYtInitialData = () => {
                    return window.ytInitialData ||
                           window.ytInitialPlayerResponse ||
                           {};
                };

                const data = getYtInitialData();
                const videoDetails = data.videoDetails || data.contents?.videoDetails || {};

                // Extract from DOM as fallback
                const titleElement = document.querySelector('h1.title yt-formatted-string, h1[data-id] yt-formatted-string, #above-the-fold #title h1');
                const channelElement = document.querySelector('#owner-name a, #channel-name a, .ytd-channel-name a');
                const viewsElement = document.querySelector('#info-strings yt-formatted-string, .view-count');
                const descriptionElement = document.querySelector('#description-text, #meta-contents #description');

                // Get duration from video element
                const videoElement = document.querySelector('video');
                const duration = videoElement ? Math.floor(videoElement.duration) : null;

                return {
                    title: videoDetails.title || (titleElement ? titleElement.textContent.trim() : null),
                    description: videoDetails.shortDescription || (descriptionElement ? descriptionElement.textContent.trim().substring(0, 500) : null),
                    duration: videoDetails.lengthSeconds ? parseInt(videoDetails.lengthSeconds) : duration,
                    viewCount: videoDetails.viewCount || (viewsElement ? this._parseViewCount(viewsElement.textContent) : null),
                    author: videoDetails.author || (channelElement ? channelElement.textContent.trim() : null),
                    thumbnail: videoDetails.thumbnail?.thumbnails?.[0]?.url,
                    keywords: videoDetails.keywords || []
                };
            }
            """

            result = await mcp_client.call_tool("browser_evaluate", {
                "script": js_code
            })

            if isinstance(result, str):
                metadata_dict = json.loads(result)
            else:
                metadata_dict = result

            return VideoMetadata(
                video_id=video_id,
                title=metadata_dict.get('title'),
                description=metadata_dict.get('description'),
                duration_seconds=metadata_dict.get('duration'),
                view_count=self._parse_view_count(metadata_dict.get('viewCount')),
                uploader=metadata_dict.get('author'),
                thumbnail_url=metadata_dict.get('thumbnail'),
                tags=metadata_dict.get('keywords', [])
            )

        except Exception as e:
            self.logger.warning(f"Metadata extraction failed: {e}")
            return VideoMetadata(video_id=video_id)

    async def _extract_video_urls(self, mcp_client) -> List[Dict[str, Any]]:
        """Extract video stream URLs from YouTube page (enhanced for authenticated access)"""
        try:
            # Enhanced JavaScript that takes advantage of authentication
            js_code = f"""
            () => {{
                const isAuthenticated = {str(self._is_authenticated).lower()};

                // Extract streaming data from YouTube's player
                const getStreamingData = () => {{
                    const playerResponse = window.ytInitialPlayerResponse || {{}};
                    const streamingData = playerResponse.streamingData || {{}};

                    const formats = [
                        ...(streamingData.formats || []),
                        ...(streamingData.adaptiveFormats || [])
                    ];

                    return formats.map(format => ({{
                        url: format.url,
                        itag: format.itag,
                        quality: format.qualityLabel || format.quality,
                        mimeType: format.mimeType,
                        hasVideo: format.mimeType?.includes('video') || false,
                        hasAudio: format.mimeType?.includes('audio') || false,
                        filesize: format.contentLength,
                        fps: format.fps,
                        bitrate: format.bitrate,
                        authenticated: isAuthenticated,
                        qualityScore: format.bitrate || 0
                    }})).filter(f => f.url);
                }};

                let formats = getStreamingData();

                // If authenticated, we may have access to higher quality streams
                if (isAuthenticated) {{
                    // Sort by quality score (bitrate) to prioritize higher quality
                    formats = formats.sort((a, b) => (b.qualityScore || 0) - (a.qualityScore || 0));
                }}

                // If no formats found, try alternative method
                if (!formats.length) {{
                    // Look for video element source
                    const videoElement = document.querySelector('video');
                    if (videoElement && videoElement.src) {{
                        return [{{
                            url: videoElement.src,
                            quality: 'unknown',
                            mimeType: 'video/mp4',
                            hasVideo: true,
                            hasAudio: true,
                            authenticated: isAuthenticated
                        }}];
                    }}
                }}

                return formats;
            }}
            """

            result = await mcp_client.call_tool("browser_evaluate", {
                "script": js_code
            })

            if isinstance(result, str):
                video_urls = json.loads(result)
            else:
                video_urls = result

            return video_urls or []

        except Exception as e:
            self.logger.error(f"Video URL extraction failed: {e}")
            return []

    async def _download_audio_stream(self, video_urls: List[Dict[str, Any]], video_id: str) -> Optional[Path]:
        """Download best audio stream"""
        # Find best audio-only stream
        audio_streams = [
            stream for stream in video_urls
            if stream.get('hasAudio') and not stream.get('hasVideo')
        ]

        if not audio_streams:
            # Fallback to streams with both audio and video
            audio_streams = [
                stream for stream in video_urls
                if stream.get('hasAudio')
            ]

        if not audio_streams:
            return None

        # Sort by quality/bitrate
        best_audio = max(audio_streams, key=lambda x: x.get('bitrate', 0))

        return await self._download_stream(best_audio['url'], video_id, 'audio', 'mp3')

    async def _download_video_and_audio_streams(self, video_urls: List[Dict[str, Any]],
                                              video_id: str, preferences: DownloadPreferences) -> tuple[Optional[Path], Optional[Path]]:
        """Download video and audio streams separately"""
        # Find best video stream
        video_streams = [
            stream for stream in video_urls
            if stream.get('hasVideo')
        ]

        # Find best audio stream
        audio_streams = [
            stream for stream in video_urls
            if stream.get('hasAudio') and not stream.get('hasVideo')
        ]

        if not audio_streams:
            audio_streams = [
                stream for stream in video_urls
                if stream.get('hasAudio')
            ]

        video_path = None
        audio_path = None

        # Download video stream
        if video_streams:
            # Filter by quality preference
            quality_map = {'720p': 720, '1080p': 1080, '480p': 480}
            target_quality = quality_map.get(preferences.quality.value, 720)

            # Find stream closest to target quality
            best_video = min(video_streams,
                           key=lambda x: abs(self._extract_quality_number(x.get('quality', '720p')) - target_quality))

            video_path = await self._download_stream(best_video['url'], video_id, 'video', 'mp4')

        # Download audio stream
        if audio_streams:
            best_audio = max(audio_streams, key=lambda x: x.get('bitrate', 0))
            audio_path = await self._download_stream(best_audio['url'], video_id, 'audio', 'mp3')

        return video_path, audio_path

    async def _download_stream(self, url: str, video_id: str, stream_type: str, extension: str) -> Optional[Path]:
        """Download a single stream"""
        try:
            import aiohttp
            import aiofiles

            output_path = self.output_dir / f"{video_id}_{stream_type}.{extension}"

            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    if response.status != 200:
                        raise DownloaderException(f"HTTP {response.status} for stream download")

                    async with aiofiles.open(output_path, 'wb') as f:
                        async for chunk in response.content.iter_chunked(8192):
                            await f.write(chunk)

            self.logger.info(f"Downloaded {stream_type} stream: {output_path}")
            return output_path

        except Exception as e:
            self.logger.error(f"Stream download failed: {e}")
            return None

    async def _extract_transcript_from_browser(self, mcp_client, video_id: str) -> Optional[TranscriptData]:
        """Try to extract transcript from browser"""
        try:
            # First try the standard transcript API
            from youtube_transcript_api import YouTubeTranscriptApi

            loop = asyncio.get_event_loop()

            def _get_transcript():
                api = YouTubeTranscriptApi()
                transcript = api.fetch(video_id, languages=['en'])

                full_text = ' '.join([snippet.text for snippet in transcript.snippets])
                segments = [
                    {
                        'text': snippet.text,
                        'start': snippet.start,
                        'duration': snippet.duration
                    }
                    for snippet in transcript.snippets
                ]

                return full_text, segments, transcript.is_generated, transcript.language_code

            text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript)

            return TranscriptData(
                text=text,
                language=language,
                is_auto_generated=is_generated,
                segments=segments,
                source="youtube-transcript-api"
            )

        except Exception as e:
            self.logger.debug(f"Transcript extraction failed: {e}")
            return None

    def _parse_view_count(self, views_text) -> Optional[int]:
        """Parse view count from text"""
        if not views_text:
            return None

        try:
            # Remove non-numeric characters except for multipliers
            import re
            views_clean = re.sub(r'[^\d.KMB]', '', str(views_text).upper())

            if 'K' in views_clean:
                return int(float(views_clean.replace('K', '')) * 1000)
            elif 'M' in views_clean:
                return int(float(views_clean.replace('M', '')) * 1000000)
            elif 'B' in views_clean:
                return int(float(views_clean.replace('B', '')) * 1000000000)
            else:
                return int(re.sub(r'[^\d]', '', views_clean))
        except:
            return None

    def _extract_quality_number(self, quality_str: str) -> int:
        """Extract numeric quality from string like '720p'"""
        try:
            import re
            match = re.search(r'(\d+)', quality_str)
            return int(match.group(1)) if match else 720
        except:
            return 720

    async def test_connection(self) -> bool:
        """Test if Playwright MCP server is working and check authentication status"""
        try:
            from backend.core.mcp_client import get_mcp_client

            mcp_client = get_mcp_client("playwright")

            # Try to navigate to YouTube
            result = await mcp_client.call_tool("browser_navigate", {
                "url": "https://www.youtube.com",
                "wait_until": "domcontentloaded"
            })

            if "error" in str(result).lower():
                return False

            # Check authentication status if enabled
            if self.use_authentication:
                await self._check_authentication_status(mcp_client)
                self.logger.info(f"Authentication status: {'Authenticated' if self._is_authenticated else 'Guest mode'}")

            return True

        except Exception as e:
            self.logger.error(f"Playwright connection test failed: {e}")
            return False

    def get_authentication_status(self) -> Dict[str, Any]:
        """Get current authentication status information"""
        return {
            "useAuthentication": self.use_authentication,
            "isAuthenticated": self._is_authenticated,
            "authChecked": self._auth_checked,
            "fallbackToGuest": self.fallback_to_guest,
            "features": {
                "privateVideos": self._is_authenticated,
                "unlistedVideos": self._is_authenticated,
                "memberContent": self._is_authenticated,
                "highQualityStreams": self._is_authenticated,
                "personalPlaylists": self._is_authenticated
            } if self._is_authenticated else {}
        }

    def supports_audio_only(self) -> bool:
        return True

    def supports_quality_selection(self) -> bool:
        return True

    def get_supported_formats(self) -> list[str]:
        return ["mp4", "webm", "mp3"]


# Create a mock MCP client if not available
class MockMCPClient:
    async def call_tool(self, tool_name: str, params: dict):
        raise DownloaderException("MCP Playwright server not available")


# Register the downloader
from backend.services.video_downloaders.base_downloader import DownloaderFactory
DownloaderFactory.register(DownloadMethod.PLAYWRIGHT, PlaywrightDownloader)