youtube-summarizer/backend/services/video_downloaders/playwright_downloader.py

575 lines
24 KiB
Python

"""
Playwright-based video downloader using browser automation
"""
import asyncio
import time
import json
import re
from pathlib import Path
from typing import Optional, Dict, Any, List
import logging
from backend.models.video_download import (
VideoDownloadResult,
DownloadPreferences,
DownloadMethod,
DownloadStatus,
VideoMetadata,
TranscriptData,
DownloaderException,
VideoNotAvailableError,
NetworkError
)
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader
logger = logging.getLogger(__name__)
class PlaywrightDownloader(BaseVideoDownloader):
"""Playwright-based video downloader using MCP server with persistent authentication"""
def __init__(self, method: DownloadMethod = DownloadMethod.PLAYWRIGHT, config: Optional[Dict[str, Any]] = None):
super().__init__(method, config)
self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage')
self.output_dir.mkdir(parents=True, exist_ok=True)
# Configuration
self.headless = config.get('headless', True) if config else True
self.timeout = config.get('timeout', 30000) if config else 30000
self.session_file = config.get('session_file') if config else None
# Authentication settings
self.use_authentication = config.get('use_authentication', True) if config else True
self.fallback_to_guest = config.get('fallback_to_guest', True) if config else True
self._auth_checked = False
self._is_authenticated = False
async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
"""Download video using Playwright browser automation with persistent authentication"""
start_time = time.time()
video_id = await self.extract_video_id(url)
try:
# Use the MCP Playwright server for browser automation
from backend.core.mcp_client import get_mcp_client
mcp_client = get_mcp_client("playwright")
# Check authentication status if enabled
if self.use_authentication and not self._auth_checked:
await self._check_authentication_status(mcp_client)
# Navigate to video page
await self._navigate_to_video(mcp_client, url)
# Extract video metadata from page
metadata = await self._extract_metadata_from_page(mcp_client, video_id)
# Check duration limits
if metadata.duration_seconds and preferences.max_duration_minutes > 0:
if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
return self.create_result(
video_id, url, DownloadStatus.FAILED,
f"Video too long: {metadata.duration_seconds//60} minutes"
)
# Extract video URLs from page
video_urls = await self._extract_video_urls(mcp_client)
if not video_urls:
raise DownloaderException("Could not extract video URLs from page")
# Download the video/audio streams
video_path = None
audio_path = None
if preferences.prefer_audio_only:
audio_path = await self._download_audio_stream(video_urls, video_id)
else:
video_path, audio_path = await self._download_video_and_audio_streams(
video_urls, video_id, preferences
)
# Get transcript
transcript = None
if preferences.enable_subtitles:
transcript = await self._extract_transcript_from_browser(mcp_client, video_id)
processing_time = time.time() - start_time
# Calculate file sizes
file_size = 0
if audio_path and audio_path.exists():
file_size += audio_path.stat().st_size
if video_path and video_path.exists():
file_size += video_path.stat().st_size
return VideoDownloadResult(
video_id=video_id,
video_url=url,
status=DownloadStatus.COMPLETED,
method=self.method,
video_path=video_path,
audio_path=audio_path,
transcript=transcript,
metadata=metadata,
processing_time_seconds=processing_time,
file_size_bytes=file_size
)
except Exception as e:
self.logger.error(f"Playwright download failed for {video_id}: {e}")
error_str = str(e).lower()
if "blocked" in error_str or "forbidden" in error_str:
raise NetworkError(f"Browser request blocked: {e}")
elif "private" in error_str or "unavailable" in error_str:
raise VideoNotAvailableError(f"Video not available: {e}")
else:
raise DownloaderException(f"Playwright error: {e}")
async def _check_authentication_status(self, mcp_client):
"""Check if browser session is authenticated to YouTube"""
try:
self.logger.info("Checking YouTube authentication status...")
# Navigate to YouTube first
await mcp_client.call_tool("browser_navigate", {
"url": "https://www.youtube.com"
})
await asyncio.sleep(2)
# Check for authentication indicators
auth_check = await mcp_client.call_tool("browser_evaluate", {
"function": """() => {
const loginButton = document.querySelector('a[href*="signin"], button[aria-label*="Sign in"]');
const accountButton = document.querySelector('[data-ved] [aria-label*="Google Account"], .gb_d .gb_e, #avatar-btn');
const channelButton = document.querySelector('#channel-handle, #channel-title');
const isAuthenticated = !loginButton && (!!accountButton || !!channelButton);
let userInfo = null;
if (isAuthenticated) {
try {
const avatar = document.querySelector('#avatar img, .gb_h img');
const name = document.querySelector('.gb_e .gb_f, #channel-handle');
userInfo = {
avatar: avatar ? avatar.src : null,
name: name ? name.textContent.trim() : 'Authenticated User'
};
} catch (e) {
userInfo = { name: 'Authenticated User', avatar: null };
}
}
return {
isAuthenticated: isAuthenticated,
hasLoginButton: !!loginButton,
hasAccountButton: !!accountButton,
hasChannelButton: !!channelButton,
userInfo: userInfo,
cookies: document.cookie.length > 0
};
}"""
})
if isinstance(auth_check, str):
auth_check = json.loads(auth_check)
self._is_authenticated = auth_check.get("isAuthenticated", False)
self._auth_checked = True
if self._is_authenticated:
user_info = auth_check.get("userInfo", {})
self.logger.info(f"Browser session is authenticated to YouTube as: {user_info.get('name', 'Unknown')}")
else:
self.logger.info("Browser session is not authenticated to YouTube")
if not self.fallback_to_guest:
raise DownloaderException("YouTube authentication required but not available")
except Exception as e:
self.logger.warning(f"Authentication status check failed: {e}")
self._is_authenticated = False
self._auth_checked = True
if not self.fallback_to_guest:
raise DownloaderException(f"Authentication check failed and guest fallback disabled: {e}")
async def _navigate_to_video(self, mcp_client, url: str):
"""Navigate to YouTube video page"""
try:
# Use MCP Playwright server to navigate
result = await mcp_client.call_tool("browser_navigate", {
"url": url,
"wait_until": "networkidle"
})
# Wait for video to load
await asyncio.sleep(3)
# Check if page loaded successfully
if "error" in str(result).lower():
raise DownloaderException(f"Failed to navigate to video: {result}")
except Exception as e:
raise DownloaderException(f"Navigation failed: {e}")
async def _extract_metadata_from_page(self, mcp_client, video_id: str) -> VideoMetadata:
"""Extract video metadata from the YouTube page"""
try:
# JavaScript to extract metadata from YouTube page
js_code = """
() => {
// Try to get data from YouTube's initial data
const getYtInitialData = () => {
return window.ytInitialData ||
window.ytInitialPlayerResponse ||
{};
};
const data = getYtInitialData();
const videoDetails = data.videoDetails || data.contents?.videoDetails || {};
// Extract from DOM as fallback
const titleElement = document.querySelector('h1.title yt-formatted-string, h1[data-id] yt-formatted-string, #above-the-fold #title h1');
const channelElement = document.querySelector('#owner-name a, #channel-name a, .ytd-channel-name a');
const viewsElement = document.querySelector('#info-strings yt-formatted-string, .view-count');
const descriptionElement = document.querySelector('#description-text, #meta-contents #description');
// Get duration from video element
const videoElement = document.querySelector('video');
const duration = videoElement ? Math.floor(videoElement.duration) : null;
return {
title: videoDetails.title || (titleElement ? titleElement.textContent.trim() : null),
description: videoDetails.shortDescription || (descriptionElement ? descriptionElement.textContent.trim().substring(0, 500) : null),
duration: videoDetails.lengthSeconds ? parseInt(videoDetails.lengthSeconds) : duration,
viewCount: videoDetails.viewCount || (viewsElement ? this._parseViewCount(viewsElement.textContent) : null),
author: videoDetails.author || (channelElement ? channelElement.textContent.trim() : null),
thumbnail: videoDetails.thumbnail?.thumbnails?.[0]?.url,
keywords: videoDetails.keywords || []
};
}
"""
result = await mcp_client.call_tool("browser_evaluate", {
"script": js_code
})
if isinstance(result, str):
metadata_dict = json.loads(result)
else:
metadata_dict = result
return VideoMetadata(
video_id=video_id,
title=metadata_dict.get('title'),
description=metadata_dict.get('description'),
duration_seconds=metadata_dict.get('duration'),
view_count=self._parse_view_count(metadata_dict.get('viewCount')),
uploader=metadata_dict.get('author'),
thumbnail_url=metadata_dict.get('thumbnail'),
tags=metadata_dict.get('keywords', [])
)
except Exception as e:
self.logger.warning(f"Metadata extraction failed: {e}")
return VideoMetadata(video_id=video_id)
async def _extract_video_urls(self, mcp_client) -> List[Dict[str, Any]]:
"""Extract video stream URLs from YouTube page (enhanced for authenticated access)"""
try:
# Enhanced JavaScript that takes advantage of authentication
js_code = f"""
() => {{
const isAuthenticated = {str(self._is_authenticated).lower()};
// Extract streaming data from YouTube's player
const getStreamingData = () => {{
const playerResponse = window.ytInitialPlayerResponse || {{}};
const streamingData = playerResponse.streamingData || {{}};
const formats = [
...(streamingData.formats || []),
...(streamingData.adaptiveFormats || [])
];
return formats.map(format => ({{
url: format.url,
itag: format.itag,
quality: format.qualityLabel || format.quality,
mimeType: format.mimeType,
hasVideo: format.mimeType?.includes('video') || false,
hasAudio: format.mimeType?.includes('audio') || false,
filesize: format.contentLength,
fps: format.fps,
bitrate: format.bitrate,
authenticated: isAuthenticated,
qualityScore: format.bitrate || 0
}})).filter(f => f.url);
}};
let formats = getStreamingData();
// If authenticated, we may have access to higher quality streams
if (isAuthenticated) {{
// Sort by quality score (bitrate) to prioritize higher quality
formats = formats.sort((a, b) => (b.qualityScore || 0) - (a.qualityScore || 0));
}}
// If no formats found, try alternative method
if (!formats.length) {{
// Look for video element source
const videoElement = document.querySelector('video');
if (videoElement && videoElement.src) {{
return [{{
url: videoElement.src,
quality: 'unknown',
mimeType: 'video/mp4',
hasVideo: true,
hasAudio: true,
authenticated: isAuthenticated
}}];
}}
}}
return formats;
}}
"""
result = await mcp_client.call_tool("browser_evaluate", {
"script": js_code
})
if isinstance(result, str):
video_urls = json.loads(result)
else:
video_urls = result
return video_urls or []
except Exception as e:
self.logger.error(f"Video URL extraction failed: {e}")
return []
async def _download_audio_stream(self, video_urls: List[Dict[str, Any]], video_id: str) -> Optional[Path]:
"""Download best audio stream"""
# Find best audio-only stream
audio_streams = [
stream for stream in video_urls
if stream.get('hasAudio') and not stream.get('hasVideo')
]
if not audio_streams:
# Fallback to streams with both audio and video
audio_streams = [
stream for stream in video_urls
if stream.get('hasAudio')
]
if not audio_streams:
return None
# Sort by quality/bitrate
best_audio = max(audio_streams, key=lambda x: x.get('bitrate', 0))
return await self._download_stream(best_audio['url'], video_id, 'audio', 'mp3')
async def _download_video_and_audio_streams(self, video_urls: List[Dict[str, Any]],
video_id: str, preferences: DownloadPreferences) -> tuple[Optional[Path], Optional[Path]]:
"""Download video and audio streams separately"""
# Find best video stream
video_streams = [
stream for stream in video_urls
if stream.get('hasVideo')
]
# Find best audio stream
audio_streams = [
stream for stream in video_urls
if stream.get('hasAudio') and not stream.get('hasVideo')
]
if not audio_streams:
audio_streams = [
stream for stream in video_urls
if stream.get('hasAudio')
]
video_path = None
audio_path = None
# Download video stream
if video_streams:
# Filter by quality preference
quality_map = {'720p': 720, '1080p': 1080, '480p': 480}
target_quality = quality_map.get(preferences.quality.value, 720)
# Find stream closest to target quality
best_video = min(video_streams,
key=lambda x: abs(self._extract_quality_number(x.get('quality', '720p')) - target_quality))
video_path = await self._download_stream(best_video['url'], video_id, 'video', 'mp4')
# Download audio stream
if audio_streams:
best_audio = max(audio_streams, key=lambda x: x.get('bitrate', 0))
audio_path = await self._download_stream(best_audio['url'], video_id, 'audio', 'mp3')
return video_path, audio_path
async def _download_stream(self, url: str, video_id: str, stream_type: str, extension: str) -> Optional[Path]:
"""Download a single stream"""
try:
import aiohttp
import aiofiles
output_path = self.output_dir / f"{video_id}_{stream_type}.{extension}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
raise DownloaderException(f"HTTP {response.status} for stream download")
async with aiofiles.open(output_path, 'wb') as f:
async for chunk in response.content.iter_chunked(8192):
await f.write(chunk)
self.logger.info(f"Downloaded {stream_type} stream: {output_path}")
return output_path
except Exception as e:
self.logger.error(f"Stream download failed: {e}")
return None
async def _extract_transcript_from_browser(self, mcp_client, video_id: str) -> Optional[TranscriptData]:
"""Try to extract transcript from browser"""
try:
# First try the standard transcript API
from youtube_transcript_api import YouTubeTranscriptApi
loop = asyncio.get_event_loop()
def _get_transcript():
api = YouTubeTranscriptApi()
transcript = api.fetch(video_id, languages=['en'])
full_text = ' '.join([snippet.text for snippet in transcript.snippets])
segments = [
{
'text': snippet.text,
'start': snippet.start,
'duration': snippet.duration
}
for snippet in transcript.snippets
]
return full_text, segments, transcript.is_generated, transcript.language_code
text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript)
return TranscriptData(
text=text,
language=language,
is_auto_generated=is_generated,
segments=segments,
source="youtube-transcript-api"
)
except Exception as e:
self.logger.debug(f"Transcript extraction failed: {e}")
return None
def _parse_view_count(self, views_text) -> Optional[int]:
"""Parse view count from text"""
if not views_text:
return None
try:
# Remove non-numeric characters except for multipliers
import re
views_clean = re.sub(r'[^\d.KMB]', '', str(views_text).upper())
if 'K' in views_clean:
return int(float(views_clean.replace('K', '')) * 1000)
elif 'M' in views_clean:
return int(float(views_clean.replace('M', '')) * 1000000)
elif 'B' in views_clean:
return int(float(views_clean.replace('B', '')) * 1000000000)
else:
return int(re.sub(r'[^\d]', '', views_clean))
except:
return None
def _extract_quality_number(self, quality_str: str) -> int:
"""Extract numeric quality from string like '720p'"""
try:
import re
match = re.search(r'(\d+)', quality_str)
return int(match.group(1)) if match else 720
except:
return 720
async def test_connection(self) -> bool:
"""Test if Playwright MCP server is working and check authentication status"""
try:
from backend.core.mcp_client import get_mcp_client
mcp_client = get_mcp_client("playwright")
# Try to navigate to YouTube
result = await mcp_client.call_tool("browser_navigate", {
"url": "https://www.youtube.com",
"wait_until": "domcontentloaded"
})
if "error" in str(result).lower():
return False
# Check authentication status if enabled
if self.use_authentication:
await self._check_authentication_status(mcp_client)
self.logger.info(f"Authentication status: {'Authenticated' if self._is_authenticated else 'Guest mode'}")
return True
except Exception as e:
self.logger.error(f"Playwright connection test failed: {e}")
return False
def get_authentication_status(self) -> Dict[str, Any]:
"""Get current authentication status information"""
return {
"useAuthentication": self.use_authentication,
"isAuthenticated": self._is_authenticated,
"authChecked": self._auth_checked,
"fallbackToGuest": self.fallback_to_guest,
"features": {
"privateVideos": self._is_authenticated,
"unlistedVideos": self._is_authenticated,
"memberContent": self._is_authenticated,
"highQualityStreams": self._is_authenticated,
"personalPlaylists": self._is_authenticated
} if self._is_authenticated else {}
}
def supports_audio_only(self) -> bool:
return True
def supports_quality_selection(self) -> bool:
return True
def get_supported_formats(self) -> list[str]:
return ["mp4", "webm", "mp3"]
# Create a mock MCP client if not available
class MockMCPClient:
async def call_tool(self, tool_name: str, params: dict):
raise DownloaderException("MCP Playwright server not available")
# Register the downloader
from backend.services.video_downloaders.base_downloader import DownloaderFactory
DownloaderFactory.register(DownloadMethod.PLAYWRIGHT, PlaywrightDownloader)