529 lines
22 KiB
Python
529 lines
22 KiB
Python
"""
|
|
Enhanced yt-dlp downloader with progress tracking and 403 error workarounds
|
|
"""
|
|
import asyncio
|
|
import time
|
|
import random
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Callable
|
|
import logging
|
|
import subprocess
|
|
|
|
from backend.models.video_download import (
|
|
VideoDownloadResult,
|
|
DownloadPreferences,
|
|
DownloadMethod,
|
|
DownloadStatus,
|
|
VideoMetadata,
|
|
TranscriptData,
|
|
VideoQuality,
|
|
DownloaderException,
|
|
VideoNotAvailableError,
|
|
NetworkError
|
|
)
|
|
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader, DownloadProgress
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class YtDlpDownloader(BaseVideoDownloader):
|
|
"""Enhanced yt-dlp downloader with progress tracking and 403 error workarounds"""
|
|
|
|
def __init__(self, method: DownloadMethod = DownloadMethod.YT_DLP, config: Optional[Dict[str, Any]] = None):
|
|
super().__init__(method, config)
|
|
self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage')
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Configuration
|
|
self.use_cookies = config.get('use_cookies', True) if config else True
|
|
self.cookies_file = config.get('cookies_file') if config else None
|
|
self.user_agents = config.get('user_agents', self._get_default_user_agents()) if config else self._get_default_user_agents()
|
|
self.proxies = config.get('proxies', []) if config else []
|
|
|
|
# Progress tracking
|
|
self.progress_callback: Optional[Callable[[DownloadProgress], None]] = None
|
|
self.retry_attempt = 0
|
|
|
|
def _get_default_user_agents(self) -> List[str]:
|
|
"""Get default user agents for rotation"""
|
|
return [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:120.0) Gecko/20100101 Firefox/120.0"
|
|
]
|
|
|
|
async def download_video(
|
|
self,
|
|
url: str,
|
|
preferences: DownloadPreferences,
|
|
progress_callback: Optional[Callable[[DownloadProgress], None]] = None
|
|
) -> VideoDownloadResult:
|
|
"""Download video using yt-dlp with progress tracking and multiple fallback strategies"""
|
|
start_time = time.time()
|
|
video_id = await self.extract_video_id(url)
|
|
|
|
# Store progress callback for use in strategies
|
|
self.progress_callback = progress_callback
|
|
self.retry_attempt = 0
|
|
|
|
# Try multiple strategies
|
|
strategies = [
|
|
self._download_with_cookies,
|
|
self._download_with_user_agent_rotation,
|
|
self._download_with_format_selection,
|
|
self._download_audio_only_fallback
|
|
]
|
|
|
|
if self.proxies:
|
|
strategies.insert(2, self._download_with_proxy_rotation)
|
|
|
|
last_error = None
|
|
|
|
for strategy_idx, strategy in enumerate(strategies):
|
|
try:
|
|
self.retry_attempt = strategy_idx
|
|
|
|
# Report progress for strategy attempt
|
|
await self.report_progress(
|
|
self.progress_callback,
|
|
DownloadProgress(
|
|
download_percent=0.0,
|
|
current_method="yt-dlp",
|
|
retry_attempt=self.retry_attempt,
|
|
status_message=f"Trying yt-dlp strategy: {strategy.__name__.replace('_', ' ').title()}"
|
|
)
|
|
)
|
|
|
|
self.logger.info(f"Trying yt-dlp strategy: {strategy.__name__}")
|
|
result = await strategy(url, video_id, preferences)
|
|
|
|
if result:
|
|
result.processing_time_seconds = time.time() - start_time
|
|
|
|
# Report completion
|
|
await self.report_progress(
|
|
self.progress_callback,
|
|
DownloadProgress(
|
|
download_percent=100.0,
|
|
current_method="yt-dlp",
|
|
retry_attempt=self.retry_attempt,
|
|
status_message="Download completed successfully"
|
|
)
|
|
)
|
|
return result
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"yt-dlp strategy {strategy.__name__} failed: {e}")
|
|
last_error = e
|
|
|
|
# Report failure
|
|
await self.report_progress(
|
|
self.progress_callback,
|
|
DownloadProgress(
|
|
download_percent=0.0,
|
|
current_method="yt-dlp",
|
|
retry_attempt=self.retry_attempt,
|
|
status_message=f"Strategy failed: {str(e)[:100]}"
|
|
)
|
|
)
|
|
continue
|
|
|
|
# All strategies failed
|
|
error_msg = f"All yt-dlp strategies failed. Last error: {last_error}"
|
|
if "403" in str(last_error) or "Forbidden" in str(last_error):
|
|
raise NetworkError(f"YouTube blocked yt-dlp requests: {last_error}")
|
|
else:
|
|
raise DownloaderException(error_msg)
|
|
|
|
async def _download_with_cookies(self, url: str, video_id: str, preferences: DownloadPreferences) -> Optional[VideoDownloadResult]:
|
|
"""Try download with browser cookies"""
|
|
if not self.use_cookies:
|
|
raise DownloaderException("Cookies disabled")
|
|
|
|
options = {
|
|
'outtmpl': str(self.output_dir / f'{video_id}_%(title)s.%(ext)s'),
|
|
'format': self._get_format_selector(preferences),
|
|
'user_agent': random.choice(self.user_agents),
|
|
'referer': 'https://www.youtube.com/',
|
|
'extractor_args': {
|
|
'youtube': {
|
|
'skip': ['dash', 'hls'] # Skip problematic formats
|
|
}
|
|
}
|
|
}
|
|
|
|
if self.cookies_file and Path(self.cookies_file).exists():
|
|
options['cookiefile'] = str(self.cookies_file)
|
|
else:
|
|
# Try to use browser cookies
|
|
options['cookiesfrombrowser'] = ('chrome', None, None, None)
|
|
|
|
return await self._execute_ytdlp(url, video_id, options, preferences)
|
|
|
|
async def _download_with_user_agent_rotation(self, url: str, video_id: str, preferences: DownloadPreferences) -> Optional[VideoDownloadResult]:
|
|
"""Try download with user agent rotation"""
|
|
user_agent = random.choice(self.user_agents)
|
|
|
|
options = {
|
|
'outtmpl': str(self.output_dir / f'{video_id}_%(title)s.%(ext)s'),
|
|
'format': self._get_format_selector(preferences),
|
|
'user_agent': user_agent,
|
|
'referer': 'https://www.youtube.com/',
|
|
'headers': {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-us,en;q=0.5',
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1'
|
|
}
|
|
}
|
|
|
|
return await self._execute_ytdlp(url, video_id, options, preferences)
|
|
|
|
async def _download_with_proxy_rotation(self, url: str, video_id: str, preferences: DownloadPreferences) -> Optional[VideoDownloadResult]:
|
|
"""Try download with proxy rotation"""
|
|
if not self.proxies:
|
|
raise DownloaderException("No proxies configured")
|
|
|
|
proxy = random.choice(self.proxies)
|
|
|
|
options = {
|
|
'outtmpl': str(self.output_dir / f'{video_id}_%(title)s.%(ext)s'),
|
|
'format': self._get_format_selector(preferences),
|
|
'proxy': proxy,
|
|
'user_agent': random.choice(self.user_agents),
|
|
'socket_timeout': 30
|
|
}
|
|
|
|
return await self._execute_ytdlp(url, video_id, options, preferences)
|
|
|
|
async def _download_with_format_selection(self, url: str, video_id: str, preferences: DownloadPreferences) -> Optional[VideoDownloadResult]:
|
|
"""Try download with specific format selection to avoid problematic streams"""
|
|
options = {
|
|
'outtmpl': str(self.output_dir / f'{video_id}_%(title)s.%(ext)s'),
|
|
'format': 'best[height<=720]/best', # Lower quality to avoid blocks
|
|
'user_agent': random.choice(self.user_agents),
|
|
'extractor_args': {
|
|
'youtube': {
|
|
'player_client': ['android', 'web'] # Use different clients
|
|
}
|
|
}
|
|
}
|
|
|
|
return await self._execute_ytdlp(url, video_id, options, preferences)
|
|
|
|
async def _download_audio_only_fallback(self, url: str, video_id: str, preferences: DownloadPreferences) -> Optional[VideoDownloadResult]:
|
|
"""Try audio-only download as fallback"""
|
|
options = {
|
|
'outtmpl': str(self.output_dir / f'{video_id}_audio.%(ext)s'),
|
|
'format': 'bestaudio/best',
|
|
'user_agent': random.choice(self.user_agents),
|
|
'postprocessors': [{
|
|
'key': 'FFmpegExtractAudio',
|
|
'preferredcodec': 'mp3',
|
|
'preferredquality': '192',
|
|
}] if self._has_ffmpeg() else []
|
|
}
|
|
|
|
return await self._execute_ytdlp(url, video_id, options, preferences, audio_only=True)
|
|
|
|
async def _execute_ytdlp(self, url: str, video_id: str, options: Dict[str, Any],
|
|
preferences: DownloadPreferences, audio_only: bool = False) -> Optional[VideoDownloadResult]:
|
|
"""Execute yt-dlp with given options"""
|
|
try:
|
|
import yt_dlp
|
|
|
|
# Add progress hook
|
|
options['progress_hooks'] = [self._progress_hook]
|
|
|
|
# Add metadata extraction
|
|
options['writeinfojson'] = True
|
|
options['writethumbnail'] = False # Skip thumbnail to avoid extra requests
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _download():
|
|
with yt_dlp.YoutubeDL(options) as ydl:
|
|
# First, extract info without downloading
|
|
info = ydl.extract_info(url, download=False)
|
|
|
|
# Check duration
|
|
duration = info.get('duration', 0)
|
|
if duration and preferences.max_duration_minutes > 0:
|
|
if duration > (preferences.max_duration_minutes * 60):
|
|
raise DownloaderException(f"Video too long: {duration//60} minutes")
|
|
|
|
# Now download
|
|
info = ydl.extract_info(url, download=True)
|
|
return info
|
|
|
|
info = await loop.run_in_executor(None, _download)
|
|
|
|
if not info:
|
|
return None
|
|
|
|
# Extract metadata
|
|
metadata = self._extract_metadata_from_info(info, video_id)
|
|
|
|
# Find downloaded files
|
|
video_path = None
|
|
audio_path = None
|
|
|
|
# Look for downloaded files
|
|
for file_path in self.output_dir.glob(f"{video_id}_*"):
|
|
if file_path.suffix.lower() in ['.mp4', '.mkv', '.webm']:
|
|
if not audio_only:
|
|
video_path = file_path
|
|
elif file_path.suffix.lower() in ['.mp3', '.m4a', '.webm']:
|
|
audio_path = file_path
|
|
|
|
# If audio-only but we got video, extract audio
|
|
if audio_only and video_path and not audio_path:
|
|
audio_path = await self._extract_audio_from_video(video_path, video_id)
|
|
|
|
# Get transcript
|
|
transcript = await self._extract_transcript_ytdlp(video_id)
|
|
|
|
# Calculate file size
|
|
file_size = 0
|
|
if video_path and video_path.exists():
|
|
file_size += video_path.stat().st_size
|
|
if audio_path and audio_path.exists():
|
|
file_size += audio_path.stat().st_size
|
|
|
|
return VideoDownloadResult(
|
|
video_id=video_id,
|
|
video_url=url,
|
|
status=DownloadStatus.COMPLETED,
|
|
method=self.method,
|
|
video_path=video_path,
|
|
audio_path=audio_path,
|
|
transcript=transcript,
|
|
metadata=metadata,
|
|
file_size_bytes=file_size
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"yt-dlp execution failed: {e}")
|
|
|
|
# Analyze error type
|
|
error_str = str(e).lower()
|
|
if "403" in error_str or "forbidden" in error_str:
|
|
raise NetworkError(f"YouTube blocked request: {e}")
|
|
elif "private" in error_str or "unavailable" in error_str:
|
|
raise VideoNotAvailableError(f"Video not available: {e}")
|
|
else:
|
|
raise DownloaderException(f"yt-dlp error: {e}")
|
|
|
|
def _get_format_selector(self, preferences: DownloadPreferences) -> str:
|
|
"""Get format selector based on preferences"""
|
|
if preferences.prefer_audio_only:
|
|
return 'bestaudio/best'
|
|
|
|
quality_map = {
|
|
VideoQuality.AUDIO_ONLY: 'bestaudio',
|
|
VideoQuality.LOW_480P: 'best[height<=480]',
|
|
VideoQuality.MEDIUM_720P: 'best[height<=720]',
|
|
VideoQuality.HIGH_1080P: 'best[height<=1080]',
|
|
VideoQuality.BEST: 'best'
|
|
}
|
|
|
|
return quality_map.get(preferences.quality, 'best[height<=720]/best')
|
|
|
|
def _progress_hook(self, d):
|
|
"""Enhanced progress hook for yt-dlp with detailed progress reporting"""
|
|
if d['status'] == 'downloading':
|
|
# Extract progress information
|
|
downloaded_bytes = d.get('downloaded_bytes', 0)
|
|
total_bytes = d.get('total_bytes') or d.get('total_bytes_estimate', 0)
|
|
percent = (downloaded_bytes / total_bytes * 100) if total_bytes > 0 else 0
|
|
speed = d.get('speed', 0) or 0 # bytes per second
|
|
eta = d.get('eta', 0) or 0 # seconds
|
|
|
|
# Create progress update
|
|
progress = DownloadProgress(
|
|
download_percent=percent,
|
|
bytes_downloaded=downloaded_bytes,
|
|
total_bytes=total_bytes,
|
|
speed_bps=speed,
|
|
eta_seconds=eta,
|
|
current_method="yt-dlp",
|
|
retry_attempt=self.retry_attempt,
|
|
status_message=f"Downloading: {percent:.1f}% ({self._format_bytes(downloaded_bytes)}/{self._format_bytes(total_bytes)}) at {self._format_speed(speed)}"
|
|
)
|
|
|
|
# Send progress update asynchronously if callback is available
|
|
if self.progress_callback:
|
|
# Since this is called from sync context, we need to handle async callback
|
|
try:
|
|
asyncio.create_task(self.report_progress(self.progress_callback, progress))
|
|
except RuntimeError:
|
|
# If no event loop is running, try to get the loop
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
loop.call_soon_threadsafe(
|
|
lambda: asyncio.create_task(self.report_progress(self.progress_callback, progress))
|
|
)
|
|
except Exception as e:
|
|
self.logger.debug(f"Could not send progress update: {e}")
|
|
|
|
self.logger.debug(f"Downloading: {percent:.1f}%, Speed: {self._format_speed(speed)}, ETA: {eta}s")
|
|
|
|
elif d['status'] == 'finished':
|
|
self.logger.info(f"Download finished: {d['filename']}")
|
|
|
|
# Send completion progress
|
|
if self.progress_callback:
|
|
progress = DownloadProgress(
|
|
download_percent=100.0,
|
|
current_method="yt-dlp",
|
|
retry_attempt=self.retry_attempt,
|
|
status_message="Processing downloaded file..."
|
|
)
|
|
try:
|
|
asyncio.create_task(self.report_progress(self.progress_callback, progress))
|
|
except RuntimeError:
|
|
pass
|
|
|
|
def _format_bytes(self, bytes: int) -> str:
|
|
"""Format bytes to human readable string"""
|
|
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
|
if bytes < 1024.0:
|
|
return f"{bytes:.1f}{unit}"
|
|
bytes /= 1024.0
|
|
return f"{bytes:.1f}PB"
|
|
|
|
def _format_speed(self, speed: float) -> str:
|
|
"""Format speed to human readable string"""
|
|
if speed <= 0:
|
|
return "N/A"
|
|
return f"{self._format_bytes(speed)}/s"
|
|
|
|
def _extract_metadata_from_info(self, info: Dict[str, Any], video_id: str) -> VideoMetadata:
|
|
"""Extract metadata from yt-dlp info"""
|
|
return VideoMetadata(
|
|
video_id=video_id,
|
|
title=info.get('title'),
|
|
description=info.get('description'),
|
|
duration_seconds=info.get('duration'),
|
|
view_count=info.get('view_count'),
|
|
upload_date=info.get('upload_date'),
|
|
uploader=info.get('uploader'),
|
|
thumbnail_url=info.get('thumbnail'),
|
|
tags=info.get('tags', []),
|
|
language=info.get('language', 'en'),
|
|
age_restricted=info.get('age_limit', 0) > 0
|
|
)
|
|
|
|
async def _extract_audio_from_video(self, video_path: Path, video_id: str) -> Optional[Path]:
|
|
"""Extract audio from video file"""
|
|
if not self._has_ffmpeg():
|
|
return None
|
|
|
|
audio_path = self.output_dir / f"{video_id}_audio.mp3"
|
|
|
|
try:
|
|
import ffmpeg
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _extract():
|
|
(
|
|
ffmpeg
|
|
.input(str(video_path))
|
|
.output(str(audio_path), acodec='mp3', audio_bitrate='192k')
|
|
.overwrite_output()
|
|
.run(quiet=True)
|
|
)
|
|
|
|
await loop.run_in_executor(None, _extract)
|
|
return audio_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Audio extraction failed: {e}")
|
|
return None
|
|
|
|
async def _extract_transcript_ytdlp(self, video_id: str) -> Optional[TranscriptData]:
|
|
"""Extract transcript using youtube-transcript-api"""
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _get_transcript():
|
|
api = YouTubeTranscriptApi()
|
|
transcript = api.fetch(video_id, languages=['en'])
|
|
|
|
full_text = ' '.join([snippet.text for snippet in transcript.snippets])
|
|
|
|
segments = [
|
|
{
|
|
'text': snippet.text,
|
|
'start': snippet.start,
|
|
'duration': snippet.duration
|
|
}
|
|
for snippet in transcript.snippets
|
|
]
|
|
|
|
return full_text, segments, transcript.is_generated, transcript.language_code
|
|
|
|
text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript)
|
|
|
|
return TranscriptData(
|
|
text=text,
|
|
language=language,
|
|
is_auto_generated=is_generated,
|
|
segments=segments,
|
|
source="youtube-transcript-api"
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.debug(f"Transcript extraction failed: {e}")
|
|
return None
|
|
|
|
def _has_ffmpeg(self) -> bool:
|
|
"""Check if ffmpeg is available"""
|
|
try:
|
|
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
|
return True
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
return False
|
|
|
|
async def test_connection(self) -> bool:
|
|
"""Test if yt-dlp is working"""
|
|
try:
|
|
import yt_dlp
|
|
|
|
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _test():
|
|
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
|
info = ydl.extract_info(test_url, download=False)
|
|
return info is not None and 'title' in info
|
|
|
|
return await loop.run_in_executor(None, _test)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"yt-dlp connection test failed: {e}")
|
|
return False
|
|
|
|
def supports_audio_only(self) -> bool:
|
|
return True
|
|
|
|
def supports_quality_selection(self) -> bool:
|
|
return True
|
|
|
|
def get_supported_formats(self) -> list[str]:
|
|
return ["mp4", "webm", "mp3", "m4a"]
|
|
|
|
|
|
# Register the downloader
|
|
from backend.services.video_downloaders.base_downloader import DownloaderFactory
|
|
DownloaderFactory.register(DownloadMethod.YT_DLP, YtDlpDownloader) |