""" Pytubefix-based video downloader """ import asyncio import time from pathlib import Path from typing import Optional, Dict, Any import logging from backend.models.video_download import ( VideoDownloadResult, DownloadPreferences, DownloadMethod, DownloadStatus, VideoMetadata, TranscriptData, VideoQuality, DownloaderException, VideoNotAvailableError ) from backend.services.video_downloaders.base_downloader import BaseVideoDownloader logger = logging.getLogger(__name__) class PytubefixDownloader(BaseVideoDownloader): """Pytubefix-based video downloader""" def __init__(self, method: DownloadMethod = DownloadMethod.PYTUBEFIX, config: Optional[Dict[str, Any]] = None): super().__init__(method, config) self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage') self.output_dir.mkdir(parents=True, exist_ok=True) async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult: """Download video using pytubefix""" start_time = time.time() video_id = await self.extract_video_id(url) try: # Import pytubefix from pytubefix import YouTube # Run in thread pool to avoid blocking loop = asyncio.get_event_loop() yt = await loop.run_in_executor(None, self._create_youtube_object, url) # Get video metadata metadata = await self._extract_metadata(yt, video_id) # Check duration limits if metadata.duration_seconds and preferences.max_duration_minutes > 0: if metadata.duration_seconds > (preferences.max_duration_minutes * 60): return self.create_result( video_id, url, DownloadStatus.FAILED, f"Video too long: {metadata.duration_seconds//60} minutes" ) # Download based on preferences video_path = None audio_path = None if preferences.prefer_audio_only or not preferences.save_video: # Download audio only audio_path = await self._download_audio(yt, video_id, loop) else: # Download video and audio separately, then merge video_path, audio_path = await self._download_video_and_audio(yt, video_id, preferences, loop) # Get transcript if available transcript = None if preferences.enable_subtitles: transcript = await self._extract_transcript(yt, video_id) processing_time = time.time() - start_time # Calculate file sizes file_size = 0 if audio_path and audio_path.exists(): file_size += audio_path.stat().st_size if video_path and video_path.exists(): file_size += video_path.stat().st_size return VideoDownloadResult( video_id=video_id, video_url=url, status=DownloadStatus.COMPLETED, method=self.method, video_path=video_path, audio_path=audio_path, transcript=transcript, metadata=metadata, processing_time_seconds=processing_time, file_size_bytes=file_size ) except Exception as e: self.logger.error(f"Pytubefix download failed for {video_id}: {e}") # Try to determine error type error_msg = str(e).lower() if "private" in error_msg or "unavailable" in error_msg: raise VideoNotAvailableError(f"Video not available: {e}") elif "age" in error_msg and "restricted" in error_msg: raise VideoNotAvailableError(f"Age-restricted video: {e}") else: raise DownloaderException(f"Pytubefix error: {e}") def _create_youtube_object(self, url: str): """Create YouTube object (runs in thread pool)""" from pytubefix import YouTube # Configure pytubefix with realistic settings return YouTube( url, use_oauth=False, # OAuth can help but may be complex allow_oauth_cache=True ) async def _extract_metadata(self, yt, video_id: str) -> VideoMetadata: """Extract video metadata""" loop = asyncio.get_event_loop() def _get_metadata(): return { 'title': getattr(yt, 'title', None), 'description': getattr(yt, 'description', None), 'length': getattr(yt, 'length', None), 'views': getattr(yt, 'views', None), 'publish_date': getattr(yt, 'publish_date', None), 'author': getattr(yt, 'author', None), 'thumbnail_url': getattr(yt, 'thumbnail_url', None), 'keywords': getattr(yt, 'keywords', []), } meta = await loop.run_in_executor(None, _get_metadata) return VideoMetadata( video_id=video_id, title=meta.get('title'), description=meta.get('description'), duration_seconds=meta.get('length'), view_count=meta.get('views'), upload_date=meta.get('publish_date').isoformat() if meta.get('publish_date') else None, uploader=meta.get('author'), thumbnail_url=meta.get('thumbnail_url'), tags=meta.get('keywords', []) ) async def _download_audio(self, yt, video_id: str, loop) -> Optional[Path]: """Download audio only""" def _download(): try: # Get best audio stream audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first() if not audio_stream: self.logger.warning("No audio stream found") return None # Download to temp location first temp_path = audio_stream.download( output_path=self.output_dir, filename=f"{video_id}_temp_audio" ) # Convert to MP3 if needed audio_path = self.output_dir / f"{video_id}_audio.mp3" if temp_path.endswith('.mp4'): # Convert MP4 to MP3 using ffmpeg-python if available try: import ffmpeg ( ffmpeg .input(temp_path) .output(str(audio_path), acodec='mp3', audio_bitrate='192k') .overwrite_output() .run(quiet=True) ) # Remove temp file Path(temp_path).unlink() except ImportError: # If ffmpeg not available, just rename Path(temp_path).rename(audio_path.with_suffix('.mp4')) audio_path = audio_path.with_suffix('.mp4') else: # Just move the file Path(temp_path).rename(audio_path) return audio_path except Exception as e: self.logger.error(f"Audio download failed: {e}") return None result = await loop.run_in_executor(None, _download) return Path(result) if result else None async def _download_video_and_audio(self, yt, video_id: str, preferences: DownloadPreferences, loop): """Download video and audio separately""" def _download(): try: # Get best video stream (no audio) video_stream = yt.streams.filter( adaptive=True, file_extension='mp4', only_video=True ).order_by('resolution').desc().first() # Get best audio stream audio_stream = yt.streams.filter( only_audio=True, file_extension='mp4' ).order_by('abr').desc().first() if not video_stream or not audio_stream: self.logger.warning("Could not find suitable video/audio streams") return None, None # Download both video_temp = video_stream.download( output_path=self.output_dir, filename=f"{video_id}_temp_video" ) audio_temp = audio_stream.download( output_path=self.output_dir, filename=f"{video_id}_temp_audio" ) # Merge using ffmpeg if available video_path = self.output_dir / f"{video_id}_video.mp4" audio_path = self.output_dir / f"{video_id}_audio.mp3" try: import ffmpeg # Merge video and audio ( ffmpeg .output( ffmpeg.input(video_temp), ffmpeg.input(audio_temp), str(video_path), vcodec='copy', acodec='aac' ) .overwrite_output() .run(quiet=True) ) # Create separate audio file ( ffmpeg .input(audio_temp) .output(str(audio_path), acodec='mp3', audio_bitrate='192k') .overwrite_output() .run(quiet=True) ) # Cleanup temp files Path(video_temp).unlink() Path(audio_temp).unlink() return video_path, audio_path except ImportError: # If no ffmpeg, just keep separate files video_path = Path(video_temp) audio_path = Path(audio_temp) return video_path, audio_path except Exception as e: self.logger.error(f"Video+audio download failed: {e}") return None, None video_result, audio_result = await loop.run_in_executor(None, _download) return (Path(video_result) if video_result else None, Path(audio_result) if audio_result else None) async def _extract_transcript(self, yt, video_id: str) -> Optional[TranscriptData]: """Extract transcript using YouTube API""" try: from youtube_transcript_api import YouTubeTranscriptApi loop = asyncio.get_event_loop() def _get_transcript(): api = YouTubeTranscriptApi() transcript = api.fetch(video_id, languages=['en']) # Convert to text full_text = ' '.join([snippet.text for snippet in transcript.snippets]) # Convert segments segments = [ { 'text': snippet.text, 'start': snippet.start, 'duration': snippet.duration } for snippet in transcript.snippets ] return full_text, segments, transcript.is_generated, transcript.language_code text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript) return TranscriptData( text=text, language=language, is_auto_generated=is_generated, segments=segments, source="youtube-transcript-api" ) except Exception as e: self.logger.debug(f"Transcript extraction failed: {e}") return None async def test_connection(self) -> bool: """Test if pytubefix is working""" try: from pytubefix import YouTube # Test with a known working video test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" loop = asyncio.get_event_loop() def _test(): yt = YouTube(test_url) return yt.title is not None result = await loop.run_in_executor(None, _test) return result except Exception as e: self.logger.error(f"Pytubefix connection test failed: {e}") return False def supports_audio_only(self) -> bool: return True def supports_quality_selection(self) -> bool: return True def get_supported_formats(self) -> list[str]: return ["mp4", "mp3", "webm"] # Register the downloader from backend.services.video_downloaders.base_downloader import DownloaderFactory DownloaderFactory.register(DownloadMethod.PYTUBEFIX, PytubefixDownloader)