youtube-summarizer/backend/services/video_downloaders/pytubefix_downloader.py

"""
Pytubefix-based video downloader
"""
import asyncio
import time
from pathlib import Path
from typing import Optional, Dict, Any
import logging

from backend.models.video_download import (
    VideoDownloadResult,
    DownloadPreferences,
    DownloadMethod,
    DownloadStatus,
    VideoMetadata,
    TranscriptData,
    VideoQuality,
    DownloaderException,
    VideoNotAvailableError
)
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader

logger = logging.getLogger(__name__)


class PytubefixDownloader(BaseVideoDownloader):
    """Pytubefix-based video downloader"""

    def __init__(self, method: DownloadMethod = DownloadMethod.PYTUBEFIX, config: Optional[Dict[str, Any]] = None):
        super().__init__(method, config)
        self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage')
        self.output_dir.mkdir(parents=True, exist_ok=True)

    async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
        """Download video using pytubefix"""
        start_time = time.time()
        video_id = await self.extract_video_id(url)

        try:
            # Import pytubefix
            from pytubefix import YouTube

            # Run in thread pool to avoid blocking
            loop = asyncio.get_event_loop()
            yt = await loop.run_in_executor(None, self._create_youtube_object, url)

            # Get video metadata
            metadata = await self._extract_metadata(yt, video_id)

            # Check duration limits
            if metadata.duration_seconds and preferences.max_duration_minutes > 0:
                if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
                    return self.create_result(
                        video_id, url, DownloadStatus.FAILED,
                        f"Video too long: {metadata.duration_seconds//60} minutes"
                    )

            # Download based on preferences
            video_path = None
            audio_path = None

            if preferences.prefer_audio_only or not preferences.save_video:
                # Download audio only
                audio_path = await self._download_audio(yt, video_id, loop)
            else:
                # Download video and audio separately, then merge
                video_path, audio_path = await self._download_video_and_audio(yt, video_id, preferences, loop)

            # Get transcript if available
            transcript = None
            if preferences.enable_subtitles:
                transcript = await self._extract_transcript(yt, video_id)

            processing_time = time.time() - start_time

            # Calculate file sizes
            file_size = 0
            if audio_path and audio_path.exists():
                file_size += audio_path.stat().st_size
            if video_path and video_path.exists():
                file_size += video_path.stat().st_size

            return VideoDownloadResult(
                video_id=video_id,
                video_url=url,
                status=DownloadStatus.COMPLETED,
                method=self.method,
                video_path=video_path,
                audio_path=audio_path,
                transcript=transcript,
                metadata=metadata,
                processing_time_seconds=processing_time,
                file_size_bytes=file_size
            )

        except Exception as e:
            self.logger.error(f"Pytubefix download failed for {video_id}: {e}")

            # Try to determine error type
            error_msg = str(e).lower()
            if "private" in error_msg or "unavailable" in error_msg:
                raise VideoNotAvailableError(f"Video not available: {e}")
            elif "age" in error_msg and "restricted" in error_msg:
                raise VideoNotAvailableError(f"Age-restricted video: {e}")
            else:
                raise DownloaderException(f"Pytubefix error: {e}")

    def _create_youtube_object(self, url: str):
        """Create YouTube object (runs in thread pool)"""
        from pytubefix import YouTube

        # Configure pytubefix with realistic settings
        return YouTube(
            url,
            use_oauth=False,  # OAuth can help but may be complex
            allow_oauth_cache=True
        )

    async def _extract_metadata(self, yt, video_id: str) -> VideoMetadata:
        """Extract video metadata"""
        loop = asyncio.get_event_loop()

        def _get_metadata():
            return {
                'title': getattr(yt, 'title', None),
                'description': getattr(yt, 'description', None),
                'length': getattr(yt, 'length', None),
                'views': getattr(yt, 'views', None),
                'publish_date': getattr(yt, 'publish_date', None),
                'author': getattr(yt, 'author', None),
                'thumbnail_url': getattr(yt, 'thumbnail_url', None),
                'keywords': getattr(yt, 'keywords', []),
            }

        meta = await loop.run_in_executor(None, _get_metadata)

        return VideoMetadata(
            video_id=video_id,
            title=meta.get('title'),
            description=meta.get('description'),
            duration_seconds=meta.get('length'),
            view_count=meta.get('views'),
            upload_date=meta.get('publish_date').isoformat() if meta.get('publish_date') else None,
            uploader=meta.get('author'),
            thumbnail_url=meta.get('thumbnail_url'),
            tags=meta.get('keywords', [])
        )

    async def _download_audio(self, yt, video_id: str, loop) -> Optional[Path]:
        """Download audio only"""
        def _download():
            try:
                # Get best audio stream
                audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first()

                if not audio_stream:
                    self.logger.warning("No audio stream found")
                    return None

                # Download to temp location first
                temp_path = audio_stream.download(
                    output_path=self.output_dir,
                    filename=f"{video_id}_temp_audio"
                )

                # Convert to MP3 if needed
                audio_path = self.output_dir / f"{video_id}_audio.mp3"

                if temp_path.endswith('.mp4'):
                    # Convert MP4 to MP3 using ffmpeg-python if available
                    try:
                        import ffmpeg
                        (
                            ffmpeg
                            .input(temp_path)
                            .output(str(audio_path), acodec='mp3', audio_bitrate='192k')
                            .overwrite_output()
                            .run(quiet=True)
                        )
                        # Remove temp file
                        Path(temp_path).unlink()

                    except ImportError:
                        # If ffmpeg not available, just rename
                        Path(temp_path).rename(audio_path.with_suffix('.mp4'))
                        audio_path = audio_path.with_suffix('.mp4')
                else:
                    # Just move the file
                    Path(temp_path).rename(audio_path)

                return audio_path

            except Exception as e:
                self.logger.error(f"Audio download failed: {e}")
                return None

        result = await loop.run_in_executor(None, _download)
        return Path(result) if result else None

    async def _download_video_and_audio(self, yt, video_id: str, preferences: DownloadPreferences, loop):
        """Download video and audio separately"""
        def _download():
            try:
                # Get best video stream (no audio)
                video_stream = yt.streams.filter(
                    adaptive=True,
                    file_extension='mp4',
                    only_video=True
                ).order_by('resolution').desc().first()

                # Get best audio stream
                audio_stream = yt.streams.filter(
                    only_audio=True,
                    file_extension='mp4'
                ).order_by('abr').desc().first()

                if not video_stream or not audio_stream:
                    self.logger.warning("Could not find suitable video/audio streams")
                    return None, None

                # Download both
                video_temp = video_stream.download(
                    output_path=self.output_dir,
                    filename=f"{video_id}_temp_video"
                )

                audio_temp = audio_stream.download(
                    output_path=self.output_dir,
                    filename=f"{video_id}_temp_audio"
                )

                # Merge using ffmpeg if available
                video_path = self.output_dir / f"{video_id}_video.mp4"
                audio_path = self.output_dir / f"{video_id}_audio.mp3"

                try:
                    import ffmpeg

                    # Merge video and audio
                    (
                        ffmpeg
                        .output(
                            ffmpeg.input(video_temp),
                            ffmpeg.input(audio_temp),
                            str(video_path),
                            vcodec='copy',
                            acodec='aac'
                        )
                        .overwrite_output()
                        .run(quiet=True)
                    )

                    # Create separate audio file
                    (
                        ffmpeg
                        .input(audio_temp)
                        .output(str(audio_path), acodec='mp3', audio_bitrate='192k')
                        .overwrite_output()
                        .run(quiet=True)
                    )

                    # Cleanup temp files
                    Path(video_temp).unlink()
                    Path(audio_temp).unlink()

                    return video_path, audio_path

                except ImportError:
                    # If no ffmpeg, just keep separate files
                    video_path = Path(video_temp)
                    audio_path = Path(audio_temp)
                    return video_path, audio_path

            except Exception as e:
                self.logger.error(f"Video+audio download failed: {e}")
                return None, None

        video_result, audio_result = await loop.run_in_executor(None, _download)
        return (Path(video_result) if video_result else None,
                Path(audio_result) if audio_result else None)

    async def _extract_transcript(self, yt, video_id: str) -> Optional[TranscriptData]:
        """Extract transcript using YouTube API"""
        try:
            from youtube_transcript_api import YouTubeTranscriptApi

            loop = asyncio.get_event_loop()

            def _get_transcript():
                api = YouTubeTranscriptApi()
                transcript = api.fetch(video_id, languages=['en'])

                # Convert to text
                full_text = ' '.join([snippet.text for snippet in transcript.snippets])

                # Convert segments
                segments = [
                    {
                        'text': snippet.text,
                        'start': snippet.start,
                        'duration': snippet.duration
                    }
                    for snippet in transcript.snippets
                ]

                return full_text, segments, transcript.is_generated, transcript.language_code

            text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript)

            return TranscriptData(
                text=text,
                language=language,
                is_auto_generated=is_generated,
                segments=segments,
                source="youtube-transcript-api"
            )

        except Exception as e:
            self.logger.debug(f"Transcript extraction failed: {e}")
            return None

    async def test_connection(self) -> bool:
        """Test if pytubefix is working"""
        try:
            from pytubefix import YouTube

            # Test with a known working video
            test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"

            loop = asyncio.get_event_loop()

            def _test():
                yt = YouTube(test_url)
                return yt.title is not None

            result = await loop.run_in_executor(None, _test)
            return result

        except Exception as e:
            self.logger.error(f"Pytubefix connection test failed: {e}")
            return False

    def supports_audio_only(self) -> bool:
        return True

    def supports_quality_selection(self) -> bool:
        return True

    def get_supported_formats(self) -> list[str]:
        return ["mp4", "mp3", "webm"]


# Register the downloader
from backend.services.video_downloaders.base_downloader import DownloaderFactory
DownloaderFactory.register(DownloadMethod.PYTUBEFIX, PytubefixDownloader)