youtube-summarizer/backend/services/video_downloaders/transcript_downloader.py

"""
Transcript-only downloader using YouTube Data API and transcript API
"""
import asyncio
import time
from typing import Optional, Dict, Any
import logging

from backend.models.video_download import (
    VideoDownloadResult,
    DownloadPreferences,
    DownloadMethod,
    DownloadStatus,
    VideoMetadata,
    TranscriptData,
    DownloaderException,
    VideoNotAvailableError
)
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader

logger = logging.getLogger(__name__)


class TranscriptOnlyDownloader(BaseVideoDownloader):
    """Transcript-only downloader using APIs - always works as fallback"""

    def __init__(self, method: DownloadMethod = DownloadMethod.TRANSCRIPT_ONLY, config: Optional[Dict[str, Any]] = None):
        super().__init__(method, config)
        self.youtube_api_key = config.get('youtube_api_key') if config else None
        self.youtube_service = None

        if self.youtube_api_key:
            try:
                from googleapiclient.discovery import build
                self.youtube_service = build('youtube', 'v3', developerKey=self.youtube_api_key)
            except Exception as e:
                logger.warning(f"Failed to initialize YouTube API service: {e}")

    async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
        """'Download' video by extracting transcript and metadata only"""
        start_time = time.time()
        video_id = await self.extract_video_id(url)

        try:
            # Get metadata from YouTube API if available
            metadata = None
            if self.youtube_service:
                metadata = await self._get_metadata_from_api(video_id)

            # Always try to get transcript
            transcript = await self._get_transcript(video_id)

            if not transcript and not metadata:
                raise VideoNotAvailableError("Could not extract transcript or metadata")

            # If we have metadata, check duration limits
            if metadata and metadata.duration_seconds and preferences.max_duration_minutes > 0:
                if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
                    return self.create_result(
                        video_id, url, DownloadStatus.FAILED,
                        f"Video too long: {metadata.duration_seconds//60} minutes"
                    )

            processing_time = time.time() - start_time

            return VideoDownloadResult(
                video_id=video_id,
                video_url=url,
                status=DownloadStatus.PARTIAL,  # Partial because no video/audio files
                method=self.method,
                video_path=None,
                audio_path=None,
                transcript=transcript,
                metadata=metadata or VideoMetadata(video_id=video_id),
                processing_time_seconds=processing_time,
                is_partial=True
            )

        except Exception as e:
            self.logger.error(f"Transcript-only download failed for {video_id}: {e}")

            error_str = str(e).lower()
            if "not available" in error_str or "private" in error_str:
                raise VideoNotAvailableError(f"Video/transcript not available: {e}")
            else:
                raise DownloaderException(f"Transcript extraction error: {e}")

    async def _get_metadata_from_api(self, video_id: str) -> Optional[VideoMetadata]:
        """Get metadata using YouTube Data API v3"""
        if not self.youtube_service:
            return None

        try:
            loop = asyncio.get_event_loop()

            def _fetch_metadata():
                response = self.youtube_service.videos().list(
                    part='snippet,contentDetails,statistics,status',
                    id=video_id
                ).execute()

                if not response.get('items'):
                    return None

                item = response['items'][0]
                snippet = item.get('snippet', {})
                content_details = item.get('contentDetails', {})
                statistics = item.get('statistics', {})
                status = item.get('status', {})

                # Parse duration (PT4M13S format)
                duration_seconds = self._parse_duration(content_details.get('duration'))

                return {
                    'title': snippet.get('title'),
                    'description': snippet.get('description'),
                    'duration_seconds': duration_seconds,
                    'view_count': int(statistics.get('viewCount', 0)) if statistics.get('viewCount') else None,
                    'upload_date': snippet.get('publishedAt'),
                    'uploader': snippet.get('channelTitle'),
                    'thumbnail_url': snippet.get('thumbnails', {}).get('high', {}).get('url'),
                    'tags': snippet.get('tags', []),
                    'language': snippet.get('defaultLanguage', 'en'),
                    'availability': status.get('privacyStatus'),
                    'age_restricted': content_details.get('contentRating', {}).get('ytRating') == 'ytAgeRestricted'
                }

            metadata_dict = await loop.run_in_executor(None, _fetch_metadata)

            if not metadata_dict:
                return None

            return VideoMetadata(
                video_id=video_id,
                **metadata_dict
            )

        except Exception as e:
            self.logger.warning(f"YouTube API metadata fetch failed: {e}")
            return None

    async def _get_transcript(self, video_id: str) -> Optional[TranscriptData]:
        """Get transcript using youtube-transcript-api"""
        try:
            from youtube_transcript_api import YouTubeTranscriptApi

            loop = asyncio.get_event_loop()

            def _fetch_transcript():
                api = YouTubeTranscriptApi()

                # Try multiple language preferences
                languages = ['en', 'en-US', 'en-GB']

                for language in languages:
                    try:
                        transcript = api.fetch(video_id, languages=[language])

                        # Convert to text
                        full_text = ' '.join([snippet.text for snippet in transcript.snippets])

                        # Convert segments
                        segments = [
                            {
                                'text': snippet.text,
                                'start': snippet.start,
                                'duration': snippet.duration
                            }
                            for snippet in transcript.snippets
                        ]

                        return full_text, segments, transcript.is_generated, transcript.language_code
                    except:
                        continue

                return None, None, None, None

            text, segments, is_generated, language = await loop.run_in_executor(None, _fetch_transcript)

            if not text:
                return None

            return TranscriptData(
                text=text,
                language=language or 'en',
                is_auto_generated=is_generated or False,
                segments=segments,
                source="youtube-transcript-api"
            )

        except Exception as e:
            self.logger.debug(f"Transcript extraction failed: {e}")
            return None

    def _parse_duration(self, duration_str: str) -> Optional[int]:
        """Parse YouTube duration format (PT4M13S) to seconds"""
        if not duration_str:
            return None

        try:
            import re

            # Parse PT4M13S format
            pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
            match = re.match(pattern, duration_str)

            if not match:
                return None

            hours = int(match.group(1) or 0)
            minutes = int(match.group(2) or 0)
            seconds = int(match.group(3) or 0)

            return hours * 3600 + minutes * 60 + seconds

        except Exception as e:
            self.logger.warning(f"Duration parsing failed: {e}")
            return None

    async def test_connection(self) -> bool:
        """Test if transcript API is working"""
        try:
            from youtube_transcript_api import YouTubeTranscriptApi

            # Test with a known working video
            test_video_id = "dQw4w9WgXcQ"

            loop = asyncio.get_event_loop()

            def _test():
                api = YouTubeTranscriptApi()
                transcript = api.fetch(test_video_id, languages=['en'])
                return len(transcript.snippets) > 0

            result = await loop.run_in_executor(None, _test)
            return result

        except Exception as e:
            self.logger.error(f"Transcript API test failed: {e}")
            return False

    async def get_video_metadata(self, video_id: str) -> Optional[VideoMetadata]:
        """Get video metadata"""
        return await self._get_metadata_from_api(video_id)

    async def get_transcript(self, video_id: str) -> Optional[TranscriptData]:
        """Get video transcript"""
        return await self._get_transcript(video_id)

    def supports_audio_only(self) -> bool:
        return False  # No audio download, transcript only

    def supports_quality_selection(self) -> bool:
        return False  # No video download

    def get_supported_formats(self) -> list[str]:
        return ["transcript"]  # Only text output


# Register the downloader
from backend.services.video_downloaders.base_downloader import DownloaderFactory
DownloaderFactory.register(DownloadMethod.TRANSCRIPT_ONLY, TranscriptOnlyDownloader)