""" Transcript-only downloader using YouTube Data API and transcript API """ import asyncio import time from typing import Optional, Dict, Any import logging from backend.models.video_download import ( VideoDownloadResult, DownloadPreferences, DownloadMethod, DownloadStatus, VideoMetadata, TranscriptData, DownloaderException, VideoNotAvailableError ) from backend.services.video_downloaders.base_downloader import BaseVideoDownloader logger = logging.getLogger(__name__) class TranscriptOnlyDownloader(BaseVideoDownloader): """Transcript-only downloader using APIs - always works as fallback""" def __init__(self, method: DownloadMethod = DownloadMethod.TRANSCRIPT_ONLY, config: Optional[Dict[str, Any]] = None): super().__init__(method, config) self.youtube_api_key = config.get('youtube_api_key') if config else None self.youtube_service = None if self.youtube_api_key: try: from googleapiclient.discovery import build self.youtube_service = build('youtube', 'v3', developerKey=self.youtube_api_key) except Exception as e: logger.warning(f"Failed to initialize YouTube API service: {e}") async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult: """'Download' video by extracting transcript and metadata only""" start_time = time.time() video_id = await self.extract_video_id(url) try: # Get metadata from YouTube API if available metadata = None if self.youtube_service: metadata = await self._get_metadata_from_api(video_id) # Always try to get transcript transcript = await self._get_transcript(video_id) if not transcript and not metadata: raise VideoNotAvailableError("Could not extract transcript or metadata") # If we have metadata, check duration limits if metadata and metadata.duration_seconds and preferences.max_duration_minutes > 0: if metadata.duration_seconds > (preferences.max_duration_minutes * 60): return self.create_result( video_id, url, DownloadStatus.FAILED, f"Video too long: {metadata.duration_seconds//60} minutes" ) processing_time = time.time() - start_time return VideoDownloadResult( video_id=video_id, video_url=url, status=DownloadStatus.PARTIAL, # Partial because no video/audio files method=self.method, video_path=None, audio_path=None, transcript=transcript, metadata=metadata or VideoMetadata(video_id=video_id), processing_time_seconds=processing_time, is_partial=True ) except Exception as e: self.logger.error(f"Transcript-only download failed for {video_id}: {e}") error_str = str(e).lower() if "not available" in error_str or "private" in error_str: raise VideoNotAvailableError(f"Video/transcript not available: {e}") else: raise DownloaderException(f"Transcript extraction error: {e}") async def _get_metadata_from_api(self, video_id: str) -> Optional[VideoMetadata]: """Get metadata using YouTube Data API v3""" if not self.youtube_service: return None try: loop = asyncio.get_event_loop() def _fetch_metadata(): response = self.youtube_service.videos().list( part='snippet,contentDetails,statistics,status', id=video_id ).execute() if not response.get('items'): return None item = response['items'][0] snippet = item.get('snippet', {}) content_details = item.get('contentDetails', {}) statistics = item.get('statistics', {}) status = item.get('status', {}) # Parse duration (PT4M13S format) duration_seconds = self._parse_duration(content_details.get('duration')) return { 'title': snippet.get('title'), 'description': snippet.get('description'), 'duration_seconds': duration_seconds, 'view_count': int(statistics.get('viewCount', 0)) if statistics.get('viewCount') else None, 'upload_date': snippet.get('publishedAt'), 'uploader': snippet.get('channelTitle'), 'thumbnail_url': snippet.get('thumbnails', {}).get('high', {}).get('url'), 'tags': snippet.get('tags', []), 'language': snippet.get('defaultLanguage', 'en'), 'availability': status.get('privacyStatus'), 'age_restricted': content_details.get('contentRating', {}).get('ytRating') == 'ytAgeRestricted' } metadata_dict = await loop.run_in_executor(None, _fetch_metadata) if not metadata_dict: return None return VideoMetadata( video_id=video_id, **metadata_dict ) except Exception as e: self.logger.warning(f"YouTube API metadata fetch failed: {e}") return None async def _get_transcript(self, video_id: str) -> Optional[TranscriptData]: """Get transcript using youtube-transcript-api""" try: from youtube_transcript_api import YouTubeTranscriptApi loop = asyncio.get_event_loop() def _fetch_transcript(): api = YouTubeTranscriptApi() # Try multiple language preferences languages = ['en', 'en-US', 'en-GB'] for language in languages: try: transcript = api.fetch(video_id, languages=[language]) # Convert to text full_text = ' '.join([snippet.text for snippet in transcript.snippets]) # Convert segments segments = [ { 'text': snippet.text, 'start': snippet.start, 'duration': snippet.duration } for snippet in transcript.snippets ] return full_text, segments, transcript.is_generated, transcript.language_code except: continue return None, None, None, None text, segments, is_generated, language = await loop.run_in_executor(None, _fetch_transcript) if not text: return None return TranscriptData( text=text, language=language or 'en', is_auto_generated=is_generated or False, segments=segments, source="youtube-transcript-api" ) except Exception as e: self.logger.debug(f"Transcript extraction failed: {e}") return None def _parse_duration(self, duration_str: str) -> Optional[int]: """Parse YouTube duration format (PT4M13S) to seconds""" if not duration_str: return None try: import re # Parse PT4M13S format pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?' match = re.match(pattern, duration_str) if not match: return None hours = int(match.group(1) or 0) minutes = int(match.group(2) or 0) seconds = int(match.group(3) or 0) return hours * 3600 + minutes * 60 + seconds except Exception as e: self.logger.warning(f"Duration parsing failed: {e}") return None async def test_connection(self) -> bool: """Test if transcript API is working""" try: from youtube_transcript_api import YouTubeTranscriptApi # Test with a known working video test_video_id = "dQw4w9WgXcQ" loop = asyncio.get_event_loop() def _test(): api = YouTubeTranscriptApi() transcript = api.fetch(test_video_id, languages=['en']) return len(transcript.snippets) > 0 result = await loop.run_in_executor(None, _test) return result except Exception as e: self.logger.error(f"Transcript API test failed: {e}") return False async def get_video_metadata(self, video_id: str) -> Optional[VideoMetadata]: """Get video metadata""" return await self._get_metadata_from_api(video_id) async def get_transcript(self, video_id: str) -> Optional[TranscriptData]: """Get video transcript""" return await self._get_transcript(video_id) def supports_audio_only(self) -> bool: return False # No audio download, transcript only def supports_quality_selection(self) -> bool: return False # No video download def get_supported_formats(self) -> list[str]: return ["transcript"] # Only text output # Register the downloader from backend.services.video_downloaders.base_downloader import DownloaderFactory DownloaderFactory.register(DownloadMethod.TRANSCRIPT_ONLY, TranscriptOnlyDownloader)