youtube-summarizer/backend/services/video_downloaders/transcript_downloader.py

262 lines
10 KiB
Python

"""
Transcript-only downloader using YouTube Data API and transcript API
"""
import asyncio
import time
from typing import Optional, Dict, Any
import logging
from backend.models.video_download import (
VideoDownloadResult,
DownloadPreferences,
DownloadMethod,
DownloadStatus,
VideoMetadata,
TranscriptData,
DownloaderException,
VideoNotAvailableError
)
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader
logger = logging.getLogger(__name__)
class TranscriptOnlyDownloader(BaseVideoDownloader):
"""Transcript-only downloader using APIs - always works as fallback"""
def __init__(self, method: DownloadMethod = DownloadMethod.TRANSCRIPT_ONLY, config: Optional[Dict[str, Any]] = None):
super().__init__(method, config)
self.youtube_api_key = config.get('youtube_api_key') if config else None
self.youtube_service = None
if self.youtube_api_key:
try:
from googleapiclient.discovery import build
self.youtube_service = build('youtube', 'v3', developerKey=self.youtube_api_key)
except Exception as e:
logger.warning(f"Failed to initialize YouTube API service: {e}")
async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
"""'Download' video by extracting transcript and metadata only"""
start_time = time.time()
video_id = await self.extract_video_id(url)
try:
# Get metadata from YouTube API if available
metadata = None
if self.youtube_service:
metadata = await self._get_metadata_from_api(video_id)
# Always try to get transcript
transcript = await self._get_transcript(video_id)
if not transcript and not metadata:
raise VideoNotAvailableError("Could not extract transcript or metadata")
# If we have metadata, check duration limits
if metadata and metadata.duration_seconds and preferences.max_duration_minutes > 0:
if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
return self.create_result(
video_id, url, DownloadStatus.FAILED,
f"Video too long: {metadata.duration_seconds//60} minutes"
)
processing_time = time.time() - start_time
return VideoDownloadResult(
video_id=video_id,
video_url=url,
status=DownloadStatus.PARTIAL, # Partial because no video/audio files
method=self.method,
video_path=None,
audio_path=None,
transcript=transcript,
metadata=metadata or VideoMetadata(video_id=video_id),
processing_time_seconds=processing_time,
is_partial=True
)
except Exception as e:
self.logger.error(f"Transcript-only download failed for {video_id}: {e}")
error_str = str(e).lower()
if "not available" in error_str or "private" in error_str:
raise VideoNotAvailableError(f"Video/transcript not available: {e}")
else:
raise DownloaderException(f"Transcript extraction error: {e}")
async def _get_metadata_from_api(self, video_id: str) -> Optional[VideoMetadata]:
"""Get metadata using YouTube Data API v3"""
if not self.youtube_service:
return None
try:
loop = asyncio.get_event_loop()
def _fetch_metadata():
response = self.youtube_service.videos().list(
part='snippet,contentDetails,statistics,status',
id=video_id
).execute()
if not response.get('items'):
return None
item = response['items'][0]
snippet = item.get('snippet', {})
content_details = item.get('contentDetails', {})
statistics = item.get('statistics', {})
status = item.get('status', {})
# Parse duration (PT4M13S format)
duration_seconds = self._parse_duration(content_details.get('duration'))
return {
'title': snippet.get('title'),
'description': snippet.get('description'),
'duration_seconds': duration_seconds,
'view_count': int(statistics.get('viewCount', 0)) if statistics.get('viewCount') else None,
'upload_date': snippet.get('publishedAt'),
'uploader': snippet.get('channelTitle'),
'thumbnail_url': snippet.get('thumbnails', {}).get('high', {}).get('url'),
'tags': snippet.get('tags', []),
'language': snippet.get('defaultLanguage', 'en'),
'availability': status.get('privacyStatus'),
'age_restricted': content_details.get('contentRating', {}).get('ytRating') == 'ytAgeRestricted'
}
metadata_dict = await loop.run_in_executor(None, _fetch_metadata)
if not metadata_dict:
return None
return VideoMetadata(
video_id=video_id,
**metadata_dict
)
except Exception as e:
self.logger.warning(f"YouTube API metadata fetch failed: {e}")
return None
async def _get_transcript(self, video_id: str) -> Optional[TranscriptData]:
"""Get transcript using youtube-transcript-api"""
try:
from youtube_transcript_api import YouTubeTranscriptApi
loop = asyncio.get_event_loop()
def _fetch_transcript():
api = YouTubeTranscriptApi()
# Try multiple language preferences
languages = ['en', 'en-US', 'en-GB']
for language in languages:
try:
transcript = api.fetch(video_id, languages=[language])
# Convert to text
full_text = ' '.join([snippet.text for snippet in transcript.snippets])
# Convert segments
segments = [
{
'text': snippet.text,
'start': snippet.start,
'duration': snippet.duration
}
for snippet in transcript.snippets
]
return full_text, segments, transcript.is_generated, transcript.language_code
except:
continue
return None, None, None, None
text, segments, is_generated, language = await loop.run_in_executor(None, _fetch_transcript)
if not text:
return None
return TranscriptData(
text=text,
language=language or 'en',
is_auto_generated=is_generated or False,
segments=segments,
source="youtube-transcript-api"
)
except Exception as e:
self.logger.debug(f"Transcript extraction failed: {e}")
return None
def _parse_duration(self, duration_str: str) -> Optional[int]:
"""Parse YouTube duration format (PT4M13S) to seconds"""
if not duration_str:
return None
try:
import re
# Parse PT4M13S format
pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
match = re.match(pattern, duration_str)
if not match:
return None
hours = int(match.group(1) or 0)
minutes = int(match.group(2) or 0)
seconds = int(match.group(3) or 0)
return hours * 3600 + minutes * 60 + seconds
except Exception as e:
self.logger.warning(f"Duration parsing failed: {e}")
return None
async def test_connection(self) -> bool:
"""Test if transcript API is working"""
try:
from youtube_transcript_api import YouTubeTranscriptApi
# Test with a known working video
test_video_id = "dQw4w9WgXcQ"
loop = asyncio.get_event_loop()
def _test():
api = YouTubeTranscriptApi()
transcript = api.fetch(test_video_id, languages=['en'])
return len(transcript.snippets) > 0
result = await loop.run_in_executor(None, _test)
return result
except Exception as e:
self.logger.error(f"Transcript API test failed: {e}")
return False
async def get_video_metadata(self, video_id: str) -> Optional[VideoMetadata]:
"""Get video metadata"""
return await self._get_metadata_from_api(video_id)
async def get_transcript(self, video_id: str) -> Optional[TranscriptData]:
"""Get video transcript"""
return await self._get_transcript(video_id)
def supports_audio_only(self) -> bool:
return False # No audio download, transcript only
def supports_quality_selection(self) -> bool:
return False # No video download
def get_supported_formats(self) -> list[str]:
return ["transcript"] # Only text output
# Register the downloader
from backend.services.video_downloaders.base_downloader import DownloaderFactory
DownloaderFactory.register(DownloadMethod.TRANSCRIPT_ONLY, TranscriptOnlyDownloader)