262 lines
10 KiB
Python
262 lines
10 KiB
Python
"""
|
|
Transcript-only downloader using YouTube Data API and transcript API
|
|
"""
|
|
import asyncio
|
|
import time
|
|
from typing import Optional, Dict, Any
|
|
import logging
|
|
|
|
from backend.models.video_download import (
|
|
VideoDownloadResult,
|
|
DownloadPreferences,
|
|
DownloadMethod,
|
|
DownloadStatus,
|
|
VideoMetadata,
|
|
TranscriptData,
|
|
DownloaderException,
|
|
VideoNotAvailableError
|
|
)
|
|
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TranscriptOnlyDownloader(BaseVideoDownloader):
|
|
"""Transcript-only downloader using APIs - always works as fallback"""
|
|
|
|
def __init__(self, method: DownloadMethod = DownloadMethod.TRANSCRIPT_ONLY, config: Optional[Dict[str, Any]] = None):
|
|
super().__init__(method, config)
|
|
self.youtube_api_key = config.get('youtube_api_key') if config else None
|
|
self.youtube_service = None
|
|
|
|
if self.youtube_api_key:
|
|
try:
|
|
from googleapiclient.discovery import build
|
|
self.youtube_service = build('youtube', 'v3', developerKey=self.youtube_api_key)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to initialize YouTube API service: {e}")
|
|
|
|
async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
|
|
"""'Download' video by extracting transcript and metadata only"""
|
|
start_time = time.time()
|
|
video_id = await self.extract_video_id(url)
|
|
|
|
try:
|
|
# Get metadata from YouTube API if available
|
|
metadata = None
|
|
if self.youtube_service:
|
|
metadata = await self._get_metadata_from_api(video_id)
|
|
|
|
# Always try to get transcript
|
|
transcript = await self._get_transcript(video_id)
|
|
|
|
if not transcript and not metadata:
|
|
raise VideoNotAvailableError("Could not extract transcript or metadata")
|
|
|
|
# If we have metadata, check duration limits
|
|
if metadata and metadata.duration_seconds and preferences.max_duration_minutes > 0:
|
|
if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
|
|
return self.create_result(
|
|
video_id, url, DownloadStatus.FAILED,
|
|
f"Video too long: {metadata.duration_seconds//60} minutes"
|
|
)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return VideoDownloadResult(
|
|
video_id=video_id,
|
|
video_url=url,
|
|
status=DownloadStatus.PARTIAL, # Partial because no video/audio files
|
|
method=self.method,
|
|
video_path=None,
|
|
audio_path=None,
|
|
transcript=transcript,
|
|
metadata=metadata or VideoMetadata(video_id=video_id),
|
|
processing_time_seconds=processing_time,
|
|
is_partial=True
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Transcript-only download failed for {video_id}: {e}")
|
|
|
|
error_str = str(e).lower()
|
|
if "not available" in error_str or "private" in error_str:
|
|
raise VideoNotAvailableError(f"Video/transcript not available: {e}")
|
|
else:
|
|
raise DownloaderException(f"Transcript extraction error: {e}")
|
|
|
|
async def _get_metadata_from_api(self, video_id: str) -> Optional[VideoMetadata]:
|
|
"""Get metadata using YouTube Data API v3"""
|
|
if not self.youtube_service:
|
|
return None
|
|
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _fetch_metadata():
|
|
response = self.youtube_service.videos().list(
|
|
part='snippet,contentDetails,statistics,status',
|
|
id=video_id
|
|
).execute()
|
|
|
|
if not response.get('items'):
|
|
return None
|
|
|
|
item = response['items'][0]
|
|
snippet = item.get('snippet', {})
|
|
content_details = item.get('contentDetails', {})
|
|
statistics = item.get('statistics', {})
|
|
status = item.get('status', {})
|
|
|
|
# Parse duration (PT4M13S format)
|
|
duration_seconds = self._parse_duration(content_details.get('duration'))
|
|
|
|
return {
|
|
'title': snippet.get('title'),
|
|
'description': snippet.get('description'),
|
|
'duration_seconds': duration_seconds,
|
|
'view_count': int(statistics.get('viewCount', 0)) if statistics.get('viewCount') else None,
|
|
'upload_date': snippet.get('publishedAt'),
|
|
'uploader': snippet.get('channelTitle'),
|
|
'thumbnail_url': snippet.get('thumbnails', {}).get('high', {}).get('url'),
|
|
'tags': snippet.get('tags', []),
|
|
'language': snippet.get('defaultLanguage', 'en'),
|
|
'availability': status.get('privacyStatus'),
|
|
'age_restricted': content_details.get('contentRating', {}).get('ytRating') == 'ytAgeRestricted'
|
|
}
|
|
|
|
metadata_dict = await loop.run_in_executor(None, _fetch_metadata)
|
|
|
|
if not metadata_dict:
|
|
return None
|
|
|
|
return VideoMetadata(
|
|
video_id=video_id,
|
|
**metadata_dict
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"YouTube API metadata fetch failed: {e}")
|
|
return None
|
|
|
|
async def _get_transcript(self, video_id: str) -> Optional[TranscriptData]:
|
|
"""Get transcript using youtube-transcript-api"""
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _fetch_transcript():
|
|
api = YouTubeTranscriptApi()
|
|
|
|
# Try multiple language preferences
|
|
languages = ['en', 'en-US', 'en-GB']
|
|
|
|
for language in languages:
|
|
try:
|
|
transcript = api.fetch(video_id, languages=[language])
|
|
|
|
# Convert to text
|
|
full_text = ' '.join([snippet.text for snippet in transcript.snippets])
|
|
|
|
# Convert segments
|
|
segments = [
|
|
{
|
|
'text': snippet.text,
|
|
'start': snippet.start,
|
|
'duration': snippet.duration
|
|
}
|
|
for snippet in transcript.snippets
|
|
]
|
|
|
|
return full_text, segments, transcript.is_generated, transcript.language_code
|
|
except:
|
|
continue
|
|
|
|
return None, None, None, None
|
|
|
|
text, segments, is_generated, language = await loop.run_in_executor(None, _fetch_transcript)
|
|
|
|
if not text:
|
|
return None
|
|
|
|
return TranscriptData(
|
|
text=text,
|
|
language=language or 'en',
|
|
is_auto_generated=is_generated or False,
|
|
segments=segments,
|
|
source="youtube-transcript-api"
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.debug(f"Transcript extraction failed: {e}")
|
|
return None
|
|
|
|
def _parse_duration(self, duration_str: str) -> Optional[int]:
|
|
"""Parse YouTube duration format (PT4M13S) to seconds"""
|
|
if not duration_str:
|
|
return None
|
|
|
|
try:
|
|
import re
|
|
|
|
# Parse PT4M13S format
|
|
pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
|
|
match = re.match(pattern, duration_str)
|
|
|
|
if not match:
|
|
return None
|
|
|
|
hours = int(match.group(1) or 0)
|
|
minutes = int(match.group(2) or 0)
|
|
seconds = int(match.group(3) or 0)
|
|
|
|
return hours * 3600 + minutes * 60 + seconds
|
|
|
|
except Exception as e:
|
|
self.logger.warning(f"Duration parsing failed: {e}")
|
|
return None
|
|
|
|
async def test_connection(self) -> bool:
|
|
"""Test if transcript API is working"""
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
# Test with a known working video
|
|
test_video_id = "dQw4w9WgXcQ"
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _test():
|
|
api = YouTubeTranscriptApi()
|
|
transcript = api.fetch(test_video_id, languages=['en'])
|
|
return len(transcript.snippets) > 0
|
|
|
|
result = await loop.run_in_executor(None, _test)
|
|
return result
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Transcript API test failed: {e}")
|
|
return False
|
|
|
|
async def get_video_metadata(self, video_id: str) -> Optional[VideoMetadata]:
|
|
"""Get video metadata"""
|
|
return await self._get_metadata_from_api(video_id)
|
|
|
|
async def get_transcript(self, video_id: str) -> Optional[TranscriptData]:
|
|
"""Get video transcript"""
|
|
return await self._get_transcript(video_id)
|
|
|
|
def supports_audio_only(self) -> bool:
|
|
return False # No audio download, transcript only
|
|
|
|
def supports_quality_selection(self) -> bool:
|
|
return False # No video download
|
|
|
|
def get_supported_formats(self) -> list[str]:
|
|
return ["transcript"] # Only text output
|
|
|
|
|
|
# Register the downloader
|
|
from backend.services.video_downloaders.base_downloader import DownloaderFactory
|
|
DownloaderFactory.register(DownloadMethod.TRANSCRIPT_ONLY, TranscriptOnlyDownloader) |