youtube-summarizer/backend/services/video_downloaders/pytubefix_downloader.py

355 lines
14 KiB
Python

"""
Pytubefix-based video downloader
"""
import asyncio
import time
from pathlib import Path
from typing import Optional, Dict, Any
import logging
from backend.models.video_download import (
VideoDownloadResult,
DownloadPreferences,
DownloadMethod,
DownloadStatus,
VideoMetadata,
TranscriptData,
VideoQuality,
DownloaderException,
VideoNotAvailableError
)
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader
logger = logging.getLogger(__name__)
class PytubefixDownloader(BaseVideoDownloader):
"""Pytubefix-based video downloader"""
def __init__(self, method: DownloadMethod = DownloadMethod.PYTUBEFIX, config: Optional[Dict[str, Any]] = None):
super().__init__(method, config)
self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage')
self.output_dir.mkdir(parents=True, exist_ok=True)
async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
"""Download video using pytubefix"""
start_time = time.time()
video_id = await self.extract_video_id(url)
try:
# Import pytubefix
from pytubefix import YouTube
# Run in thread pool to avoid blocking
loop = asyncio.get_event_loop()
yt = await loop.run_in_executor(None, self._create_youtube_object, url)
# Get video metadata
metadata = await self._extract_metadata(yt, video_id)
# Check duration limits
if metadata.duration_seconds and preferences.max_duration_minutes > 0:
if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
return self.create_result(
video_id, url, DownloadStatus.FAILED,
f"Video too long: {metadata.duration_seconds//60} minutes"
)
# Download based on preferences
video_path = None
audio_path = None
if preferences.prefer_audio_only or not preferences.save_video:
# Download audio only
audio_path = await self._download_audio(yt, video_id, loop)
else:
# Download video and audio separately, then merge
video_path, audio_path = await self._download_video_and_audio(yt, video_id, preferences, loop)
# Get transcript if available
transcript = None
if preferences.enable_subtitles:
transcript = await self._extract_transcript(yt, video_id)
processing_time = time.time() - start_time
# Calculate file sizes
file_size = 0
if audio_path and audio_path.exists():
file_size += audio_path.stat().st_size
if video_path and video_path.exists():
file_size += video_path.stat().st_size
return VideoDownloadResult(
video_id=video_id,
video_url=url,
status=DownloadStatus.COMPLETED,
method=self.method,
video_path=video_path,
audio_path=audio_path,
transcript=transcript,
metadata=metadata,
processing_time_seconds=processing_time,
file_size_bytes=file_size
)
except Exception as e:
self.logger.error(f"Pytubefix download failed for {video_id}: {e}")
# Try to determine error type
error_msg = str(e).lower()
if "private" in error_msg or "unavailable" in error_msg:
raise VideoNotAvailableError(f"Video not available: {e}")
elif "age" in error_msg and "restricted" in error_msg:
raise VideoNotAvailableError(f"Age-restricted video: {e}")
else:
raise DownloaderException(f"Pytubefix error: {e}")
def _create_youtube_object(self, url: str):
"""Create YouTube object (runs in thread pool)"""
from pytubefix import YouTube
# Configure pytubefix with realistic settings
return YouTube(
url,
use_oauth=False, # OAuth can help but may be complex
allow_oauth_cache=True
)
async def _extract_metadata(self, yt, video_id: str) -> VideoMetadata:
"""Extract video metadata"""
loop = asyncio.get_event_loop()
def _get_metadata():
return {
'title': getattr(yt, 'title', None),
'description': getattr(yt, 'description', None),
'length': getattr(yt, 'length', None),
'views': getattr(yt, 'views', None),
'publish_date': getattr(yt, 'publish_date', None),
'author': getattr(yt, 'author', None),
'thumbnail_url': getattr(yt, 'thumbnail_url', None),
'keywords': getattr(yt, 'keywords', []),
}
meta = await loop.run_in_executor(None, _get_metadata)
return VideoMetadata(
video_id=video_id,
title=meta.get('title'),
description=meta.get('description'),
duration_seconds=meta.get('length'),
view_count=meta.get('views'),
upload_date=meta.get('publish_date').isoformat() if meta.get('publish_date') else None,
uploader=meta.get('author'),
thumbnail_url=meta.get('thumbnail_url'),
tags=meta.get('keywords', [])
)
async def _download_audio(self, yt, video_id: str, loop) -> Optional[Path]:
"""Download audio only"""
def _download():
try:
# Get best audio stream
audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first()
if not audio_stream:
self.logger.warning("No audio stream found")
return None
# Download to temp location first
temp_path = audio_stream.download(
output_path=self.output_dir,
filename=f"{video_id}_temp_audio"
)
# Convert to MP3 if needed
audio_path = self.output_dir / f"{video_id}_audio.mp3"
if temp_path.endswith('.mp4'):
# Convert MP4 to MP3 using ffmpeg-python if available
try:
import ffmpeg
(
ffmpeg
.input(temp_path)
.output(str(audio_path), acodec='mp3', audio_bitrate='192k')
.overwrite_output()
.run(quiet=True)
)
# Remove temp file
Path(temp_path).unlink()
except ImportError:
# If ffmpeg not available, just rename
Path(temp_path).rename(audio_path.with_suffix('.mp4'))
audio_path = audio_path.with_suffix('.mp4')
else:
# Just move the file
Path(temp_path).rename(audio_path)
return audio_path
except Exception as e:
self.logger.error(f"Audio download failed: {e}")
return None
result = await loop.run_in_executor(None, _download)
return Path(result) if result else None
async def _download_video_and_audio(self, yt, video_id: str, preferences: DownloadPreferences, loop):
"""Download video and audio separately"""
def _download():
try:
# Get best video stream (no audio)
video_stream = yt.streams.filter(
adaptive=True,
file_extension='mp4',
only_video=True
).order_by('resolution').desc().first()
# Get best audio stream
audio_stream = yt.streams.filter(
only_audio=True,
file_extension='mp4'
).order_by('abr').desc().first()
if not video_stream or not audio_stream:
self.logger.warning("Could not find suitable video/audio streams")
return None, None
# Download both
video_temp = video_stream.download(
output_path=self.output_dir,
filename=f"{video_id}_temp_video"
)
audio_temp = audio_stream.download(
output_path=self.output_dir,
filename=f"{video_id}_temp_audio"
)
# Merge using ffmpeg if available
video_path = self.output_dir / f"{video_id}_video.mp4"
audio_path = self.output_dir / f"{video_id}_audio.mp3"
try:
import ffmpeg
# Merge video and audio
(
ffmpeg
.output(
ffmpeg.input(video_temp),
ffmpeg.input(audio_temp),
str(video_path),
vcodec='copy',
acodec='aac'
)
.overwrite_output()
.run(quiet=True)
)
# Create separate audio file
(
ffmpeg
.input(audio_temp)
.output(str(audio_path), acodec='mp3', audio_bitrate='192k')
.overwrite_output()
.run(quiet=True)
)
# Cleanup temp files
Path(video_temp).unlink()
Path(audio_temp).unlink()
return video_path, audio_path
except ImportError:
# If no ffmpeg, just keep separate files
video_path = Path(video_temp)
audio_path = Path(audio_temp)
return video_path, audio_path
except Exception as e:
self.logger.error(f"Video+audio download failed: {e}")
return None, None
video_result, audio_result = await loop.run_in_executor(None, _download)
return (Path(video_result) if video_result else None,
Path(audio_result) if audio_result else None)
async def _extract_transcript(self, yt, video_id: str) -> Optional[TranscriptData]:
"""Extract transcript using YouTube API"""
try:
from youtube_transcript_api import YouTubeTranscriptApi
loop = asyncio.get_event_loop()
def _get_transcript():
api = YouTubeTranscriptApi()
transcript = api.fetch(video_id, languages=['en'])
# Convert to text
full_text = ' '.join([snippet.text for snippet in transcript.snippets])
# Convert segments
segments = [
{
'text': snippet.text,
'start': snippet.start,
'duration': snippet.duration
}
for snippet in transcript.snippets
]
return full_text, segments, transcript.is_generated, transcript.language_code
text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript)
return TranscriptData(
text=text,
language=language,
is_auto_generated=is_generated,
segments=segments,
source="youtube-transcript-api"
)
except Exception as e:
self.logger.debug(f"Transcript extraction failed: {e}")
return None
async def test_connection(self) -> bool:
"""Test if pytubefix is working"""
try:
from pytubefix import YouTube
# Test with a known working video
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
loop = asyncio.get_event_loop()
def _test():
yt = YouTube(test_url)
return yt.title is not None
result = await loop.run_in_executor(None, _test)
return result
except Exception as e:
self.logger.error(f"Pytubefix connection test failed: {e}")
return False
def supports_audio_only(self) -> bool:
return True
def supports_quality_selection(self) -> bool:
return True
def get_supported_formats(self) -> list[str]:
return ["mp4", "mp3", "webm"]
# Register the downloader
from backend.services.video_downloaders.base_downloader import DownloaderFactory
DownloaderFactory.register(DownloadMethod.PYTUBEFIX, PytubefixDownloader)