355 lines
14 KiB
Python
355 lines
14 KiB
Python
"""
|
|
Pytubefix-based video downloader
|
|
"""
|
|
import asyncio
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
import logging
|
|
|
|
from backend.models.video_download import (
|
|
VideoDownloadResult,
|
|
DownloadPreferences,
|
|
DownloadMethod,
|
|
DownloadStatus,
|
|
VideoMetadata,
|
|
TranscriptData,
|
|
VideoQuality,
|
|
DownloaderException,
|
|
VideoNotAvailableError
|
|
)
|
|
from backend.services.video_downloaders.base_downloader import BaseVideoDownloader
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PytubefixDownloader(BaseVideoDownloader):
|
|
"""Pytubefix-based video downloader"""
|
|
|
|
def __init__(self, method: DownloadMethod = DownloadMethod.PYTUBEFIX, config: Optional[Dict[str, Any]] = None):
|
|
super().__init__(method, config)
|
|
self.output_dir = Path(config.get('output_dir', './video_storage')) if config else Path('./video_storage')
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
async def download_video(self, url: str, preferences: DownloadPreferences) -> VideoDownloadResult:
|
|
"""Download video using pytubefix"""
|
|
start_time = time.time()
|
|
video_id = await self.extract_video_id(url)
|
|
|
|
try:
|
|
# Import pytubefix
|
|
from pytubefix import YouTube
|
|
|
|
# Run in thread pool to avoid blocking
|
|
loop = asyncio.get_event_loop()
|
|
yt = await loop.run_in_executor(None, self._create_youtube_object, url)
|
|
|
|
# Get video metadata
|
|
metadata = await self._extract_metadata(yt, video_id)
|
|
|
|
# Check duration limits
|
|
if metadata.duration_seconds and preferences.max_duration_minutes > 0:
|
|
if metadata.duration_seconds > (preferences.max_duration_minutes * 60):
|
|
return self.create_result(
|
|
video_id, url, DownloadStatus.FAILED,
|
|
f"Video too long: {metadata.duration_seconds//60} minutes"
|
|
)
|
|
|
|
# Download based on preferences
|
|
video_path = None
|
|
audio_path = None
|
|
|
|
if preferences.prefer_audio_only or not preferences.save_video:
|
|
# Download audio only
|
|
audio_path = await self._download_audio(yt, video_id, loop)
|
|
else:
|
|
# Download video and audio separately, then merge
|
|
video_path, audio_path = await self._download_video_and_audio(yt, video_id, preferences, loop)
|
|
|
|
# Get transcript if available
|
|
transcript = None
|
|
if preferences.enable_subtitles:
|
|
transcript = await self._extract_transcript(yt, video_id)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
# Calculate file sizes
|
|
file_size = 0
|
|
if audio_path and audio_path.exists():
|
|
file_size += audio_path.stat().st_size
|
|
if video_path and video_path.exists():
|
|
file_size += video_path.stat().st_size
|
|
|
|
return VideoDownloadResult(
|
|
video_id=video_id,
|
|
video_url=url,
|
|
status=DownloadStatus.COMPLETED,
|
|
method=self.method,
|
|
video_path=video_path,
|
|
audio_path=audio_path,
|
|
transcript=transcript,
|
|
metadata=metadata,
|
|
processing_time_seconds=processing_time,
|
|
file_size_bytes=file_size
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Pytubefix download failed for {video_id}: {e}")
|
|
|
|
# Try to determine error type
|
|
error_msg = str(e).lower()
|
|
if "private" in error_msg or "unavailable" in error_msg:
|
|
raise VideoNotAvailableError(f"Video not available: {e}")
|
|
elif "age" in error_msg and "restricted" in error_msg:
|
|
raise VideoNotAvailableError(f"Age-restricted video: {e}")
|
|
else:
|
|
raise DownloaderException(f"Pytubefix error: {e}")
|
|
|
|
def _create_youtube_object(self, url: str):
|
|
"""Create YouTube object (runs in thread pool)"""
|
|
from pytubefix import YouTube
|
|
|
|
# Configure pytubefix with realistic settings
|
|
return YouTube(
|
|
url,
|
|
use_oauth=False, # OAuth can help but may be complex
|
|
allow_oauth_cache=True
|
|
)
|
|
|
|
async def _extract_metadata(self, yt, video_id: str) -> VideoMetadata:
|
|
"""Extract video metadata"""
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _get_metadata():
|
|
return {
|
|
'title': getattr(yt, 'title', None),
|
|
'description': getattr(yt, 'description', None),
|
|
'length': getattr(yt, 'length', None),
|
|
'views': getattr(yt, 'views', None),
|
|
'publish_date': getattr(yt, 'publish_date', None),
|
|
'author': getattr(yt, 'author', None),
|
|
'thumbnail_url': getattr(yt, 'thumbnail_url', None),
|
|
'keywords': getattr(yt, 'keywords', []),
|
|
}
|
|
|
|
meta = await loop.run_in_executor(None, _get_metadata)
|
|
|
|
return VideoMetadata(
|
|
video_id=video_id,
|
|
title=meta.get('title'),
|
|
description=meta.get('description'),
|
|
duration_seconds=meta.get('length'),
|
|
view_count=meta.get('views'),
|
|
upload_date=meta.get('publish_date').isoformat() if meta.get('publish_date') else None,
|
|
uploader=meta.get('author'),
|
|
thumbnail_url=meta.get('thumbnail_url'),
|
|
tags=meta.get('keywords', [])
|
|
)
|
|
|
|
async def _download_audio(self, yt, video_id: str, loop) -> Optional[Path]:
|
|
"""Download audio only"""
|
|
def _download():
|
|
try:
|
|
# Get best audio stream
|
|
audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first()
|
|
|
|
if not audio_stream:
|
|
self.logger.warning("No audio stream found")
|
|
return None
|
|
|
|
# Download to temp location first
|
|
temp_path = audio_stream.download(
|
|
output_path=self.output_dir,
|
|
filename=f"{video_id}_temp_audio"
|
|
)
|
|
|
|
# Convert to MP3 if needed
|
|
audio_path = self.output_dir / f"{video_id}_audio.mp3"
|
|
|
|
if temp_path.endswith('.mp4'):
|
|
# Convert MP4 to MP3 using ffmpeg-python if available
|
|
try:
|
|
import ffmpeg
|
|
(
|
|
ffmpeg
|
|
.input(temp_path)
|
|
.output(str(audio_path), acodec='mp3', audio_bitrate='192k')
|
|
.overwrite_output()
|
|
.run(quiet=True)
|
|
)
|
|
# Remove temp file
|
|
Path(temp_path).unlink()
|
|
|
|
except ImportError:
|
|
# If ffmpeg not available, just rename
|
|
Path(temp_path).rename(audio_path.with_suffix('.mp4'))
|
|
audio_path = audio_path.with_suffix('.mp4')
|
|
else:
|
|
# Just move the file
|
|
Path(temp_path).rename(audio_path)
|
|
|
|
return audio_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Audio download failed: {e}")
|
|
return None
|
|
|
|
result = await loop.run_in_executor(None, _download)
|
|
return Path(result) if result else None
|
|
|
|
async def _download_video_and_audio(self, yt, video_id: str, preferences: DownloadPreferences, loop):
|
|
"""Download video and audio separately"""
|
|
def _download():
|
|
try:
|
|
# Get best video stream (no audio)
|
|
video_stream = yt.streams.filter(
|
|
adaptive=True,
|
|
file_extension='mp4',
|
|
only_video=True
|
|
).order_by('resolution').desc().first()
|
|
|
|
# Get best audio stream
|
|
audio_stream = yt.streams.filter(
|
|
only_audio=True,
|
|
file_extension='mp4'
|
|
).order_by('abr').desc().first()
|
|
|
|
if not video_stream or not audio_stream:
|
|
self.logger.warning("Could not find suitable video/audio streams")
|
|
return None, None
|
|
|
|
# Download both
|
|
video_temp = video_stream.download(
|
|
output_path=self.output_dir,
|
|
filename=f"{video_id}_temp_video"
|
|
)
|
|
|
|
audio_temp = audio_stream.download(
|
|
output_path=self.output_dir,
|
|
filename=f"{video_id}_temp_audio"
|
|
)
|
|
|
|
# Merge using ffmpeg if available
|
|
video_path = self.output_dir / f"{video_id}_video.mp4"
|
|
audio_path = self.output_dir / f"{video_id}_audio.mp3"
|
|
|
|
try:
|
|
import ffmpeg
|
|
|
|
# Merge video and audio
|
|
(
|
|
ffmpeg
|
|
.output(
|
|
ffmpeg.input(video_temp),
|
|
ffmpeg.input(audio_temp),
|
|
str(video_path),
|
|
vcodec='copy',
|
|
acodec='aac'
|
|
)
|
|
.overwrite_output()
|
|
.run(quiet=True)
|
|
)
|
|
|
|
# Create separate audio file
|
|
(
|
|
ffmpeg
|
|
.input(audio_temp)
|
|
.output(str(audio_path), acodec='mp3', audio_bitrate='192k')
|
|
.overwrite_output()
|
|
.run(quiet=True)
|
|
)
|
|
|
|
# Cleanup temp files
|
|
Path(video_temp).unlink()
|
|
Path(audio_temp).unlink()
|
|
|
|
return video_path, audio_path
|
|
|
|
except ImportError:
|
|
# If no ffmpeg, just keep separate files
|
|
video_path = Path(video_temp)
|
|
audio_path = Path(audio_temp)
|
|
return video_path, audio_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Video+audio download failed: {e}")
|
|
return None, None
|
|
|
|
video_result, audio_result = await loop.run_in_executor(None, _download)
|
|
return (Path(video_result) if video_result else None,
|
|
Path(audio_result) if audio_result else None)
|
|
|
|
async def _extract_transcript(self, yt, video_id: str) -> Optional[TranscriptData]:
|
|
"""Extract transcript using YouTube API"""
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _get_transcript():
|
|
api = YouTubeTranscriptApi()
|
|
transcript = api.fetch(video_id, languages=['en'])
|
|
|
|
# Convert to text
|
|
full_text = ' '.join([snippet.text for snippet in transcript.snippets])
|
|
|
|
# Convert segments
|
|
segments = [
|
|
{
|
|
'text': snippet.text,
|
|
'start': snippet.start,
|
|
'duration': snippet.duration
|
|
}
|
|
for snippet in transcript.snippets
|
|
]
|
|
|
|
return full_text, segments, transcript.is_generated, transcript.language_code
|
|
|
|
text, segments, is_generated, language = await loop.run_in_executor(None, _get_transcript)
|
|
|
|
return TranscriptData(
|
|
text=text,
|
|
language=language,
|
|
is_auto_generated=is_generated,
|
|
segments=segments,
|
|
source="youtube-transcript-api"
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.debug(f"Transcript extraction failed: {e}")
|
|
return None
|
|
|
|
async def test_connection(self) -> bool:
|
|
"""Test if pytubefix is working"""
|
|
try:
|
|
from pytubefix import YouTube
|
|
|
|
# Test with a known working video
|
|
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _test():
|
|
yt = YouTube(test_url)
|
|
return yt.title is not None
|
|
|
|
result = await loop.run_in_executor(None, _test)
|
|
return result
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Pytubefix connection test failed: {e}")
|
|
return False
|
|
|
|
def supports_audio_only(self) -> bool:
|
|
return True
|
|
|
|
def supports_quality_selection(self) -> bool:
|
|
return True
|
|
|
|
def get_supported_formats(self) -> list[str]:
|
|
return ["mp4", "mp3", "webm"]
|
|
|
|
|
|
# Register the downloader
|
|
from backend.services.video_downloaders.base_downloader import DownloaderFactory
|
|
DownloaderFactory.register(DownloadMethod.PYTUBEFIX, PytubefixDownloader) |