422 lines
16 KiB
Python
422 lines
16 KiB
Python
"""
|
|
Video Download Service using yt-dlp for YouTube video management.
|
|
Handles downloading, storage, caching, and cleanup of video files.
|
|
"""
|
|
|
|
import yt_dlp
|
|
import os
|
|
import json
|
|
import hashlib
|
|
import shutil
|
|
import asyncio
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Tuple
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class VideoDownloadError(Exception):
|
|
"""Custom exception for video download errors"""
|
|
pass
|
|
|
|
|
|
class VideoDownloadService:
|
|
"""Service for downloading and managing YouTube videos locally."""
|
|
|
|
def __init__(
|
|
self,
|
|
storage_dir: str = "data/youtube-videos",
|
|
max_storage_size_gb: float = 10.0,
|
|
video_quality: str = "720p",
|
|
keep_videos: bool = True,
|
|
cache_file: str = "download_cache.json"
|
|
):
|
|
"""
|
|
Initialize the video download service.
|
|
|
|
Args:
|
|
storage_dir: Base directory for storing videos
|
|
max_storage_size_gb: Maximum storage size in GB
|
|
video_quality: Default video quality (720p, 1080p, best)
|
|
keep_videos: Whether to keep videos after processing
|
|
cache_file: File to track downloaded videos
|
|
"""
|
|
self.base_dir = Path(storage_dir)
|
|
self.max_storage_bytes = max_storage_size_gb * 1024 * 1024 * 1024
|
|
self.video_quality = video_quality
|
|
self.keep_videos = keep_videos
|
|
self.cache_file = self.base_dir / cache_file
|
|
|
|
# Create directory structure
|
|
self.videos_dir = self.base_dir / "videos"
|
|
self.audio_dir = self.base_dir / "audio"
|
|
self.temp_dir = self.base_dir / "temp"
|
|
self.metadata_dir = self.base_dir / "metadata"
|
|
|
|
self._ensure_directories()
|
|
self._load_cache()
|
|
|
|
# Progress tracking
|
|
self.download_progress = {}
|
|
|
|
def _ensure_directories(self):
|
|
"""Create necessary directories if they don't exist."""
|
|
for directory in [
|
|
self.base_dir,
|
|
self.videos_dir,
|
|
self.audio_dir,
|
|
self.temp_dir,
|
|
self.metadata_dir
|
|
]:
|
|
directory.mkdir(exist_ok=True, parents=True)
|
|
|
|
def _load_cache(self):
|
|
"""Load the download cache from file."""
|
|
if self.cache_file.exists():
|
|
try:
|
|
with open(self.cache_file, 'r') as f:
|
|
self.cache = json.load(f)
|
|
except json.JSONDecodeError:
|
|
logger.warning("Cache file corrupted, starting with empty cache")
|
|
self.cache = {}
|
|
else:
|
|
self.cache = {}
|
|
self._save_cache()
|
|
|
|
def _save_cache(self):
|
|
"""Save the download cache to file."""
|
|
with open(self.cache_file, 'w') as f:
|
|
json.dump(self.cache, f, indent=2)
|
|
|
|
def _get_video_hash(self, video_id: str) -> str:
|
|
"""Generate a unique hash for a video ID."""
|
|
return hashlib.md5(video_id.encode()).hexdigest()
|
|
|
|
def _get_ydl_opts(self, video_id: str, download: bool = True) -> Dict:
|
|
"""Get yt-dlp options for download or info extraction."""
|
|
opts = {
|
|
'quiet': True,
|
|
'no_warnings': True,
|
|
'extract_flat': False,
|
|
}
|
|
|
|
if download:
|
|
# Set quality format string based on preference
|
|
if self.video_quality == "best":
|
|
format_str = 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best'
|
|
elif self.video_quality == "1080p":
|
|
format_str = 'bestvideo[height<=1080][ext=mp4]+bestaudio[ext=m4a]/best[height<=1080][ext=mp4]/best'
|
|
elif self.video_quality == "720p":
|
|
format_str = 'bestvideo[height<=720][ext=mp4]+bestaudio[ext=m4a]/best[height<=720][ext=mp4]/best'
|
|
else:
|
|
format_str = 'best[ext=mp4]/best'
|
|
|
|
video_path = self.videos_dir / f"{video_id}.mp4"
|
|
|
|
opts.update({
|
|
'format': format_str,
|
|
'outtmpl': str(video_path),
|
|
'progress_hooks': [lambda d: self._progress_hook(video_id, d)],
|
|
'postprocessors': [],
|
|
'writeinfojson': True,
|
|
'writethumbnail': True,
|
|
})
|
|
else:
|
|
opts['skip_download'] = True
|
|
|
|
return opts
|
|
|
|
def _progress_hook(self, video_id: str, d: Dict):
|
|
"""Progress hook for yt-dlp downloads."""
|
|
if d['status'] == 'downloading':
|
|
self.download_progress[video_id] = {
|
|
'status': 'downloading',
|
|
'percent': d.get('_percent_str', 'N/A'),
|
|
'speed': d.get('_speed_str', 'N/A'),
|
|
'eta': d.get('_eta_str', 'N/A'),
|
|
'total_bytes': d.get('total_bytes', 0),
|
|
'downloaded_bytes': d.get('downloaded_bytes', 0),
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
elif d['status'] == 'finished':
|
|
self.download_progress[video_id] = {
|
|
'status': 'finished',
|
|
'percent': '100%',
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
async def get_video_info(self, url: str) -> Dict:
|
|
"""
|
|
Extract video information using yt-dlp.
|
|
|
|
Args:
|
|
url: YouTube URL
|
|
|
|
Returns:
|
|
Video information dictionary
|
|
"""
|
|
ydl_opts = self._get_ydl_opts('', download=False)
|
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
try:
|
|
info = await asyncio.get_event_loop().run_in_executor(
|
|
None, ydl.extract_info, url, False
|
|
)
|
|
return info
|
|
except Exception as e:
|
|
logger.error(f"Error extracting video info: {e}")
|
|
raise VideoDownloadError(f"Failed to get video info: {str(e)}")
|
|
|
|
def is_video_downloaded(self, video_id: str) -> bool:
|
|
"""Check if a video is already downloaded."""
|
|
video_hash = self._get_video_hash(video_id)
|
|
if video_hash not in self.cache:
|
|
return False
|
|
|
|
video_path = Path(self.cache[video_hash].get('video_path', ''))
|
|
return video_path.exists()
|
|
|
|
def get_current_storage_usage(self) -> float:
|
|
"""Get current storage usage in bytes."""
|
|
total_size = 0
|
|
for directory in [self.videos_dir, self.audio_dir]:
|
|
if directory.exists():
|
|
for path in directory.glob('**/*'):
|
|
if path.is_file():
|
|
total_size += path.stat().st_size
|
|
return total_size
|
|
|
|
def cleanup_old_videos(self, bytes_to_free: int) -> int:
|
|
"""
|
|
Remove oldest videos to free up space.
|
|
|
|
Args:
|
|
bytes_to_free: Number of bytes to free
|
|
|
|
Returns:
|
|
Number of bytes actually freed
|
|
"""
|
|
if not self.cache:
|
|
return 0
|
|
|
|
# Sort videos by download date (oldest first)
|
|
sorted_videos = sorted(
|
|
self.cache.items(),
|
|
key=lambda x: x[1].get('download_date', '1970-01-01')
|
|
)
|
|
|
|
bytes_freed = 0
|
|
videos_removed = []
|
|
|
|
for video_hash, info in sorted_videos:
|
|
if bytes_freed >= bytes_to_free:
|
|
break
|
|
|
|
# Skip videos marked as "keep"
|
|
if info.get('keep', False):
|
|
continue
|
|
|
|
video_path = Path(info.get('video_path', ''))
|
|
audio_path = Path(info.get('audio_path', ''))
|
|
|
|
freed_this_video = 0
|
|
|
|
if video_path.exists():
|
|
freed_this_video += video_path.stat().st_size
|
|
video_path.unlink()
|
|
logger.info(f"Removed video: {video_path}")
|
|
|
|
if audio_path.exists():
|
|
freed_this_video += audio_path.stat().st_size
|
|
audio_path.unlink()
|
|
logger.info(f"Removed audio: {audio_path}")
|
|
|
|
# Remove metadata files
|
|
video_id = info.get('video_id')
|
|
if video_id:
|
|
info_file = self.videos_dir / f"{video_id}.info.json"
|
|
thumb_file = self.videos_dir / f"{video_id}.jpg"
|
|
|
|
for file in [info_file, thumb_file]:
|
|
if file.exists():
|
|
file.unlink()
|
|
|
|
bytes_freed += freed_this_video
|
|
videos_removed.append(video_hash)
|
|
|
|
# Update cache
|
|
for video_hash in videos_removed:
|
|
del self.cache[video_hash]
|
|
|
|
self._save_cache()
|
|
logger.info(f"Cleanup freed {bytes_freed / (1024*1024):.2f} MB")
|
|
|
|
return bytes_freed
|
|
|
|
async def download_video(
|
|
self,
|
|
url: str,
|
|
extract_audio: bool = True,
|
|
force: bool = False
|
|
) -> Tuple[Optional[Path], Optional[Path]]:
|
|
"""
|
|
Download a video and optionally extract audio.
|
|
|
|
Args:
|
|
url: YouTube URL
|
|
extract_audio: Whether to extract audio
|
|
force: Force re-download even if cached
|
|
|
|
Returns:
|
|
Tuple of (video_path, audio_path)
|
|
"""
|
|
try:
|
|
# Get video info first
|
|
info = await self.get_video_info(url)
|
|
video_id = info['id']
|
|
video_hash = self._get_video_hash(video_id)
|
|
|
|
# Check if already downloaded
|
|
if not force and self.is_video_downloaded(video_id):
|
|
logger.info(f"Video {video_id} already downloaded, using cached version")
|
|
cached_info = self.cache[video_hash]
|
|
video_path = Path(cached_info['video_path'])
|
|
audio_path = Path(cached_info.get('audio_path', '')) if cached_info.get('audio_path') else None
|
|
return video_path, audio_path
|
|
|
|
# Check storage space
|
|
current_usage = self.get_current_storage_usage()
|
|
estimated_size = info.get('filesize_approx', 500 * 1024 * 1024) # Default 500MB
|
|
|
|
if current_usage + estimated_size > self.max_storage_bytes:
|
|
bytes_to_free = (current_usage + estimated_size) - self.max_storage_bytes
|
|
logger.info(f"Storage limit reached, freeing {bytes_to_free / (1024*1024):.2f} MB")
|
|
freed = self.cleanup_old_videos(bytes_to_free)
|
|
if freed < bytes_to_free:
|
|
raise VideoDownloadError(
|
|
f"Insufficient storage space. Need {bytes_to_free / (1024*1024):.2f} MB, "
|
|
f"but only freed {freed / (1024*1024):.2f} MB"
|
|
)
|
|
|
|
# Download video
|
|
logger.info(f"Downloading video {video_id} at {self.video_quality} quality")
|
|
video_path = self.videos_dir / f"{video_id}.mp4"
|
|
|
|
ydl_opts = self._get_ydl_opts(video_id, download=True)
|
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, ydl.download, [url]
|
|
)
|
|
|
|
audio_path = None
|
|
|
|
# Extract audio if requested
|
|
if extract_audio and video_path.exists():
|
|
audio_path = self.audio_dir / f"{video_id}.mp3"
|
|
|
|
logger.info(f"Extracting audio to {audio_path}")
|
|
|
|
audio_opts = {
|
|
'format': 'bestaudio/best',
|
|
'outtmpl': str(audio_path.with_suffix('')), # Remove .mp3 for yt-dlp
|
|
'postprocessors': [{
|
|
'key': 'FFmpegExtractAudio',
|
|
'preferredcodec': 'mp3',
|
|
'preferredquality': '192',
|
|
}],
|
|
'quiet': True,
|
|
'no_warnings': True,
|
|
}
|
|
|
|
with yt_dlp.YoutubeDL(audio_opts) as ydl:
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, ydl.download, [url]
|
|
)
|
|
|
|
# Update cache
|
|
self.cache[video_hash] = {
|
|
'video_id': video_id,
|
|
'title': info.get('title', 'Unknown'),
|
|
'channel': info.get('channel', 'Unknown'),
|
|
'duration': info.get('duration', 0),
|
|
'video_path': str(video_path),
|
|
'audio_path': str(audio_path) if audio_path else None,
|
|
'download_date': datetime.now().isoformat(),
|
|
'size_bytes': video_path.stat().st_size if video_path.exists() else 0,
|
|
'url': url,
|
|
'quality': self.video_quality
|
|
}
|
|
|
|
self._save_cache()
|
|
logger.info(f"Successfully downloaded video {video_id}")
|
|
|
|
return video_path, audio_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error downloading video from {url}: {e}")
|
|
# Clean up any partial downloads
|
|
if 'video_id' in locals():
|
|
self._cleanup_failed_download(video_id)
|
|
raise VideoDownloadError(f"Failed to download video: {str(e)}")
|
|
|
|
def _cleanup_failed_download(self, video_id: str):
|
|
"""Clean up any files from a failed download."""
|
|
logger.info(f"Cleaning up failed download for {video_id}")
|
|
|
|
# Remove video and audio files
|
|
video_path = self.videos_dir / f"{video_id}.mp4"
|
|
audio_path = self.audio_dir / f"{video_id}.mp3"
|
|
info_path = self.videos_dir / f"{video_id}.info.json"
|
|
thumb_path = self.videos_dir / f"{video_id}.jpg"
|
|
|
|
for path in [video_path, audio_path, info_path, thumb_path]:
|
|
if path.exists():
|
|
path.unlink()
|
|
logger.debug(f"Removed {path}")
|
|
|
|
# Remove from cache if exists
|
|
video_hash = self._get_video_hash(video_id)
|
|
if video_hash in self.cache:
|
|
del self.cache[video_hash]
|
|
self._save_cache()
|
|
|
|
def get_storage_stats(self) -> Dict:
|
|
"""Get storage statistics."""
|
|
total_videos = len(self.cache)
|
|
total_size = self.get_current_storage_usage()
|
|
available_size = self.max_storage_bytes - total_size
|
|
|
|
return {
|
|
'total_videos': total_videos,
|
|
'total_size_bytes': total_size,
|
|
'total_size_mb': total_size / (1024 * 1024),
|
|
'total_size_gb': total_size / (1024 * 1024 * 1024),
|
|
'max_size_bytes': self.max_storage_bytes,
|
|
'max_size_gb': self.max_storage_bytes / (1024 * 1024 * 1024),
|
|
'available_bytes': available_size,
|
|
'available_mb': available_size / (1024 * 1024),
|
|
'available_gb': available_size / (1024 * 1024 * 1024),
|
|
'usage_percent': (total_size / self.max_storage_bytes * 100) if self.max_storage_bytes > 0 else 0,
|
|
'video_quality': self.video_quality,
|
|
'keep_videos': self.keep_videos
|
|
}
|
|
|
|
def get_download_progress(self, video_id: str) -> Optional[Dict]:
|
|
"""Get download progress for a specific video."""
|
|
return self.download_progress.get(video_id)
|
|
|
|
def get_cached_videos(self) -> List[Dict]:
|
|
"""Get list of all cached videos with their info."""
|
|
videos = []
|
|
for video_hash, info in self.cache.items():
|
|
video_info = info.copy()
|
|
video_info['hash'] = video_hash
|
|
video_info['exists'] = Path(info['video_path']).exists()
|
|
videos.append(video_info)
|
|
|
|
# Sort by download date, newest first
|
|
videos.sort(key=lambda x: x.get('download_date', ''), reverse=True)
|
|
return videos |