youtube-summarizer/backend/config/video_download_config.py

186 lines
8.4 KiB
Python

"""
Video download configuration
"""
from pathlib import Path
from typing import List, Optional, Dict, Any
try:
from pydantic_settings import BaseSettings
from pydantic import Field
except ImportError:
# Fallback for older pydantic versions
from pydantic import BaseSettings, Field
from backend.models.video_download import VideoQuality, DownloadMethod
class VideoDownloadConfig(BaseSettings):
"""Configuration for video download system"""
# API Keys
youtube_api_key: Optional[str] = Field(None, description="YouTube Data API v3 key")
# Storage Configuration
storage_path: Path = Field(Path("./video_storage"), description="Base storage directory")
max_storage_gb: float = Field(10.0, description="Maximum storage size in GB")
cleanup_older_than_days: int = Field(30, description="Clean up files older than X days")
temp_dir: Path = Field(Path("./video_storage/temp"), description="Temporary files directory")
# Download Preferences
default_quality: VideoQuality = Field(VideoQuality.MEDIUM_720P, description="Default video quality")
max_video_duration_minutes: int = Field(180, description="Skip videos longer than X minutes")
prefer_audio_only: bool = Field(True, description="Prefer audio-only for transcription")
extract_audio: bool = Field(True, description="Always extract audio")
save_video: bool = Field(False, description="Save video files (storage optimization)")
# Method Configuration
enabled_methods: List[DownloadMethod] = Field(
default=[
DownloadMethod.PYTUBEFIX,
DownloadMethod.YT_DLP,
DownloadMethod.PLAYWRIGHT,
DownloadMethod.TRANSCRIPT_ONLY
],
description="Enabled download methods in order of preference"
)
method_timeout_seconds: int = Field(120, description="Timeout per download method")
max_retries_per_method: int = Field(2, description="Max retries per method")
# yt-dlp specific configuration
ytdlp_use_cookies: bool = Field(True, description="Use cookies for yt-dlp")
ytdlp_cookies_file: Optional[Path] = Field(None, description="Path to cookies.txt file")
ytdlp_user_agents: List[str] = Field(
default=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
],
description="User agents for yt-dlp rotation"
)
# Playwright configuration
playwright_headless: bool = Field(True, description="Run Playwright in headless mode")
playwright_browser_session: Optional[Path] = Field(None, description="Saved browser session")
playwright_timeout: int = Field(30000, description="Playwright timeout in milliseconds")
# External tools configuration
external_tools_enabled: bool = Field(True, description="Enable external tools")
fourk_video_downloader_path: Optional[Path] = Field(None, description="Path to 4K Video Downloader CLI")
# Web services configuration
web_services_enabled: bool = Field(True, description="Enable web service APIs")
web_service_timeout: int = Field(30, description="Web service timeout in seconds")
web_service_user_agents: List[str] = Field(
default=[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
],
description="User agents for web services"
)
# Performance Configuration
max_concurrent_downloads: int = Field(3, description="Maximum concurrent downloads")
cache_results: bool = Field(True, description="Cache download results")
cache_ttl_hours: int = Field(24, description="Cache TTL in hours")
# Monitoring and Health
health_check_interval_minutes: int = Field(30, description="Health check interval")
success_rate_threshold: float = Field(0.7, description="Switch methods if success rate drops below")
enable_telemetry: bool = Field(True, description="Enable performance telemetry")
# Error Handling
max_total_retries: int = Field(5, description="Maximum total retries across all methods")
backoff_factor: float = Field(1.5, description="Exponential backoff factor")
# Audio Processing
audio_format: str = Field("mp3", description="Audio output format")
audio_quality: str = Field("192k", description="Audio quality")
keep_audio_files: bool = Field(True, description="Keep audio files for future re-transcription")
audio_cleanup_days: int = Field(30, description="Delete audio files older than X days (0 = never delete)")
# Video Processing
video_format: str = Field("mp4", description="Video output format")
merge_audio_video: bool = Field(True, description="Merge audio and video streams")
# Faster-Whisper Configuration (20-32x speed improvement)
whisper_model: str = Field("large-v3-turbo", description="Faster-whisper model ('large-v3-turbo', 'large-v3', 'large-v2', 'medium', 'small', 'base', 'tiny')")
whisper_device: str = Field("auto", description="Processing device ('auto', 'cpu', 'cuda')")
whisper_compute_type: str = Field("auto", description="Compute type ('auto', 'int8', 'float16', 'float32')")
whisper_beam_size: int = Field(5, description="Beam search size (1-10, higher = better quality)")
whisper_vad_filter: bool = Field(True, description="Voice Activity Detection for efficiency")
whisper_word_timestamps: bool = Field(True, description="Enable word-level timestamps")
whisper_temperature: float = Field(0.0, description="Sampling temperature (0 = deterministic)")
whisper_best_of: int = Field(5, description="Number of candidates when sampling")
class Config:
env_file = ".env"
env_prefix = "VIDEO_DOWNLOAD_"
case_sensitive = False
extra = "ignore" # Allow extra environment variables
def get_storage_dirs(self) -> Dict[str, Path]:
"""Get all storage directories"""
base = Path(self.storage_path)
return {
"base": base,
"videos": base / "videos",
"audio": base / "audio",
"transcripts": base / "transcripts",
"summaries": base / "summaries",
"temp": base / "temp",
"cache": base / "cache",
"logs": base / "logs"
}
def ensure_directories(self):
"""Create all required directories"""
dirs = self.get_storage_dirs()
for path in dirs.values():
path.mkdir(parents=True, exist_ok=True)
def get_method_priority(self) -> List[DownloadMethod]:
"""Get download methods in priority order"""
return self.enabled_methods.copy()
def is_method_enabled(self, method: DownloadMethod) -> bool:
"""Check if a download method is enabled"""
return method in self.enabled_methods
# Default configuration instance
default_config = VideoDownloadConfig()
def get_video_download_config() -> VideoDownloadConfig:
"""Get video download configuration"""
return VideoDownloadConfig()
# Configuration validation
def validate_config(config: VideoDownloadConfig) -> List[str]:
"""Validate configuration and return list of warnings/errors"""
warnings = []
# Check storage space
if config.max_storage_gb < 1.0:
warnings.append("Storage limit is very low (< 1GB)")
# Check if any download methods are enabled
if not config.enabled_methods:
warnings.append("No download methods enabled")
# Check for required tools/dependencies
if DownloadMethod.PLAYWRIGHT in config.enabled_methods:
try:
import playwright
except ImportError:
warnings.append("Playwright not installed but enabled in config")
# Check external tool paths
if config.fourk_video_downloader_path and not config.fourk_video_downloader_path.exists():
warnings.append(f"4K Video Downloader path does not exist: {config.fourk_video_downloader_path}")
# Check cookies file
if config.ytdlp_cookies_file and not config.ytdlp_cookies_file.exists():
warnings.append(f"yt-dlp cookies file does not exist: {config.ytdlp_cookies_file}")
return warnings