youtube-summarizer/backend/models/video_download.py

217 lines
6.3 KiB
Python

"""
Video download models and data structures
"""
import asyncio
import time
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Optional, List, Dict, Any, Union
from pydantic import BaseModel, HttpUrl, Field
class DownloadMethod(str, Enum):
"""Supported download methods"""
PYTUBEFIX = "pytubefix"
YT_DLP = "yt-dlp"
PLAYWRIGHT = "playwright"
EXTERNAL_TOOL = "external_tool"
WEB_SERVICE = "web_service"
TRANSCRIPT_ONLY = "transcript_only"
FAILED = "failed"
class VideoQuality(str, Enum):
"""Video quality options"""
AUDIO_ONLY = "audio_only"
LOW_480P = "480p"
MEDIUM_720P = "720p"
HIGH_1080P = "1080p"
ULTRA_1440P = "1440p"
MAX_2160P = "2160p"
BEST = "best"
class DownloadStatus(str, Enum):
"""Download operation status"""
PENDING = "pending"
IN_PROGRESS = "in_progress"
COMPLETED = "completed"
FAILED = "failed"
PARTIAL = "partial" # Transcript only, no video
CANCELLED = "cancelled"
class DownloadPreferences(BaseModel):
"""User preferences for video downloading"""
quality: VideoQuality = VideoQuality.MEDIUM_720P
prefer_audio_only: bool = True # For transcription, audio is sufficient
max_duration_minutes: int = 180 # Skip very long videos
fallback_to_transcript: bool = True
extract_audio: bool = True
save_video: bool = False # For storage optimization
output_format: str = "mp4"
enable_subtitles: bool = True
class VideoMetadata(BaseModel):
"""Video metadata from various sources"""
video_id: str
title: Optional[str] = None
description: Optional[str] = None
duration_seconds: Optional[int] = None
view_count: Optional[int] = None
upload_date: Optional[str] = None
uploader: Optional[str] = None
thumbnail_url: Optional[str] = None
tags: List[str] = Field(default_factory=list)
language: Optional[str] = "en"
availability: Optional[str] = None # public, private, unlisted
age_restricted: bool = False
class TranscriptData(BaseModel):
"""Transcript information"""
text: str
language: str = "en"
is_auto_generated: bool = False
segments: Optional[List[Dict[str, Any]]] = None
source: str = "youtube-transcript-api" # Source of transcript
class VideoDownloadResult(BaseModel):
"""Result of a video download operation"""
video_id: str
video_url: str
status: DownloadStatus
method: DownloadMethod
# File paths
video_path: Optional[Path] = None
audio_path: Optional[Path] = None
# Content
transcript: Optional[TranscriptData] = None
metadata: Optional[VideoMetadata] = None
# Performance metrics
download_time_seconds: Optional[float] = None
file_size_bytes: Optional[int] = None
processing_time_seconds: Optional[float] = None
# Error handling
error_message: Optional[str] = None
error_details: Optional[Dict[str, Any]] = None
retry_count: int = 0
# Flags
is_partial: bool = False # True if only transcript/metadata available
from_cache: bool = False
created_at: datetime = Field(default_factory=datetime.now)
class Config:
arbitrary_types_allowed = True
class DownloadJobStatus(BaseModel):
"""Status of a download job"""
job_id: str
video_url: str
status: DownloadStatus
progress_percent: float = 0.0
current_method: Optional[DownloadMethod] = None
error_message: Optional[str] = None
estimated_completion: Optional[datetime] = None
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
class DownloadMetrics(BaseModel):
"""Download performance metrics"""
total_attempts: int = 0
successful_downloads: int = 0
failed_downloads: int = 0
partial_downloads: int = 0 # Transcript-only results
# Method-specific success rates
method_success_rates: Dict[str, float] = Field(default_factory=dict)
method_attempt_counts: Dict[str, int] = Field(default_factory=dict)
# Performance metrics
average_download_time: float = 0.0
average_file_size_mb: float = 0.0
# Error analysis
common_errors: Dict[str, int] = Field(default_factory=dict)
last_updated: datetime = Field(default_factory=datetime.now)
def update_success_rate(self, method: DownloadMethod, success: bool):
"""Update success rate for a specific method"""
method_str = method.value
if method_str not in self.method_attempt_counts:
self.method_attempt_counts[method_str] = 0
self.method_success_rates[method_str] = 0.0
current_attempts = self.method_attempt_counts[method_str]
current_rate = self.method_success_rates[method_str]
# Calculate new success rate
if success:
new_successes = (current_rate * current_attempts) + 1
else:
new_successes = (current_rate * current_attempts)
new_attempts = current_attempts + 1
new_rate = new_successes / new_attempts if new_attempts > 0 else 0.0
self.method_attempt_counts[method_str] = new_attempts
self.method_success_rates[method_str] = new_rate
self.last_updated = datetime.now()
class HealthCheckResult(BaseModel):
"""Health check result for download system"""
overall_status: str # healthy, degraded, unhealthy
healthy_methods: int
total_methods: int
method_details: Dict[str, Dict[str, Any]]
recommendations: List[str] = Field(default_factory=list)
last_check: datetime = Field(default_factory=datetime.now)
class DownloaderException(Exception):
"""Base exception for download operations"""
pass
class VideoNotAvailableError(DownloaderException):
"""Video is not available for download"""
pass
class UnsupportedFormatError(DownloaderException):
"""Requested format is not supported"""
pass
class DownloadTimeoutError(DownloaderException):
"""Download operation timed out"""
pass
class QuotaExceededError(DownloaderException):
"""API quota exceeded"""
pass
class NetworkError(DownloaderException):
"""Network-related error"""
pass
class AllMethodsFailedError(DownloaderException):
"""All download methods have failed"""
pass