youtube-summarizer/sdks/python/youtube_summarizer_sdk/models.py

247 lines
6.9 KiB
Python

"""
Data models for the YouTube Summarizer SDK
"""
from typing import List, Optional, Dict, Any, Union
from datetime import datetime
from enum import Enum
from dataclasses import dataclass, field
from pydantic import BaseModel, HttpUrl, Field, validator
class TranscriptSource(str, Enum):
"""Transcript source options"""
YOUTUBE = "youtube"
WHISPER = "whisper"
BOTH = "both"
class WhisperModelSize(str, Enum):
"""Whisper model size options"""
TINY = "tiny"
BASE = "base"
SMALL = "small"
MEDIUM = "medium"
LARGE = "large"
class ProcessingPriority(str, Enum):
"""Processing priority levels"""
LOW = "low"
NORMAL = "normal"
HIGH = "high"
URGENT = "urgent"
class JobStatus(str, Enum):
"""Job processing status"""
QUEUED = "queued"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class WebhookEvent(str, Enum):
"""Webhook event types"""
JOB_STARTED = "job.started"
JOB_PROGRESS = "job.progress"
JOB_COMPLETED = "job.completed"
JOB_FAILED = "job.failed"
BATCH_COMPLETED = "batch.completed"
# Request Models
class TranscriptRequest(BaseModel):
"""Request model for transcript extraction"""
video_url: HttpUrl = Field(..., description="YouTube video URL")
transcript_source: TranscriptSource = Field(
default=TranscriptSource.YOUTUBE,
description="Transcript source"
)
whisper_model_size: Optional[WhisperModelSize] = Field(
default=WhisperModelSize.SMALL,
description="Whisper model size for AI transcription"
)
priority: ProcessingPriority = Field(
default=ProcessingPriority.NORMAL,
description="Processing priority"
)
webhook_url: Optional[HttpUrl] = Field(
None,
description="Webhook URL for notifications"
)
include_quality_analysis: bool = Field(
default=True,
description="Include transcript quality analysis"
)
custom_prompt: Optional[str] = Field(
None,
description="Custom processing prompt"
)
tags: List[str] = Field(
default_factory=list,
description="Custom tags for organization"
)
@validator('video_url')
def validate_youtube_url(cls, v):
"""Validate YouTube URL format"""
url_str = str(v)
valid_patterns = [
'youtube.com/watch',
'youtu.be/',
'youtube.com/embed',
'm.youtube.com/watch'
]
if not any(pattern in url_str for pattern in valid_patterns):
raise ValueError('Invalid YouTube URL format')
return v
class BatchProcessingRequest(BaseModel):
"""Request model for batch processing"""
video_urls: List[HttpUrl] = Field(
...,
min_items=1,
max_items=1000,
description="List of YouTube video URLs"
)
batch_name: str = Field(..., description="Batch job name")
transcript_source: TranscriptSource = Field(
default=TranscriptSource.YOUTUBE,
description="Transcript source for all videos"
)
priority: ProcessingPriority = Field(
default=ProcessingPriority.NORMAL,
description="Processing priority"
)
webhook_url: Optional[HttpUrl] = Field(
None,
description="Webhook URL for batch notifications"
)
parallel_processing: bool = Field(
default=False,
description="Enable parallel processing"
)
max_concurrent_jobs: int = Field(
default=5,
ge=1,
le=50,
description="Maximum concurrent jobs"
)
# Response Models
class JobResponse(BaseModel):
"""Response model for job creation"""
job_id: str
status: JobStatus
priority: ProcessingPriority
created_at: datetime
estimated_completion: Optional[datetime]
progress_percentage: float
current_stage: str
webhook_url: Optional[str]
metadata: Dict[str, Any]
class BatchJobResponse(BaseModel):
"""Response model for batch job creation"""
batch_id: str
status: JobStatus
video_count: int
priority: ProcessingPriority
estimated_completion: Optional[datetime]
parallel_processing: bool
webhook_url: Optional[str]
metadata: Dict[str, Any]
class TranscriptResult(BaseModel):
"""Transcript extraction result"""
job_id: str
video_id: str
video_url: str
transcript_source: TranscriptSource
transcript: Optional[str]
segments: Optional[List[Dict[str, Any]]]
processing_time_seconds: float
quality_score: Optional[float]
confidence_score: Optional[float]
metadata: Dict[str, Any]
class QualityComparison(BaseModel):
"""Quality comparison between transcript sources"""
similarity_score: float
punctuation_improvement_score: float
capitalization_improvement_score: float
technical_terms_improved: List[str]
recommendation: TranscriptSource
class DualTranscriptResult(BaseModel):
"""Result for dual transcript extraction"""
job_id: str
video_id: str
video_url: str
youtube_transcript: Optional[str]
whisper_transcript: Optional[str]
youtube_segments: Optional[List[Dict[str, Any]]]
whisper_segments: Optional[List[Dict[str, Any]]]
quality_comparison: Optional[QualityComparison]
processing_time_seconds: float
metadata: Dict[str, Any]
class APIUsageStats(BaseModel):
"""API usage statistics"""
total_requests: int
requests_today: int
requests_this_month: int
average_response_time_ms: float
success_rate: float
rate_limit_remaining: int
quota_reset_time: datetime
class ProcessingTimeEstimate(BaseModel):
"""Processing time estimate"""
estimated_time_seconds: float
estimated_cost: Optional[float]
factors: Optional[Dict[str, float]]
# Webhook Models
class WebhookPayload(BaseModel):
"""Webhook notification payload"""
event: WebhookEvent
timestamp: datetime
data: Dict[str, Any]
@dataclass
class SDKConfig:
"""SDK configuration"""
api_key: str
base_url: str = "https://api.youtube-summarizer.com"
timeout: float = 60.0
max_retries: int = 3
retry_delay: float = 1.0
verify_ssl: bool = True
user_agent: str = field(default_factory=lambda: f"youtube-summarizer-python-sdk/4.2.0")
@dataclass
class WebSocketConfig:
"""WebSocket configuration for real-time updates"""
url: str = "wss://api.youtube-summarizer.com/ws"
auto_reconnect: bool = True
max_reconnect_attempts: int = 5
heartbeat_interval: float = 30.0
reconnect_delay: float = 5.0
# MCP-specific models
class MCPToolRequest(BaseModel):
"""MCP tool request model"""
name: str
arguments: Dict[str, Any]
class MCPToolResult(BaseModel):
"""MCP tool result model"""
content: List[Dict[str, Any]]
is_error: bool = False
class MCPResourceRequest(BaseModel):
"""MCP resource request model"""
uri: str
class MCPResourceResult(BaseModel):
"""MCP resource result model"""
contents: List[Dict[str, Any]]