247 lines
6.9 KiB
Python
247 lines
6.9 KiB
Python
"""
|
|
Data models for the YouTube Summarizer SDK
|
|
"""
|
|
|
|
from typing import List, Optional, Dict, Any, Union
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from dataclasses import dataclass, field
|
|
from pydantic import BaseModel, HttpUrl, Field, validator
|
|
|
|
class TranscriptSource(str, Enum):
|
|
"""Transcript source options"""
|
|
YOUTUBE = "youtube"
|
|
WHISPER = "whisper"
|
|
BOTH = "both"
|
|
|
|
class WhisperModelSize(str, Enum):
|
|
"""Whisper model size options"""
|
|
TINY = "tiny"
|
|
BASE = "base"
|
|
SMALL = "small"
|
|
MEDIUM = "medium"
|
|
LARGE = "large"
|
|
|
|
class ProcessingPriority(str, Enum):
|
|
"""Processing priority levels"""
|
|
LOW = "low"
|
|
NORMAL = "normal"
|
|
HIGH = "high"
|
|
URGENT = "urgent"
|
|
|
|
class JobStatus(str, Enum):
|
|
"""Job processing status"""
|
|
QUEUED = "queued"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELLED = "cancelled"
|
|
|
|
class WebhookEvent(str, Enum):
|
|
"""Webhook event types"""
|
|
JOB_STARTED = "job.started"
|
|
JOB_PROGRESS = "job.progress"
|
|
JOB_COMPLETED = "job.completed"
|
|
JOB_FAILED = "job.failed"
|
|
BATCH_COMPLETED = "batch.completed"
|
|
|
|
# Request Models
|
|
class TranscriptRequest(BaseModel):
|
|
"""Request model for transcript extraction"""
|
|
video_url: HttpUrl = Field(..., description="YouTube video URL")
|
|
transcript_source: TranscriptSource = Field(
|
|
default=TranscriptSource.YOUTUBE,
|
|
description="Transcript source"
|
|
)
|
|
whisper_model_size: Optional[WhisperModelSize] = Field(
|
|
default=WhisperModelSize.SMALL,
|
|
description="Whisper model size for AI transcription"
|
|
)
|
|
priority: ProcessingPriority = Field(
|
|
default=ProcessingPriority.NORMAL,
|
|
description="Processing priority"
|
|
)
|
|
webhook_url: Optional[HttpUrl] = Field(
|
|
None,
|
|
description="Webhook URL for notifications"
|
|
)
|
|
include_quality_analysis: bool = Field(
|
|
default=True,
|
|
description="Include transcript quality analysis"
|
|
)
|
|
custom_prompt: Optional[str] = Field(
|
|
None,
|
|
description="Custom processing prompt"
|
|
)
|
|
tags: List[str] = Field(
|
|
default_factory=list,
|
|
description="Custom tags for organization"
|
|
)
|
|
|
|
@validator('video_url')
|
|
def validate_youtube_url(cls, v):
|
|
"""Validate YouTube URL format"""
|
|
url_str = str(v)
|
|
valid_patterns = [
|
|
'youtube.com/watch',
|
|
'youtu.be/',
|
|
'youtube.com/embed',
|
|
'm.youtube.com/watch'
|
|
]
|
|
|
|
if not any(pattern in url_str for pattern in valid_patterns):
|
|
raise ValueError('Invalid YouTube URL format')
|
|
|
|
return v
|
|
|
|
class BatchProcessingRequest(BaseModel):
|
|
"""Request model for batch processing"""
|
|
video_urls: List[HttpUrl] = Field(
|
|
...,
|
|
min_items=1,
|
|
max_items=1000,
|
|
description="List of YouTube video URLs"
|
|
)
|
|
batch_name: str = Field(..., description="Batch job name")
|
|
transcript_source: TranscriptSource = Field(
|
|
default=TranscriptSource.YOUTUBE,
|
|
description="Transcript source for all videos"
|
|
)
|
|
priority: ProcessingPriority = Field(
|
|
default=ProcessingPriority.NORMAL,
|
|
description="Processing priority"
|
|
)
|
|
webhook_url: Optional[HttpUrl] = Field(
|
|
None,
|
|
description="Webhook URL for batch notifications"
|
|
)
|
|
parallel_processing: bool = Field(
|
|
default=False,
|
|
description="Enable parallel processing"
|
|
)
|
|
max_concurrent_jobs: int = Field(
|
|
default=5,
|
|
ge=1,
|
|
le=50,
|
|
description="Maximum concurrent jobs"
|
|
)
|
|
|
|
# Response Models
|
|
class JobResponse(BaseModel):
|
|
"""Response model for job creation"""
|
|
job_id: str
|
|
status: JobStatus
|
|
priority: ProcessingPriority
|
|
created_at: datetime
|
|
estimated_completion: Optional[datetime]
|
|
progress_percentage: float
|
|
current_stage: str
|
|
webhook_url: Optional[str]
|
|
metadata: Dict[str, Any]
|
|
|
|
class BatchJobResponse(BaseModel):
|
|
"""Response model for batch job creation"""
|
|
batch_id: str
|
|
status: JobStatus
|
|
video_count: int
|
|
priority: ProcessingPriority
|
|
estimated_completion: Optional[datetime]
|
|
parallel_processing: bool
|
|
webhook_url: Optional[str]
|
|
metadata: Dict[str, Any]
|
|
|
|
class TranscriptResult(BaseModel):
|
|
"""Transcript extraction result"""
|
|
job_id: str
|
|
video_id: str
|
|
video_url: str
|
|
transcript_source: TranscriptSource
|
|
transcript: Optional[str]
|
|
segments: Optional[List[Dict[str, Any]]]
|
|
processing_time_seconds: float
|
|
quality_score: Optional[float]
|
|
confidence_score: Optional[float]
|
|
metadata: Dict[str, Any]
|
|
|
|
class QualityComparison(BaseModel):
|
|
"""Quality comparison between transcript sources"""
|
|
similarity_score: float
|
|
punctuation_improvement_score: float
|
|
capitalization_improvement_score: float
|
|
technical_terms_improved: List[str]
|
|
recommendation: TranscriptSource
|
|
|
|
class DualTranscriptResult(BaseModel):
|
|
"""Result for dual transcript extraction"""
|
|
job_id: str
|
|
video_id: str
|
|
video_url: str
|
|
youtube_transcript: Optional[str]
|
|
whisper_transcript: Optional[str]
|
|
youtube_segments: Optional[List[Dict[str, Any]]]
|
|
whisper_segments: Optional[List[Dict[str, Any]]]
|
|
quality_comparison: Optional[QualityComparison]
|
|
processing_time_seconds: float
|
|
metadata: Dict[str, Any]
|
|
|
|
class APIUsageStats(BaseModel):
|
|
"""API usage statistics"""
|
|
total_requests: int
|
|
requests_today: int
|
|
requests_this_month: int
|
|
average_response_time_ms: float
|
|
success_rate: float
|
|
rate_limit_remaining: int
|
|
quota_reset_time: datetime
|
|
|
|
class ProcessingTimeEstimate(BaseModel):
|
|
"""Processing time estimate"""
|
|
estimated_time_seconds: float
|
|
estimated_cost: Optional[float]
|
|
factors: Optional[Dict[str, float]]
|
|
|
|
# Webhook Models
|
|
class WebhookPayload(BaseModel):
|
|
"""Webhook notification payload"""
|
|
event: WebhookEvent
|
|
timestamp: datetime
|
|
data: Dict[str, Any]
|
|
|
|
@dataclass
|
|
class SDKConfig:
|
|
"""SDK configuration"""
|
|
api_key: str
|
|
base_url: str = "https://api.youtube-summarizer.com"
|
|
timeout: float = 60.0
|
|
max_retries: int = 3
|
|
retry_delay: float = 1.0
|
|
verify_ssl: bool = True
|
|
user_agent: str = field(default_factory=lambda: f"youtube-summarizer-python-sdk/4.2.0")
|
|
|
|
@dataclass
|
|
class WebSocketConfig:
|
|
"""WebSocket configuration for real-time updates"""
|
|
url: str = "wss://api.youtube-summarizer.com/ws"
|
|
auto_reconnect: bool = True
|
|
max_reconnect_attempts: int = 5
|
|
heartbeat_interval: float = 30.0
|
|
reconnect_delay: float = 5.0
|
|
|
|
# MCP-specific models
|
|
class MCPToolRequest(BaseModel):
|
|
"""MCP tool request model"""
|
|
name: str
|
|
arguments: Dict[str, Any]
|
|
|
|
class MCPToolResult(BaseModel):
|
|
"""MCP tool result model"""
|
|
content: List[Dict[str, Any]]
|
|
is_error: bool = False
|
|
|
|
class MCPResourceRequest(BaseModel):
|
|
"""MCP resource request model"""
|
|
uri: str
|
|
|
|
class MCPResourceResult(BaseModel):
|
|
"""MCP resource result model"""
|
|
contents: List[Dict[str, Any]] |