229 lines
7.4 KiB
Python
229 lines
7.4 KiB
Python
from pydantic import BaseModel, Field
|
|
from typing import Optional, List, Dict, Any
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
|
|
|
|
class TranscriptSource(str, Enum):
|
|
"""Transcript source options for dual transcript functionality."""
|
|
YOUTUBE = "youtube"
|
|
WHISPER = "whisper"
|
|
BOTH = "both"
|
|
|
|
|
|
class ExtractionMethod(str, Enum):
|
|
YOUTUBE_API = "youtube_api"
|
|
AUTO_CAPTIONS = "auto_captions"
|
|
WHISPER_AUDIO = "whisper_audio"
|
|
WHISPER_API = "whisper_api"
|
|
MOCK = "mock"
|
|
FAILED = "failed"
|
|
|
|
|
|
class TranscriptSegment(BaseModel):
|
|
text: str
|
|
start: float
|
|
duration: float
|
|
|
|
@property
|
|
def end(self) -> float:
|
|
return self.start + self.duration
|
|
|
|
|
|
class TranscriptMetadata(BaseModel):
|
|
word_count: int
|
|
estimated_reading_time: int # in seconds
|
|
language: str
|
|
has_timestamps: bool
|
|
extraction_method: ExtractionMethod
|
|
processing_time_seconds: float
|
|
|
|
|
|
class TranscriptChunk(BaseModel):
|
|
chunk_index: int
|
|
text: str
|
|
start_time: Optional[float] = None
|
|
end_time: Optional[float] = None
|
|
token_count: int
|
|
|
|
|
|
class TranscriptResult(BaseModel):
|
|
video_id: str
|
|
transcript: Optional[str] = None
|
|
segments: Optional[List[TranscriptSegment]] = None
|
|
metadata: Optional[TranscriptMetadata] = None
|
|
method: ExtractionMethod
|
|
success: bool
|
|
from_cache: bool = False
|
|
error: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class TranscriptRequest(BaseModel):
|
|
video_id: str = Field(..., description="YouTube video ID")
|
|
language_preference: str = Field("en", description="Preferred transcript language")
|
|
include_metadata: bool = Field(True, description="Include transcript metadata")
|
|
|
|
|
|
class TranscriptResponse(BaseModel):
|
|
video_id: str
|
|
transcript: Optional[str] = None
|
|
segments: Optional[List[TranscriptSegment]] = None
|
|
metadata: Optional[TranscriptMetadata] = None
|
|
extraction_method: str
|
|
language: str
|
|
word_count: int
|
|
cached: bool
|
|
processing_time_seconds: float
|
|
error: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class JobResponse(BaseModel):
|
|
job_id: str
|
|
status: str
|
|
message: str
|
|
|
|
|
|
class JobStatusResponse(BaseModel):
|
|
job_id: str
|
|
status: str # "pending", "processing", "completed", "failed"
|
|
progress_percentage: int
|
|
current_step: Optional[str] = None
|
|
result: Optional[TranscriptResponse] = None
|
|
error: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
# Dual Transcript Models for Enhanced Functionality
|
|
class DualTranscriptSegment(BaseModel):
|
|
"""Enhanced transcript segment with confidence and speaker info."""
|
|
start_time: float
|
|
end_time: float
|
|
text: str
|
|
confidence: Optional[float] = None
|
|
speaker: Optional[str] = None
|
|
|
|
@property
|
|
def duration(self) -> float:
|
|
"""Get duration of the segment in seconds."""
|
|
return self.end_time - self.start_time
|
|
|
|
|
|
class DualTranscriptMetadata(BaseModel):
|
|
"""Enhanced metadata for dual transcript functionality."""
|
|
video_id: str
|
|
language: str
|
|
word_count: int
|
|
total_segments: int
|
|
has_timestamps: bool
|
|
extraction_method: str
|
|
processing_time_seconds: float = 0.0
|
|
quality_score: float = 0.0
|
|
confidence_score: float = 0.0
|
|
estimated_reading_time_minutes: Optional[float] = None
|
|
|
|
def model_post_init(self, __context):
|
|
"""Calculate derived fields after initialization."""
|
|
if self.estimated_reading_time_minutes is None:
|
|
# Average reading speed: 200 words per minute
|
|
self.estimated_reading_time_minutes = self.word_count / 200.0
|
|
|
|
|
|
class TranscriptComparison(BaseModel):
|
|
"""Comparison metrics between two transcripts."""
|
|
word_count_difference: int
|
|
similarity_score: float # 0-1 scale
|
|
punctuation_improvement_score: float # 0-1 scale
|
|
capitalization_improvement_score: float # 0-1 scale
|
|
processing_time_ratio: float # whisper_time / youtube_time
|
|
quality_difference: float # whisper_quality - youtube_quality
|
|
confidence_difference: float # whisper_confidence - youtube_confidence
|
|
recommendation: str # "youtube", "whisper", or "both"
|
|
significant_differences: List[str]
|
|
technical_terms_improved: List[str]
|
|
|
|
|
|
class DualTranscriptResult(BaseModel):
|
|
"""Result from dual transcript extraction."""
|
|
video_id: str
|
|
source: TranscriptSource
|
|
youtube_transcript: Optional[List[DualTranscriptSegment]] = None
|
|
youtube_metadata: Optional[DualTranscriptMetadata] = None
|
|
whisper_transcript: Optional[List[DualTranscriptSegment]] = None
|
|
whisper_metadata: Optional[DualTranscriptMetadata] = None
|
|
comparison: Optional[TranscriptComparison] = None
|
|
processing_time_seconds: float
|
|
success: bool
|
|
error: Optional[str] = None
|
|
|
|
@property
|
|
def has_youtube(self) -> bool:
|
|
"""Check if YouTube transcript is available."""
|
|
return self.youtube_transcript is not None and len(self.youtube_transcript) > 0
|
|
|
|
@property
|
|
def has_whisper(self) -> bool:
|
|
"""Check if Whisper transcript is available."""
|
|
return self.whisper_transcript is not None and len(self.whisper_transcript) > 0
|
|
|
|
@property
|
|
def has_comparison(self) -> bool:
|
|
"""Check if comparison data is available."""
|
|
return self.comparison is not None
|
|
|
|
def get_transcript(self, source: str) -> Optional[List[DualTranscriptSegment]]:
|
|
"""Get transcript by source name."""
|
|
if source == "youtube":
|
|
return self.youtube_transcript
|
|
elif source == "whisper":
|
|
return self.whisper_transcript
|
|
else:
|
|
return None
|
|
|
|
def get_metadata(self, source: str) -> Optional[DualTranscriptMetadata]:
|
|
"""Get metadata by source name."""
|
|
if source == "youtube":
|
|
return self.youtube_metadata
|
|
elif source == "whisper":
|
|
return self.whisper_metadata
|
|
else:
|
|
return None
|
|
|
|
|
|
class DualTranscriptRequest(BaseModel):
|
|
"""Request model for dual transcript extraction."""
|
|
video_url: str
|
|
transcript_source: TranscriptSource
|
|
whisper_model_size: str = "small" # For Whisper: tiny, base, small, medium, large
|
|
include_metadata: bool = True
|
|
include_comparison: bool = True # Only relevant when source is BOTH
|
|
|
|
|
|
class ProcessingTimeEstimate(BaseModel):
|
|
"""Processing time estimates for different transcript sources."""
|
|
youtube_seconds: Optional[float] = None
|
|
whisper_seconds: Optional[float] = None
|
|
total_seconds: Optional[float] = None
|
|
estimated_completion: Optional[str] = None # ISO timestamp
|
|
|
|
|
|
# Response models for API
|
|
class DualTranscriptResponse(BaseModel):
|
|
"""API response for dual transcript extraction."""
|
|
video_id: str
|
|
source: TranscriptSource
|
|
youtube_transcript: Optional[List[DualTranscriptSegment]] = None
|
|
youtube_metadata: Optional[DualTranscriptMetadata] = None
|
|
whisper_transcript: Optional[List[DualTranscriptSegment]] = None
|
|
whisper_metadata: Optional[DualTranscriptMetadata] = None
|
|
comparison: Optional[TranscriptComparison] = None
|
|
processing_time_seconds: float
|
|
success: bool
|
|
error: Optional[str] = None
|
|
has_youtube: bool = False
|
|
has_whisper: bool = False
|
|
has_comparison: bool = False
|
|
|
|
def model_post_init(self, __context):
|
|
"""Calculate derived properties after initialization."""
|
|
self.has_youtube = self.youtube_transcript is not None and len(self.youtube_transcript) > 0
|
|
self.has_whisper = self.whisper_transcript is not None and len(self.whisper_transcript) > 0
|
|
self.has_comparison = self.comparison is not None |