youtube-summarizer/backend/models/transcript.py

229 lines
7.4 KiB
Python

from pydantic import BaseModel, Field
from typing import Optional, List, Dict, Any
from datetime import datetime
from enum import Enum
class TranscriptSource(str, Enum):
"""Transcript source options for dual transcript functionality."""
YOUTUBE = "youtube"
WHISPER = "whisper"
BOTH = "both"
class ExtractionMethod(str, Enum):
YOUTUBE_API = "youtube_api"
AUTO_CAPTIONS = "auto_captions"
WHISPER_AUDIO = "whisper_audio"
WHISPER_API = "whisper_api"
MOCK = "mock"
FAILED = "failed"
class TranscriptSegment(BaseModel):
text: str
start: float
duration: float
@property
def end(self) -> float:
return self.start + self.duration
class TranscriptMetadata(BaseModel):
word_count: int
estimated_reading_time: int # in seconds
language: str
has_timestamps: bool
extraction_method: ExtractionMethod
processing_time_seconds: float
class TranscriptChunk(BaseModel):
chunk_index: int
text: str
start_time: Optional[float] = None
end_time: Optional[float] = None
token_count: int
class TranscriptResult(BaseModel):
video_id: str
transcript: Optional[str] = None
segments: Optional[List[TranscriptSegment]] = None
metadata: Optional[TranscriptMetadata] = None
method: ExtractionMethod
success: bool
from_cache: bool = False
error: Optional[Dict[str, Any]] = None
class TranscriptRequest(BaseModel):
video_id: str = Field(..., description="YouTube video ID")
language_preference: str = Field("en", description="Preferred transcript language")
include_metadata: bool = Field(True, description="Include transcript metadata")
class TranscriptResponse(BaseModel):
video_id: str
transcript: Optional[str] = None
segments: Optional[List[TranscriptSegment]] = None
metadata: Optional[TranscriptMetadata] = None
extraction_method: str
language: str
word_count: int
cached: bool
processing_time_seconds: float
error: Optional[Dict[str, Any]] = None
class JobResponse(BaseModel):
job_id: str
status: str
message: str
class JobStatusResponse(BaseModel):
job_id: str
status: str # "pending", "processing", "completed", "failed"
progress_percentage: int
current_step: Optional[str] = None
result: Optional[TranscriptResponse] = None
error: Optional[Dict[str, Any]] = None
# Dual Transcript Models for Enhanced Functionality
class DualTranscriptSegment(BaseModel):
"""Enhanced transcript segment with confidence and speaker info."""
start_time: float
end_time: float
text: str
confidence: Optional[float] = None
speaker: Optional[str] = None
@property
def duration(self) -> float:
"""Get duration of the segment in seconds."""
return self.end_time - self.start_time
class DualTranscriptMetadata(BaseModel):
"""Enhanced metadata for dual transcript functionality."""
video_id: str
language: str
word_count: int
total_segments: int
has_timestamps: bool
extraction_method: str
processing_time_seconds: float = 0.0
quality_score: float = 0.0
confidence_score: float = 0.0
estimated_reading_time_minutes: Optional[float] = None
def model_post_init(self, __context):
"""Calculate derived fields after initialization."""
if self.estimated_reading_time_minutes is None:
# Average reading speed: 200 words per minute
self.estimated_reading_time_minutes = self.word_count / 200.0
class TranscriptComparison(BaseModel):
"""Comparison metrics between two transcripts."""
word_count_difference: int
similarity_score: float # 0-1 scale
punctuation_improvement_score: float # 0-1 scale
capitalization_improvement_score: float # 0-1 scale
processing_time_ratio: float # whisper_time / youtube_time
quality_difference: float # whisper_quality - youtube_quality
confidence_difference: float # whisper_confidence - youtube_confidence
recommendation: str # "youtube", "whisper", or "both"
significant_differences: List[str]
technical_terms_improved: List[str]
class DualTranscriptResult(BaseModel):
"""Result from dual transcript extraction."""
video_id: str
source: TranscriptSource
youtube_transcript: Optional[List[DualTranscriptSegment]] = None
youtube_metadata: Optional[DualTranscriptMetadata] = None
whisper_transcript: Optional[List[DualTranscriptSegment]] = None
whisper_metadata: Optional[DualTranscriptMetadata] = None
comparison: Optional[TranscriptComparison] = None
processing_time_seconds: float
success: bool
error: Optional[str] = None
@property
def has_youtube(self) -> bool:
"""Check if YouTube transcript is available."""
return self.youtube_transcript is not None and len(self.youtube_transcript) > 0
@property
def has_whisper(self) -> bool:
"""Check if Whisper transcript is available."""
return self.whisper_transcript is not None and len(self.whisper_transcript) > 0
@property
def has_comparison(self) -> bool:
"""Check if comparison data is available."""
return self.comparison is not None
def get_transcript(self, source: str) -> Optional[List[DualTranscriptSegment]]:
"""Get transcript by source name."""
if source == "youtube":
return self.youtube_transcript
elif source == "whisper":
return self.whisper_transcript
else:
return None
def get_metadata(self, source: str) -> Optional[DualTranscriptMetadata]:
"""Get metadata by source name."""
if source == "youtube":
return self.youtube_metadata
elif source == "whisper":
return self.whisper_metadata
else:
return None
class DualTranscriptRequest(BaseModel):
"""Request model for dual transcript extraction."""
video_url: str
transcript_source: TranscriptSource
whisper_model_size: str = "small" # For Whisper: tiny, base, small, medium, large
include_metadata: bool = True
include_comparison: bool = True # Only relevant when source is BOTH
class ProcessingTimeEstimate(BaseModel):
"""Processing time estimates for different transcript sources."""
youtube_seconds: Optional[float] = None
whisper_seconds: Optional[float] = None
total_seconds: Optional[float] = None
estimated_completion: Optional[str] = None # ISO timestamp
# Response models for API
class DualTranscriptResponse(BaseModel):
"""API response for dual transcript extraction."""
video_id: str
source: TranscriptSource
youtube_transcript: Optional[List[DualTranscriptSegment]] = None
youtube_metadata: Optional[DualTranscriptMetadata] = None
whisper_transcript: Optional[List[DualTranscriptSegment]] = None
whisper_metadata: Optional[DualTranscriptMetadata] = None
comparison: Optional[TranscriptComparison] = None
processing_time_seconds: float
success: bool
error: Optional[str] = None
has_youtube: bool = False
has_whisper: bool = False
has_comparison: bool = False
def model_post_init(self, __context):
"""Calculate derived properties after initialization."""
self.has_youtube = self.youtube_transcript is not None and len(self.youtube_transcript) > 0
self.has_whisper = self.whisper_transcript is not None and len(self.whisper_transcript) > 0
self.has_comparison = self.comparison is not None