youtube-summarizer/backend/services/dual_transcript_service.py

564 lines
23 KiB
Python

"""
Dual transcript service that provides YouTube captions, Whisper AI transcription, or both.
Coordinates between different transcript sources and provides comparison functionality.
"""
import asyncio
import logging
import time
from typing import List, Dict, Optional, Tuple, Union
from enum import Enum
from .transcript_service import TranscriptService
from .faster_whisper_transcript_service import FasterWhisperTranscriptService
from ..config.video_download_config import VideoDownloadConfig
from ..models.transcript import (
DualTranscriptSegment,
DualTranscriptMetadata,
TranscriptSource,
DualTranscriptResult,
TranscriptComparison,
TranscriptSegment,
TranscriptMetadata
)
from ..core.config import settings
logger = logging.getLogger(__name__)
class TranscriptQuality(Enum):
"""Transcript quality levels"""
STANDARD = "standard" # YouTube captions
HIGH = "high" # Whisper small/base
PREMIUM = "premium" # Whisper medium/large
class DualTranscriptService:
"""Service for managing dual transcript extraction and comparison."""
def __init__(self):
self.transcript_service = TranscriptService()
# Load configuration for faster-whisper
config = VideoDownloadConfig()
self.whisper_service = FasterWhisperTranscriptService(
model_size=config.whisper_model,
device=config.whisper_device,
compute_type=config.whisper_compute_type,
beam_size=config.whisper_beam_size,
vad_filter=config.whisper_vad_filter,
word_timestamps=config.whisper_word_timestamps,
temperature=config.whisper_temperature,
best_of=config.whisper_best_of
)
async def get_transcript(
self,
video_id: str,
video_url: str,
source: TranscriptSource,
progress_callback=None
) -> DualTranscriptResult:
"""
Get transcript from specified source(s).
Args:
video_id: YouTube video ID
video_url: Full YouTube video URL
source: Which transcript source(s) to use
progress_callback: Optional callback for progress updates
Returns:
DualTranscriptResult with requested transcript data
"""
start_time = time.time()
try:
if source == TranscriptSource.YOUTUBE:
return await self._get_youtube_only(
video_id, video_url, progress_callback
)
elif source == TranscriptSource.WHISPER:
return await self._get_whisper_only(
video_id, video_url, progress_callback
)
elif source == TranscriptSource.BOTH:
return await self._get_both_transcripts(
video_id, video_url, progress_callback
)
else:
raise ValueError(f"Invalid transcript source: {source}")
except Exception as e:
logger.error(f"Failed to get transcript for video {video_id} from {source}: {e}")
processing_time = time.time() - start_time
return DualTranscriptResult(
video_id=video_id,
source=source,
youtube_transcript=None,
youtube_metadata=None,
whisper_transcript=None,
whisper_metadata=None,
comparison=None,
processing_time_seconds=processing_time,
success=False,
error=str(e)
)
async def _get_youtube_only(
self,
video_id: str,
video_url: str,
progress_callback=None
) -> DualTranscriptResult:
"""Get YouTube captions only."""
start_time = time.time()
try:
if progress_callback:
await progress_callback("Extracting YouTube captions...")
# Get YouTube transcript via existing transcript service
transcript_result = await self.transcript_service.extract_transcript(video_id)
if transcript_result.success and transcript_result.transcript:
# Convert to dual transcript format
youtube_segments = self._convert_to_dual_segments(transcript_result)
youtube_metadata = self._convert_to_dual_metadata(transcript_result, video_id)
else:
raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}")
if progress_callback:
await progress_callback("YouTube captions extracted successfully")
processing_time = time.time() - start_time
return DualTranscriptResult(
video_id=video_id,
source=TranscriptSource.YOUTUBE,
youtube_transcript=youtube_segments,
youtube_metadata=youtube_metadata,
whisper_transcript=None,
whisper_metadata=None,
comparison=None,
processing_time_seconds=processing_time,
success=True,
error=None
)
except Exception as e:
logger.error(f"YouTube transcript extraction failed: {e}")
raise
async def _get_whisper_only(
self,
video_id: str,
video_url: str,
progress_callback=None
) -> DualTranscriptResult:
"""Get Whisper AI transcription only."""
start_time = time.time()
try:
if progress_callback:
await progress_callback("Starting AI transcription with Whisper...")
# Get Whisper transcript
whisper_segments, whisper_metadata = await self.whisper_service.transcribe_video(
video_id, video_url, progress_callback
)
processing_time = time.time() - start_time
whisper_metadata.processing_time_seconds = processing_time
return DualTranscriptResult(
video_id=video_id,
source=TranscriptSource.WHISPER,
youtube_transcript=None,
youtube_metadata=None,
whisper_transcript=whisper_segments,
whisper_metadata=whisper_metadata,
comparison=None,
processing_time_seconds=processing_time,
success=True,
error=None
)
except Exception as e:
logger.error(f"Whisper transcript extraction failed: {e}")
raise
async def _get_both_transcripts(
self,
video_id: str,
video_url: str,
progress_callback=None
) -> DualTranscriptResult:
"""Get both YouTube and Whisper transcripts for comparison."""
start_time = time.time()
try:
# Progress tracking
if progress_callback:
await progress_callback("Starting dual transcript extraction...")
# Run both extractions in parallel
youtube_task = asyncio.create_task(
self._get_youtube_with_progress(video_id, video_url, progress_callback)
)
whisper_task = asyncio.create_task(
self._get_whisper_with_progress(video_id, video_url, progress_callback)
)
# Wait for both to complete
youtube_result, whisper_result = await asyncio.gather(
youtube_task, whisper_task, return_exceptions=True
)
# Handle any exceptions
youtube_segments, youtube_metadata = None, None
whisper_segments, whisper_metadata = None, None
errors = []
if isinstance(youtube_result, Exception):
logger.warning(f"YouTube extraction failed: {youtube_result}")
errors.append(f"YouTube: {youtube_result}")
else:
youtube_segments, youtube_metadata = youtube_result
if isinstance(whisper_result, Exception):
logger.warning(f"Whisper extraction failed: {whisper_result}")
errors.append(f"Whisper: {whisper_result}")
else:
whisper_segments, whisper_metadata = whisper_result
# Generate comparison if we have both transcripts
comparison = None
if youtube_segments and whisper_segments:
if progress_callback:
await progress_callback("Generating transcript comparison...")
comparison = self._compare_transcripts(
youtube_segments, youtube_metadata,
whisper_segments, whisper_metadata
)
processing_time = time.time() - start_time
if whisper_metadata:
whisper_metadata.processing_time_seconds = processing_time
# Determine success status
success = (youtube_segments is not None) or (whisper_segments is not None)
error_message = "; ".join(errors) if errors else None
if progress_callback:
if success:
await progress_callback("Dual transcript extraction completed")
else:
await progress_callback("Dual transcript extraction failed")
return DualTranscriptResult(
video_id=video_id,
source=TranscriptSource.BOTH,
youtube_transcript=youtube_segments,
youtube_metadata=youtube_metadata,
whisper_transcript=whisper_segments,
whisper_metadata=whisper_metadata,
comparison=comparison,
processing_time_seconds=processing_time,
success=success,
error=error_message
)
except Exception as e:
logger.error(f"Dual transcript extraction failed: {e}")
processing_time = time.time() - start_time
return DualTranscriptResult(
video_id=video_id,
source=TranscriptSource.BOTH,
youtube_transcript=None,
youtube_metadata=None,
whisper_transcript=None,
whisper_metadata=None,
comparison=None,
processing_time_seconds=processing_time,
success=False,
error=str(e)
)
async def _get_youtube_with_progress(
self,
video_id: str,
video_url: str,
progress_callback=None
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
"""Get YouTube transcript with progress updates."""
if progress_callback:
await progress_callback("Extracting YouTube captions...")
transcript_result = await self.transcript_service.extract_transcript(video_id)
if not transcript_result.success:
raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}")
# Convert to dual transcript format
result = (
self._convert_to_dual_segments(transcript_result),
self._convert_to_dual_metadata(transcript_result, video_id)
)
if progress_callback:
await progress_callback("YouTube captions extracted")
return result
async def _get_whisper_with_progress(
self,
video_id: str,
video_url: str,
progress_callback=None
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
"""Get Whisper transcript with progress updates."""
if progress_callback:
await progress_callback("Starting AI transcription...")
result = await self.whisper_service.transcribe_video(
video_id, video_url, progress_callback
)
if progress_callback:
await progress_callback("AI transcription completed")
return result
def _compare_transcripts(
self,
youtube_segments: List[TranscriptSegment],
youtube_metadata: TranscriptMetadata,
whisper_segments: List[TranscriptSegment],
whisper_metadata: TranscriptMetadata
) -> TranscriptComparison:
"""Generate comparison between YouTube and Whisper transcripts."""
# Combine segments into full text for comparison
youtube_text = " ".join(segment.text for segment in youtube_segments)
whisper_text = " ".join(segment.text for segment in whisper_segments)
# Calculate basic metrics
youtube_words = youtube_text.split()
whisper_words = whisper_text.split()
# Calculate word-level differences (simplified)
word_differences = abs(len(youtube_words) - len(whisper_words))
word_similarity = 1.0 - (word_differences / max(len(youtube_words), len(whisper_words), 1))
# Calculate quality metrics
punctuation_improvement = self._calculate_punctuation_improvement(youtube_text, whisper_text)
capitalization_improvement = self._calculate_capitalization_improvement(youtube_text, whisper_text)
# Determine recommendation
recommendation = self._generate_recommendation(
youtube_metadata, whisper_metadata, word_similarity,
punctuation_improvement, capitalization_improvement
)
return TranscriptComparison(
word_count_difference=word_differences,
similarity_score=word_similarity,
punctuation_improvement_score=punctuation_improvement,
capitalization_improvement_score=capitalization_improvement,
processing_time_ratio=whisper_metadata.processing_time_seconds / max(youtube_metadata.processing_time_seconds, 0.1),
quality_difference=whisper_metadata.quality_score - youtube_metadata.quality_score,
confidence_difference=whisper_metadata.confidence_score - youtube_metadata.confidence_score,
recommendation=recommendation,
significant_differences=self._find_significant_differences(youtube_text, whisper_text),
technical_terms_improved=self._find_technical_improvements(youtube_text, whisper_text)
)
def _calculate_punctuation_improvement(self, youtube_text: str, whisper_text: str) -> float:
"""Calculate improvement in punctuation between transcripts."""
youtube_punct = sum(1 for c in youtube_text if c in '.,!?;:')
whisper_punct = sum(1 for c in whisper_text if c in '.,!?;:')
# Normalize by text length
youtube_punct_ratio = youtube_punct / max(len(youtube_text), 1)
whisper_punct_ratio = whisper_punct / max(len(whisper_text), 1)
# Return improvement score (0-1 scale)
improvement = whisper_punct_ratio - youtube_punct_ratio
return max(0.0, min(1.0, improvement * 10)) # Scale to 0-1
def _calculate_capitalization_improvement(self, youtube_text: str, whisper_text: str) -> float:
"""Calculate improvement in capitalization between transcripts."""
youtube_capitals = sum(1 for c in youtube_text if c.isupper())
whisper_capitals = sum(1 for c in whisper_text if c.isupper())
# Normalize by text length
youtube_cap_ratio = youtube_capitals / max(len(youtube_text), 1)
whisper_cap_ratio = whisper_capitals / max(len(whisper_text), 1)
# Return improvement score (0-1 scale)
improvement = whisper_cap_ratio - youtube_cap_ratio
return max(0.0, min(1.0, improvement * 5)) # Scale to 0-1
def _generate_recommendation(
self,
youtube_metadata: TranscriptMetadata,
whisper_metadata: TranscriptMetadata,
similarity: float,
punct_improvement: float,
cap_improvement: float
) -> str:
"""Generate recommendation based on comparison metrics."""
# If very similar and YouTube is much faster
if similarity > 0.95 and whisper_metadata.processing_time_seconds > youtube_metadata.processing_time_seconds * 10:
return "youtube"
# If significant quality improvement with Whisper
if (whisper_metadata.quality_score - youtube_metadata.quality_score) > 0.2:
return "whisper"
# If significant punctuation/capitalization improvement
if punct_improvement > 0.3 or cap_improvement > 0.3:
return "whisper"
# If low confidence in YouTube captions
if youtube_metadata.confidence_score < 0.6 and whisper_metadata.confidence_score > 0.7:
return "whisper"
# Default to YouTube for speed if quality is similar
return "youtube"
def _find_significant_differences(self, youtube_text: str, whisper_text: str) -> List[str]:
"""Find significant textual differences between transcripts."""
differences = []
# Simple difference detection (can be enhanced with difflib)
youtube_words = set(youtube_text.lower().split())
whisper_words = set(whisper_text.lower().split())
unique_to_whisper = whisper_words - youtube_words
unique_to_youtube = youtube_words - whisper_words
if len(unique_to_whisper) > 5:
differences.append(f"Whisper includes {len(unique_to_whisper)} additional unique words")
if len(unique_to_youtube) > 5:
differences.append(f"YouTube includes {len(unique_to_youtube)} words not in Whisper")
return differences[:5] # Limit to 5 most significant
def _find_technical_improvements(self, youtube_text: str, whisper_text: str) -> List[str]:
"""Find technical terms that were improved in Whisper transcript."""
improvements = []
# Common technical terms that might be improved
technical_patterns = [
("API", "a p i"),
("URL", "u r l"),
("HTTP", "h t t p"),
("JSON", "jason"),
("SQL", "sequel"),
("AI", "a i"),
("ML", "m l"),
("GPU", "g p u"),
("CPU", "c p u")
]
for correct, incorrect in technical_patterns:
if incorrect.lower() in youtube_text.lower() and correct.lower() in whisper_text.lower():
improvements.append(f"'{incorrect}''{correct}'")
return improvements[:3] # Limit to 3 most significant
def estimate_processing_time(
self,
video_duration_seconds: float,
source: TranscriptSource
) -> Dict[str, float]:
"""
Estimate processing time for different transcript sources.
Args:
video_duration_seconds: Duration of the video in seconds
source: Which transcript source(s) to estimate for
Returns:
Dictionary with time estimates in seconds
"""
estimates = {}
if source in [TranscriptSource.YOUTUBE, TranscriptSource.BOTH]:
# YouTube API is very fast - usually 1-3 seconds regardless of video length
estimates["youtube"] = min(3.0, max(1.0, video_duration_seconds * 0.01))
if source in [TranscriptSource.WHISPER, TranscriptSource.BOTH]:
# Whisper processing time depends on model size and duration
# Rough estimates: ~0.1-0.5x real-time depending on hardware
base_ratio = 0.3 # Conservative estimate
device_multiplier = 0.5 if self.whisper_service.device == "cuda" else 1.5
estimates["whisper"] = video_duration_seconds * base_ratio * device_multiplier
if source == TranscriptSource.BOTH:
# Parallel processing, so max of both plus comparison overhead
estimates["total"] = max(estimates.get("youtube", 0), estimates.get("whisper", 0)) + 2.0
else:
estimates["total"] = sum(estimates.values())
return estimates
def _convert_to_dual_segments(self, transcript_result) -> List[DualTranscriptSegment]:
"""Convert TranscriptResult to DualTranscriptSegment list."""
if not transcript_result.segments:
# If no segments, create segments from plain text
if transcript_result.transcript:
# Simple conversion - split text into segments (basic implementation)
text_segments = transcript_result.transcript.split('. ')
segments = []
current_time = 0.0
for i, text in enumerate(text_segments):
if text.strip():
# Estimate duration based on word count (rough estimate)
word_count = len(text.split())
duration = word_count * 0.5 # 0.5 seconds per word (rough)
segments.append(DualTranscriptSegment(
start_time=current_time,
end_time=current_time + duration,
text=text.strip() + ('.' if not text.endswith('.') else ''),
confidence=0.8 # Default confidence for YouTube
))
current_time += duration + 0.5 # Small gap between segments
return segments
return []
# Convert existing segments
dual_segments = []
for segment in transcript_result.segments:
dual_segments.append(DualTranscriptSegment(
start_time=segment.start,
end_time=segment.start + segment.duration,
text=segment.text,
confidence=0.8 # Default confidence for YouTube captions
))
return dual_segments
def _convert_to_dual_metadata(self, transcript_result, video_id: str) -> DualTranscriptMetadata:
"""Convert TranscriptResult to DualTranscriptMetadata."""
word_count = len(transcript_result.transcript.split()) if transcript_result.transcript else 0
return DualTranscriptMetadata(
video_id=video_id,
language=transcript_result.metadata.language if transcript_result.metadata else "en",
word_count=word_count,
total_segments=len(transcript_result.segments) if transcript_result.segments else 0,
has_timestamps=transcript_result.segments is not None and len(transcript_result.segments) > 0,
extraction_method=transcript_result.method.value,
processing_time_seconds=transcript_result.metadata.processing_time_seconds if transcript_result.metadata else 0.0,
quality_score=0.75, # Default quality score for YouTube captions
confidence_score=0.8 # Default confidence for YouTube captions
)
async def cleanup(self):
"""Clean up resources used by transcript services."""
await self.whisper_service.cleanup()