564 lines
23 KiB
Python
564 lines
23 KiB
Python
"""
|
|
Dual transcript service that provides YouTube captions, Whisper AI transcription, or both.
|
|
Coordinates between different transcript sources and provides comparison functionality.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from typing import List, Dict, Optional, Tuple, Union
|
|
from enum import Enum
|
|
|
|
from .transcript_service import TranscriptService
|
|
from .faster_whisper_transcript_service import FasterWhisperTranscriptService
|
|
from ..config.video_download_config import VideoDownloadConfig
|
|
from ..models.transcript import (
|
|
DualTranscriptSegment,
|
|
DualTranscriptMetadata,
|
|
TranscriptSource,
|
|
DualTranscriptResult,
|
|
TranscriptComparison,
|
|
TranscriptSegment,
|
|
TranscriptMetadata
|
|
)
|
|
from ..core.config import settings
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TranscriptQuality(Enum):
|
|
"""Transcript quality levels"""
|
|
STANDARD = "standard" # YouTube captions
|
|
HIGH = "high" # Whisper small/base
|
|
PREMIUM = "premium" # Whisper medium/large
|
|
|
|
|
|
class DualTranscriptService:
|
|
"""Service for managing dual transcript extraction and comparison."""
|
|
|
|
def __init__(self):
|
|
self.transcript_service = TranscriptService()
|
|
# Load configuration for faster-whisper
|
|
config = VideoDownloadConfig()
|
|
self.whisper_service = FasterWhisperTranscriptService(
|
|
model_size=config.whisper_model,
|
|
device=config.whisper_device,
|
|
compute_type=config.whisper_compute_type,
|
|
beam_size=config.whisper_beam_size,
|
|
vad_filter=config.whisper_vad_filter,
|
|
word_timestamps=config.whisper_word_timestamps,
|
|
temperature=config.whisper_temperature,
|
|
best_of=config.whisper_best_of
|
|
)
|
|
|
|
async def get_transcript(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
source: TranscriptSource,
|
|
progress_callback=None
|
|
) -> DualTranscriptResult:
|
|
"""
|
|
Get transcript from specified source(s).
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
video_url: Full YouTube video URL
|
|
source: Which transcript source(s) to use
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
DualTranscriptResult with requested transcript data
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
if source == TranscriptSource.YOUTUBE:
|
|
return await self._get_youtube_only(
|
|
video_id, video_url, progress_callback
|
|
)
|
|
elif source == TranscriptSource.WHISPER:
|
|
return await self._get_whisper_only(
|
|
video_id, video_url, progress_callback
|
|
)
|
|
elif source == TranscriptSource.BOTH:
|
|
return await self._get_both_transcripts(
|
|
video_id, video_url, progress_callback
|
|
)
|
|
else:
|
|
raise ValueError(f"Invalid transcript source: {source}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get transcript for video {video_id} from {source}: {e}")
|
|
processing_time = time.time() - start_time
|
|
return DualTranscriptResult(
|
|
video_id=video_id,
|
|
source=source,
|
|
youtube_transcript=None,
|
|
youtube_metadata=None,
|
|
whisper_transcript=None,
|
|
whisper_metadata=None,
|
|
comparison=None,
|
|
processing_time_seconds=processing_time,
|
|
success=False,
|
|
error=str(e)
|
|
)
|
|
|
|
async def _get_youtube_only(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> DualTranscriptResult:
|
|
"""Get YouTube captions only."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
if progress_callback:
|
|
await progress_callback("Extracting YouTube captions...")
|
|
|
|
# Get YouTube transcript via existing transcript service
|
|
transcript_result = await self.transcript_service.extract_transcript(video_id)
|
|
if transcript_result.success and transcript_result.transcript:
|
|
# Convert to dual transcript format
|
|
youtube_segments = self._convert_to_dual_segments(transcript_result)
|
|
youtube_metadata = self._convert_to_dual_metadata(transcript_result, video_id)
|
|
else:
|
|
raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}")
|
|
|
|
if progress_callback:
|
|
await progress_callback("YouTube captions extracted successfully")
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return DualTranscriptResult(
|
|
video_id=video_id,
|
|
source=TranscriptSource.YOUTUBE,
|
|
youtube_transcript=youtube_segments,
|
|
youtube_metadata=youtube_metadata,
|
|
whisper_transcript=None,
|
|
whisper_metadata=None,
|
|
comparison=None,
|
|
processing_time_seconds=processing_time,
|
|
success=True,
|
|
error=None
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"YouTube transcript extraction failed: {e}")
|
|
raise
|
|
|
|
async def _get_whisper_only(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> DualTranscriptResult:
|
|
"""Get Whisper AI transcription only."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
if progress_callback:
|
|
await progress_callback("Starting AI transcription with Whisper...")
|
|
|
|
# Get Whisper transcript
|
|
whisper_segments, whisper_metadata = await self.whisper_service.transcribe_video(
|
|
video_id, video_url, progress_callback
|
|
)
|
|
|
|
processing_time = time.time() - start_time
|
|
whisper_metadata.processing_time_seconds = processing_time
|
|
|
|
return DualTranscriptResult(
|
|
video_id=video_id,
|
|
source=TranscriptSource.WHISPER,
|
|
youtube_transcript=None,
|
|
youtube_metadata=None,
|
|
whisper_transcript=whisper_segments,
|
|
whisper_metadata=whisper_metadata,
|
|
comparison=None,
|
|
processing_time_seconds=processing_time,
|
|
success=True,
|
|
error=None
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Whisper transcript extraction failed: {e}")
|
|
raise
|
|
|
|
async def _get_both_transcripts(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> DualTranscriptResult:
|
|
"""Get both YouTube and Whisper transcripts for comparison."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Progress tracking
|
|
if progress_callback:
|
|
await progress_callback("Starting dual transcript extraction...")
|
|
|
|
# Run both extractions in parallel
|
|
youtube_task = asyncio.create_task(
|
|
self._get_youtube_with_progress(video_id, video_url, progress_callback)
|
|
)
|
|
whisper_task = asyncio.create_task(
|
|
self._get_whisper_with_progress(video_id, video_url, progress_callback)
|
|
)
|
|
|
|
# Wait for both to complete
|
|
youtube_result, whisper_result = await asyncio.gather(
|
|
youtube_task, whisper_task, return_exceptions=True
|
|
)
|
|
|
|
# Handle any exceptions
|
|
youtube_segments, youtube_metadata = None, None
|
|
whisper_segments, whisper_metadata = None, None
|
|
errors = []
|
|
|
|
if isinstance(youtube_result, Exception):
|
|
logger.warning(f"YouTube extraction failed: {youtube_result}")
|
|
errors.append(f"YouTube: {youtube_result}")
|
|
else:
|
|
youtube_segments, youtube_metadata = youtube_result
|
|
|
|
if isinstance(whisper_result, Exception):
|
|
logger.warning(f"Whisper extraction failed: {whisper_result}")
|
|
errors.append(f"Whisper: {whisper_result}")
|
|
else:
|
|
whisper_segments, whisper_metadata = whisper_result
|
|
|
|
# Generate comparison if we have both transcripts
|
|
comparison = None
|
|
if youtube_segments and whisper_segments:
|
|
if progress_callback:
|
|
await progress_callback("Generating transcript comparison...")
|
|
|
|
comparison = self._compare_transcripts(
|
|
youtube_segments, youtube_metadata,
|
|
whisper_segments, whisper_metadata
|
|
)
|
|
|
|
processing_time = time.time() - start_time
|
|
if whisper_metadata:
|
|
whisper_metadata.processing_time_seconds = processing_time
|
|
|
|
# Determine success status
|
|
success = (youtube_segments is not None) or (whisper_segments is not None)
|
|
error_message = "; ".join(errors) if errors else None
|
|
|
|
if progress_callback:
|
|
if success:
|
|
await progress_callback("Dual transcript extraction completed")
|
|
else:
|
|
await progress_callback("Dual transcript extraction failed")
|
|
|
|
return DualTranscriptResult(
|
|
video_id=video_id,
|
|
source=TranscriptSource.BOTH,
|
|
youtube_transcript=youtube_segments,
|
|
youtube_metadata=youtube_metadata,
|
|
whisper_transcript=whisper_segments,
|
|
whisper_metadata=whisper_metadata,
|
|
comparison=comparison,
|
|
processing_time_seconds=processing_time,
|
|
success=success,
|
|
error=error_message
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Dual transcript extraction failed: {e}")
|
|
processing_time = time.time() - start_time
|
|
return DualTranscriptResult(
|
|
video_id=video_id,
|
|
source=TranscriptSource.BOTH,
|
|
youtube_transcript=None,
|
|
youtube_metadata=None,
|
|
whisper_transcript=None,
|
|
whisper_metadata=None,
|
|
comparison=None,
|
|
processing_time_seconds=processing_time,
|
|
success=False,
|
|
error=str(e)
|
|
)
|
|
|
|
async def _get_youtube_with_progress(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
|
|
"""Get YouTube transcript with progress updates."""
|
|
if progress_callback:
|
|
await progress_callback("Extracting YouTube captions...")
|
|
|
|
transcript_result = await self.transcript_service.extract_transcript(video_id)
|
|
if not transcript_result.success:
|
|
raise Exception(f"YouTube transcript extraction failed: {transcript_result.error}")
|
|
|
|
# Convert to dual transcript format
|
|
result = (
|
|
self._convert_to_dual_segments(transcript_result),
|
|
self._convert_to_dual_metadata(transcript_result, video_id)
|
|
)
|
|
|
|
if progress_callback:
|
|
await progress_callback("YouTube captions extracted")
|
|
|
|
return result
|
|
|
|
async def _get_whisper_with_progress(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
|
|
"""Get Whisper transcript with progress updates."""
|
|
if progress_callback:
|
|
await progress_callback("Starting AI transcription...")
|
|
|
|
result = await self.whisper_service.transcribe_video(
|
|
video_id, video_url, progress_callback
|
|
)
|
|
|
|
if progress_callback:
|
|
await progress_callback("AI transcription completed")
|
|
|
|
return result
|
|
|
|
def _compare_transcripts(
|
|
self,
|
|
youtube_segments: List[TranscriptSegment],
|
|
youtube_metadata: TranscriptMetadata,
|
|
whisper_segments: List[TranscriptSegment],
|
|
whisper_metadata: TranscriptMetadata
|
|
) -> TranscriptComparison:
|
|
"""Generate comparison between YouTube and Whisper transcripts."""
|
|
|
|
# Combine segments into full text for comparison
|
|
youtube_text = " ".join(segment.text for segment in youtube_segments)
|
|
whisper_text = " ".join(segment.text for segment in whisper_segments)
|
|
|
|
# Calculate basic metrics
|
|
youtube_words = youtube_text.split()
|
|
whisper_words = whisper_text.split()
|
|
|
|
# Calculate word-level differences (simplified)
|
|
word_differences = abs(len(youtube_words) - len(whisper_words))
|
|
word_similarity = 1.0 - (word_differences / max(len(youtube_words), len(whisper_words), 1))
|
|
|
|
# Calculate quality metrics
|
|
punctuation_improvement = self._calculate_punctuation_improvement(youtube_text, whisper_text)
|
|
capitalization_improvement = self._calculate_capitalization_improvement(youtube_text, whisper_text)
|
|
|
|
# Determine recommendation
|
|
recommendation = self._generate_recommendation(
|
|
youtube_metadata, whisper_metadata, word_similarity,
|
|
punctuation_improvement, capitalization_improvement
|
|
)
|
|
|
|
return TranscriptComparison(
|
|
word_count_difference=word_differences,
|
|
similarity_score=word_similarity,
|
|
punctuation_improvement_score=punctuation_improvement,
|
|
capitalization_improvement_score=capitalization_improvement,
|
|
processing_time_ratio=whisper_metadata.processing_time_seconds / max(youtube_metadata.processing_time_seconds, 0.1),
|
|
quality_difference=whisper_metadata.quality_score - youtube_metadata.quality_score,
|
|
confidence_difference=whisper_metadata.confidence_score - youtube_metadata.confidence_score,
|
|
recommendation=recommendation,
|
|
significant_differences=self._find_significant_differences(youtube_text, whisper_text),
|
|
technical_terms_improved=self._find_technical_improvements(youtube_text, whisper_text)
|
|
)
|
|
|
|
def _calculate_punctuation_improvement(self, youtube_text: str, whisper_text: str) -> float:
|
|
"""Calculate improvement in punctuation between transcripts."""
|
|
youtube_punct = sum(1 for c in youtube_text if c in '.,!?;:')
|
|
whisper_punct = sum(1 for c in whisper_text if c in '.,!?;:')
|
|
|
|
# Normalize by text length
|
|
youtube_punct_ratio = youtube_punct / max(len(youtube_text), 1)
|
|
whisper_punct_ratio = whisper_punct / max(len(whisper_text), 1)
|
|
|
|
# Return improvement score (0-1 scale)
|
|
improvement = whisper_punct_ratio - youtube_punct_ratio
|
|
return max(0.0, min(1.0, improvement * 10)) # Scale to 0-1
|
|
|
|
def _calculate_capitalization_improvement(self, youtube_text: str, whisper_text: str) -> float:
|
|
"""Calculate improvement in capitalization between transcripts."""
|
|
youtube_capitals = sum(1 for c in youtube_text if c.isupper())
|
|
whisper_capitals = sum(1 for c in whisper_text if c.isupper())
|
|
|
|
# Normalize by text length
|
|
youtube_cap_ratio = youtube_capitals / max(len(youtube_text), 1)
|
|
whisper_cap_ratio = whisper_capitals / max(len(whisper_text), 1)
|
|
|
|
# Return improvement score (0-1 scale)
|
|
improvement = whisper_cap_ratio - youtube_cap_ratio
|
|
return max(0.0, min(1.0, improvement * 5)) # Scale to 0-1
|
|
|
|
def _generate_recommendation(
|
|
self,
|
|
youtube_metadata: TranscriptMetadata,
|
|
whisper_metadata: TranscriptMetadata,
|
|
similarity: float,
|
|
punct_improvement: float,
|
|
cap_improvement: float
|
|
) -> str:
|
|
"""Generate recommendation based on comparison metrics."""
|
|
|
|
# If very similar and YouTube is much faster
|
|
if similarity > 0.95 and whisper_metadata.processing_time_seconds > youtube_metadata.processing_time_seconds * 10:
|
|
return "youtube"
|
|
|
|
# If significant quality improvement with Whisper
|
|
if (whisper_metadata.quality_score - youtube_metadata.quality_score) > 0.2:
|
|
return "whisper"
|
|
|
|
# If significant punctuation/capitalization improvement
|
|
if punct_improvement > 0.3 or cap_improvement > 0.3:
|
|
return "whisper"
|
|
|
|
# If low confidence in YouTube captions
|
|
if youtube_metadata.confidence_score < 0.6 and whisper_metadata.confidence_score > 0.7:
|
|
return "whisper"
|
|
|
|
# Default to YouTube for speed if quality is similar
|
|
return "youtube"
|
|
|
|
def _find_significant_differences(self, youtube_text: str, whisper_text: str) -> List[str]:
|
|
"""Find significant textual differences between transcripts."""
|
|
differences = []
|
|
|
|
# Simple difference detection (can be enhanced with difflib)
|
|
youtube_words = set(youtube_text.lower().split())
|
|
whisper_words = set(whisper_text.lower().split())
|
|
|
|
unique_to_whisper = whisper_words - youtube_words
|
|
unique_to_youtube = youtube_words - whisper_words
|
|
|
|
if len(unique_to_whisper) > 5:
|
|
differences.append(f"Whisper includes {len(unique_to_whisper)} additional unique words")
|
|
|
|
if len(unique_to_youtube) > 5:
|
|
differences.append(f"YouTube includes {len(unique_to_youtube)} words not in Whisper")
|
|
|
|
return differences[:5] # Limit to 5 most significant
|
|
|
|
def _find_technical_improvements(self, youtube_text: str, whisper_text: str) -> List[str]:
|
|
"""Find technical terms that were improved in Whisper transcript."""
|
|
improvements = []
|
|
|
|
# Common technical terms that might be improved
|
|
technical_patterns = [
|
|
("API", "a p i"),
|
|
("URL", "u r l"),
|
|
("HTTP", "h t t p"),
|
|
("JSON", "jason"),
|
|
("SQL", "sequel"),
|
|
("AI", "a i"),
|
|
("ML", "m l"),
|
|
("GPU", "g p u"),
|
|
("CPU", "c p u")
|
|
]
|
|
|
|
for correct, incorrect in technical_patterns:
|
|
if incorrect.lower() in youtube_text.lower() and correct.lower() in whisper_text.lower():
|
|
improvements.append(f"'{incorrect}' → '{correct}'")
|
|
|
|
return improvements[:3] # Limit to 3 most significant
|
|
|
|
def estimate_processing_time(
|
|
self,
|
|
video_duration_seconds: float,
|
|
source: TranscriptSource
|
|
) -> Dict[str, float]:
|
|
"""
|
|
Estimate processing time for different transcript sources.
|
|
|
|
Args:
|
|
video_duration_seconds: Duration of the video in seconds
|
|
source: Which transcript source(s) to estimate for
|
|
|
|
Returns:
|
|
Dictionary with time estimates in seconds
|
|
"""
|
|
estimates = {}
|
|
|
|
if source in [TranscriptSource.YOUTUBE, TranscriptSource.BOTH]:
|
|
# YouTube API is very fast - usually 1-3 seconds regardless of video length
|
|
estimates["youtube"] = min(3.0, max(1.0, video_duration_seconds * 0.01))
|
|
|
|
if source in [TranscriptSource.WHISPER, TranscriptSource.BOTH]:
|
|
# Whisper processing time depends on model size and duration
|
|
# Rough estimates: ~0.1-0.5x real-time depending on hardware
|
|
base_ratio = 0.3 # Conservative estimate
|
|
device_multiplier = 0.5 if self.whisper_service.device == "cuda" else 1.5
|
|
estimates["whisper"] = video_duration_seconds * base_ratio * device_multiplier
|
|
|
|
if source == TranscriptSource.BOTH:
|
|
# Parallel processing, so max of both plus comparison overhead
|
|
estimates["total"] = max(estimates.get("youtube", 0), estimates.get("whisper", 0)) + 2.0
|
|
else:
|
|
estimates["total"] = sum(estimates.values())
|
|
|
|
return estimates
|
|
|
|
def _convert_to_dual_segments(self, transcript_result) -> List[DualTranscriptSegment]:
|
|
"""Convert TranscriptResult to DualTranscriptSegment list."""
|
|
if not transcript_result.segments:
|
|
# If no segments, create segments from plain text
|
|
if transcript_result.transcript:
|
|
# Simple conversion - split text into segments (basic implementation)
|
|
text_segments = transcript_result.transcript.split('. ')
|
|
segments = []
|
|
current_time = 0.0
|
|
|
|
for i, text in enumerate(text_segments):
|
|
if text.strip():
|
|
# Estimate duration based on word count (rough estimate)
|
|
word_count = len(text.split())
|
|
duration = word_count * 0.5 # 0.5 seconds per word (rough)
|
|
|
|
segments.append(DualTranscriptSegment(
|
|
start_time=current_time,
|
|
end_time=current_time + duration,
|
|
text=text.strip() + ('.' if not text.endswith('.') else ''),
|
|
confidence=0.8 # Default confidence for YouTube
|
|
))
|
|
current_time += duration + 0.5 # Small gap between segments
|
|
|
|
return segments
|
|
return []
|
|
|
|
# Convert existing segments
|
|
dual_segments = []
|
|
for segment in transcript_result.segments:
|
|
dual_segments.append(DualTranscriptSegment(
|
|
start_time=segment.start,
|
|
end_time=segment.start + segment.duration,
|
|
text=segment.text,
|
|
confidence=0.8 # Default confidence for YouTube captions
|
|
))
|
|
|
|
return dual_segments
|
|
|
|
def _convert_to_dual_metadata(self, transcript_result, video_id: str) -> DualTranscriptMetadata:
|
|
"""Convert TranscriptResult to DualTranscriptMetadata."""
|
|
word_count = len(transcript_result.transcript.split()) if transcript_result.transcript else 0
|
|
|
|
return DualTranscriptMetadata(
|
|
video_id=video_id,
|
|
language=transcript_result.metadata.language if transcript_result.metadata else "en",
|
|
word_count=word_count,
|
|
total_segments=len(transcript_result.segments) if transcript_result.segments else 0,
|
|
has_timestamps=transcript_result.segments is not None and len(transcript_result.segments) > 0,
|
|
extraction_method=transcript_result.method.value,
|
|
processing_time_seconds=transcript_result.metadata.processing_time_seconds if transcript_result.metadata else 0.0,
|
|
quality_score=0.75, # Default quality score for YouTube captions
|
|
confidence_score=0.8 # Default confidence for YouTube captions
|
|
)
|
|
|
|
async def cleanup(self):
|
|
"""Clean up resources used by transcript services."""
|
|
await self.whisper_service.cleanup() |