youtube-summarizer/backend/services/transcript_service.py

477 lines
22 KiB
Python

import asyncio
import time
import logging
from typing import Optional, List, Dict, Any, TYPE_CHECKING
import json
import random
from datetime import datetime
if TYPE_CHECKING:
from backend.core.websocket_manager import WebSocketManager
from backend.models.transcript import (
TranscriptResult,
TranscriptMetadata,
TranscriptSegment,
ExtractionMethod
)
from backend.core.exceptions import (
TranscriptExtractionError,
ErrorCode
)
from backend.services.mock_cache import MockCacheClient
from backend.services.intelligent_video_downloader import IntelligentVideoDownloader
from backend.models.video_download import DownloadPreferences, VideoQuality, DownloadStatus
logger = logging.getLogger(__name__)
class MockWhisperClient:
"""Mock Whisper client for audio transcription simulation"""
async def transcribe(self, video_id: str) -> str:
await asyncio.sleep(0.5) # Simulate processing time
return f"[Whisper transcription] This is a mock audio transcription for video {video_id}."
class TranscriptNotAvailableError(Exception):
"""Raised when transcript is not available through YouTube API"""
pass
class CaptionsNotAvailableError(Exception):
"""Raised when auto-captions are not available"""
pass
class AudioTranscriptionError(Exception):
"""Raised when audio transcription fails"""
pass
class TranscriptService:
"""Service for extracting video transcripts with fallback methods"""
# Mock transcript data for demonstration
MOCK_TRANSCRIPTS = {
"dQw4w9WgXcQ": {
"text": """Welcome to this comprehensive tutorial on modern web development.
Today we'll be exploring the fundamentals of building scalable applications.
First, let's discuss the importance of choosing the right architecture.
When building web applications, you need to consider factors like performance,
maintainability, and user experience.
The key components we'll cover include:
- Frontend frameworks and their ecosystems
- Backend API design patterns
- Database optimization strategies
- Deployment and DevOps best practices
Throughout this video, we'll build a real-world application step by step,
explaining each decision and trade-off along the way.
By the end of this tutorial, you'll have a solid understanding of modern
web development practices and be ready to build your own production-ready applications.""",
"segments": [
{"text": "Welcome to this comprehensive tutorial on modern web development.", "start": 0.0, "duration": 3.5},
{"text": "Today we'll be exploring the fundamentals of building scalable applications.", "start": 3.5, "duration": 4.0},
{"text": "First, let's discuss the importance of choosing the right architecture.", "start": 7.5, "duration": 3.8},
]
},
"test123": {
"text": """This is a test video transcript for demonstration purposes.
It contains sample content that can be used for testing the summarization system.
The transcript includes multiple paragraphs and various topics to ensure
the system can handle different types of content effectively.""",
"segments": []
}
}
def __init__(self, cache_client: Optional[MockCacheClient] = None,
whisper_client: Optional[MockWhisperClient] = None,
websocket_manager: Optional['WebSocketManager'] = None):
self.cache_client = cache_client or MockCacheClient()
self.whisper_client = whisper_client or MockWhisperClient()
self.websocket_manager = websocket_manager
self._method_success_rates = {
"youtube_api": 0.7, # 70% success rate for primary method
"auto_captions": 0.5, # 50% success rate for auto-captions
"whisper_audio": 0.9 # 90% success rate for Whisper
}
# Check if we should use real YouTube API based on environment settings
from backend.core.config import settings
self._use_real_youtube_api = not settings.USE_MOCK_SERVICES and settings.ENABLE_REAL_TRANSCRIPT_EXTRACTION
self._using_real_whisper = whisper_client is not None and not isinstance(whisper_client, MockWhisperClient)
# Initialize intelligent video downloader for additional fallback methods
self.video_downloader = None
# Store segments temporarily for passing to _create_result
self._last_whisper_segments = None
self._last_transcript_segments = None
if self._use_real_youtube_api:
try:
self.video_downloader = IntelligentVideoDownloader(websocket_manager=websocket_manager)
logger.info("Initialized IntelligentVideoDownloader with multiple fallback methods and WebSocket support")
except Exception as e:
logger.warning(f"Could not initialize IntelligentVideoDownloader: {e}")
logger.info(f"TranscriptService initialized: use_real_youtube_api={self._use_real_youtube_api}, using_real_whisper={self._using_real_whisper}")
async def extract_transcript(self, video_id: str,
language_preference: str = "en") -> TranscriptResult:
"""
Extract transcript using fallback chain with caching.
Args:
video_id: YouTube video ID
language_preference: Preferred language code
Returns:
TranscriptResult with transcript data or error
"""
start_time = time.time()
# Check cache first
cache_key = f"transcript:{video_id}:{language_preference}"
cached_result = await self.cache_client.get(cache_key)
if cached_result:
logger.info(f"Cache hit for video {video_id}")
# The cached_result is a JSON string, parse it
result_data = json.loads(cached_result) if isinstance(cached_result, str) else cached_result
# Create TranscriptResult from cached data with from_cache flag
return TranscriptResult(**result_data, from_cache=True)
# Try primary method: YouTube Transcript API (mock)
try:
transcript = await self._extract_youtube_transcript(video_id, language_preference)
result = await self._create_result(
video_id, transcript, ExtractionMethod.YOUTUBE_API,
language_preference, start_time
)
await self._cache_result(cache_key, result)
return result
except TranscriptNotAvailableError:
logger.info(f"YouTube API transcript not available for {video_id}")
# Fallback 1: Auto-generated captions (mock)
try:
transcript = await self._extract_auto_captions(video_id, language_preference)
result = await self._create_result(
video_id, transcript, ExtractionMethod.AUTO_CAPTIONS,
language_preference, start_time
)
await self._cache_result(cache_key, result)
return result
except CaptionsNotAvailableError:
logger.info(f"Auto-captions not available for {video_id}")
# Fallback 2: Audio transcription with Whisper (mock)
try:
transcript = await self._transcribe_audio(video_id, language_preference)
result = await self._create_result(
video_id, transcript, ExtractionMethod.WHISPER_AUDIO,
language_preference, start_time
)
await self._cache_result(cache_key, result)
return result
except AudioTranscriptionError as e:
logger.info(f"Whisper transcription failed for {video_id}, trying advanced fallback methods")
# Fallback 3-8: Use IntelligentVideoDownloader with multiple methods
# This includes: pytubefix, yt-dlp, playwright, external tools, web services
if self.video_downloader:
try:
transcript = await self._extract_with_video_downloader(video_id, language_preference)
result = await self._create_result(
video_id, transcript, ExtractionMethod.WHISPER_AUDIO, # Mark as audio since it's likely from audio
language_preference, start_time
)
await self._cache_result(cache_key, result)
return result
except Exception as e:
logger.error(f"Advanced fallback methods failed for {video_id}: {e}")
logger.error(f"All transcript extraction methods failed for {video_id}")
return TranscriptResult(
video_id=video_id,
transcript=None,
method=ExtractionMethod.FAILED,
success=False,
error={
"code": ErrorCode.TRANSCRIPT_NOT_AVAILABLE,
"message": "Unable to extract transcript from video",
"details": {
"video_id": video_id,
"attempted_methods": ["youtube_api", "auto_captions", "whisper_audio"],
"last_error": str(e),
"suggestions": [
"Try a different video with captions available",
"Check if video is public and accessible",
"Contact support if this video should have transcripts"
]
}
}
)
async def _extract_youtube_transcript(self, video_id: str,
language: str) -> str:
"""YouTube Transcript API extraction (mock or real)"""
# Use real implementation if available
if self._use_real_youtube_api:
try:
from youtube_transcript_api import YouTubeTranscriptApi
loop = asyncio.get_event_loop()
def _fetch_transcript():
# Try multiple language preferences
languages = [language, 'en', 'en-US', 'en-GB']
for lang in languages:
try:
# Use the static method get_transcript directly
transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang])
# Store the raw transcript data for segments
self._last_transcript_segments = transcript_list
# Convert list of transcript entries to text
full_text = ' '.join([entry['text'] for entry in transcript_list])
return full_text
except Exception as e:
logger.debug(f"Failed to fetch transcript for language {lang}: {e}")
continue
raise TranscriptNotAvailableError(f"No transcript available for {video_id}")
transcript_text = await loop.run_in_executor(None, _fetch_transcript)
return transcript_text
except Exception as e:
logger.error(f"Real YouTube transcript extraction failed: {e}")
raise TranscriptNotAvailableError(f"Failed to extract transcript: {e}")
# Mock implementation
await asyncio.sleep(0.3) # Simulate API call
# Simulate success/failure based on probability
if random.random() > self._method_success_rates["youtube_api"]:
raise TranscriptNotAvailableError(f"No transcript available for {video_id}")
# Return mock transcript if available
if video_id in self.MOCK_TRANSCRIPTS:
return self.MOCK_TRANSCRIPTS[video_id]["text"]
# Generate generic mock transcript
return f"""This is a mock transcript extracted via YouTube API for video {video_id}.
The content discusses various topics related to technology and innovation.
This demonstration text shows how the transcript extraction service works."""
async def _extract_auto_captions(self, video_id: str,
language: str) -> str:
"""Auto-generated captions extraction"""
# Use real implementation if available
if self._use_real_youtube_api:
try:
from youtube_transcript_api import YouTubeTranscriptApi
loop = asyncio.get_event_loop()
def _fetch_auto_captions():
# Try to get auto-generated captions
try:
# List available transcripts for the video
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Try to find auto-generated caption for the requested language
for transcript in transcript_list:
if transcript.is_generated and transcript.language_code == language:
caption_list = transcript.fetch()
full_text = ' '.join([entry['text'] for entry in caption_list])
return f"[Auto-generated] {full_text}"
# Try English auto-generated as fallback
for transcript in transcript_list:
if transcript.is_generated and transcript.language_code in ['en', 'en-US']:
caption_list = transcript.fetch()
full_text = ' '.join([entry['text'] for entry in caption_list])
return f"[Auto-generated] {full_text}"
raise CaptionsNotAvailableError(f"No auto-generated captions available for {video_id}")
except Exception as e:
raise CaptionsNotAvailableError(f"Failed to fetch auto-captions: {e}")
caption_text = await loop.run_in_executor(None, _fetch_auto_captions)
return caption_text
except Exception as e:
logger.error(f"Real auto-caption extraction failed: {e}")
raise CaptionsNotAvailableError(f"Failed to extract auto-captions: {e}")
# Mock implementation fallback
await asyncio.sleep(0.4) # Simulate API call
if random.random() > self._method_success_rates["auto_captions"]:
raise CaptionsNotAvailableError(f"No auto-captions for {video_id}")
return f"""[Auto-generated] This is a mock auto-caption transcript for video {video_id}.
Auto-generated captions may contain errors but provide useful content.
The transcript has been processed and cleaned for better readability."""
async def _transcribe_audio(self, video_id: str,
language: str) -> str:
"""Audio transcription using Whisper (mock or real)"""
# Use real implementation if available
if self._using_real_whisper and self.whisper_client and not isinstance(self.whisper_client, MockWhisperClient):
try:
# Use the real Whisper service
logger.info(f"Using real Whisper service for video {video_id}")
video_url = f"https://www.youtube.com/watch?v={video_id}"
segments, metadata = await self.whisper_client.transcribe_video(
video_id, video_url
)
# Convert DualTranscriptSegment to TranscriptSegment for compatibility
from backend.models.transcript import TranscriptSegment
converted_segments = []
for segment in segments:
converted_segments.append(TranscriptSegment(
text=segment.text,
start=segment.start_time,
duration=segment.end_time - segment.start_time
))
# Store converted segments for use in _create_result
self._last_whisper_segments = converted_segments
# Convert segments to text
transcript_text = ' '.join([segment.text for segment in segments])
logger.info(f"Successfully transcribed audio for {video_id} - {metadata.word_count} words")
return transcript_text
except Exception as e:
logger.error(f"Real audio transcription failed: {e}")
raise AudioTranscriptionError(f"Failed to transcribe audio: {e}")
# Mock implementation
await asyncio.sleep(0.8) # Simulate longer processing time
if random.random() > self._method_success_rates["whisper_audio"]:
raise AudioTranscriptionError(f"Failed to transcribe audio for {video_id}")
return await self.whisper_client.transcribe(video_id)
async def _extract_with_video_downloader(self, video_id: str, language: str) -> str:
"""Use IntelligentVideoDownloader with multiple fallback methods"""
if not self.video_downloader:
raise Exception("Video downloader not available")
video_url = f"https://www.youtube.com/watch?v={video_id}"
# Configure preferences for transcript extraction
preferences = DownloadPreferences(
quality=VideoQuality.AUDIO_ONLY, # We only need audio for transcription
prefer_audio_only=True,
fallback_to_transcript=True
)
logger.info(f"Attempting advanced download methods for {video_id}")
# The IntelligentVideoDownloader will try:
# 1. pytubefix
# 2. yt-dlp
# 3. playwright (browser automation)
# 4. external tools
# 5. web services
# 6. transcript only fallback
result = await self.video_downloader.download_video(video_url, preferences)
if result.status in [DownloadStatus.COMPLETED, DownloadStatus.PARTIAL]:
# If we got audio, transcribe it
if result.audio_file and result.audio_file.exists():
# Use whisper to transcribe the downloaded audio
if self._using_real_whisper and self.whisper_client:
segments, metadata = await self.whisper_client.transcribe_video(
video_id, video_url
)
return ' '.join([segment.text for segment in segments])
else:
# Fall back to basic extraction
return f"[Advanced Download] Successfully downloaded audio for {video_id} using {result.method_used}"
# If we only got transcript data
if result.transcript:
return result.transcript
raise Exception(f"Download completed but no transcript available")
else:
raise Exception(f"All advanced download methods failed: {result.error_message}")
async def _create_result(self, video_id: str, transcript: str,
method: ExtractionMethod, language: str,
start_time: float) -> TranscriptResult:
"""Create TranscriptResult with metadata"""
processing_time = time.time() - start_time
word_count = len(transcript.split())
metadata = TranscriptMetadata(
word_count=word_count,
estimated_reading_time=int(word_count / 200 * 60), # 200 WPM reading speed
language=language,
has_timestamps=method == ExtractionMethod.YOUTUBE_API,
extraction_method=method,
processing_time_seconds=processing_time
)
# Get segments if available
segments = None
# Check for real Whisper segments first
if self._last_whisper_segments and method == ExtractionMethod.WHISPER_AUDIO:
segments = self._last_whisper_segments
self._last_whisper_segments = None # Clear after use
# Fall back to mock data segments
elif video_id in self.MOCK_TRANSCRIPTS and self.MOCK_TRANSCRIPTS[video_id].get("segments"):
segments = [TranscriptSegment(**seg) for seg in self.MOCK_TRANSCRIPTS[video_id]["segments"]]
return TranscriptResult(
video_id=video_id,
transcript=transcript,
segments=segments,
metadata=metadata,
method=method,
success=True,
from_cache=False
)
async def _cache_result(self, cache_key: str, result: TranscriptResult):
"""Cache the transcript result"""
try:
# Convert to dict for caching
cache_data = result.model_dump(exclude={'from_cache'})
await self.cache_client.set(cache_key, cache_data, ttl=86400) # 24 hours
logger.info(f"Cached transcript for key {cache_key}")
except Exception as e:
logger.error(f"Failed to cache transcript: {e}")
def extract_metadata(self, transcript: str) -> Dict[str, Any]:
"""Extract metadata from transcript text"""
word_count = len(transcript.split())
char_count = len(transcript)
line_count = len(transcript.split('\n'))
return {
"word_count": word_count,
"character_count": char_count,
"line_count": line_count,
"estimated_reading_time_seconds": int(word_count / 200 * 60),
"average_words_per_line": word_count / max(line_count, 1)
}