477 lines
22 KiB
Python
477 lines
22 KiB
Python
import asyncio
|
|
import time
|
|
import logging
|
|
from typing import Optional, List, Dict, Any, TYPE_CHECKING
|
|
import json
|
|
import random
|
|
from datetime import datetime
|
|
|
|
if TYPE_CHECKING:
|
|
from backend.core.websocket_manager import WebSocketManager
|
|
|
|
from backend.models.transcript import (
|
|
TranscriptResult,
|
|
TranscriptMetadata,
|
|
TranscriptSegment,
|
|
ExtractionMethod
|
|
)
|
|
from backend.core.exceptions import (
|
|
TranscriptExtractionError,
|
|
ErrorCode
|
|
)
|
|
from backend.services.mock_cache import MockCacheClient
|
|
from backend.services.intelligent_video_downloader import IntelligentVideoDownloader
|
|
from backend.models.video_download import DownloadPreferences, VideoQuality, DownloadStatus
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MockWhisperClient:
|
|
"""Mock Whisper client for audio transcription simulation"""
|
|
|
|
async def transcribe(self, video_id: str) -> str:
|
|
await asyncio.sleep(0.5) # Simulate processing time
|
|
return f"[Whisper transcription] This is a mock audio transcription for video {video_id}."
|
|
|
|
|
|
class TranscriptNotAvailableError(Exception):
|
|
"""Raised when transcript is not available through YouTube API"""
|
|
pass
|
|
|
|
|
|
class CaptionsNotAvailableError(Exception):
|
|
"""Raised when auto-captions are not available"""
|
|
pass
|
|
|
|
|
|
class AudioTranscriptionError(Exception):
|
|
"""Raised when audio transcription fails"""
|
|
pass
|
|
|
|
|
|
class TranscriptService:
|
|
"""Service for extracting video transcripts with fallback methods"""
|
|
|
|
# Mock transcript data for demonstration
|
|
MOCK_TRANSCRIPTS = {
|
|
"dQw4w9WgXcQ": {
|
|
"text": """Welcome to this comprehensive tutorial on modern web development.
|
|
Today we'll be exploring the fundamentals of building scalable applications.
|
|
|
|
First, let's discuss the importance of choosing the right architecture.
|
|
When building web applications, you need to consider factors like performance,
|
|
maintainability, and user experience.
|
|
|
|
The key components we'll cover include:
|
|
- Frontend frameworks and their ecosystems
|
|
- Backend API design patterns
|
|
- Database optimization strategies
|
|
- Deployment and DevOps best practices
|
|
|
|
Throughout this video, we'll build a real-world application step by step,
|
|
explaining each decision and trade-off along the way.
|
|
|
|
By the end of this tutorial, you'll have a solid understanding of modern
|
|
web development practices and be ready to build your own production-ready applications.""",
|
|
"segments": [
|
|
{"text": "Welcome to this comprehensive tutorial on modern web development.", "start": 0.0, "duration": 3.5},
|
|
{"text": "Today we'll be exploring the fundamentals of building scalable applications.", "start": 3.5, "duration": 4.0},
|
|
{"text": "First, let's discuss the importance of choosing the right architecture.", "start": 7.5, "duration": 3.8},
|
|
]
|
|
},
|
|
"test123": {
|
|
"text": """This is a test video transcript for demonstration purposes.
|
|
It contains sample content that can be used for testing the summarization system.
|
|
|
|
The transcript includes multiple paragraphs and various topics to ensure
|
|
the system can handle different types of content effectively.""",
|
|
"segments": []
|
|
}
|
|
}
|
|
|
|
def __init__(self, cache_client: Optional[MockCacheClient] = None,
|
|
whisper_client: Optional[MockWhisperClient] = None,
|
|
websocket_manager: Optional['WebSocketManager'] = None):
|
|
self.cache_client = cache_client or MockCacheClient()
|
|
self.whisper_client = whisper_client or MockWhisperClient()
|
|
self.websocket_manager = websocket_manager
|
|
self._method_success_rates = {
|
|
"youtube_api": 0.7, # 70% success rate for primary method
|
|
"auto_captions": 0.5, # 50% success rate for auto-captions
|
|
"whisper_audio": 0.9 # 90% success rate for Whisper
|
|
}
|
|
# Check if we should use real YouTube API based on environment settings
|
|
from backend.core.config import settings
|
|
self._use_real_youtube_api = not settings.USE_MOCK_SERVICES and settings.ENABLE_REAL_TRANSCRIPT_EXTRACTION
|
|
self._using_real_whisper = whisper_client is not None and not isinstance(whisper_client, MockWhisperClient)
|
|
|
|
# Initialize intelligent video downloader for additional fallback methods
|
|
self.video_downloader = None
|
|
|
|
# Store segments temporarily for passing to _create_result
|
|
self._last_whisper_segments = None
|
|
self._last_transcript_segments = None
|
|
if self._use_real_youtube_api:
|
|
try:
|
|
self.video_downloader = IntelligentVideoDownloader(websocket_manager=websocket_manager)
|
|
logger.info("Initialized IntelligentVideoDownloader with multiple fallback methods and WebSocket support")
|
|
except Exception as e:
|
|
logger.warning(f"Could not initialize IntelligentVideoDownloader: {e}")
|
|
|
|
logger.info(f"TranscriptService initialized: use_real_youtube_api={self._use_real_youtube_api}, using_real_whisper={self._using_real_whisper}")
|
|
|
|
async def extract_transcript(self, video_id: str,
|
|
language_preference: str = "en") -> TranscriptResult:
|
|
"""
|
|
Extract transcript using fallback chain with caching.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
language_preference: Preferred language code
|
|
|
|
Returns:
|
|
TranscriptResult with transcript data or error
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Check cache first
|
|
cache_key = f"transcript:{video_id}:{language_preference}"
|
|
cached_result = await self.cache_client.get(cache_key)
|
|
|
|
if cached_result:
|
|
logger.info(f"Cache hit for video {video_id}")
|
|
# The cached_result is a JSON string, parse it
|
|
result_data = json.loads(cached_result) if isinstance(cached_result, str) else cached_result
|
|
# Create TranscriptResult from cached data with from_cache flag
|
|
return TranscriptResult(**result_data, from_cache=True)
|
|
|
|
# Try primary method: YouTube Transcript API (mock)
|
|
try:
|
|
transcript = await self._extract_youtube_transcript(video_id, language_preference)
|
|
result = await self._create_result(
|
|
video_id, transcript, ExtractionMethod.YOUTUBE_API,
|
|
language_preference, start_time
|
|
)
|
|
await self._cache_result(cache_key, result)
|
|
return result
|
|
except TranscriptNotAvailableError:
|
|
logger.info(f"YouTube API transcript not available for {video_id}")
|
|
|
|
# Fallback 1: Auto-generated captions (mock)
|
|
try:
|
|
transcript = await self._extract_auto_captions(video_id, language_preference)
|
|
result = await self._create_result(
|
|
video_id, transcript, ExtractionMethod.AUTO_CAPTIONS,
|
|
language_preference, start_time
|
|
)
|
|
await self._cache_result(cache_key, result)
|
|
return result
|
|
except CaptionsNotAvailableError:
|
|
logger.info(f"Auto-captions not available for {video_id}")
|
|
|
|
# Fallback 2: Audio transcription with Whisper (mock)
|
|
try:
|
|
transcript = await self._transcribe_audio(video_id, language_preference)
|
|
result = await self._create_result(
|
|
video_id, transcript, ExtractionMethod.WHISPER_AUDIO,
|
|
language_preference, start_time
|
|
)
|
|
await self._cache_result(cache_key, result)
|
|
return result
|
|
except AudioTranscriptionError as e:
|
|
logger.info(f"Whisper transcription failed for {video_id}, trying advanced fallback methods")
|
|
|
|
# Fallback 3-8: Use IntelligentVideoDownloader with multiple methods
|
|
# This includes: pytubefix, yt-dlp, playwright, external tools, web services
|
|
if self.video_downloader:
|
|
try:
|
|
transcript = await self._extract_with_video_downloader(video_id, language_preference)
|
|
result = await self._create_result(
|
|
video_id, transcript, ExtractionMethod.WHISPER_AUDIO, # Mark as audio since it's likely from audio
|
|
language_preference, start_time
|
|
)
|
|
await self._cache_result(cache_key, result)
|
|
return result
|
|
except Exception as e:
|
|
logger.error(f"Advanced fallback methods failed for {video_id}: {e}")
|
|
|
|
logger.error(f"All transcript extraction methods failed for {video_id}")
|
|
|
|
return TranscriptResult(
|
|
video_id=video_id,
|
|
transcript=None,
|
|
method=ExtractionMethod.FAILED,
|
|
success=False,
|
|
error={
|
|
"code": ErrorCode.TRANSCRIPT_NOT_AVAILABLE,
|
|
"message": "Unable to extract transcript from video",
|
|
"details": {
|
|
"video_id": video_id,
|
|
"attempted_methods": ["youtube_api", "auto_captions", "whisper_audio"],
|
|
"last_error": str(e),
|
|
"suggestions": [
|
|
"Try a different video with captions available",
|
|
"Check if video is public and accessible",
|
|
"Contact support if this video should have transcripts"
|
|
]
|
|
}
|
|
}
|
|
)
|
|
|
|
async def _extract_youtube_transcript(self, video_id: str,
|
|
language: str) -> str:
|
|
"""YouTube Transcript API extraction (mock or real)"""
|
|
|
|
# Use real implementation if available
|
|
if self._use_real_youtube_api:
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _fetch_transcript():
|
|
# Try multiple language preferences
|
|
languages = [language, 'en', 'en-US', 'en-GB']
|
|
|
|
for lang in languages:
|
|
try:
|
|
# Use the static method get_transcript directly
|
|
transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang])
|
|
|
|
# Store the raw transcript data for segments
|
|
self._last_transcript_segments = transcript_list
|
|
|
|
# Convert list of transcript entries to text
|
|
full_text = ' '.join([entry['text'] for entry in transcript_list])
|
|
return full_text
|
|
except Exception as e:
|
|
logger.debug(f"Failed to fetch transcript for language {lang}: {e}")
|
|
continue
|
|
|
|
raise TranscriptNotAvailableError(f"No transcript available for {video_id}")
|
|
|
|
transcript_text = await loop.run_in_executor(None, _fetch_transcript)
|
|
return transcript_text
|
|
|
|
except Exception as e:
|
|
logger.error(f"Real YouTube transcript extraction failed: {e}")
|
|
raise TranscriptNotAvailableError(f"Failed to extract transcript: {e}")
|
|
|
|
# Mock implementation
|
|
await asyncio.sleep(0.3) # Simulate API call
|
|
|
|
# Simulate success/failure based on probability
|
|
if random.random() > self._method_success_rates["youtube_api"]:
|
|
raise TranscriptNotAvailableError(f"No transcript available for {video_id}")
|
|
|
|
# Return mock transcript if available
|
|
if video_id in self.MOCK_TRANSCRIPTS:
|
|
return self.MOCK_TRANSCRIPTS[video_id]["text"]
|
|
|
|
# Generate generic mock transcript
|
|
return f"""This is a mock transcript extracted via YouTube API for video {video_id}.
|
|
The content discusses various topics related to technology and innovation.
|
|
This demonstration text shows how the transcript extraction service works."""
|
|
|
|
async def _extract_auto_captions(self, video_id: str,
|
|
language: str) -> str:
|
|
"""Auto-generated captions extraction"""
|
|
|
|
# Use real implementation if available
|
|
if self._use_real_youtube_api:
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _fetch_auto_captions():
|
|
# Try to get auto-generated captions
|
|
try:
|
|
# List available transcripts for the video
|
|
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
|
|
# Try to find auto-generated caption for the requested language
|
|
for transcript in transcript_list:
|
|
if transcript.is_generated and transcript.language_code == language:
|
|
caption_list = transcript.fetch()
|
|
full_text = ' '.join([entry['text'] for entry in caption_list])
|
|
return f"[Auto-generated] {full_text}"
|
|
|
|
# Try English auto-generated as fallback
|
|
for transcript in transcript_list:
|
|
if transcript.is_generated and transcript.language_code in ['en', 'en-US']:
|
|
caption_list = transcript.fetch()
|
|
full_text = ' '.join([entry['text'] for entry in caption_list])
|
|
return f"[Auto-generated] {full_text}"
|
|
|
|
raise CaptionsNotAvailableError(f"No auto-generated captions available for {video_id}")
|
|
except Exception as e:
|
|
raise CaptionsNotAvailableError(f"Failed to fetch auto-captions: {e}")
|
|
|
|
caption_text = await loop.run_in_executor(None, _fetch_auto_captions)
|
|
return caption_text
|
|
|
|
except Exception as e:
|
|
logger.error(f"Real auto-caption extraction failed: {e}")
|
|
raise CaptionsNotAvailableError(f"Failed to extract auto-captions: {e}")
|
|
|
|
# Mock implementation fallback
|
|
await asyncio.sleep(0.4) # Simulate API call
|
|
|
|
if random.random() > self._method_success_rates["auto_captions"]:
|
|
raise CaptionsNotAvailableError(f"No auto-captions for {video_id}")
|
|
|
|
return f"""[Auto-generated] This is a mock auto-caption transcript for video {video_id}.
|
|
Auto-generated captions may contain errors but provide useful content.
|
|
The transcript has been processed and cleaned for better readability."""
|
|
|
|
async def _transcribe_audio(self, video_id: str,
|
|
language: str) -> str:
|
|
"""Audio transcription using Whisper (mock or real)"""
|
|
|
|
# Use real implementation if available
|
|
if self._using_real_whisper and self.whisper_client and not isinstance(self.whisper_client, MockWhisperClient):
|
|
try:
|
|
# Use the real Whisper service
|
|
logger.info(f"Using real Whisper service for video {video_id}")
|
|
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
segments, metadata = await self.whisper_client.transcribe_video(
|
|
video_id, video_url
|
|
)
|
|
|
|
# Convert DualTranscriptSegment to TranscriptSegment for compatibility
|
|
from backend.models.transcript import TranscriptSegment
|
|
converted_segments = []
|
|
for segment in segments:
|
|
converted_segments.append(TranscriptSegment(
|
|
text=segment.text,
|
|
start=segment.start_time,
|
|
duration=segment.end_time - segment.start_time
|
|
))
|
|
|
|
# Store converted segments for use in _create_result
|
|
self._last_whisper_segments = converted_segments
|
|
|
|
# Convert segments to text
|
|
transcript_text = ' '.join([segment.text for segment in segments])
|
|
logger.info(f"Successfully transcribed audio for {video_id} - {metadata.word_count} words")
|
|
return transcript_text
|
|
|
|
except Exception as e:
|
|
logger.error(f"Real audio transcription failed: {e}")
|
|
raise AudioTranscriptionError(f"Failed to transcribe audio: {e}")
|
|
|
|
# Mock implementation
|
|
await asyncio.sleep(0.8) # Simulate longer processing time
|
|
|
|
if random.random() > self._method_success_rates["whisper_audio"]:
|
|
raise AudioTranscriptionError(f"Failed to transcribe audio for {video_id}")
|
|
|
|
return await self.whisper_client.transcribe(video_id)
|
|
|
|
async def _extract_with_video_downloader(self, video_id: str, language: str) -> str:
|
|
"""Use IntelligentVideoDownloader with multiple fallback methods"""
|
|
if not self.video_downloader:
|
|
raise Exception("Video downloader not available")
|
|
|
|
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
# Configure preferences for transcript extraction
|
|
preferences = DownloadPreferences(
|
|
quality=VideoQuality.AUDIO_ONLY, # We only need audio for transcription
|
|
prefer_audio_only=True,
|
|
fallback_to_transcript=True
|
|
)
|
|
|
|
logger.info(f"Attempting advanced download methods for {video_id}")
|
|
|
|
# The IntelligentVideoDownloader will try:
|
|
# 1. pytubefix
|
|
# 2. yt-dlp
|
|
# 3. playwright (browser automation)
|
|
# 4. external tools
|
|
# 5. web services
|
|
# 6. transcript only fallback
|
|
result = await self.video_downloader.download_video(video_url, preferences)
|
|
|
|
if result.status in [DownloadStatus.COMPLETED, DownloadStatus.PARTIAL]:
|
|
# If we got audio, transcribe it
|
|
if result.audio_file and result.audio_file.exists():
|
|
# Use whisper to transcribe the downloaded audio
|
|
if self._using_real_whisper and self.whisper_client:
|
|
segments, metadata = await self.whisper_client.transcribe_video(
|
|
video_id, video_url
|
|
)
|
|
return ' '.join([segment.text for segment in segments])
|
|
else:
|
|
# Fall back to basic extraction
|
|
return f"[Advanced Download] Successfully downloaded audio for {video_id} using {result.method_used}"
|
|
|
|
# If we only got transcript data
|
|
if result.transcript:
|
|
return result.transcript
|
|
|
|
raise Exception(f"Download completed but no transcript available")
|
|
else:
|
|
raise Exception(f"All advanced download methods failed: {result.error_message}")
|
|
|
|
async def _create_result(self, video_id: str, transcript: str,
|
|
method: ExtractionMethod, language: str,
|
|
start_time: float) -> TranscriptResult:
|
|
"""Create TranscriptResult with metadata"""
|
|
processing_time = time.time() - start_time
|
|
word_count = len(transcript.split())
|
|
|
|
metadata = TranscriptMetadata(
|
|
word_count=word_count,
|
|
estimated_reading_time=int(word_count / 200 * 60), # 200 WPM reading speed
|
|
language=language,
|
|
has_timestamps=method == ExtractionMethod.YOUTUBE_API,
|
|
extraction_method=method,
|
|
processing_time_seconds=processing_time
|
|
)
|
|
|
|
# Get segments if available
|
|
segments = None
|
|
|
|
# Check for real Whisper segments first
|
|
if self._last_whisper_segments and method == ExtractionMethod.WHISPER_AUDIO:
|
|
segments = self._last_whisper_segments
|
|
self._last_whisper_segments = None # Clear after use
|
|
# Fall back to mock data segments
|
|
elif video_id in self.MOCK_TRANSCRIPTS and self.MOCK_TRANSCRIPTS[video_id].get("segments"):
|
|
segments = [TranscriptSegment(**seg) for seg in self.MOCK_TRANSCRIPTS[video_id]["segments"]]
|
|
|
|
return TranscriptResult(
|
|
video_id=video_id,
|
|
transcript=transcript,
|
|
segments=segments,
|
|
metadata=metadata,
|
|
method=method,
|
|
success=True,
|
|
from_cache=False
|
|
)
|
|
|
|
async def _cache_result(self, cache_key: str, result: TranscriptResult):
|
|
"""Cache the transcript result"""
|
|
try:
|
|
# Convert to dict for caching
|
|
cache_data = result.model_dump(exclude={'from_cache'})
|
|
await self.cache_client.set(cache_key, cache_data, ttl=86400) # 24 hours
|
|
logger.info(f"Cached transcript for key {cache_key}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to cache transcript: {e}")
|
|
|
|
def extract_metadata(self, transcript: str) -> Dict[str, Any]:
|
|
"""Extract metadata from transcript text"""
|
|
word_count = len(transcript.split())
|
|
char_count = len(transcript)
|
|
line_count = len(transcript.split('\n'))
|
|
|
|
return {
|
|
"word_count": word_count,
|
|
"character_count": char_count,
|
|
"line_count": line_count,
|
|
"estimated_reading_time_seconds": int(word_count / 200 * 60),
|
|
"average_words_per_line": word_count / max(line_count, 1)
|
|
} |