youtube-summarizer/backend/services/enhanced_transcript_service.py

405 lines
16 KiB
Python

"""
Enhanced Transcript Service with local video file support.
Integrates with VideoDownloadService for local file-based transcription.
"""
import asyncio
import logging
from typing import Optional, Dict, Any
from pathlib import Path
import json
from backend.models.transcript import (
TranscriptResult,
TranscriptMetadata,
TranscriptSegment,
ExtractionMethod
)
from backend.core.exceptions import (
TranscriptExtractionError,
ErrorCode
)
from backend.services.transcript_service import TranscriptService
from backend.services.video_download_service import VideoDownloadService, VideoDownloadError
from backend.services.mock_cache import MockCacheClient
logger = logging.getLogger(__name__)
class MockWhisperService:
"""Mock Whisper service for local audio transcription."""
def __init__(self):
self.model_name = "base"
self.language = "en"
async def transcribe_audio(self, audio_path: Path) -> Dict[str, Any]:
"""
Mock transcription of audio file.
In production, this would use OpenAI Whisper or similar.
Args:
audio_path: Path to audio file
Returns:
Transcription result with segments
"""
await asyncio.sleep(1.0) # Simulate processing time
# Generate mock transcript based on file
video_id = audio_path.stem
return {
"text": f"""[Transcribed from local audio: {audio_path.name}]
This is a high-quality transcription from the downloaded video.
Local transcription provides better accuracy than online methods.
The video discusses important topics including:
- Advanced machine learning techniques
- Modern software architecture patterns
- Best practices for scalable applications
- Performance optimization strategies
Using local files ensures we can process videos even if they're removed from YouTube,
and we get consistent quality across all transcriptions.
This mock transcript demonstrates the enhanced capabilities of local processing,
which would include proper timestamps and speaker detection in production.""",
"segments": [
{
"text": "This is a high-quality transcription from the downloaded video.",
"start": 0.0,
"end": 4.0
},
{
"text": "Local transcription provides better accuracy than online methods.",
"start": 4.0,
"end": 8.0
},
{
"text": "The video discusses important topics including advanced machine learning techniques.",
"start": 8.0,
"end": 13.0
}
],
"language": "en",
"duration": 120.0 # Mock duration
}
class EnhancedTranscriptService(TranscriptService):
"""
Enhanced transcript service that prioritizes local video files.
Extraction priority:
1. Check for locally downloaded video/audio files
2. Fall back to YouTube Transcript API
3. Download video and extract audio if needed
4. Use Whisper for transcription
"""
def __init__(
self,
video_service: Optional[VideoDownloadService] = None,
cache_client: Optional[MockCacheClient] = None,
whisper_service: Optional[MockWhisperService] = None
):
"""
Initialize enhanced transcript service.
Args:
video_service: Video download service for local files
cache_client: Cache client for transcript caching
whisper_service: Whisper service for local transcription
"""
super().__init__(cache_client=cache_client)
self.video_service = video_service or VideoDownloadService()
self.whisper_service = whisper_service or MockWhisperService()
# Update success rates to prefer local files
self._method_success_rates = {
"local_file": 0.95, # 95% success with local files
"youtube_api": 0.7, # 70% success with YouTube API
"auto_captions": 0.5, # 50% success with auto-captions
"whisper_download": 0.9 # 90% success with download + Whisper
}
def _extract_video_id_from_url(self, url: str) -> str:
"""Extract video ID from YouTube URL."""
# Simple extraction for common YouTube URL formats
if "youtube.com/watch?v=" in url:
return url.split("v=")[1].split("&")[0]
elif "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
else:
# Assume it's already a video ID
return url
async def extract_transcript(
self,
video_id_or_url: str,
language_preference: str = "en",
force_download: bool = False
) -> TranscriptResult:
"""
Extract transcript with local file priority.
Args:
video_id_or_url: YouTube video ID or URL
language_preference: Preferred language for transcript
force_download: Force download even if online methods work
Returns:
TranscriptResult with transcript and metadata
"""
# Determine if input is URL or video ID
if "youtube.com" in video_id_or_url or "youtu.be" in video_id_or_url:
url = video_id_or_url
video_id = self._extract_video_id_from_url(url)
else:
video_id = video_id_or_url
url = f"https://www.youtube.com/watch?v={video_id}"
# Check cache first
cache_key = f"transcript:{video_id}:{language_preference}"
cached_result = await self.cache_client.get(cache_key)
if cached_result:
logger.info(f"Transcript cache hit for {video_id}")
return TranscriptResult.model_validate(json.loads(cached_result))
# Try local file first if available
if self.video_service.is_video_downloaded(video_id):
logger.info(f"Using local files for transcript extraction: {video_id}")
local_result = await self._extract_from_local_video(video_id)
if local_result:
await self.cache_client.set(cache_key, local_result.model_dump_json(), ttl=86400)
return local_result
# If force_download, download the video first
if force_download:
logger.info(f"Force downloading video for transcription: {video_id}")
download_result = await self._download_and_transcribe(url, video_id)
if download_result:
await self.cache_client.set(cache_key, download_result.model_dump_json(), ttl=86400)
return download_result
# Try YouTube API methods (from parent class)
try:
logger.info(f"Attempting YouTube API transcript extraction for {video_id}")
api_result = await super().extract_transcript(video_id, language_preference)
# Cache the result
await self.cache_client.set(cache_key, api_result.model_dump_json(), ttl=86400)
return api_result
except TranscriptExtractionError as e:
logger.warning(f"YouTube API methods failed: {e}")
# As last resort, download video and transcribe
logger.info(f"Falling back to download and transcribe for {video_id}")
download_result = await self._download_and_transcribe(url, video_id)
if download_result:
await self.cache_client.set(cache_key, download_result.model_dump_json(), ttl=86400)
return download_result
# If all methods fail, raise error
raise TranscriptExtractionError(
message="Unable to extract transcript through any method",
error_code=ErrorCode.TRANSCRIPT_UNAVAILABLE,
details={
"video_id": video_id,
"attempted_methods": [
"local_file", "youtube_api", "auto_captions", "download_and_transcribe"
],
"suggestions": [
"Check if video is available and public",
"Try again later",
"Enable captions on the video"
]
}
)
async def _extract_from_local_video(self, video_id: str) -> Optional[TranscriptResult]:
"""
Extract transcript from locally stored video/audio files.
Args:
video_id: YouTube video ID
Returns:
TranscriptResult or None if extraction fails
"""
try:
# Get cached video info
video_hash = self.video_service._get_video_hash(video_id)
cached_info = self.video_service.cache.get(video_hash)
if not cached_info:
logger.warning(f"No cache info for downloaded video {video_id}")
return None
# Check for audio file
audio_path = cached_info.get('audio_path')
if audio_path:
audio_file = Path(audio_path)
if audio_file.exists():
logger.info(f"Transcribing from local audio: {audio_file}")
# Transcribe using Whisper
transcription = await self.whisper_service.transcribe_audio(audio_file)
# Convert to TranscriptResult
segments = [
TranscriptSegment(
text=seg["text"],
start=seg["start"],
duration=seg["end"] - seg["start"]
)
for seg in transcription.get("segments", [])
]
metadata = TranscriptMetadata(
language=transcription.get("language", "en"),
duration=transcription.get("duration", 0),
word_count=len(transcription["text"].split()),
has_timestamps=bool(segments)
)
return TranscriptResult(
video_id=video_id,
transcript=transcription["text"],
segments=segments,
metadata=metadata,
method=ExtractionMethod.WHISPER_AUDIO,
language=transcription.get("language", "en"),
success=True,
from_cache=False,
processing_time=1.0 # Mock processing time
)
# If no audio file, check for video file
video_path = cached_info.get('video_path')
if video_path:
video_file = Path(video_path)
if video_file.exists():
logger.info(f"Video found but no audio extracted yet: {video_file}")
# Could extract audio here if needed
return None
return None
except Exception as e:
logger.error(f"Error extracting from local video {video_id}: {e}")
return None
async def _download_and_transcribe(self, url: str, video_id: str) -> Optional[TranscriptResult]:
"""
Download video and transcribe the audio.
Args:
url: YouTube URL
video_id: Video ID
Returns:
TranscriptResult or None if fails
"""
try:
logger.info(f"Downloading video for transcription: {video_id}")
# Download video with audio extraction
video_path, audio_path = await self.video_service.download_video(
url=url,
extract_audio=True,
force=False
)
if audio_path and audio_path.exists():
logger.info(f"Audio extracted, transcribing: {audio_path}")
# Transcribe using Whisper
transcription = await self.whisper_service.transcribe_audio(audio_path)
# Convert to TranscriptResult
segments = [
TranscriptSegment(
text=seg["text"],
start=seg["start"],
duration=seg["end"] - seg["start"]
)
for seg in transcription.get("segments", [])
]
metadata = TranscriptMetadata(
language=transcription.get("language", "en"),
duration=transcription.get("duration", 0),
word_count=len(transcription["text"].split()),
has_timestamps=bool(segments)
)
return TranscriptResult(
video_id=video_id,
transcript=transcription["text"],
segments=segments,
metadata=metadata,
method=ExtractionMethod.WHISPER_AUDIO,
language=transcription.get("language", "en"),
success=True,
from_cache=False,
processing_time=2.0 # Mock processing time
)
logger.warning(f"Download succeeded but no audio extracted for {video_id}")
return None
except VideoDownloadError as e:
logger.error(f"Failed to download video {video_id}: {e}")
return None
except Exception as e:
logger.error(f"Error in download and transcribe for {video_id}: {e}")
return None
async def get_transcript_with_priority(
self,
video_id: str,
prefer_local: bool = True,
download_if_missing: bool = False
) -> TranscriptResult:
"""
Get transcript with configurable priority.
Args:
video_id: YouTube video ID
prefer_local: Prefer local files over API
download_if_missing: Download video if not available locally
Returns:
TranscriptResult
"""
url = f"https://www.youtube.com/watch?v={video_id}"
if prefer_local and self.video_service.is_video_downloaded(video_id):
# Try local first
local_result = await self._extract_from_local_video(video_id)
if local_result:
return local_result
# Try API methods
try:
return await super().extract_transcript(video_id)
except TranscriptExtractionError:
if download_if_missing:
# Download and transcribe
download_result = await self._download_and_transcribe(url, video_id)
if download_result:
return download_result
raise
def get_extraction_stats(self) -> Dict[str, Any]:
"""Get statistics about extraction methods and success rates."""
return {
"method_success_rates": self._method_success_rates,
"cached_videos": len(self.video_service.cache),
"total_storage_mb": self.video_service.get_storage_stats()['total_size_mb'],
"preferred_method": "local_file" if self.video_service.cache else "youtube_api"
}