405 lines
16 KiB
Python
405 lines
16 KiB
Python
"""
|
|
Enhanced Transcript Service with local video file support.
|
|
Integrates with VideoDownloadService for local file-based transcription.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Optional, Dict, Any
|
|
from pathlib import Path
|
|
import json
|
|
|
|
from backend.models.transcript import (
|
|
TranscriptResult,
|
|
TranscriptMetadata,
|
|
TranscriptSegment,
|
|
ExtractionMethod
|
|
)
|
|
from backend.core.exceptions import (
|
|
TranscriptExtractionError,
|
|
ErrorCode
|
|
)
|
|
from backend.services.transcript_service import TranscriptService
|
|
from backend.services.video_download_service import VideoDownloadService, VideoDownloadError
|
|
from backend.services.mock_cache import MockCacheClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MockWhisperService:
|
|
"""Mock Whisper service for local audio transcription."""
|
|
|
|
def __init__(self):
|
|
self.model_name = "base"
|
|
self.language = "en"
|
|
|
|
async def transcribe_audio(self, audio_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Mock transcription of audio file.
|
|
In production, this would use OpenAI Whisper or similar.
|
|
|
|
Args:
|
|
audio_path: Path to audio file
|
|
|
|
Returns:
|
|
Transcription result with segments
|
|
"""
|
|
await asyncio.sleep(1.0) # Simulate processing time
|
|
|
|
# Generate mock transcript based on file
|
|
video_id = audio_path.stem
|
|
|
|
return {
|
|
"text": f"""[Transcribed from local audio: {audio_path.name}]
|
|
This is a high-quality transcription from the downloaded video.
|
|
Local transcription provides better accuracy than online methods.
|
|
|
|
The video discusses important topics including:
|
|
- Advanced machine learning techniques
|
|
- Modern software architecture patterns
|
|
- Best practices for scalable applications
|
|
- Performance optimization strategies
|
|
|
|
Using local files ensures we can process videos even if they're removed from YouTube,
|
|
and we get consistent quality across all transcriptions.
|
|
|
|
This mock transcript demonstrates the enhanced capabilities of local processing,
|
|
which would include proper timestamps and speaker detection in production.""",
|
|
|
|
"segments": [
|
|
{
|
|
"text": "This is a high-quality transcription from the downloaded video.",
|
|
"start": 0.0,
|
|
"end": 4.0
|
|
},
|
|
{
|
|
"text": "Local transcription provides better accuracy than online methods.",
|
|
"start": 4.0,
|
|
"end": 8.0
|
|
},
|
|
{
|
|
"text": "The video discusses important topics including advanced machine learning techniques.",
|
|
"start": 8.0,
|
|
"end": 13.0
|
|
}
|
|
],
|
|
"language": "en",
|
|
"duration": 120.0 # Mock duration
|
|
}
|
|
|
|
|
|
class EnhancedTranscriptService(TranscriptService):
|
|
"""
|
|
Enhanced transcript service that prioritizes local video files.
|
|
|
|
Extraction priority:
|
|
1. Check for locally downloaded video/audio files
|
|
2. Fall back to YouTube Transcript API
|
|
3. Download video and extract audio if needed
|
|
4. Use Whisper for transcription
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
video_service: Optional[VideoDownloadService] = None,
|
|
cache_client: Optional[MockCacheClient] = None,
|
|
whisper_service: Optional[MockWhisperService] = None
|
|
):
|
|
"""
|
|
Initialize enhanced transcript service.
|
|
|
|
Args:
|
|
video_service: Video download service for local files
|
|
cache_client: Cache client for transcript caching
|
|
whisper_service: Whisper service for local transcription
|
|
"""
|
|
super().__init__(cache_client=cache_client)
|
|
self.video_service = video_service or VideoDownloadService()
|
|
self.whisper_service = whisper_service or MockWhisperService()
|
|
|
|
# Update success rates to prefer local files
|
|
self._method_success_rates = {
|
|
"local_file": 0.95, # 95% success with local files
|
|
"youtube_api": 0.7, # 70% success with YouTube API
|
|
"auto_captions": 0.5, # 50% success with auto-captions
|
|
"whisper_download": 0.9 # 90% success with download + Whisper
|
|
}
|
|
|
|
def _extract_video_id_from_url(self, url: str) -> str:
|
|
"""Extract video ID from YouTube URL."""
|
|
# Simple extraction for common YouTube URL formats
|
|
if "youtube.com/watch?v=" in url:
|
|
return url.split("v=")[1].split("&")[0]
|
|
elif "youtu.be/" in url:
|
|
return url.split("youtu.be/")[1].split("?")[0]
|
|
else:
|
|
# Assume it's already a video ID
|
|
return url
|
|
|
|
async def extract_transcript(
|
|
self,
|
|
video_id_or_url: str,
|
|
language_preference: str = "en",
|
|
force_download: bool = False
|
|
) -> TranscriptResult:
|
|
"""
|
|
Extract transcript with local file priority.
|
|
|
|
Args:
|
|
video_id_or_url: YouTube video ID or URL
|
|
language_preference: Preferred language for transcript
|
|
force_download: Force download even if online methods work
|
|
|
|
Returns:
|
|
TranscriptResult with transcript and metadata
|
|
"""
|
|
# Determine if input is URL or video ID
|
|
if "youtube.com" in video_id_or_url or "youtu.be" in video_id_or_url:
|
|
url = video_id_or_url
|
|
video_id = self._extract_video_id_from_url(url)
|
|
else:
|
|
video_id = video_id_or_url
|
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
# Check cache first
|
|
cache_key = f"transcript:{video_id}:{language_preference}"
|
|
cached_result = await self.cache_client.get(cache_key)
|
|
if cached_result:
|
|
logger.info(f"Transcript cache hit for {video_id}")
|
|
return TranscriptResult.model_validate(json.loads(cached_result))
|
|
|
|
# Try local file first if available
|
|
if self.video_service.is_video_downloaded(video_id):
|
|
logger.info(f"Using local files for transcript extraction: {video_id}")
|
|
local_result = await self._extract_from_local_video(video_id)
|
|
if local_result:
|
|
await self.cache_client.set(cache_key, local_result.model_dump_json(), ttl=86400)
|
|
return local_result
|
|
|
|
# If force_download, download the video first
|
|
if force_download:
|
|
logger.info(f"Force downloading video for transcription: {video_id}")
|
|
download_result = await self._download_and_transcribe(url, video_id)
|
|
if download_result:
|
|
await self.cache_client.set(cache_key, download_result.model_dump_json(), ttl=86400)
|
|
return download_result
|
|
|
|
# Try YouTube API methods (from parent class)
|
|
try:
|
|
logger.info(f"Attempting YouTube API transcript extraction for {video_id}")
|
|
api_result = await super().extract_transcript(video_id, language_preference)
|
|
|
|
# Cache the result
|
|
await self.cache_client.set(cache_key, api_result.model_dump_json(), ttl=86400)
|
|
return api_result
|
|
|
|
except TranscriptExtractionError as e:
|
|
logger.warning(f"YouTube API methods failed: {e}")
|
|
|
|
# As last resort, download video and transcribe
|
|
logger.info(f"Falling back to download and transcribe for {video_id}")
|
|
download_result = await self._download_and_transcribe(url, video_id)
|
|
if download_result:
|
|
await self.cache_client.set(cache_key, download_result.model_dump_json(), ttl=86400)
|
|
return download_result
|
|
|
|
# If all methods fail, raise error
|
|
raise TranscriptExtractionError(
|
|
message="Unable to extract transcript through any method",
|
|
error_code=ErrorCode.TRANSCRIPT_UNAVAILABLE,
|
|
details={
|
|
"video_id": video_id,
|
|
"attempted_methods": [
|
|
"local_file", "youtube_api", "auto_captions", "download_and_transcribe"
|
|
],
|
|
"suggestions": [
|
|
"Check if video is available and public",
|
|
"Try again later",
|
|
"Enable captions on the video"
|
|
]
|
|
}
|
|
)
|
|
|
|
async def _extract_from_local_video(self, video_id: str) -> Optional[TranscriptResult]:
|
|
"""
|
|
Extract transcript from locally stored video/audio files.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
|
|
Returns:
|
|
TranscriptResult or None if extraction fails
|
|
"""
|
|
try:
|
|
# Get cached video info
|
|
video_hash = self.video_service._get_video_hash(video_id)
|
|
cached_info = self.video_service.cache.get(video_hash)
|
|
|
|
if not cached_info:
|
|
logger.warning(f"No cache info for downloaded video {video_id}")
|
|
return None
|
|
|
|
# Check for audio file
|
|
audio_path = cached_info.get('audio_path')
|
|
if audio_path:
|
|
audio_file = Path(audio_path)
|
|
if audio_file.exists():
|
|
logger.info(f"Transcribing from local audio: {audio_file}")
|
|
|
|
# Transcribe using Whisper
|
|
transcription = await self.whisper_service.transcribe_audio(audio_file)
|
|
|
|
# Convert to TranscriptResult
|
|
segments = [
|
|
TranscriptSegment(
|
|
text=seg["text"],
|
|
start=seg["start"],
|
|
duration=seg["end"] - seg["start"]
|
|
)
|
|
for seg in transcription.get("segments", [])
|
|
]
|
|
|
|
metadata = TranscriptMetadata(
|
|
language=transcription.get("language", "en"),
|
|
duration=transcription.get("duration", 0),
|
|
word_count=len(transcription["text"].split()),
|
|
has_timestamps=bool(segments)
|
|
)
|
|
|
|
return TranscriptResult(
|
|
video_id=video_id,
|
|
transcript=transcription["text"],
|
|
segments=segments,
|
|
metadata=metadata,
|
|
method=ExtractionMethod.WHISPER_AUDIO,
|
|
language=transcription.get("language", "en"),
|
|
success=True,
|
|
from_cache=False,
|
|
processing_time=1.0 # Mock processing time
|
|
)
|
|
|
|
# If no audio file, check for video file
|
|
video_path = cached_info.get('video_path')
|
|
if video_path:
|
|
video_file = Path(video_path)
|
|
if video_file.exists():
|
|
logger.info(f"Video found but no audio extracted yet: {video_file}")
|
|
# Could extract audio here if needed
|
|
return None
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting from local video {video_id}: {e}")
|
|
return None
|
|
|
|
async def _download_and_transcribe(self, url: str, video_id: str) -> Optional[TranscriptResult]:
|
|
"""
|
|
Download video and transcribe the audio.
|
|
|
|
Args:
|
|
url: YouTube URL
|
|
video_id: Video ID
|
|
|
|
Returns:
|
|
TranscriptResult or None if fails
|
|
"""
|
|
try:
|
|
logger.info(f"Downloading video for transcription: {video_id}")
|
|
|
|
# Download video with audio extraction
|
|
video_path, audio_path = await self.video_service.download_video(
|
|
url=url,
|
|
extract_audio=True,
|
|
force=False
|
|
)
|
|
|
|
if audio_path and audio_path.exists():
|
|
logger.info(f"Audio extracted, transcribing: {audio_path}")
|
|
|
|
# Transcribe using Whisper
|
|
transcription = await self.whisper_service.transcribe_audio(audio_path)
|
|
|
|
# Convert to TranscriptResult
|
|
segments = [
|
|
TranscriptSegment(
|
|
text=seg["text"],
|
|
start=seg["start"],
|
|
duration=seg["end"] - seg["start"]
|
|
)
|
|
for seg in transcription.get("segments", [])
|
|
]
|
|
|
|
metadata = TranscriptMetadata(
|
|
language=transcription.get("language", "en"),
|
|
duration=transcription.get("duration", 0),
|
|
word_count=len(transcription["text"].split()),
|
|
has_timestamps=bool(segments)
|
|
)
|
|
|
|
return TranscriptResult(
|
|
video_id=video_id,
|
|
transcript=transcription["text"],
|
|
segments=segments,
|
|
metadata=metadata,
|
|
method=ExtractionMethod.WHISPER_AUDIO,
|
|
language=transcription.get("language", "en"),
|
|
success=True,
|
|
from_cache=False,
|
|
processing_time=2.0 # Mock processing time
|
|
)
|
|
|
|
logger.warning(f"Download succeeded but no audio extracted for {video_id}")
|
|
return None
|
|
|
|
except VideoDownloadError as e:
|
|
logger.error(f"Failed to download video {video_id}: {e}")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Error in download and transcribe for {video_id}: {e}")
|
|
return None
|
|
|
|
async def get_transcript_with_priority(
|
|
self,
|
|
video_id: str,
|
|
prefer_local: bool = True,
|
|
download_if_missing: bool = False
|
|
) -> TranscriptResult:
|
|
"""
|
|
Get transcript with configurable priority.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
prefer_local: Prefer local files over API
|
|
download_if_missing: Download video if not available locally
|
|
|
|
Returns:
|
|
TranscriptResult
|
|
"""
|
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
|
|
if prefer_local and self.video_service.is_video_downloaded(video_id):
|
|
# Try local first
|
|
local_result = await self._extract_from_local_video(video_id)
|
|
if local_result:
|
|
return local_result
|
|
|
|
# Try API methods
|
|
try:
|
|
return await super().extract_transcript(video_id)
|
|
except TranscriptExtractionError:
|
|
if download_if_missing:
|
|
# Download and transcribe
|
|
download_result = await self._download_and_transcribe(url, video_id)
|
|
if download_result:
|
|
return download_result
|
|
raise
|
|
|
|
def get_extraction_stats(self) -> Dict[str, Any]:
|
|
"""Get statistics about extraction methods and success rates."""
|
|
return {
|
|
"method_success_rates": self._method_success_rates,
|
|
"cached_videos": len(self.video_service.cache),
|
|
"total_storage_mb": self.video_service.get_storage_stats()['total_size_mb'],
|
|
"preferred_method": "local_file" if self.video_service.cache else "youtube_api"
|
|
} |