youtube-summarizer/backend/services/whisper_transcript_service.py

581 lines
22 KiB
Python

"""
Whisper transcription service for YouTube videos.
Adapted from archived personal-ai-assistant transcription service for YouTube video context.
"""
import os
import logging
import tempfile
import asyncio
from datetime import datetime
from typing import List, Dict, Optional, Tuple
from pathlib import Path
import torch
import whisper
from pydub import AudioSegment
import yt_dlp
import aiofiles
import aiohttp
from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
from ..core.config import settings
from ..config.video_download_config import VideoDownloadConfig
logger = logging.getLogger(__name__)
class WhisperTranscriptService:
"""Service for transcribing YouTube videos using OpenAI Whisper."""
def __init__(self, model_size: str = "small", device: str = "auto"):
"""
Initialize the Whisper transcription service.
Args:
model_size: Whisper model size ("tiny", "base", "small", "medium", "large")
device: Device to run on ("cpu", "cuda", "auto")
"""
self.model_size = model_size
self.device = self._get_device(device)
self.model = None
# Configuration
self.chunk_duration = 30 * 60 # 30 minutes per chunk
self.overlap_duration = 30 # 30 seconds overlap between chunks
self.max_segment_length = 1000 # Maximum characters per segment
# Use video storage configuration
self.config = VideoDownloadConfig()
self.config.ensure_directories()
self.storage_dirs = self.config.get_storage_dirs()
self.temp_dir = self.storage_dirs["temp"]
def _get_device(self, device: str) -> str:
"""Determine the appropriate device for processing."""
if device == "auto":
if torch.cuda.is_available():
return "cuda"
else:
return "cpu"
return device
async def _load_model(self) -> whisper.Whisper:
"""Load the Whisper model on-demand."""
if self.model is None:
logger.info(f"Loading Whisper model '{self.model_size}' on device '{self.device}'")
try:
# Run model loading in executor to avoid blocking async loop
loop = asyncio.get_event_loop()
self.model = await loop.run_in_executor(
None,
lambda: whisper.load_model(self.model_size, device=self.device)
)
logger.info(f"Successfully loaded Whisper model '{self.model_size}'")
except Exception as e:
logger.error(f"Failed to load Whisper model: {e}")
raise
return self.model
async def transcribe_video(
self,
video_id: str,
video_url: str,
progress_callback=None
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
"""
Transcribe a YouTube video and return segments with metadata.
Args:
video_id: YouTube video ID
video_url: Full YouTube video URL
progress_callback: Optional callback for progress updates
Returns:
Tuple of (segments, metadata)
"""
try:
if progress_callback:
await progress_callback("Downloading audio from YouTube video...")
# Download audio from YouTube video
audio_path = await self._download_audio(video_id, video_url)
if progress_callback:
await progress_callback("Audio downloaded, starting transcription...")
logger.info(f"Starting Whisper transcription for video {video_id}")
# Transcribe the audio file
segments = await self._transcribe_audio_file(
audio_path,
progress_callback=progress_callback
)
# Create metadata
metadata = DualTranscriptMetadata(
video_id=video_id,
language="en", # Whisper auto-detects, but assume English for now
word_count=sum(len(segment.text.split()) for segment in segments),
total_segments=len(segments),
has_timestamps=True,
extraction_method="whisper_ai",
processing_time_seconds=0, # Will be calculated by caller
quality_score=self._calculate_quality_score(segments),
confidence_score=self._calculate_confidence_score(segments)
)
logger.info(f"Completed Whisper transcription for video {video_id}. Generated {len(segments)} segments.")
# Save transcript to file
await self._save_transcript(video_id, segments)
return segments, metadata
except Exception as e:
logger.error(f"Whisper transcription failed for video {video_id}: {e}")
raise
finally:
# Clean up temporary WAV file, but keep MP3 for future re-transcription
if 'audio_path' in locals() and audio_path.endswith('.wav'):
wav_path = Path(audio_path)
mp3_path = wav_path.with_suffix('.mp3')
if mp3_path.exists() and wav_path.exists():
try:
os.unlink(audio_path)
logger.info(f"Cleaned up temporary WAV, keeping MP3: {mp3_path}")
except Exception as e:
logger.warning(f"Failed to clean up WAV file {audio_path}: {e}")
else:
logger.info(f"Keeping audio file: {audio_path}")
async def _download_audio(self, video_id: str, video_url: str) -> str:
"""Download audio from YouTube video using yt-dlp."""
try:
# Check if audio already exists (MP3 for storage)
mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
wav_path = self.storage_dirs["audio"] / f"{video_id}.wav"
# If MP3 exists, convert to WAV for Whisper
if mp3_path.exists():
logger.info(f"Using existing audio file: {mp3_path}")
# Convert MP3 to WAV for Whisper processing
await self._convert_audio(mp3_path, wav_path)
return str(wav_path)
# Download as MP3 for efficient storage
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': str(self.storage_dirs["audio"] / f"{video_id}.%(ext)s"),
'quiet': True,
'no_warnings': True,
}
# Run yt-dlp in executor to avoid blocking
loop = asyncio.get_event_loop()
await loop.run_in_executor(
None,
lambda: self._run_yt_dlp(video_url, ydl_opts)
)
# Convert MP3 to WAV for Whisper processing
if mp3_path.exists():
await self._convert_audio(mp3_path, wav_path)
return str(wav_path)
raise RuntimeError(f"Failed to download audio for {video_id}")
except Exception as e:
logger.error(f"Failed to download audio for video {video_id}: {e}")
raise RuntimeError(f"Audio download failed: {e}")
def _run_yt_dlp(self, url: str, opts: dict):
"""Run yt-dlp synchronously."""
with yt_dlp.YoutubeDL(opts) as ydl:
ydl.download([url])
async def _convert_audio(self, input_path: Path, output_path: Path):
"""Convert audio between formats using pydub."""
try:
loop = asyncio.get_event_loop()
def convert():
audio = AudioSegment.from_file(str(input_path))
audio.export(str(output_path), format=output_path.suffix[1:])
await loop.run_in_executor(None, convert)
logger.info(f"Converted {input_path} to {output_path}")
except Exception as e:
logger.error(f"Audio conversion failed: {e}")
raise
async def _transcribe_audio_file(
self,
audio_path: str,
progress_callback=None
) -> List[DualTranscriptSegment]:
"""
Transcribe an audio file with chunking for long videos.
Args:
audio_path: Path to the audio file
progress_callback: Optional callback for progress updates
Returns:
List of transcription segments
"""
model = await self._load_model()
# Get audio duration
duration = await self._get_audio_duration(audio_path)
logger.info(f"Audio duration: {duration:.2f} seconds ({duration/60:.1f} minutes)")
if duration <= self.chunk_duration:
# Process entire file at once for shorter videos
return await self._transcribe_chunk(
model, audio_path, 0, duration, progress_callback
)
else:
# Process in chunks for longer videos
return await self._transcribe_in_chunks(
model, audio_path, duration, progress_callback
)
async def _get_audio_duration(self, audio_path: str) -> float:
"""Get audio duration using pydub."""
loop = asyncio.get_event_loop()
audio = await loop.run_in_executor(None, AudioSegment.from_file, audio_path)
return len(audio) / 1000.0 # Convert milliseconds to seconds
async def _transcribe_chunk(
self,
model: whisper.Whisper,
audio_path: str,
start_time: float,
end_time: float,
progress_callback=None
) -> List[DualTranscriptSegment]:
"""
Transcribe a specific chunk of audio.
Args:
model: Loaded Whisper model
audio_path: Path to the audio file
start_time: Start time in seconds
end_time: End time in seconds
progress_callback: Optional callback for progress updates
Returns:
List of transcription segments for this chunk
"""
try:
# Extract audio chunk if needed
if start_time > 0 or end_time < await self._get_audio_duration(audio_path):
chunk_path = await self._extract_audio_chunk(
audio_path, start_time, end_time
)
time_offset = start_time
else:
chunk_path = audio_path
time_offset = 0
# Transcribe the chunk
logger.info(f"Transcribing chunk {start_time:.1f}s - {end_time:.1f}s")
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
lambda: model.transcribe(
chunk_path,
word_timestamps=True,
language="en", # Can be made configurable
task="transcribe"
)
)
# Convert to TranscriptSegment objects
segments = []
for whisper_segment in result["segments"]:
# Adjust timestamps if this is a chunk
adj_start = whisper_segment["start"] + time_offset
adj_end = whisper_segment["end"] + time_offset
# Split long segments
text = whisper_segment["text"].strip()
if len(text) > self.max_segment_length:
split_segments = self._split_long_segment(
text, adj_start, adj_end
)
segments.extend(split_segments)
else:
segments.append(DualTranscriptSegment(
start_time=adj_start,
end_time=adj_end,
text=text,
confidence=whisper_segment.get("avg_logprob", 0.0)
))
# Clean up temporary chunk file
if chunk_path != audio_path and os.path.exists(chunk_path):
os.unlink(chunk_path)
if progress_callback:
await progress_callback(f"Transcribed chunk {start_time:.1f}s - {end_time:.1f}s")
return segments
except Exception as e:
logger.error(f"Failed to transcribe chunk {start_time}-{end_time}: {e}")
raise
async def _extract_audio_chunk(
self,
audio_path: str,
start_time: float,
end_time: float
) -> str:
"""Extract a chunk of audio to a temporary file."""
chunk_path = self.temp_dir / f"chunk_{start_time}_{end_time}.wav"
loop = asyncio.get_event_loop()
def extract_chunk():
audio = AudioSegment.from_file(audio_path)
chunk = audio[start_time*1000:end_time*1000] # pydub uses milliseconds
chunk.export(str(chunk_path), format="wav")
await loop.run_in_executor(None, extract_chunk)
return str(chunk_path)
async def _transcribe_in_chunks(
self,
model: whisper.Whisper,
audio_path: str,
total_duration: float,
progress_callback=None
) -> List[DualTranscriptSegment]:
"""
Transcribe a long audio file in chunks with overlap.
Args:
model: Loaded Whisper model
audio_path: Path to the audio file
total_duration: Total duration in seconds
progress_callback: Optional callback for progress updates
Returns:
List of transcription segments
"""
all_segments = []
current_time = 0
chunk_number = 1
while current_time < total_duration:
# Calculate chunk boundaries
chunk_start = max(0, current_time - self.overlap_duration)
chunk_end = min(total_duration, current_time + self.chunk_duration)
logger.info(f"Processing chunk {chunk_number}: {chunk_start:.1f}s - {chunk_end:.1f}s")
# Transcribe chunk
chunk_segments = await self._transcribe_chunk(
model, audio_path, chunk_start, chunk_end, progress_callback
)
# Filter overlapping segments (keep only new content)
if current_time > 0:
chunk_segments = [s for s in chunk_segments if s.start_time >= current_time]
all_segments.extend(chunk_segments)
# Move to next chunk
current_time += self.chunk_duration
chunk_number += 1
return all_segments
def _split_long_segment(
self,
text: str,
start_time: float,
end_time: float
) -> List[DualTranscriptSegment]:
"""
Split a long text segment into smaller segments.
Args:
text: Text to split
start_time: Start time of the original segment
end_time: End time of the original segment
Returns:
List of smaller segments
"""
segments = []
duration = end_time - start_time
# Split text by sentences or at word boundaries
words = text.split()
current_text = ""
current_words = 0
time_per_word = duration / len(words) if len(words) > 0 else 0
for i, word in enumerate(words):
if len(current_text + " " + word) > self.max_segment_length and current_text:
# Create segment
segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
segment_end = start_time + current_words * time_per_word
segments.append(DualTranscriptSegment(
start_time=segment_start,
end_time=segment_end,
text=current_text.strip()
))
current_text = word
else:
current_text += " " + word if current_text else word
current_words += 1
# Add final segment
if current_text:
segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
segments.append(DualTranscriptSegment(
start_time=segment_start,
end_time=end_time,
text=current_text.strip()
))
return segments
def _calculate_quality_score(self, segments: List[DualTranscriptSegment]) -> float:
"""Calculate overall quality score based on segment characteristics."""
if not segments:
return 0.0
# Simple quality heuristics
total_confidence = sum(s.confidence for s in segments if s.confidence is not None)
avg_confidence = total_confidence / len(segments)
# Normalize confidence from log probability to 0-1 scale
# Whisper typically gives log probabilities from -5 to 0
normalized_confidence = max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))
return normalized_confidence
def _calculate_confidence_score(self, segments: List[DualTranscriptSegment]) -> float:
"""Calculate average confidence score."""
if not segments:
return 0.0
confidences = [s.confidence for s in segments if s.confidence is not None]
if not confidences:
return 0.0
avg_confidence = sum(confidences) / len(confidences)
# Normalize from log probability to 0-1 scale
return max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))
async def _save_transcript(self, video_id: str, segments: List[DualTranscriptSegment]):
"""Save transcript and audio metadata to files for future use"""
try:
# Save audio metadata
await self._save_audio_metadata(video_id)
transcript_path = self.storage_dirs["transcripts"] / f"{video_id}.txt"
# Create human-readable transcript file
transcript_lines = []
for segment in segments:
if segment.start_time and segment.end_time:
timestamp = f"[{segment.start_time:.1f}s - {segment.end_time:.1f}s]"
transcript_lines.append(f"{timestamp} {segment.text}")
else:
transcript_lines.append(segment.text)
# Write transcript to file
async with aiofiles.open(transcript_path, 'w', encoding='utf-8') as f:
await f.write('\n'.join(transcript_lines))
logger.info(f"Saved transcript to {transcript_path}")
# Also save as JSON for programmatic access
json_path = self.storage_dirs["transcripts"] / f"{video_id}.json"
segments_data = [
{
"start_time": seg.start_time,
"end_time": seg.end_time,
"text": seg.text,
"confidence": seg.confidence
}
for seg in segments
]
async with aiofiles.open(json_path, 'w', encoding='utf-8') as f:
import json
await f.write(json.dumps(segments_data, indent=2))
logger.info(f"Saved transcript JSON to {json_path}")
except Exception as e:
logger.warning(f"Failed to save transcript for {video_id}: {e}")
async def _save_audio_metadata(self, video_id: str):
"""Save audio metadata for tracking and management"""
try:
mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
if not mp3_path.exists():
return
# Get audio file info
audio_info = {
"video_id": video_id,
"file_path": str(mp3_path),
"file_size_mb": round(mp3_path.stat().st_size / (1024 * 1024), 2),
"download_date": datetime.now().isoformat(),
"format": "mp3",
"quality": "192kbps",
"model_used": self.model_size,
"device": self.device
}
# Try to get audio duration
try:
loop = asyncio.get_event_loop()
audio = await loop.run_in_executor(None, AudioSegment.from_file, str(mp3_path))
audio_info["duration_seconds"] = len(audio) / 1000.0
audio_info["duration_formatted"] = f"{int(audio_info['duration_seconds'] // 60)}:{int(audio_info['duration_seconds'] % 60):02d}"
except:
pass
# Save metadata
metadata_path = self.storage_dirs["audio"] / f"{video_id}_metadata.json"
async with aiofiles.open(metadata_path, 'w', encoding='utf-8') as f:
import json
await f.write(json.dumps(audio_info, indent=2))
logger.info(f"Saved audio metadata to {metadata_path}")
except Exception as e:
logger.warning(f"Failed to save audio metadata for {video_id}: {e}")
async def cleanup(self):
"""Clean up temporary files and resources."""
try:
# Don't delete the whole temp directory as it's shared
# Just clean up old files periodically
# Unload model to free GPU memory
if self.model is not None:
del self.model
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
logger.warning(f"Error during cleanup: {e}")