581 lines
22 KiB
Python
581 lines
22 KiB
Python
"""
|
|
Whisper transcription service for YouTube videos.
|
|
Adapted from archived personal-ai-assistant transcription service for YouTube video context.
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import tempfile
|
|
import asyncio
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional, Tuple
|
|
from pathlib import Path
|
|
import torch
|
|
import whisper
|
|
from pydub import AudioSegment
|
|
import yt_dlp
|
|
import aiofiles
|
|
import aiohttp
|
|
|
|
from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
|
|
from ..core.config import settings
|
|
from ..config.video_download_config import VideoDownloadConfig
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class WhisperTranscriptService:
|
|
"""Service for transcribing YouTube videos using OpenAI Whisper."""
|
|
|
|
def __init__(self, model_size: str = "small", device: str = "auto"):
|
|
"""
|
|
Initialize the Whisper transcription service.
|
|
|
|
Args:
|
|
model_size: Whisper model size ("tiny", "base", "small", "medium", "large")
|
|
device: Device to run on ("cpu", "cuda", "auto")
|
|
"""
|
|
self.model_size = model_size
|
|
self.device = self._get_device(device)
|
|
self.model = None
|
|
|
|
# Configuration
|
|
self.chunk_duration = 30 * 60 # 30 minutes per chunk
|
|
self.overlap_duration = 30 # 30 seconds overlap between chunks
|
|
self.max_segment_length = 1000 # Maximum characters per segment
|
|
|
|
# Use video storage configuration
|
|
self.config = VideoDownloadConfig()
|
|
self.config.ensure_directories()
|
|
self.storage_dirs = self.config.get_storage_dirs()
|
|
self.temp_dir = self.storage_dirs["temp"]
|
|
|
|
def _get_device(self, device: str) -> str:
|
|
"""Determine the appropriate device for processing."""
|
|
if device == "auto":
|
|
if torch.cuda.is_available():
|
|
return "cuda"
|
|
else:
|
|
return "cpu"
|
|
return device
|
|
|
|
async def _load_model(self) -> whisper.Whisper:
|
|
"""Load the Whisper model on-demand."""
|
|
if self.model is None:
|
|
logger.info(f"Loading Whisper model '{self.model_size}' on device '{self.device}'")
|
|
try:
|
|
# Run model loading in executor to avoid blocking async loop
|
|
loop = asyncio.get_event_loop()
|
|
self.model = await loop.run_in_executor(
|
|
None,
|
|
lambda: whisper.load_model(self.model_size, device=self.device)
|
|
)
|
|
logger.info(f"Successfully loaded Whisper model '{self.model_size}'")
|
|
except Exception as e:
|
|
logger.error(f"Failed to load Whisper model: {e}")
|
|
raise
|
|
return self.model
|
|
|
|
async def transcribe_video(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
|
|
"""
|
|
Transcribe a YouTube video and return segments with metadata.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
video_url: Full YouTube video URL
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
Tuple of (segments, metadata)
|
|
"""
|
|
try:
|
|
if progress_callback:
|
|
await progress_callback("Downloading audio from YouTube video...")
|
|
|
|
# Download audio from YouTube video
|
|
audio_path = await self._download_audio(video_id, video_url)
|
|
|
|
if progress_callback:
|
|
await progress_callback("Audio downloaded, starting transcription...")
|
|
|
|
logger.info(f"Starting Whisper transcription for video {video_id}")
|
|
|
|
# Transcribe the audio file
|
|
segments = await self._transcribe_audio_file(
|
|
audio_path,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
# Create metadata
|
|
metadata = DualTranscriptMetadata(
|
|
video_id=video_id,
|
|
language="en", # Whisper auto-detects, but assume English for now
|
|
word_count=sum(len(segment.text.split()) for segment in segments),
|
|
total_segments=len(segments),
|
|
has_timestamps=True,
|
|
extraction_method="whisper_ai",
|
|
processing_time_seconds=0, # Will be calculated by caller
|
|
quality_score=self._calculate_quality_score(segments),
|
|
confidence_score=self._calculate_confidence_score(segments)
|
|
)
|
|
|
|
logger.info(f"Completed Whisper transcription for video {video_id}. Generated {len(segments)} segments.")
|
|
|
|
# Save transcript to file
|
|
await self._save_transcript(video_id, segments)
|
|
|
|
return segments, metadata
|
|
|
|
except Exception as e:
|
|
logger.error(f"Whisper transcription failed for video {video_id}: {e}")
|
|
raise
|
|
finally:
|
|
# Clean up temporary WAV file, but keep MP3 for future re-transcription
|
|
if 'audio_path' in locals() and audio_path.endswith('.wav'):
|
|
wav_path = Path(audio_path)
|
|
mp3_path = wav_path.with_suffix('.mp3')
|
|
|
|
if mp3_path.exists() and wav_path.exists():
|
|
try:
|
|
os.unlink(audio_path)
|
|
logger.info(f"Cleaned up temporary WAV, keeping MP3: {mp3_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to clean up WAV file {audio_path}: {e}")
|
|
else:
|
|
logger.info(f"Keeping audio file: {audio_path}")
|
|
|
|
async def _download_audio(self, video_id: str, video_url: str) -> str:
|
|
"""Download audio from YouTube video using yt-dlp."""
|
|
try:
|
|
# Check if audio already exists (MP3 for storage)
|
|
mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
|
|
wav_path = self.storage_dirs["audio"] / f"{video_id}.wav"
|
|
|
|
# If MP3 exists, convert to WAV for Whisper
|
|
if mp3_path.exists():
|
|
logger.info(f"Using existing audio file: {mp3_path}")
|
|
# Convert MP3 to WAV for Whisper processing
|
|
await self._convert_audio(mp3_path, wav_path)
|
|
return str(wav_path)
|
|
|
|
# Download as MP3 for efficient storage
|
|
ydl_opts = {
|
|
'format': 'bestaudio/best',
|
|
'postprocessors': [{
|
|
'key': 'FFmpegExtractAudio',
|
|
'preferredcodec': 'mp3',
|
|
'preferredquality': '192',
|
|
}],
|
|
'outtmpl': str(self.storage_dirs["audio"] / f"{video_id}.%(ext)s"),
|
|
'quiet': True,
|
|
'no_warnings': True,
|
|
}
|
|
|
|
# Run yt-dlp in executor to avoid blocking
|
|
loop = asyncio.get_event_loop()
|
|
await loop.run_in_executor(
|
|
None,
|
|
lambda: self._run_yt_dlp(video_url, ydl_opts)
|
|
)
|
|
|
|
# Convert MP3 to WAV for Whisper processing
|
|
if mp3_path.exists():
|
|
await self._convert_audio(mp3_path, wav_path)
|
|
return str(wav_path)
|
|
|
|
raise RuntimeError(f"Failed to download audio for {video_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to download audio for video {video_id}: {e}")
|
|
raise RuntimeError(f"Audio download failed: {e}")
|
|
|
|
def _run_yt_dlp(self, url: str, opts: dict):
|
|
"""Run yt-dlp synchronously."""
|
|
with yt_dlp.YoutubeDL(opts) as ydl:
|
|
ydl.download([url])
|
|
|
|
async def _convert_audio(self, input_path: Path, output_path: Path):
|
|
"""Convert audio between formats using pydub."""
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def convert():
|
|
audio = AudioSegment.from_file(str(input_path))
|
|
audio.export(str(output_path), format=output_path.suffix[1:])
|
|
|
|
await loop.run_in_executor(None, convert)
|
|
logger.info(f"Converted {input_path} to {output_path}")
|
|
except Exception as e:
|
|
logger.error(f"Audio conversion failed: {e}")
|
|
raise
|
|
|
|
async def _transcribe_audio_file(
|
|
self,
|
|
audio_path: str,
|
|
progress_callback=None
|
|
) -> List[DualTranscriptSegment]:
|
|
"""
|
|
Transcribe an audio file with chunking for long videos.
|
|
|
|
Args:
|
|
audio_path: Path to the audio file
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
List of transcription segments
|
|
"""
|
|
model = await self._load_model()
|
|
|
|
# Get audio duration
|
|
duration = await self._get_audio_duration(audio_path)
|
|
logger.info(f"Audio duration: {duration:.2f} seconds ({duration/60:.1f} minutes)")
|
|
|
|
if duration <= self.chunk_duration:
|
|
# Process entire file at once for shorter videos
|
|
return await self._transcribe_chunk(
|
|
model, audio_path, 0, duration, progress_callback
|
|
)
|
|
else:
|
|
# Process in chunks for longer videos
|
|
return await self._transcribe_in_chunks(
|
|
model, audio_path, duration, progress_callback
|
|
)
|
|
|
|
async def _get_audio_duration(self, audio_path: str) -> float:
|
|
"""Get audio duration using pydub."""
|
|
loop = asyncio.get_event_loop()
|
|
audio = await loop.run_in_executor(None, AudioSegment.from_file, audio_path)
|
|
return len(audio) / 1000.0 # Convert milliseconds to seconds
|
|
|
|
async def _transcribe_chunk(
|
|
self,
|
|
model: whisper.Whisper,
|
|
audio_path: str,
|
|
start_time: float,
|
|
end_time: float,
|
|
progress_callback=None
|
|
) -> List[DualTranscriptSegment]:
|
|
"""
|
|
Transcribe a specific chunk of audio.
|
|
|
|
Args:
|
|
model: Loaded Whisper model
|
|
audio_path: Path to the audio file
|
|
start_time: Start time in seconds
|
|
end_time: End time in seconds
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
List of transcription segments for this chunk
|
|
"""
|
|
try:
|
|
# Extract audio chunk if needed
|
|
if start_time > 0 or end_time < await self._get_audio_duration(audio_path):
|
|
chunk_path = await self._extract_audio_chunk(
|
|
audio_path, start_time, end_time
|
|
)
|
|
time_offset = start_time
|
|
else:
|
|
chunk_path = audio_path
|
|
time_offset = 0
|
|
|
|
# Transcribe the chunk
|
|
logger.info(f"Transcribing chunk {start_time:.1f}s - {end_time:.1f}s")
|
|
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
lambda: model.transcribe(
|
|
chunk_path,
|
|
word_timestamps=True,
|
|
language="en", # Can be made configurable
|
|
task="transcribe"
|
|
)
|
|
)
|
|
|
|
# Convert to TranscriptSegment objects
|
|
segments = []
|
|
|
|
for whisper_segment in result["segments"]:
|
|
# Adjust timestamps if this is a chunk
|
|
adj_start = whisper_segment["start"] + time_offset
|
|
adj_end = whisper_segment["end"] + time_offset
|
|
|
|
# Split long segments
|
|
text = whisper_segment["text"].strip()
|
|
if len(text) > self.max_segment_length:
|
|
split_segments = self._split_long_segment(
|
|
text, adj_start, adj_end
|
|
)
|
|
segments.extend(split_segments)
|
|
else:
|
|
segments.append(DualTranscriptSegment(
|
|
start_time=adj_start,
|
|
end_time=adj_end,
|
|
text=text,
|
|
confidence=whisper_segment.get("avg_logprob", 0.0)
|
|
))
|
|
|
|
# Clean up temporary chunk file
|
|
if chunk_path != audio_path and os.path.exists(chunk_path):
|
|
os.unlink(chunk_path)
|
|
|
|
if progress_callback:
|
|
await progress_callback(f"Transcribed chunk {start_time:.1f}s - {end_time:.1f}s")
|
|
|
|
return segments
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to transcribe chunk {start_time}-{end_time}: {e}")
|
|
raise
|
|
|
|
async def _extract_audio_chunk(
|
|
self,
|
|
audio_path: str,
|
|
start_time: float,
|
|
end_time: float
|
|
) -> str:
|
|
"""Extract a chunk of audio to a temporary file."""
|
|
chunk_path = self.temp_dir / f"chunk_{start_time}_{end_time}.wav"
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def extract_chunk():
|
|
audio = AudioSegment.from_file(audio_path)
|
|
chunk = audio[start_time*1000:end_time*1000] # pydub uses milliseconds
|
|
chunk.export(str(chunk_path), format="wav")
|
|
|
|
await loop.run_in_executor(None, extract_chunk)
|
|
return str(chunk_path)
|
|
|
|
async def _transcribe_in_chunks(
|
|
self,
|
|
model: whisper.Whisper,
|
|
audio_path: str,
|
|
total_duration: float,
|
|
progress_callback=None
|
|
) -> List[DualTranscriptSegment]:
|
|
"""
|
|
Transcribe a long audio file in chunks with overlap.
|
|
|
|
Args:
|
|
model: Loaded Whisper model
|
|
audio_path: Path to the audio file
|
|
total_duration: Total duration in seconds
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
List of transcription segments
|
|
"""
|
|
all_segments = []
|
|
current_time = 0
|
|
chunk_number = 1
|
|
|
|
while current_time < total_duration:
|
|
# Calculate chunk boundaries
|
|
chunk_start = max(0, current_time - self.overlap_duration)
|
|
chunk_end = min(total_duration, current_time + self.chunk_duration)
|
|
|
|
logger.info(f"Processing chunk {chunk_number}: {chunk_start:.1f}s - {chunk_end:.1f}s")
|
|
|
|
# Transcribe chunk
|
|
chunk_segments = await self._transcribe_chunk(
|
|
model, audio_path, chunk_start, chunk_end, progress_callback
|
|
)
|
|
|
|
# Filter overlapping segments (keep only new content)
|
|
if current_time > 0:
|
|
chunk_segments = [s for s in chunk_segments if s.start_time >= current_time]
|
|
|
|
all_segments.extend(chunk_segments)
|
|
|
|
# Move to next chunk
|
|
current_time += self.chunk_duration
|
|
chunk_number += 1
|
|
|
|
return all_segments
|
|
|
|
def _split_long_segment(
|
|
self,
|
|
text: str,
|
|
start_time: float,
|
|
end_time: float
|
|
) -> List[DualTranscriptSegment]:
|
|
"""
|
|
Split a long text segment into smaller segments.
|
|
|
|
Args:
|
|
text: Text to split
|
|
start_time: Start time of the original segment
|
|
end_time: End time of the original segment
|
|
|
|
Returns:
|
|
List of smaller segments
|
|
"""
|
|
segments = []
|
|
duration = end_time - start_time
|
|
|
|
# Split text by sentences or at word boundaries
|
|
words = text.split()
|
|
current_text = ""
|
|
current_words = 0
|
|
|
|
time_per_word = duration / len(words) if len(words) > 0 else 0
|
|
|
|
for i, word in enumerate(words):
|
|
if len(current_text + " " + word) > self.max_segment_length and current_text:
|
|
# Create segment
|
|
segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
|
|
segment_end = start_time + current_words * time_per_word
|
|
|
|
segments.append(DualTranscriptSegment(
|
|
start_time=segment_start,
|
|
end_time=segment_end,
|
|
text=current_text.strip()
|
|
))
|
|
|
|
current_text = word
|
|
else:
|
|
current_text += " " + word if current_text else word
|
|
|
|
current_words += 1
|
|
|
|
# Add final segment
|
|
if current_text:
|
|
segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
|
|
segments.append(DualTranscriptSegment(
|
|
start_time=segment_start,
|
|
end_time=end_time,
|
|
text=current_text.strip()
|
|
))
|
|
|
|
return segments
|
|
|
|
def _calculate_quality_score(self, segments: List[DualTranscriptSegment]) -> float:
|
|
"""Calculate overall quality score based on segment characteristics."""
|
|
if not segments:
|
|
return 0.0
|
|
|
|
# Simple quality heuristics
|
|
total_confidence = sum(s.confidence for s in segments if s.confidence is not None)
|
|
avg_confidence = total_confidence / len(segments)
|
|
|
|
# Normalize confidence from log probability to 0-1 scale
|
|
# Whisper typically gives log probabilities from -5 to 0
|
|
normalized_confidence = max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))
|
|
|
|
return normalized_confidence
|
|
|
|
def _calculate_confidence_score(self, segments: List[DualTranscriptSegment]) -> float:
|
|
"""Calculate average confidence score."""
|
|
if not segments:
|
|
return 0.0
|
|
|
|
confidences = [s.confidence for s in segments if s.confidence is not None]
|
|
if not confidences:
|
|
return 0.0
|
|
|
|
avg_confidence = sum(confidences) / len(confidences)
|
|
# Normalize from log probability to 0-1 scale
|
|
return max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))
|
|
|
|
async def _save_transcript(self, video_id: str, segments: List[DualTranscriptSegment]):
|
|
"""Save transcript and audio metadata to files for future use"""
|
|
try:
|
|
# Save audio metadata
|
|
await self._save_audio_metadata(video_id)
|
|
transcript_path = self.storage_dirs["transcripts"] / f"{video_id}.txt"
|
|
|
|
# Create human-readable transcript file
|
|
transcript_lines = []
|
|
for segment in segments:
|
|
if segment.start_time and segment.end_time:
|
|
timestamp = f"[{segment.start_time:.1f}s - {segment.end_time:.1f}s]"
|
|
transcript_lines.append(f"{timestamp} {segment.text}")
|
|
else:
|
|
transcript_lines.append(segment.text)
|
|
|
|
# Write transcript to file
|
|
async with aiofiles.open(transcript_path, 'w', encoding='utf-8') as f:
|
|
await f.write('\n'.join(transcript_lines))
|
|
|
|
logger.info(f"Saved transcript to {transcript_path}")
|
|
|
|
# Also save as JSON for programmatic access
|
|
json_path = self.storage_dirs["transcripts"] / f"{video_id}.json"
|
|
segments_data = [
|
|
{
|
|
"start_time": seg.start_time,
|
|
"end_time": seg.end_time,
|
|
"text": seg.text,
|
|
"confidence": seg.confidence
|
|
}
|
|
for seg in segments
|
|
]
|
|
|
|
async with aiofiles.open(json_path, 'w', encoding='utf-8') as f:
|
|
import json
|
|
await f.write(json.dumps(segments_data, indent=2))
|
|
|
|
logger.info(f"Saved transcript JSON to {json_path}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save transcript for {video_id}: {e}")
|
|
|
|
async def _save_audio_metadata(self, video_id: str):
|
|
"""Save audio metadata for tracking and management"""
|
|
try:
|
|
mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
|
|
if not mp3_path.exists():
|
|
return
|
|
|
|
# Get audio file info
|
|
audio_info = {
|
|
"video_id": video_id,
|
|
"file_path": str(mp3_path),
|
|
"file_size_mb": round(mp3_path.stat().st_size / (1024 * 1024), 2),
|
|
"download_date": datetime.now().isoformat(),
|
|
"format": "mp3",
|
|
"quality": "192kbps",
|
|
"model_used": self.model_size,
|
|
"device": self.device
|
|
}
|
|
|
|
# Try to get audio duration
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
audio = await loop.run_in_executor(None, AudioSegment.from_file, str(mp3_path))
|
|
audio_info["duration_seconds"] = len(audio) / 1000.0
|
|
audio_info["duration_formatted"] = f"{int(audio_info['duration_seconds'] // 60)}:{int(audio_info['duration_seconds'] % 60):02d}"
|
|
except:
|
|
pass
|
|
|
|
# Save metadata
|
|
metadata_path = self.storage_dirs["audio"] / f"{video_id}_metadata.json"
|
|
async with aiofiles.open(metadata_path, 'w', encoding='utf-8') as f:
|
|
import json
|
|
await f.write(json.dumps(audio_info, indent=2))
|
|
|
|
logger.info(f"Saved audio metadata to {metadata_path}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save audio metadata for {video_id}: {e}")
|
|
|
|
async def cleanup(self):
|
|
"""Clean up temporary files and resources."""
|
|
try:
|
|
# Don't delete the whole temp directory as it's shared
|
|
# Just clean up old files periodically
|
|
|
|
# Unload model to free GPU memory
|
|
if self.model is not None:
|
|
del self.model
|
|
if torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error during cleanup: {e}") |