637 lines
26 KiB
Python
637 lines
26 KiB
Python
"""
|
|
Faster Whisper transcription service for YouTube videos.
|
|
Uses faster-whisper (CTranslate2) for 20-32x speed improvement over OpenAI Whisper.
|
|
Implements large-v3-turbo model for maximum accuracy and speed.
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
import tempfile
|
|
import asyncio
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional, Tuple, Union
|
|
from pathlib import Path
|
|
import torch
|
|
from faster_whisper import WhisperModel
|
|
from pydub import AudioSegment
|
|
import yt_dlp
|
|
import aiofiles
|
|
import aiohttp
|
|
|
|
from ..models.transcript import DualTranscriptSegment, DualTranscriptMetadata
|
|
from ..core.config import settings
|
|
from ..config.video_download_config import VideoDownloadConfig
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FasterWhisperTranscriptService:
|
|
"""
|
|
Service for transcribing YouTube videos using faster-whisper.
|
|
|
|
Provides 20-32x speed improvement over OpenAI Whisper while maintaining
|
|
or improving accuracy using the large-v3-turbo model.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
model_size: str = "large-v3-turbo",
|
|
device: str = "auto",
|
|
compute_type: str = "auto",
|
|
beam_size: int = 5,
|
|
vad_filter: bool = True,
|
|
word_timestamps: bool = True,
|
|
temperature: float = 0.0,
|
|
best_of: int = 5
|
|
):
|
|
"""
|
|
Initialize the faster-whisper transcription service.
|
|
|
|
Args:
|
|
model_size: Model size ("large-v3-turbo", "large-v3", "large-v2", "medium", "small", "base", "tiny")
|
|
Recommended: "large-v3-turbo" for best speed/accuracy balance
|
|
device: Device to run on ("cpu", "cuda", "auto")
|
|
compute_type: Computation type ("int8", "float16", "float32", "auto")
|
|
"int8" provides best speed with minimal accuracy loss
|
|
"""
|
|
self.model_size = model_size
|
|
self.device = self._get_device(device)
|
|
self.compute_type = self._get_compute_type(compute_type)
|
|
self.model = None
|
|
|
|
# Configuration optimized for faster-whisper
|
|
self.chunk_duration = 30 * 60 # 30 minutes per chunk
|
|
self.overlap_duration = 30 # 30 seconds overlap between chunks
|
|
self.max_segment_length = 1000 # Maximum characters per segment
|
|
|
|
# Faster-whisper specific optimizations from parameters
|
|
self.vad_filter = vad_filter # Voice Activity Detection for efficiency
|
|
self.vad_parameters = dict(
|
|
min_silence_duration_ms=500,
|
|
speech_pad_ms=400,
|
|
)
|
|
|
|
# Batch processing configuration from parameters
|
|
self.beam_size = beam_size # Beam search size (1-10, higher = better quality, slower)
|
|
self.best_of = best_of # Number of candidates when sampling (None = deterministic)
|
|
self.temperature = temperature # Sampling temperature (0 = deterministic)
|
|
self.word_timestamps = word_timestamps # Enable word-level timestamps
|
|
|
|
# Use video storage configuration
|
|
self.config = VideoDownloadConfig()
|
|
self.config.ensure_directories()
|
|
self.storage_dirs = self.config.get_storage_dirs()
|
|
self.temp_dir = self.storage_dirs["temp"]
|
|
|
|
def _get_device(self, device: str) -> str:
|
|
"""Determine the appropriate device for processing."""
|
|
if device == "auto":
|
|
if torch.cuda.is_available():
|
|
logger.info("CUDA available, using GPU acceleration")
|
|
return "cuda"
|
|
else:
|
|
logger.info("CUDA not available, using CPU")
|
|
return "cpu"
|
|
return device
|
|
|
|
def _get_compute_type(self, compute_type: str) -> str:
|
|
"""Determine the appropriate compute type for the device."""
|
|
if compute_type == "auto":
|
|
if self.device == "cuda":
|
|
# Use float16 for GPU for best speed/memory balance
|
|
return "float16"
|
|
else:
|
|
# Use int8 for CPU for best speed
|
|
return "int8"
|
|
return compute_type
|
|
|
|
async def _load_model(self) -> WhisperModel:
|
|
"""Load the faster-whisper model on-demand."""
|
|
if self.model is None:
|
|
logger.info(f"Loading faster-whisper model '{self.model_size}' on device '{self.device}' with compute_type '{self.compute_type}'")
|
|
try:
|
|
# Run model loading in executor to avoid blocking async loop
|
|
loop = asyncio.get_event_loop()
|
|
|
|
# Handle special model names
|
|
model_name = self.model_size
|
|
if model_name == "large-v3-turbo":
|
|
# Use the optimized CTranslate2 model
|
|
model_name = "deepdml/faster-whisper-large-v3-turbo-ct2"
|
|
|
|
self.model = await loop.run_in_executor(
|
|
None,
|
|
lambda: WhisperModel(
|
|
model_name,
|
|
device=self.device,
|
|
compute_type=self.compute_type,
|
|
cpu_threads=0, # Use all available CPU threads
|
|
num_workers=1, # Number of parallel workers
|
|
)
|
|
)
|
|
|
|
logger.info(f"Successfully loaded faster-whisper model '{self.model_size}' ({model_name})")
|
|
logger.info(f"Model device: {self.device}, compute_type: {self.compute_type}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load faster-whisper model: {e}")
|
|
# Fallback to standard large-v3 if turbo model fails
|
|
if self.model_size == "large-v3-turbo":
|
|
logger.info("Falling back to large-v3 model")
|
|
try:
|
|
self.model = await loop.run_in_executor(
|
|
None,
|
|
lambda: WhisperModel(
|
|
"large-v3",
|
|
device=self.device,
|
|
compute_type=self.compute_type,
|
|
)
|
|
)
|
|
logger.info("Successfully loaded fallback large-v3 model")
|
|
except Exception as fallback_error:
|
|
logger.error(f"Fallback model also failed: {fallback_error}")
|
|
raise fallback_error
|
|
else:
|
|
raise e
|
|
return self.model
|
|
|
|
async def transcribe_video(
|
|
self,
|
|
video_id: str,
|
|
video_url: str,
|
|
progress_callback=None
|
|
) -> Tuple[List[DualTranscriptSegment], DualTranscriptMetadata]:
|
|
"""
|
|
Transcribe a YouTube video and return segments with metadata.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
video_url: Full YouTube video URL
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
Tuple of (segments, metadata)
|
|
"""
|
|
start_time = datetime.now()
|
|
|
|
try:
|
|
if progress_callback:
|
|
await progress_callback("Downloading audio from YouTube video...")
|
|
|
|
# Download audio from YouTube video
|
|
audio_path = await self._download_audio(video_id, video_url)
|
|
|
|
if progress_callback:
|
|
await progress_callback("Audio downloaded, starting faster-whisper transcription...")
|
|
|
|
logger.info(f"Starting faster-whisper transcription for video {video_id} using model {self.model_size}")
|
|
|
|
# Transcribe the audio file
|
|
segments = await self._transcribe_audio_file(
|
|
audio_path,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
# Calculate processing time
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
# Create metadata
|
|
metadata = DualTranscriptMetadata(
|
|
video_id=video_id,
|
|
language="en", # faster-whisper auto-detects, but assume English for now
|
|
word_count=sum(len(segment.text.split()) for segment in segments),
|
|
total_segments=len(segments),
|
|
has_timestamps=True,
|
|
extraction_method="faster_whisper",
|
|
processing_time_seconds=processing_time,
|
|
quality_score=self._calculate_quality_score(segments),
|
|
confidence_score=self._calculate_confidence_score(segments)
|
|
)
|
|
|
|
duration_minutes = processing_time / 60
|
|
logger.info(
|
|
f"Completed faster-whisper transcription for video {video_id}. "
|
|
f"Generated {len(segments)} segments in {processing_time:.2f}s ({duration_minutes:.2f} minutes). "
|
|
f"Model: {self.model_size}, Device: {self.device}"
|
|
)
|
|
|
|
# Save transcript to file
|
|
await self._save_transcript(video_id, segments, metadata)
|
|
|
|
return segments, metadata
|
|
|
|
except Exception as e:
|
|
logger.error(f"Faster-whisper transcription failed for video {video_id}: {e}")
|
|
raise
|
|
finally:
|
|
# Clean up temporary files, but keep MP3 for future re-transcription
|
|
if 'audio_path' in locals() and audio_path:
|
|
await self._cleanup_temp_files(audio_path)
|
|
|
|
async def _download_audio(self, video_id: str, video_url: str) -> str:
|
|
"""Download audio from YouTube video using yt-dlp."""
|
|
try:
|
|
# Check if audio already exists (MP3 for storage)
|
|
mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
|
|
|
|
# If MP3 exists, use it directly (faster-whisper handles MP3 natively)
|
|
if mp3_path.exists():
|
|
logger.info(f"Using existing audio file: {mp3_path}")
|
|
return str(mp3_path)
|
|
|
|
# Download as MP3 for efficient storage
|
|
ydl_opts = {
|
|
'format': 'bestaudio/best',
|
|
'postprocessors': [{
|
|
'key': 'FFmpegExtractAudio',
|
|
'preferredcodec': 'mp3',
|
|
'preferredquality': '192',
|
|
}],
|
|
'outtmpl': str(self.storage_dirs["audio"] / f"{video_id}.%(ext)s"),
|
|
'quiet': True,
|
|
'no_warnings': True,
|
|
}
|
|
|
|
# Run yt-dlp in executor to avoid blocking
|
|
loop = asyncio.get_event_loop()
|
|
await loop.run_in_executor(
|
|
None,
|
|
lambda: self._run_yt_dlp(video_url, ydl_opts)
|
|
)
|
|
|
|
# Return MP3 path (faster-whisper can handle MP3 directly)
|
|
if mp3_path.exists():
|
|
return str(mp3_path)
|
|
|
|
raise RuntimeError(f"Failed to download audio for {video_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to download audio for video {video_id}: {e}")
|
|
raise RuntimeError(f"Audio download failed: {e}")
|
|
|
|
def _run_yt_dlp(self, url: str, opts: dict):
|
|
"""Run yt-dlp synchronously."""
|
|
with yt_dlp.YoutubeDL(opts) as ydl:
|
|
ydl.download([url])
|
|
|
|
async def _transcribe_audio_file(
|
|
self,
|
|
audio_path: str,
|
|
progress_callback=None
|
|
) -> List[DualTranscriptSegment]:
|
|
"""
|
|
Transcribe an audio file with optimized faster-whisper settings.
|
|
|
|
Args:
|
|
audio_path: Path to the audio file
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
List of transcription segments
|
|
"""
|
|
model = await self._load_model()
|
|
|
|
# Get audio duration for progress tracking
|
|
duration = await self._get_audio_duration(audio_path)
|
|
logger.info(f"Audio duration: {duration:.2f} seconds ({duration/60:.1f} minutes)")
|
|
|
|
try:
|
|
if progress_callback:
|
|
await progress_callback(f"Transcribing {duration/60:.1f} minute audio with {self.model_size}...")
|
|
|
|
# Use faster-whisper with optimized settings
|
|
logger.info(f"Transcribing with faster-whisper - VAD: {self.vad_filter}, Beam: {self.beam_size}")
|
|
|
|
loop = asyncio.get_event_loop()
|
|
result = await loop.run_in_executor(
|
|
None,
|
|
lambda: self._transcribe_with_faster_whisper(model, audio_path)
|
|
)
|
|
|
|
segments, info = result
|
|
|
|
# Log transcription info
|
|
logger.info(f"Detected language: {info.language} (probability: {info.language_probability:.2f})")
|
|
logger.info(f"Duration: {info.duration:.2f}s, VAD: {info.vad_options if hasattr(info, 'vad_options') else 'N/A'}")
|
|
|
|
# Convert to DualTranscriptSegment objects
|
|
transcript_segments = []
|
|
|
|
for segment in segments:
|
|
# Handle word-level timestamps if available
|
|
text = segment.text.strip()
|
|
|
|
# Split long segments if needed
|
|
if len(text) > self.max_segment_length:
|
|
split_segments = self._split_long_segment(
|
|
text, segment.start, segment.end
|
|
)
|
|
transcript_segments.extend(split_segments)
|
|
else:
|
|
transcript_segments.append(DualTranscriptSegment(
|
|
start_time=segment.start,
|
|
end_time=segment.end,
|
|
text=text,
|
|
confidence=segment.avg_logprob if hasattr(segment, 'avg_logprob') else None
|
|
))
|
|
|
|
if progress_callback:
|
|
await progress_callback(f"Transcription complete - {len(transcript_segments)} segments generated")
|
|
|
|
return transcript_segments
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to transcribe audio file {audio_path}: {e}")
|
|
raise
|
|
|
|
def _transcribe_with_faster_whisper(self, model: WhisperModel, audio_path: str):
|
|
"""
|
|
Perform the actual transcription with faster-whisper.
|
|
Run in executor to avoid blocking the event loop.
|
|
"""
|
|
return model.transcribe(
|
|
audio_path,
|
|
beam_size=self.beam_size,
|
|
best_of=self.best_of,
|
|
temperature=self.temperature,
|
|
vad_filter=self.vad_filter,
|
|
vad_parameters=self.vad_parameters,
|
|
word_timestamps=self.word_timestamps,
|
|
language="en", # Can be made configurable
|
|
task="transcribe"
|
|
)
|
|
|
|
async def _get_audio_duration(self, audio_path: str) -> float:
|
|
"""Get audio duration using pydub."""
|
|
loop = asyncio.get_event_loop()
|
|
audio = await loop.run_in_executor(None, AudioSegment.from_file, audio_path)
|
|
return len(audio) / 1000.0 # Convert milliseconds to seconds
|
|
|
|
def _split_long_segment(
|
|
self,
|
|
text: str,
|
|
start_time: float,
|
|
end_time: float
|
|
) -> List[DualTranscriptSegment]:
|
|
"""
|
|
Split a long text segment into smaller segments.
|
|
|
|
Args:
|
|
text: Text to split
|
|
start_time: Start time of the original segment
|
|
end_time: End time of the original segment
|
|
|
|
Returns:
|
|
List of smaller segments
|
|
"""
|
|
segments = []
|
|
duration = end_time - start_time
|
|
|
|
# Split text by sentences or at word boundaries
|
|
words = text.split()
|
|
current_text = ""
|
|
current_words = 0
|
|
|
|
time_per_word = duration / len(words) if len(words) > 0 else 0
|
|
|
|
for i, word in enumerate(words):
|
|
if len(current_text + " " + word) > self.max_segment_length and current_text:
|
|
# Create segment
|
|
segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
|
|
segment_end = start_time + current_words * time_per_word
|
|
|
|
segments.append(DualTranscriptSegment(
|
|
start_time=segment_start,
|
|
end_time=segment_end,
|
|
text=current_text.strip()
|
|
))
|
|
|
|
current_text = word
|
|
else:
|
|
current_text += " " + word if current_text else word
|
|
|
|
current_words += 1
|
|
|
|
# Add final segment
|
|
if current_text:
|
|
segment_start = start_time + (current_words - len(current_text.split())) * time_per_word
|
|
segments.append(DualTranscriptSegment(
|
|
start_time=segment_start,
|
|
end_time=end_time,
|
|
text=current_text.strip()
|
|
))
|
|
|
|
return segments
|
|
|
|
def _calculate_quality_score(self, segments: List[DualTranscriptSegment]) -> float:
|
|
"""Calculate overall quality score based on segment characteristics."""
|
|
if not segments:
|
|
return 0.0
|
|
|
|
# Faster-whisper provides more reliable confidence scores
|
|
confidences = [s.confidence for s in segments if s.confidence is not None]
|
|
if not confidences:
|
|
return 0.8 # Default high quality for faster-whisper
|
|
|
|
avg_confidence = sum(confidences) / len(confidences)
|
|
|
|
# Normalize confidence from log probability to 0-1 scale
|
|
# faster-whisper typically gives better normalized scores
|
|
normalized_confidence = max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))
|
|
|
|
# Boost quality score for faster-whisper due to improved model
|
|
return min(1.0, normalized_confidence * 1.1)
|
|
|
|
def _calculate_confidence_score(self, segments: List[DualTranscriptSegment]) -> float:
|
|
"""Calculate average confidence score."""
|
|
if not segments:
|
|
return 0.0
|
|
|
|
confidences = [s.confidence for s in segments if s.confidence is not None]
|
|
if not confidences:
|
|
return 0.85 # Higher default for faster-whisper
|
|
|
|
avg_confidence = sum(confidences) / len(confidences)
|
|
# Normalize from log probability to 0-1 scale
|
|
return max(0.0, min(1.0, (avg_confidence + 5.0) / 5.0))
|
|
|
|
async def _save_transcript(
|
|
self,
|
|
video_id: str,
|
|
segments: List[DualTranscriptSegment],
|
|
metadata: DualTranscriptMetadata
|
|
):
|
|
"""Save transcript and metadata to files for future use"""
|
|
try:
|
|
# Save audio metadata with faster-whisper info
|
|
await self._save_audio_metadata(video_id, metadata)
|
|
|
|
transcript_path = self.storage_dirs["transcripts"] / f"{video_id}_faster_whisper.txt"
|
|
|
|
# Create human-readable transcript file
|
|
transcript_lines = [
|
|
f"# Faster-Whisper Transcript - Model: {self.model_size}",
|
|
f"# Processing time: {metadata.processing_time_seconds:.2f}s",
|
|
f"# Quality score: {metadata.quality_score:.3f}",
|
|
f"# Confidence score: {metadata.confidence_score:.3f}",
|
|
f"# Total segments: {len(segments)}",
|
|
""
|
|
]
|
|
|
|
for segment in segments:
|
|
if segment.start_time is not None and segment.end_time is not None:
|
|
timestamp = f"[{segment.start_time:.1f}s - {segment.end_time:.1f}s]"
|
|
transcript_lines.append(f"{timestamp} {segment.text}")
|
|
else:
|
|
transcript_lines.append(segment.text)
|
|
|
|
# Write transcript to file
|
|
async with aiofiles.open(transcript_path, 'w', encoding='utf-8') as f:
|
|
await f.write('\n'.join(transcript_lines))
|
|
|
|
logger.info(f"Saved faster-whisper transcript to {transcript_path}")
|
|
|
|
# Also save as JSON for programmatic access
|
|
json_path = self.storage_dirs["transcripts"] / f"{video_id}_faster_whisper.json"
|
|
segments_data = {
|
|
"metadata": {
|
|
"model": self.model_size,
|
|
"device": self.device,
|
|
"compute_type": self.compute_type,
|
|
"processing_time_seconds": metadata.processing_time_seconds,
|
|
"quality_score": metadata.quality_score,
|
|
"confidence_score": metadata.confidence_score,
|
|
"total_segments": len(segments),
|
|
"word_count": metadata.word_count,
|
|
"extraction_method": "faster_whisper"
|
|
},
|
|
"segments": [
|
|
{
|
|
"start_time": seg.start_time,
|
|
"end_time": seg.end_time,
|
|
"text": seg.text,
|
|
"confidence": seg.confidence
|
|
}
|
|
for seg in segments
|
|
]
|
|
}
|
|
|
|
async with aiofiles.open(json_path, 'w', encoding='utf-8') as f:
|
|
import json
|
|
await f.write(json.dumps(segments_data, indent=2))
|
|
|
|
logger.info(f"Saved faster-whisper transcript JSON to {json_path}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save transcript for {video_id}: {e}")
|
|
|
|
async def _save_audio_metadata(self, video_id: str, metadata: DualTranscriptMetadata):
|
|
"""Save audio metadata with faster-whisper specific information"""
|
|
try:
|
|
mp3_path = self.storage_dirs["audio"] / f"{video_id}.mp3"
|
|
if not mp3_path.exists():
|
|
return
|
|
|
|
# Get audio file info
|
|
audio_info = {
|
|
"video_id": video_id,
|
|
"file_path": str(mp3_path),
|
|
"file_size_mb": round(mp3_path.stat().st_size / (1024 * 1024), 2),
|
|
"download_date": datetime.now().isoformat(),
|
|
"format": "mp3",
|
|
"quality": "192kbps",
|
|
|
|
# Faster-whisper specific metadata
|
|
"transcription_engine": "faster_whisper",
|
|
"model_used": self.model_size,
|
|
"device": self.device,
|
|
"compute_type": self.compute_type,
|
|
"processing_time_seconds": metadata.processing_time_seconds,
|
|
"quality_score": metadata.quality_score,
|
|
"confidence_score": metadata.confidence_score,
|
|
"vad_enabled": self.vad_filter,
|
|
"beam_size": self.beam_size
|
|
}
|
|
|
|
# Try to get audio duration
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
audio = await loop.run_in_executor(None, AudioSegment.from_file, str(mp3_path))
|
|
duration_seconds = len(audio) / 1000.0
|
|
audio_info["duration_seconds"] = duration_seconds
|
|
audio_info["duration_formatted"] = f"{int(duration_seconds // 60)}:{int(duration_seconds % 60):02d}"
|
|
|
|
# Calculate speed improvement ratio
|
|
if metadata.processing_time_seconds > 0:
|
|
speed_ratio = duration_seconds / metadata.processing_time_seconds
|
|
audio_info["speed_ratio"] = round(speed_ratio, 2)
|
|
audio_info["realtime_factor"] = f"{speed_ratio:.1f}x faster than realtime"
|
|
|
|
except:
|
|
pass
|
|
|
|
# Save metadata
|
|
metadata_path = self.storage_dirs["audio"] / f"{video_id}_faster_whisper_metadata.json"
|
|
async with aiofiles.open(metadata_path, 'w', encoding='utf-8') as f:
|
|
import json
|
|
await f.write(json.dumps(audio_info, indent=2))
|
|
|
|
logger.info(f"Saved faster-whisper audio metadata to {metadata_path}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save audio metadata for {video_id}: {e}")
|
|
|
|
async def _cleanup_temp_files(self, audio_path: str):
|
|
"""Clean up temporary files while preserving MP3 for re-use."""
|
|
try:
|
|
# Only clean up if this was a temporary WAV file
|
|
if audio_path.endswith('.wav'):
|
|
wav_path = Path(audio_path)
|
|
mp3_path = wav_path.with_suffix('.mp3')
|
|
|
|
if mp3_path.exists() and wav_path.exists():
|
|
try:
|
|
os.unlink(audio_path)
|
|
logger.info(f"Cleaned up temporary WAV, keeping MP3: {mp3_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to clean up WAV file {audio_path}: {e}")
|
|
else:
|
|
logger.info(f"Keeping audio file: {audio_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Error during temp file cleanup: {e}")
|
|
|
|
async def cleanup(self):
|
|
"""Clean up resources and free memory."""
|
|
try:
|
|
# Unload model to free memory
|
|
if self.model is not None:
|
|
del self.model
|
|
self.model = None
|
|
|
|
# Clear GPU cache if using CUDA
|
|
if torch.cuda.is_available() and self.device == "cuda":
|
|
torch.cuda.empty_cache()
|
|
logger.info("Cleared GPU cache")
|
|
|
|
logger.info("Faster-whisper service cleanup completed")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error during cleanup: {e}")
|
|
|
|
def get_performance_info(self) -> Dict:
|
|
"""Get information about the current configuration and expected performance."""
|
|
return {
|
|
"model": self.model_size,
|
|
"device": self.device,
|
|
"compute_type": self.compute_type,
|
|
"vad_enabled": self.vad_filter,
|
|
"beam_size": self.beam_size,
|
|
"expected_speed_improvement": "20-32x faster than OpenAI Whisper",
|
|
"optimizations": [
|
|
"CTranslate2 optimization engine",
|
|
"Voice Activity Detection (VAD)",
|
|
"GPU acceleration" if self.device == "cuda" else "CPU optimization",
|
|
f"Quantization ({self.compute_type})",
|
|
"Native MP3 support (no conversion needed)"
|
|
]
|
|
} |