541 lines
17 KiB
Markdown
541 lines
17 KiB
Markdown
# Audio Processing Architecture
|
|
|
|
## Overview
|
|
|
|
The audio processing pipeline handles the critical first step: converting various media formats into optimized audio suitable for transcription. This architecture ensures consistent, high-quality input for the Whisper model.
|
|
|
|
## Pipeline Stages
|
|
|
|
### Stage 1: Media Download/Acquisition
|
|
```python
|
|
class MediaAcquisition:
|
|
"""Handle media from various sources"""
|
|
|
|
async def acquire(self, source: str) -> Path:
|
|
if source.startswith(('http://', 'https://')):
|
|
return await self.download_media(source)
|
|
elif Path(source).exists():
|
|
return Path(source)
|
|
else:
|
|
raise ValueError(f"Invalid source: {source}")
|
|
|
|
async def download_media(self, url: str) -> Path:
|
|
"""Download with progress tracking"""
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url) as response:
|
|
total_size = int(response.headers.get('content-length', 0))
|
|
|
|
# Stream to temporary file
|
|
temp_file = Path(tempfile.mktemp(suffix='.tmp'))
|
|
with open(temp_file, 'wb') as f:
|
|
async for chunk in response.content.iter_chunked(8192):
|
|
f.write(chunk)
|
|
await self.update_progress(f.tell(), total_size)
|
|
|
|
return temp_file
|
|
```
|
|
|
|
### Stage 2: Format Detection & Validation
|
|
```python
|
|
class FormatValidator:
|
|
"""Validate and identify media formats"""
|
|
|
|
SUPPORTED_FORMATS = {
|
|
'video': ['.mp4', '.avi', '.mov', '.mkv', '.webm'],
|
|
'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a']
|
|
}
|
|
|
|
def validate_format(self, file_path: Path) -> MediaInfo:
|
|
"""Extract media information"""
|
|
probe = ffmpeg.probe(str(file_path))
|
|
|
|
# Check for audio stream
|
|
audio_streams = [
|
|
s for s in probe['streams']
|
|
if s['codec_type'] == 'audio'
|
|
]
|
|
|
|
if not audio_streams:
|
|
raise ValueError("No audio stream found")
|
|
|
|
stream = audio_streams[0]
|
|
return MediaInfo(
|
|
format=probe['format']['format_name'],
|
|
duration=float(probe['format']['duration']),
|
|
sample_rate=int(stream['sample_rate']),
|
|
channels=int(stream['channels']),
|
|
codec=stream['codec_name'],
|
|
bitrate=int(stream.get('bit_rate', 0))
|
|
)
|
|
```
|
|
|
|
### Stage 3: Audio Extraction
|
|
```python
|
|
class AudioExtractor:
|
|
"""Extract audio from video files"""
|
|
|
|
async def extract_audio(self, video_path: Path) -> Path:
|
|
"""Extract audio track from video"""
|
|
output_path = video_path.with_suffix('.extracted.wav')
|
|
|
|
# FFmpeg extraction command
|
|
command = (
|
|
ffmpeg
|
|
.input(str(video_path))
|
|
.output(
|
|
str(output_path),
|
|
acodec='pcm_s16le', # 16-bit PCM
|
|
ar=16000, # 16kHz sample rate
|
|
ac=1, # Mono
|
|
loglevel='error'
|
|
)
|
|
.overwrite_output()
|
|
)
|
|
|
|
# Run async
|
|
process = await asyncio.create_subprocess_exec(
|
|
'ffmpeg', *command.compile(),
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
stdout, stderr = await process.communicate()
|
|
|
|
if process.returncode != 0:
|
|
raise ProcessingError(f"FFmpeg failed: {stderr.decode()}")
|
|
|
|
return output_path
|
|
```
|
|
|
|
### Stage 4: Audio Preprocessing
|
|
```python
|
|
class AudioPreprocessor:
|
|
"""Optimize audio for transcription"""
|
|
|
|
def __init__(self):
|
|
self.target_sample_rate = 16000
|
|
self.target_channels = 1 # Mono
|
|
self.target_format = 'wav'
|
|
|
|
async def preprocess(self, audio_path: Path) -> Path:
|
|
"""Full preprocessing pipeline"""
|
|
# Load audio
|
|
audio, sr = librosa.load(
|
|
str(audio_path),
|
|
sr=self.target_sample_rate,
|
|
mono=True
|
|
)
|
|
|
|
# Apply preprocessing chain
|
|
audio = self.remove_silence(audio, sr)
|
|
audio = self.normalize_volume(audio)
|
|
audio = self.apply_noise_reduction(audio, sr)
|
|
audio = self.compress_dynamic_range(audio)
|
|
|
|
# Save processed audio
|
|
output_path = audio_path.with_suffix('.preprocessed.wav')
|
|
sf.write(output_path, audio, sr, subtype='PCM_16')
|
|
|
|
return output_path
|
|
|
|
def remove_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
|
"""Remove leading/trailing silence"""
|
|
# Use librosa's trim function
|
|
trimmed, _ = librosa.effects.trim(
|
|
audio,
|
|
top_db=20, # Threshold in dB
|
|
frame_length=2048,
|
|
hop_length=512
|
|
)
|
|
return trimmed
|
|
|
|
def normalize_volume(self, audio: np.ndarray) -> np.ndarray:
|
|
"""Normalize to consistent volume"""
|
|
# Peak normalization to -3dB
|
|
peak = np.abs(audio).max()
|
|
if peak > 0:
|
|
target_peak = 10 ** (-3 / 20) # -3dB in linear scale
|
|
audio = audio * (target_peak / peak)
|
|
return audio
|
|
|
|
def apply_noise_reduction(self, audio: np.ndarray, sr: int) -> np.ndarray:
|
|
"""Reduce background noise"""
|
|
# Simple spectral gating
|
|
D = librosa.stft(audio)
|
|
magnitude = np.abs(D)
|
|
|
|
# Estimate noise floor (bottom 10%)
|
|
noise_floor = np.percentile(magnitude, 10)
|
|
|
|
# Gate frequencies below noise floor
|
|
mask = magnitude > noise_floor * 1.5
|
|
D_gated = D * mask
|
|
|
|
# Reconstruct audio
|
|
audio_denoised = librosa.istft(D_gated)
|
|
|
|
return audio_denoised
|
|
|
|
def compress_dynamic_range(self, audio: np.ndarray) -> np.ndarray:
|
|
"""Apply gentle compression"""
|
|
# Simple compression algorithm
|
|
threshold = 0.7
|
|
ratio = 4.0
|
|
|
|
# Apply compression to peaks
|
|
mask = np.abs(audio) > threshold
|
|
compressed = audio.copy()
|
|
compressed[mask] = np.sign(audio[mask]) * (
|
|
threshold + (np.abs(audio[mask]) - threshold) / ratio
|
|
)
|
|
|
|
return compressed
|
|
```
|
|
|
|
### Stage 5: Chunking for Long Audio
|
|
```python
|
|
class AudioChunker:
|
|
"""Split long audio files for processing"""
|
|
|
|
def __init__(self, chunk_duration: int = 600): # 10 minutes
|
|
self.chunk_duration = chunk_duration
|
|
self.overlap = 2 # 2 second overlap
|
|
|
|
async def chunk_audio(self, audio_path: Path) -> List[AudioChunk]:
|
|
"""Split audio into overlapping chunks"""
|
|
# Get duration
|
|
info = await self.get_audio_info(audio_path)
|
|
duration = info.duration
|
|
|
|
if duration <= self.chunk_duration:
|
|
# No chunking needed
|
|
return [AudioChunk(
|
|
path=audio_path,
|
|
start=0,
|
|
end=duration,
|
|
index=0
|
|
)]
|
|
|
|
# Calculate chunks
|
|
chunks = []
|
|
chunk_size = self.chunk_duration
|
|
step = chunk_size - self.overlap
|
|
|
|
for i, start in enumerate(range(0, int(duration), step)):
|
|
end = min(start + chunk_size, duration)
|
|
|
|
# Extract chunk
|
|
chunk_path = await self.extract_chunk(
|
|
audio_path, start, end - start, i
|
|
)
|
|
|
|
chunks.append(AudioChunk(
|
|
path=chunk_path,
|
|
start=start,
|
|
end=end,
|
|
index=i
|
|
))
|
|
|
|
if end >= duration:
|
|
break
|
|
|
|
return chunks
|
|
|
|
async def extract_chunk(
|
|
self,
|
|
audio_path: Path,
|
|
start: float,
|
|
duration: float,
|
|
index: int
|
|
) -> Path:
|
|
"""Extract a specific chunk"""
|
|
output_path = audio_path.parent / f"{audio_path.stem}_chunk_{index:03d}.wav"
|
|
|
|
command = (
|
|
ffmpeg
|
|
.input(str(audio_path), ss=start, t=duration)
|
|
.output(str(output_path), acodec='copy')
|
|
.overwrite_output()
|
|
)
|
|
|
|
await self.run_ffmpeg(command)
|
|
return output_path
|
|
```
|
|
|
|
## Quality Assurance
|
|
|
|
### Audio Quality Metrics
|
|
```python
|
|
class AudioQualityAnalyzer:
|
|
"""Analyze audio quality metrics"""
|
|
|
|
def analyze(self, audio_path: Path) -> QualityReport:
|
|
audio, sr = librosa.load(str(audio_path))
|
|
|
|
return QualityReport(
|
|
snr=self.calculate_snr(audio),
|
|
silence_ratio=self.calculate_silence_ratio(audio),
|
|
clipping_ratio=self.calculate_clipping(audio),
|
|
frequency_range=self.analyze_frequency_range(audio, sr),
|
|
recommended_action=self.recommend_action(audio, sr)
|
|
)
|
|
|
|
def calculate_snr(self, audio: np.ndarray) -> float:
|
|
"""Signal-to-noise ratio in dB"""
|
|
# Use robust estimator
|
|
signal_power = np.median(audio ** 2)
|
|
noise_power = np.median((audio - np.median(audio)) ** 2)
|
|
|
|
if noise_power > 0:
|
|
snr = 10 * np.log10(signal_power / noise_power)
|
|
else:
|
|
snr = float('inf')
|
|
|
|
return snr
|
|
|
|
def calculate_silence_ratio(self, audio: np.ndarray) -> float:
|
|
"""Percentage of silence in audio"""
|
|
threshold = 0.01 # Silence threshold
|
|
silence_samples = np.sum(np.abs(audio) < threshold)
|
|
return silence_samples / len(audio)
|
|
|
|
def calculate_clipping(self, audio: np.ndarray) -> float:
|
|
"""Percentage of clipped samples"""
|
|
clipping_threshold = 0.99
|
|
clipped = np.sum(np.abs(audio) > clipping_threshold)
|
|
return clipped / len(audio)
|
|
```
|
|
|
|
## Performance Optimization
|
|
|
|
### Parallel Processing
|
|
```python
|
|
class ParallelAudioProcessor:
|
|
"""Process multiple audio files in parallel"""
|
|
|
|
def __init__(self, max_workers: int = 4):
|
|
self.max_workers = max_workers
|
|
self.semaphore = asyncio.Semaphore(max_workers)
|
|
|
|
async def process_batch(self, audio_files: List[Path]) -> List[Path]:
|
|
"""Process multiple files concurrently"""
|
|
tasks = [
|
|
self.process_with_limit(audio_file)
|
|
for audio_file in audio_files
|
|
]
|
|
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Handle errors
|
|
processed = []
|
|
for result, audio_file in zip(results, audio_files):
|
|
if isinstance(result, Exception):
|
|
logger.error(f"Failed to process {audio_file}: {result}")
|
|
else:
|
|
processed.append(result)
|
|
|
|
return processed
|
|
|
|
async def process_with_limit(self, audio_file: Path) -> Path:
|
|
"""Process with concurrency limit"""
|
|
async with self.semaphore:
|
|
return await self.process_single(audio_file)
|
|
```
|
|
|
|
### Caching Preprocessed Audio
|
|
```python
|
|
class PreprocessedAudioCache:
|
|
"""Cache preprocessed audio files"""
|
|
|
|
def __init__(self, cache_dir: Path):
|
|
self.cache_dir = cache_dir
|
|
self.cache_dir.mkdir(exist_ok=True)
|
|
|
|
def get_cache_path(self, original_path: Path) -> Path:
|
|
"""Generate cache file path"""
|
|
file_hash = self.calculate_hash(original_path)
|
|
return self.cache_dir / f"{file_hash}.preprocessed.wav"
|
|
|
|
async def get_or_process(
|
|
self,
|
|
audio_path: Path,
|
|
processor: AudioPreprocessor
|
|
) -> Path:
|
|
"""Get from cache or process"""
|
|
cache_path = self.get_cache_path(audio_path)
|
|
|
|
if cache_path.exists():
|
|
# Verify cache is newer than source
|
|
if cache_path.stat().st_mtime > audio_path.stat().st_mtime:
|
|
logger.info(f"Using cached preprocessed audio: {cache_path}")
|
|
return cache_path
|
|
|
|
# Process and cache
|
|
processed = await processor.preprocess(audio_path)
|
|
shutil.copy2(processed, cache_path)
|
|
|
|
return cache_path
|
|
```
|
|
|
|
## Error Handling
|
|
|
|
### Common Audio Issues
|
|
```python
|
|
class AudioErrorHandler:
|
|
"""Handle common audio processing errors"""
|
|
|
|
async def handle_processing_error(
|
|
self,
|
|
error: Exception,
|
|
audio_path: Path
|
|
) -> Optional[Path]:
|
|
"""Attempt recovery from errors"""
|
|
|
|
if isinstance(error, CorruptedFileError):
|
|
# Try to repair with FFmpeg
|
|
return await self.repair_corrupted_file(audio_path)
|
|
|
|
elif isinstance(error, UnsupportedFormatError):
|
|
# Try alternative extraction method
|
|
return await self.extract_with_alternative_method(audio_path)
|
|
|
|
elif isinstance(error, SilentAudioError):
|
|
# Audio is completely silent
|
|
logger.warning(f"Audio file is silent: {audio_path}")
|
|
return None
|
|
|
|
else:
|
|
# Unknown error
|
|
logger.error(f"Unhandled error: {error}")
|
|
raise
|
|
|
|
async def repair_corrupted_file(self, audio_path: Path) -> Path:
|
|
"""Attempt to repair corrupted audio"""
|
|
repaired_path = audio_path.with_suffix('.repaired.wav')
|
|
|
|
# Use FFmpeg's error correction
|
|
command = (
|
|
ffmpeg
|
|
.input(str(audio_path), err_detect='aggressive')
|
|
.output(
|
|
str(repaired_path),
|
|
acodec='pcm_s16le',
|
|
ar=16000,
|
|
ac=1
|
|
)
|
|
.global_args('-xerror') # Exit on error
|
|
.overwrite_output()
|
|
)
|
|
|
|
try:
|
|
await self.run_ffmpeg(command)
|
|
return repaired_path
|
|
except Exception:
|
|
raise RepairFailedError(f"Could not repair {audio_path}")
|
|
```
|
|
|
|
## Testing Strategy
|
|
|
|
### Audio Processing Tests
|
|
```python
|
|
# tests/test_audio_processing.py
|
|
class TestAudioProcessing:
|
|
|
|
@pytest.fixture
|
|
def test_audio_files(self):
|
|
"""Provide test audio files"""
|
|
return {
|
|
'clean': Path('tests/fixtures/audio/clean_speech.wav'),
|
|
'noisy': Path('tests/fixtures/audio/noisy_speech.wav'),
|
|
'music': Path('tests/fixtures/audio/music_and_speech.mp3'),
|
|
'silent': Path('tests/fixtures/audio/silent.wav'),
|
|
'corrupted': Path('tests/fixtures/audio/corrupted.mp4')
|
|
}
|
|
|
|
async def test_preprocessing_improves_quality(self, test_audio_files):
|
|
"""Test that preprocessing improves audio quality"""
|
|
processor = AudioPreprocessor()
|
|
|
|
original = test_audio_files['noisy']
|
|
processed = await processor.preprocess(original)
|
|
|
|
# Analyze both
|
|
original_quality = AudioQualityAnalyzer().analyze(original)
|
|
processed_quality = AudioQualityAnalyzer().analyze(processed)
|
|
|
|
# Should improve SNR
|
|
assert processed_quality.snr > original_quality.snr
|
|
|
|
# Should reduce silence
|
|
assert processed_quality.silence_ratio < original_quality.silence_ratio
|
|
|
|
async def test_chunking_preserves_content(self, test_audio_files):
|
|
"""Test that chunking doesn't lose content"""
|
|
chunker = AudioChunker(chunk_duration=30) # 30 second chunks
|
|
|
|
original = test_audio_files['clean']
|
|
chunks = await chunker.chunk_audio(original)
|
|
|
|
# Verify coverage
|
|
original_duration = get_duration(original)
|
|
chunk_coverage = sum(c.end - c.start for c in chunks)
|
|
|
|
# Should cover entire file (with overlaps)
|
|
assert chunk_coverage >= original_duration
|
|
|
|
# Verify overlap
|
|
for i in range(len(chunks) - 1):
|
|
assert chunks[i].end > chunks[i + 1].start # Overlap exists
|
|
```
|
|
|
|
## Configuration
|
|
|
|
### Audio Processing Settings
|
|
```python
|
|
# config/audio.py
|
|
AUDIO_CONFIG = {
|
|
# Target format for Whisper
|
|
'target_sample_rate': 16000,
|
|
'target_channels': 1,
|
|
'target_format': 'wav',
|
|
'target_bit_depth': 16,
|
|
|
|
# Preprocessing
|
|
'remove_silence': True,
|
|
'silence_threshold_db': 20,
|
|
'normalize_volume': True,
|
|
'target_peak_db': -3,
|
|
'apply_noise_reduction': True,
|
|
'noise_gate_ratio': 1.5,
|
|
|
|
# Chunking
|
|
'max_chunk_duration': 600, # 10 minutes
|
|
'chunk_overlap': 2, # seconds
|
|
|
|
# Quality thresholds
|
|
'min_snr_db': 10,
|
|
'max_silence_ratio': 0.8,
|
|
'max_clipping_ratio': 0.01,
|
|
|
|
# Performance
|
|
'max_parallel_processing': 4,
|
|
'cache_preprocessed': True,
|
|
'cache_directory': '/tmp/trax/audio_cache'
|
|
}
|
|
```
|
|
|
|
## Summary
|
|
|
|
The audio processing architecture ensures:
|
|
1. **Format flexibility** - Handle any media format
|
|
2. **Quality optimization** - Improve audio for transcription
|
|
3. **Reliability** - Handle errors gracefully
|
|
4. **Performance** - Parallel processing and caching
|
|
5. **Testability** - Comprehensive test coverage
|
|
|
|
This foundation enables accurate, efficient transcription across diverse media sources.
|
|
|
|
---
|
|
|
|
*Last Updated: 2024*
|
|
*Architecture Version: 1.0* |