# Audio Processing Architecture ## Overview The audio processing pipeline handles the critical first step: converting various media formats into optimized audio suitable for transcription. This architecture ensures consistent, high-quality input for the Whisper model. ## Pipeline Stages ### Stage 1: Media Download/Acquisition ```python class MediaAcquisition: """Handle media from various sources""" async def acquire(self, source: str) -> Path: if source.startswith(('http://', 'https://')): return await self.download_media(source) elif Path(source).exists(): return Path(source) else: raise ValueError(f"Invalid source: {source}") async def download_media(self, url: str) -> Path: """Download with progress tracking""" async with aiohttp.ClientSession() as session: async with session.get(url) as response: total_size = int(response.headers.get('content-length', 0)) # Stream to temporary file temp_file = Path(tempfile.mktemp(suffix='.tmp')) with open(temp_file, 'wb') as f: async for chunk in response.content.iter_chunked(8192): f.write(chunk) await self.update_progress(f.tell(), total_size) return temp_file ``` ### Stage 2: Format Detection & Validation ```python class FormatValidator: """Validate and identify media formats""" SUPPORTED_FORMATS = { 'video': ['.mp4', '.avi', '.mov', '.mkv', '.webm'], 'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'] } def validate_format(self, file_path: Path) -> MediaInfo: """Extract media information""" probe = ffmpeg.probe(str(file_path)) # Check for audio stream audio_streams = [ s for s in probe['streams'] if s['codec_type'] == 'audio' ] if not audio_streams: raise ValueError("No audio stream found") stream = audio_streams[0] return MediaInfo( format=probe['format']['format_name'], duration=float(probe['format']['duration']), sample_rate=int(stream['sample_rate']), channels=int(stream['channels']), codec=stream['codec_name'], bitrate=int(stream.get('bit_rate', 0)) ) ``` ### Stage 3: Audio Extraction ```python class AudioExtractor: """Extract audio from video files""" async def extract_audio(self, video_path: Path) -> Path: """Extract audio track from video""" output_path = video_path.with_suffix('.extracted.wav') # FFmpeg extraction command command = ( ffmpeg .input(str(video_path)) .output( str(output_path), acodec='pcm_s16le', # 16-bit PCM ar=16000, # 16kHz sample rate ac=1, # Mono loglevel='error' ) .overwrite_output() ) # Run async process = await asyncio.create_subprocess_exec( 'ffmpeg', *command.compile(), stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() if process.returncode != 0: raise ProcessingError(f"FFmpeg failed: {stderr.decode()}") return output_path ``` ### Stage 4: Audio Preprocessing ```python class AudioPreprocessor: """Optimize audio for transcription""" def __init__(self): self.target_sample_rate = 16000 self.target_channels = 1 # Mono self.target_format = 'wav' async def preprocess(self, audio_path: Path) -> Path: """Full preprocessing pipeline""" # Load audio audio, sr = librosa.load( str(audio_path), sr=self.target_sample_rate, mono=True ) # Apply preprocessing chain audio = self.remove_silence(audio, sr) audio = self.normalize_volume(audio) audio = self.apply_noise_reduction(audio, sr) audio = self.compress_dynamic_range(audio) # Save processed audio output_path = audio_path.with_suffix('.preprocessed.wav') sf.write(output_path, audio, sr, subtype='PCM_16') return output_path def remove_silence(self, audio: np.ndarray, sr: int) -> np.ndarray: """Remove leading/trailing silence""" # Use librosa's trim function trimmed, _ = librosa.effects.trim( audio, top_db=20, # Threshold in dB frame_length=2048, hop_length=512 ) return trimmed def normalize_volume(self, audio: np.ndarray) -> np.ndarray: """Normalize to consistent volume""" # Peak normalization to -3dB peak = np.abs(audio).max() if peak > 0: target_peak = 10 ** (-3 / 20) # -3dB in linear scale audio = audio * (target_peak / peak) return audio def apply_noise_reduction(self, audio: np.ndarray, sr: int) -> np.ndarray: """Reduce background noise""" # Simple spectral gating D = librosa.stft(audio) magnitude = np.abs(D) # Estimate noise floor (bottom 10%) noise_floor = np.percentile(magnitude, 10) # Gate frequencies below noise floor mask = magnitude > noise_floor * 1.5 D_gated = D * mask # Reconstruct audio audio_denoised = librosa.istft(D_gated) return audio_denoised def compress_dynamic_range(self, audio: np.ndarray) -> np.ndarray: """Apply gentle compression""" # Simple compression algorithm threshold = 0.7 ratio = 4.0 # Apply compression to peaks mask = np.abs(audio) > threshold compressed = audio.copy() compressed[mask] = np.sign(audio[mask]) * ( threshold + (np.abs(audio[mask]) - threshold) / ratio ) return compressed ``` ### Stage 5: Chunking for Long Audio ```python class AudioChunker: """Split long audio files for processing""" def __init__(self, chunk_duration: int = 600): # 10 minutes self.chunk_duration = chunk_duration self.overlap = 2 # 2 second overlap async def chunk_audio(self, audio_path: Path) -> List[AudioChunk]: """Split audio into overlapping chunks""" # Get duration info = await self.get_audio_info(audio_path) duration = info.duration if duration <= self.chunk_duration: # No chunking needed return [AudioChunk( path=audio_path, start=0, end=duration, index=0 )] # Calculate chunks chunks = [] chunk_size = self.chunk_duration step = chunk_size - self.overlap for i, start in enumerate(range(0, int(duration), step)): end = min(start + chunk_size, duration) # Extract chunk chunk_path = await self.extract_chunk( audio_path, start, end - start, i ) chunks.append(AudioChunk( path=chunk_path, start=start, end=end, index=i )) if end >= duration: break return chunks async def extract_chunk( self, audio_path: Path, start: float, duration: float, index: int ) -> Path: """Extract a specific chunk""" output_path = audio_path.parent / f"{audio_path.stem}_chunk_{index:03d}.wav" command = ( ffmpeg .input(str(audio_path), ss=start, t=duration) .output(str(output_path), acodec='copy') .overwrite_output() ) await self.run_ffmpeg(command) return output_path ``` ## Quality Assurance ### Audio Quality Metrics ```python class AudioQualityAnalyzer: """Analyze audio quality metrics""" def analyze(self, audio_path: Path) -> QualityReport: audio, sr = librosa.load(str(audio_path)) return QualityReport( snr=self.calculate_snr(audio), silence_ratio=self.calculate_silence_ratio(audio), clipping_ratio=self.calculate_clipping(audio), frequency_range=self.analyze_frequency_range(audio, sr), recommended_action=self.recommend_action(audio, sr) ) def calculate_snr(self, audio: np.ndarray) -> float: """Signal-to-noise ratio in dB""" # Use robust estimator signal_power = np.median(audio ** 2) noise_power = np.median((audio - np.median(audio)) ** 2) if noise_power > 0: snr = 10 * np.log10(signal_power / noise_power) else: snr = float('inf') return snr def calculate_silence_ratio(self, audio: np.ndarray) -> float: """Percentage of silence in audio""" threshold = 0.01 # Silence threshold silence_samples = np.sum(np.abs(audio) < threshold) return silence_samples / len(audio) def calculate_clipping(self, audio: np.ndarray) -> float: """Percentage of clipped samples""" clipping_threshold = 0.99 clipped = np.sum(np.abs(audio) > clipping_threshold) return clipped / len(audio) ``` ## Performance Optimization ### Parallel Processing ```python class ParallelAudioProcessor: """Process multiple audio files in parallel""" def __init__(self, max_workers: int = 4): self.max_workers = max_workers self.semaphore = asyncio.Semaphore(max_workers) async def process_batch(self, audio_files: List[Path]) -> List[Path]: """Process multiple files concurrently""" tasks = [ self.process_with_limit(audio_file) for audio_file in audio_files ] results = await asyncio.gather(*tasks, return_exceptions=True) # Handle errors processed = [] for result, audio_file in zip(results, audio_files): if isinstance(result, Exception): logger.error(f"Failed to process {audio_file}: {result}") else: processed.append(result) return processed async def process_with_limit(self, audio_file: Path) -> Path: """Process with concurrency limit""" async with self.semaphore: return await self.process_single(audio_file) ``` ### Caching Preprocessed Audio ```python class PreprocessedAudioCache: """Cache preprocessed audio files""" def __init__(self, cache_dir: Path): self.cache_dir = cache_dir self.cache_dir.mkdir(exist_ok=True) def get_cache_path(self, original_path: Path) -> Path: """Generate cache file path""" file_hash = self.calculate_hash(original_path) return self.cache_dir / f"{file_hash}.preprocessed.wav" async def get_or_process( self, audio_path: Path, processor: AudioPreprocessor ) -> Path: """Get from cache or process""" cache_path = self.get_cache_path(audio_path) if cache_path.exists(): # Verify cache is newer than source if cache_path.stat().st_mtime > audio_path.stat().st_mtime: logger.info(f"Using cached preprocessed audio: {cache_path}") return cache_path # Process and cache processed = await processor.preprocess(audio_path) shutil.copy2(processed, cache_path) return cache_path ``` ## Error Handling ### Common Audio Issues ```python class AudioErrorHandler: """Handle common audio processing errors""" async def handle_processing_error( self, error: Exception, audio_path: Path ) -> Optional[Path]: """Attempt recovery from errors""" if isinstance(error, CorruptedFileError): # Try to repair with FFmpeg return await self.repair_corrupted_file(audio_path) elif isinstance(error, UnsupportedFormatError): # Try alternative extraction method return await self.extract_with_alternative_method(audio_path) elif isinstance(error, SilentAudioError): # Audio is completely silent logger.warning(f"Audio file is silent: {audio_path}") return None else: # Unknown error logger.error(f"Unhandled error: {error}") raise async def repair_corrupted_file(self, audio_path: Path) -> Path: """Attempt to repair corrupted audio""" repaired_path = audio_path.with_suffix('.repaired.wav') # Use FFmpeg's error correction command = ( ffmpeg .input(str(audio_path), err_detect='aggressive') .output( str(repaired_path), acodec='pcm_s16le', ar=16000, ac=1 ) .global_args('-xerror') # Exit on error .overwrite_output() ) try: await self.run_ffmpeg(command) return repaired_path except Exception: raise RepairFailedError(f"Could not repair {audio_path}") ``` ## Testing Strategy ### Audio Processing Tests ```python # tests/test_audio_processing.py class TestAudioProcessing: @pytest.fixture def test_audio_files(self): """Provide test audio files""" return { 'clean': Path('tests/fixtures/audio/clean_speech.wav'), 'noisy': Path('tests/fixtures/audio/noisy_speech.wav'), 'music': Path('tests/fixtures/audio/music_and_speech.mp3'), 'silent': Path('tests/fixtures/audio/silent.wav'), 'corrupted': Path('tests/fixtures/audio/corrupted.mp4') } async def test_preprocessing_improves_quality(self, test_audio_files): """Test that preprocessing improves audio quality""" processor = AudioPreprocessor() original = test_audio_files['noisy'] processed = await processor.preprocess(original) # Analyze both original_quality = AudioQualityAnalyzer().analyze(original) processed_quality = AudioQualityAnalyzer().analyze(processed) # Should improve SNR assert processed_quality.snr > original_quality.snr # Should reduce silence assert processed_quality.silence_ratio < original_quality.silence_ratio async def test_chunking_preserves_content(self, test_audio_files): """Test that chunking doesn't lose content""" chunker = AudioChunker(chunk_duration=30) # 30 second chunks original = test_audio_files['clean'] chunks = await chunker.chunk_audio(original) # Verify coverage original_duration = get_duration(original) chunk_coverage = sum(c.end - c.start for c in chunks) # Should cover entire file (with overlaps) assert chunk_coverage >= original_duration # Verify overlap for i in range(len(chunks) - 1): assert chunks[i].end > chunks[i + 1].start # Overlap exists ``` ## Configuration ### Audio Processing Settings ```python # config/audio.py AUDIO_CONFIG = { # Target format for Whisper 'target_sample_rate': 16000, 'target_channels': 1, 'target_format': 'wav', 'target_bit_depth': 16, # Preprocessing 'remove_silence': True, 'silence_threshold_db': 20, 'normalize_volume': True, 'target_peak_db': -3, 'apply_noise_reduction': True, 'noise_gate_ratio': 1.5, # Chunking 'max_chunk_duration': 600, # 10 minutes 'chunk_overlap': 2, # seconds # Quality thresholds 'min_snr_db': 10, 'max_silence_ratio': 0.8, 'max_clipping_ratio': 0.01, # Performance 'max_parallel_processing': 4, 'cache_preprocessed': True, 'cache_directory': '/tmp/trax/audio_cache' } ``` ## Summary The audio processing architecture ensures: 1. **Format flexibility** - Handle any media format 2. **Quality optimization** - Improve audio for transcription 3. **Reliability** - Handle errors gracefully 4. **Performance** - Parallel processing and caching 5. **Testability** - Comprehensive test coverage This foundation enables accurate, efficient transcription across diverse media sources. --- *Last Updated: 2024* *Architecture Version: 1.0*