trax/docs/architecture/audio-processing.md

17 KiB

Audio Processing Architecture

Overview

The audio processing pipeline handles the critical first step: converting various media formats into optimized audio suitable for transcription. This architecture ensures consistent, high-quality input for the Whisper model.

Pipeline Stages

Stage 1: Media Download/Acquisition

class MediaAcquisition:
    """Handle media from various sources"""
    
    async def acquire(self, source: str) -> Path:
        if source.startswith(('http://', 'https://')):
            return await self.download_media(source)
        elif Path(source).exists():
            return Path(source)
        else:
            raise ValueError(f"Invalid source: {source}")
    
    async def download_media(self, url: str) -> Path:
        """Download with progress tracking"""
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                total_size = int(response.headers.get('content-length', 0))
                
                # Stream to temporary file
                temp_file = Path(tempfile.mktemp(suffix='.tmp'))
                with open(temp_file, 'wb') as f:
                    async for chunk in response.content.iter_chunked(8192):
                        f.write(chunk)
                        await self.update_progress(f.tell(), total_size)
                
                return temp_file

Stage 2: Format Detection & Validation

class FormatValidator:
    """Validate and identify media formats"""
    
    SUPPORTED_FORMATS = {
        'video': ['.mp4', '.avi', '.mov', '.mkv', '.webm'],
        'audio': ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a']
    }
    
    def validate_format(self, file_path: Path) -> MediaInfo:
        """Extract media information"""
        probe = ffmpeg.probe(str(file_path))
        
        # Check for audio stream
        audio_streams = [
            s for s in probe['streams'] 
            if s['codec_type'] == 'audio'
        ]
        
        if not audio_streams:
            raise ValueError("No audio stream found")
        
        stream = audio_streams[0]
        return MediaInfo(
            format=probe['format']['format_name'],
            duration=float(probe['format']['duration']),
            sample_rate=int(stream['sample_rate']),
            channels=int(stream['channels']),
            codec=stream['codec_name'],
            bitrate=int(stream.get('bit_rate', 0))
        )

Stage 3: Audio Extraction

class AudioExtractor:
    """Extract audio from video files"""
    
    async def extract_audio(self, video_path: Path) -> Path:
        """Extract audio track from video"""
        output_path = video_path.with_suffix('.extracted.wav')
        
        # FFmpeg extraction command
        command = (
            ffmpeg
            .input(str(video_path))
            .output(
                str(output_path),
                acodec='pcm_s16le',  # 16-bit PCM
                ar=16000,            # 16kHz sample rate
                ac=1,                # Mono
                loglevel='error'
            )
            .overwrite_output()
        )
        
        # Run async
        process = await asyncio.create_subprocess_exec(
            'ffmpeg', *command.compile(),
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE
        )
        
        stdout, stderr = await process.communicate()
        
        if process.returncode != 0:
            raise ProcessingError(f"FFmpeg failed: {stderr.decode()}")
        
        return output_path

Stage 4: Audio Preprocessing

class AudioPreprocessor:
    """Optimize audio for transcription"""
    
    def __init__(self):
        self.target_sample_rate = 16000
        self.target_channels = 1  # Mono
        self.target_format = 'wav'
    
    async def preprocess(self, audio_path: Path) -> Path:
        """Full preprocessing pipeline"""
        # Load audio
        audio, sr = librosa.load(
            str(audio_path),
            sr=self.target_sample_rate,
            mono=True
        )
        
        # Apply preprocessing chain
        audio = self.remove_silence(audio, sr)
        audio = self.normalize_volume(audio)
        audio = self.apply_noise_reduction(audio, sr)
        audio = self.compress_dynamic_range(audio)
        
        # Save processed audio
        output_path = audio_path.with_suffix('.preprocessed.wav')
        sf.write(output_path, audio, sr, subtype='PCM_16')
        
        return output_path
    
    def remove_silence(self, audio: np.ndarray, sr: int) -> np.ndarray:
        """Remove leading/trailing silence"""
        # Use librosa's trim function
        trimmed, _ = librosa.effects.trim(
            audio,
            top_db=20,  # Threshold in dB
            frame_length=2048,
            hop_length=512
        )
        return trimmed
    
    def normalize_volume(self, audio: np.ndarray) -> np.ndarray:
        """Normalize to consistent volume"""
        # Peak normalization to -3dB
        peak = np.abs(audio).max()
        if peak > 0:
            target_peak = 10 ** (-3 / 20)  # -3dB in linear scale
            audio = audio * (target_peak / peak)
        return audio
    
    def apply_noise_reduction(self, audio: np.ndarray, sr: int) -> np.ndarray:
        """Reduce background noise"""
        # Simple spectral gating
        D = librosa.stft(audio)
        magnitude = np.abs(D)
        
        # Estimate noise floor (bottom 10%)
        noise_floor = np.percentile(magnitude, 10)
        
        # Gate frequencies below noise floor
        mask = magnitude > noise_floor * 1.5
        D_gated = D * mask
        
        # Reconstruct audio
        audio_denoised = librosa.istft(D_gated)
        
        return audio_denoised
    
    def compress_dynamic_range(self, audio: np.ndarray) -> np.ndarray:
        """Apply gentle compression"""
        # Simple compression algorithm
        threshold = 0.7
        ratio = 4.0
        
        # Apply compression to peaks
        mask = np.abs(audio) > threshold
        compressed = audio.copy()
        compressed[mask] = np.sign(audio[mask]) * (
            threshold + (np.abs(audio[mask]) - threshold) / ratio
        )
        
        return compressed

Stage 5: Chunking for Long Audio

class AudioChunker:
    """Split long audio files for processing"""
    
    def __init__(self, chunk_duration: int = 600):  # 10 minutes
        self.chunk_duration = chunk_duration
        self.overlap = 2  # 2 second overlap
    
    async def chunk_audio(self, audio_path: Path) -> List[AudioChunk]:
        """Split audio into overlapping chunks"""
        # Get duration
        info = await self.get_audio_info(audio_path)
        duration = info.duration
        
        if duration <= self.chunk_duration:
            # No chunking needed
            return [AudioChunk(
                path=audio_path,
                start=0,
                end=duration,
                index=0
            )]
        
        # Calculate chunks
        chunks = []
        chunk_size = self.chunk_duration
        step = chunk_size - self.overlap
        
        for i, start in enumerate(range(0, int(duration), step)):
            end = min(start + chunk_size, duration)
            
            # Extract chunk
            chunk_path = await self.extract_chunk(
                audio_path, start, end - start, i
            )
            
            chunks.append(AudioChunk(
                path=chunk_path,
                start=start,
                end=end,
                index=i
            ))
            
            if end >= duration:
                break
        
        return chunks
    
    async def extract_chunk(
        self, 
        audio_path: Path, 
        start: float, 
        duration: float,
        index: int
    ) -> Path:
        """Extract a specific chunk"""
        output_path = audio_path.parent / f"{audio_path.stem}_chunk_{index:03d}.wav"
        
        command = (
            ffmpeg
            .input(str(audio_path), ss=start, t=duration)
            .output(str(output_path), acodec='copy')
            .overwrite_output()
        )
        
        await self.run_ffmpeg(command)
        return output_path

Quality Assurance

Audio Quality Metrics

class AudioQualityAnalyzer:
    """Analyze audio quality metrics"""
    
    def analyze(self, audio_path: Path) -> QualityReport:
        audio, sr = librosa.load(str(audio_path))
        
        return QualityReport(
            snr=self.calculate_snr(audio),
            silence_ratio=self.calculate_silence_ratio(audio),
            clipping_ratio=self.calculate_clipping(audio),
            frequency_range=self.analyze_frequency_range(audio, sr),
            recommended_action=self.recommend_action(audio, sr)
        )
    
    def calculate_snr(self, audio: np.ndarray) -> float:
        """Signal-to-noise ratio in dB"""
        # Use robust estimator
        signal_power = np.median(audio ** 2)
        noise_power = np.median((audio - np.median(audio)) ** 2)
        
        if noise_power > 0:
            snr = 10 * np.log10(signal_power / noise_power)
        else:
            snr = float('inf')
        
        return snr
    
    def calculate_silence_ratio(self, audio: np.ndarray) -> float:
        """Percentage of silence in audio"""
        threshold = 0.01  # Silence threshold
        silence_samples = np.sum(np.abs(audio) < threshold)
        return silence_samples / len(audio)
    
    def calculate_clipping(self, audio: np.ndarray) -> float:
        """Percentage of clipped samples"""
        clipping_threshold = 0.99
        clipped = np.sum(np.abs(audio) > clipping_threshold)
        return clipped / len(audio)

Performance Optimization

Parallel Processing

class ParallelAudioProcessor:
    """Process multiple audio files in parallel"""
    
    def __init__(self, max_workers: int = 4):
        self.max_workers = max_workers
        self.semaphore = asyncio.Semaphore(max_workers)
    
    async def process_batch(self, audio_files: List[Path]) -> List[Path]:
        """Process multiple files concurrently"""
        tasks = [
            self.process_with_limit(audio_file)
            for audio_file in audio_files
        ]
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Handle errors
        processed = []
        for result, audio_file in zip(results, audio_files):
            if isinstance(result, Exception):
                logger.error(f"Failed to process {audio_file}: {result}")
            else:
                processed.append(result)
        
        return processed
    
    async def process_with_limit(self, audio_file: Path) -> Path:
        """Process with concurrency limit"""
        async with self.semaphore:
            return await self.process_single(audio_file)

Caching Preprocessed Audio

class PreprocessedAudioCache:
    """Cache preprocessed audio files"""
    
    def __init__(self, cache_dir: Path):
        self.cache_dir = cache_dir
        self.cache_dir.mkdir(exist_ok=True)
    
    def get_cache_path(self, original_path: Path) -> Path:
        """Generate cache file path"""
        file_hash = self.calculate_hash(original_path)
        return self.cache_dir / f"{file_hash}.preprocessed.wav"
    
    async def get_or_process(
        self, 
        audio_path: Path,
        processor: AudioPreprocessor
    ) -> Path:
        """Get from cache or process"""
        cache_path = self.get_cache_path(audio_path)
        
        if cache_path.exists():
            # Verify cache is newer than source
            if cache_path.stat().st_mtime > audio_path.stat().st_mtime:
                logger.info(f"Using cached preprocessed audio: {cache_path}")
                return cache_path
        
        # Process and cache
        processed = await processor.preprocess(audio_path)
        shutil.copy2(processed, cache_path)
        
        return cache_path

Error Handling

Common Audio Issues

class AudioErrorHandler:
    """Handle common audio processing errors"""
    
    async def handle_processing_error(
        self, 
        error: Exception,
        audio_path: Path
    ) -> Optional[Path]:
        """Attempt recovery from errors"""
        
        if isinstance(error, CorruptedFileError):
            # Try to repair with FFmpeg
            return await self.repair_corrupted_file(audio_path)
        
        elif isinstance(error, UnsupportedFormatError):
            # Try alternative extraction method
            return await self.extract_with_alternative_method(audio_path)
        
        elif isinstance(error, SilentAudioError):
            # Audio is completely silent
            logger.warning(f"Audio file is silent: {audio_path}")
            return None
        
        else:
            # Unknown error
            logger.error(f"Unhandled error: {error}")
            raise
    
    async def repair_corrupted_file(self, audio_path: Path) -> Path:
        """Attempt to repair corrupted audio"""
        repaired_path = audio_path.with_suffix('.repaired.wav')
        
        # Use FFmpeg's error correction
        command = (
            ffmpeg
            .input(str(audio_path), err_detect='aggressive')
            .output(
                str(repaired_path),
                acodec='pcm_s16le',
                ar=16000,
                ac=1
            )
            .global_args('-xerror')  # Exit on error
            .overwrite_output()
        )
        
        try:
            await self.run_ffmpeg(command)
            return repaired_path
        except Exception:
            raise RepairFailedError(f"Could not repair {audio_path}")

Testing Strategy

Audio Processing Tests

# tests/test_audio_processing.py
class TestAudioProcessing:
    
    @pytest.fixture
    def test_audio_files(self):
        """Provide test audio files"""
        return {
            'clean': Path('tests/fixtures/audio/clean_speech.wav'),
            'noisy': Path('tests/fixtures/audio/noisy_speech.wav'),
            'music': Path('tests/fixtures/audio/music_and_speech.mp3'),
            'silent': Path('tests/fixtures/audio/silent.wav'),
            'corrupted': Path('tests/fixtures/audio/corrupted.mp4')
        }
    
    async def test_preprocessing_improves_quality(self, test_audio_files):
        """Test that preprocessing improves audio quality"""
        processor = AudioPreprocessor()
        
        original = test_audio_files['noisy']
        processed = await processor.preprocess(original)
        
        # Analyze both
        original_quality = AudioQualityAnalyzer().analyze(original)
        processed_quality = AudioQualityAnalyzer().analyze(processed)
        
        # Should improve SNR
        assert processed_quality.snr > original_quality.snr
        
        # Should reduce silence
        assert processed_quality.silence_ratio < original_quality.silence_ratio
    
    async def test_chunking_preserves_content(self, test_audio_files):
        """Test that chunking doesn't lose content"""
        chunker = AudioChunker(chunk_duration=30)  # 30 second chunks
        
        original = test_audio_files['clean']
        chunks = await chunker.chunk_audio(original)
        
        # Verify coverage
        original_duration = get_duration(original)
        chunk_coverage = sum(c.end - c.start for c in chunks)
        
        # Should cover entire file (with overlaps)
        assert chunk_coverage >= original_duration
        
        # Verify overlap
        for i in range(len(chunks) - 1):
            assert chunks[i].end > chunks[i + 1].start  # Overlap exists

Configuration

Audio Processing Settings

# config/audio.py
AUDIO_CONFIG = {
    # Target format for Whisper
    'target_sample_rate': 16000,
    'target_channels': 1,
    'target_format': 'wav',
    'target_bit_depth': 16,
    
    # Preprocessing
    'remove_silence': True,
    'silence_threshold_db': 20,
    'normalize_volume': True,
    'target_peak_db': -3,
    'apply_noise_reduction': True,
    'noise_gate_ratio': 1.5,
    
    # Chunking
    'max_chunk_duration': 600,  # 10 minutes
    'chunk_overlap': 2,  # seconds
    
    # Quality thresholds
    'min_snr_db': 10,
    'max_silence_ratio': 0.8,
    'max_clipping_ratio': 0.01,
    
    # Performance
    'max_parallel_processing': 4,
    'cache_preprocessed': True,
    'cache_directory': '/tmp/trax/audio_cache'
}

Summary

The audio processing architecture ensures:

  1. Format flexibility - Handle any media format
  2. Quality optimization - Improve audio for transcription
  3. Reliability - Handle errors gracefully
  4. Performance - Parallel processing and caching
  5. Testability - Comprehensive test coverage

This foundation enables accurate, efficient transcription across diverse media sources.


Last Updated: 2024
Architecture Version: 1.0