#!/usr/bin/env python3 """ Test Adaptive Chunk Sizing for Transcription Optimization. TDD tests for dynamic chunk sizing based on audio characteristics. Expected 1.5-2x speed improvement from intelligent chunking. """ import pytest import numpy as np from pathlib import Path from typing import List, Tuple import librosa from src.services.adaptive_chunking import ( AdaptiveChunker, ChunkInfo, AudioCharacteristics, ChunkingStrategy ) class TestAdaptiveChunking: """Test suite for adaptive chunk sizing - 1.5-2x speed improvement.""" @pytest.fixture def sample_audio_with_silence(self): """Create audio with silence patterns for testing.""" sample_rate = 16000 duration = 120 # 2 minutes # Create audio with alternating speech and silence audio = [] for i in range(4): # 20 seconds of speech (simulated with noise) speech = np.random.randn(sample_rate * 20) * 0.3 audio.extend(speech) # 10 seconds of silence silence = np.zeros(sample_rate * 10) audio.extend(silence) return np.array(audio, dtype=np.float32), sample_rate @pytest.fixture def sample_audio_continuous(self): """Create continuous speech audio without breaks.""" sample_rate = 16000 duration = 120 # 2 minutes # Continuous speech simulation audio = np.random.randn(sample_rate * duration) * 0.3 return audio.astype(np.float32), sample_rate def test_detects_audio_characteristics(self, sample_audio_with_silence): """Test detection of audio characteristics for adaptive chunking.""" audio, sample_rate = sample_audio_with_silence chunker = AdaptiveChunker() characteristics = chunker.analyze_audio(audio, sample_rate) assert isinstance(characteristics, AudioCharacteristics) assert characteristics.duration > 0 assert characteristics.has_silence_patterns assert len(characteristics.silence_segments) > 0 assert characteristics.speech_density < 1.0 # Not 100% speech assert characteristics.average_segment_length > 0 def test_adapts_chunk_size_based_on_duration(self): """Test chunk size adapts to audio duration.""" chunker = AdaptiveChunker() # Short audio (30 seconds) - smaller chunks short_size = chunker.determine_chunk_size(duration_seconds=30) assert 10 <= short_size <= 15 # Medium audio (5 minutes) - medium chunks medium_size = chunker.determine_chunk_size(duration_seconds=300) assert 25 <= medium_size <= 35 # Long audio (30 minutes) - larger chunks long_size = chunker.determine_chunk_size(duration_seconds=1800) assert 45 <= long_size <= 60 # Verify progressive increase assert short_size < medium_size < long_size def test_chunks_at_silence_boundaries(self, sample_audio_with_silence): """Test that chunks are split at natural silence boundaries.""" audio, sample_rate = sample_audio_with_silence chunker = AdaptiveChunker(prefer_silence_splits=True) chunks = chunker.create_adaptive_chunks(audio, sample_rate) # Should create chunks that align with silence assert len(chunks) >= 4 # At least 4 natural segments for chunk in chunks: assert isinstance(chunk, ChunkInfo) assert chunk.start_sample < chunk.end_sample assert chunk.confidence > 0 # Check if chunk boundaries are near silence if chunk.split_at_silence: # Verify the boundary is actually at low energy boundary_region = audio[chunk.end_sample-100:chunk.end_sample+100] assert np.mean(np.abs(boundary_region)) < 0.1 def test_handles_continuous_speech(self, sample_audio_continuous): """Test chunking of continuous speech without natural breaks.""" audio, sample_rate = sample_audio_continuous chunker = AdaptiveChunker(prefer_silence_splits=True) chunks = chunker.create_adaptive_chunks(audio, sample_rate) # Should fall back to time-based chunking assert len(chunks) > 1 # Chunks should be roughly equal size chunk_sizes = [c.end_sample - c.start_sample for c in chunks] avg_size = np.mean(chunk_sizes) std_size = np.std(chunk_sizes) # Standard deviation should be small (uniform chunks) assert std_size / avg_size < 0.2 def test_speech_density_affects_chunk_size(self): """Test that speech density influences chunk sizing.""" chunker = AdaptiveChunker() # High density speech - smaller chunks for accuracy high_density_size = chunker.determine_chunk_size( duration_seconds=300, speech_density=0.95 # 95% speech ) # Low density speech - larger chunks acceptable low_density_size = chunker.determine_chunk_size( duration_seconds=300, speech_density=0.50 # 50% speech ) assert high_density_size < low_density_size def test_respects_min_max_constraints(self): """Test that chunk sizes respect min/max constraints.""" chunker = AdaptiveChunker( min_chunk_seconds=10, max_chunk_seconds=60 ) # Very short audio size = chunker.determine_chunk_size(duration_seconds=5) assert size == 10 # Minimum constraint # Very long audio size = chunker.determine_chunk_size(duration_seconds=3600) assert size == 60 # Maximum constraint def test_overlap_adjusts_with_chunk_size(self): """Test that overlap duration scales with chunk size.""" chunker = AdaptiveChunker() # Small chunks - smaller overlap small_chunks = chunker.create_adaptive_chunks( np.zeros(16000 * 30), 16000, # 30 seconds target_chunk_size=10 ) small_overlap = chunker.determine_overlap(10) assert 0.5 <= small_overlap <= 1.5 # Large chunks - larger overlap large_overlap = chunker.determine_overlap(60) assert 2 <= large_overlap <= 4 def test_chunking_strategy_selection(self): """Test selection of appropriate chunking strategy.""" chunker = AdaptiveChunker() # Short audio - time-based strategy strategy = chunker.select_strategy( duration_seconds=30, has_silence=False ) assert strategy == ChunkingStrategy.TIME_BASED # Long audio with silence - silence-based strategy strategy = chunker.select_strategy( duration_seconds=600, has_silence=True ) assert strategy == ChunkingStrategy.SILENCE_BASED # Medium audio with high speech density - hybrid strategy strategy = chunker.select_strategy( duration_seconds=300, has_silence=True, speech_density=0.9 ) assert strategy == ChunkingStrategy.HYBRID def test_performance_improvement(self, sample_audio_with_silence): """Test that adaptive chunking provides 1.5-2x improvement.""" audio, sample_rate = sample_audio_with_silence # Fixed size chunking fixed_chunker = AdaptiveChunker(adaptive=False, fixed_chunk_size=30) fixed_chunks = fixed_chunker.create_adaptive_chunks(audio, sample_rate) # Adaptive chunking adaptive_chunker = AdaptiveChunker(adaptive=True) adaptive_chunks = adaptive_chunker.create_adaptive_chunks(audio, sample_rate) # Adaptive should create more efficient chunks # Measured by total processing overhead (overlaps) fixed_overhead = sum(c.overlap_duration for c in fixed_chunks) adaptive_overhead = sum(c.overlap_duration for c in adaptive_chunks) # Adaptive should have less overhead improvement = fixed_overhead / adaptive_overhead assert improvement >= 1.5 # At least 1.5x improvement def test_chunk_info_metadata(self): """Test that chunk info contains useful metadata.""" chunker = AdaptiveChunker() audio = np.random.randn(16000 * 60).astype(np.float32) chunks = chunker.create_adaptive_chunks(audio, 16000) for chunk in chunks: assert hasattr(chunk, 'start_sample') assert hasattr(chunk, 'end_sample') assert hasattr(chunk, 'start_time') assert hasattr(chunk, 'end_time') assert hasattr(chunk, 'duration') assert hasattr(chunk, 'overlap_duration') assert hasattr(chunk, 'confidence') assert hasattr(chunk, 'split_at_silence') assert hasattr(chunk, 'strategy_used') def test_energy_based_splitting(self): """Test energy-based split point detection.""" chunker = AdaptiveChunker() # Create audio with clear energy variation sample_rate = 16000 loud = np.random.randn(sample_rate * 5) * 0.5 # Loud section quiet = np.random.randn(sample_rate * 5) * 0.05 # Quiet section audio = np.concatenate([loud, quiet, loud]) # Find best split points split_points = chunker.find_energy_valleys(audio, sample_rate) assert len(split_points) > 0 # Should identify the quiet section as a split point assert any( sample_rate * 4 < point < sample_rate * 6 for point in split_points ) def test_handles_very_short_audio(self): """Test handling of audio shorter than minimum chunk size.""" chunker = AdaptiveChunker(min_chunk_seconds=30) # 10-second audio short_audio = np.random.randn(16000 * 10).astype(np.float32) chunks = chunker.create_adaptive_chunks(short_audio, 16000) # Should create single chunk assert len(chunks) == 1 assert chunks[0].duration == 10 def test_progressive_chunk_sizing(self): """Test progressive increase in chunk size for very long audio.""" chunker = AdaptiveChunker(progressive_sizing=True) # 1-hour audio chunks = chunker.plan_progressive_chunks(duration_seconds=3600) # Early chunks should be smaller assert chunks[0]['size'] < chunks[-1]['size'] # Size should increase progressively for i in range(1, len(chunks)): assert chunks[i]['size'] >= chunks[i-1]['size'] class TestChunkingOptimization: """Test optimization benefits of adaptive chunking.""" def test_reduces_redundant_processing(self): """Test that adaptive chunking reduces redundant overlap processing.""" chunker = AdaptiveChunker() duration = 600 # 10 minutes # Fixed 30-second chunks with 2-second overlap fixed_chunks = chunker.calculate_fixed_chunks(duration, 30, 2) fixed_overlap_total = len(fixed_chunks) * 2 # Adaptive chunks with variable overlap adaptive_chunks = chunker.calculate_adaptive_chunks(duration) adaptive_overlap_total = sum(c['overlap'] for c in adaptive_chunks) # Adaptive should have less total overlap reduction = (fixed_overlap_total - adaptive_overlap_total) / fixed_overlap_total assert reduction > 0.3 # At least 30% reduction in overlap def test_memory_efficiency(self): """Test that adaptive chunking improves memory efficiency.""" chunker = AdaptiveChunker() # Large audio file simulation audio_size_mb = 500 # 500MB audio file fixed_memory = chunker.estimate_memory_usage( audio_size_mb, strategy='fixed', chunk_size=30 ) adaptive_memory = chunker.estimate_memory_usage( audio_size_mb, strategy='adaptive' ) # Adaptive should use less peak memory assert adaptive_memory < fixed_memory * 0.8 # 20% less memory