trax/tests/test_adaptive_chunking.py

325 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Test Adaptive Chunk Sizing for Transcription Optimization.
TDD tests for dynamic chunk sizing based on audio characteristics.
Expected 1.5-2x speed improvement from intelligent chunking.
"""
import pytest
import numpy as np
from pathlib import Path
from typing import List, Tuple
import librosa
from src.services.adaptive_chunking import (
AdaptiveChunker,
ChunkInfo,
AudioCharacteristics,
ChunkingStrategy
)
class TestAdaptiveChunking:
"""Test suite for adaptive chunk sizing - 1.5-2x speed improvement."""
@pytest.fixture
def sample_audio_with_silence(self):
"""Create audio with silence patterns for testing."""
sample_rate = 16000
duration = 120 # 2 minutes
# Create audio with alternating speech and silence
audio = []
for i in range(4):
# 20 seconds of speech (simulated with noise)
speech = np.random.randn(sample_rate * 20) * 0.3
audio.extend(speech)
# 10 seconds of silence
silence = np.zeros(sample_rate * 10)
audio.extend(silence)
return np.array(audio, dtype=np.float32), sample_rate
@pytest.fixture
def sample_audio_continuous(self):
"""Create continuous speech audio without breaks."""
sample_rate = 16000
duration = 120 # 2 minutes
# Continuous speech simulation
audio = np.random.randn(sample_rate * duration) * 0.3
return audio.astype(np.float32), sample_rate
def test_detects_audio_characteristics(self, sample_audio_with_silence):
"""Test detection of audio characteristics for adaptive chunking."""
audio, sample_rate = sample_audio_with_silence
chunker = AdaptiveChunker()
characteristics = chunker.analyze_audio(audio, sample_rate)
assert isinstance(characteristics, AudioCharacteristics)
assert characteristics.duration > 0
assert characteristics.has_silence_patterns
assert len(characteristics.silence_segments) > 0
assert characteristics.speech_density < 1.0 # Not 100% speech
assert characteristics.average_segment_length > 0
def test_adapts_chunk_size_based_on_duration(self):
"""Test chunk size adapts to audio duration."""
chunker = AdaptiveChunker()
# Short audio (30 seconds) - smaller chunks
short_size = chunker.determine_chunk_size(duration_seconds=30)
assert 10 <= short_size <= 15
# Medium audio (5 minutes) - medium chunks
medium_size = chunker.determine_chunk_size(duration_seconds=300)
assert 25 <= medium_size <= 35
# Long audio (30 minutes) - larger chunks
long_size = chunker.determine_chunk_size(duration_seconds=1800)
assert 45 <= long_size <= 60
# Verify progressive increase
assert short_size < medium_size < long_size
def test_chunks_at_silence_boundaries(self, sample_audio_with_silence):
"""Test that chunks are split at natural silence boundaries."""
audio, sample_rate = sample_audio_with_silence
chunker = AdaptiveChunker(prefer_silence_splits=True)
chunks = chunker.create_adaptive_chunks(audio, sample_rate)
# Should create chunks that align with silence
assert len(chunks) >= 4 # At least 4 natural segments
for chunk in chunks:
assert isinstance(chunk, ChunkInfo)
assert chunk.start_sample < chunk.end_sample
assert chunk.confidence > 0
# Check if chunk boundaries are near silence
if chunk.split_at_silence:
# Verify the boundary is actually at low energy
boundary_region = audio[chunk.end_sample-100:chunk.end_sample+100]
assert np.mean(np.abs(boundary_region)) < 0.1
def test_handles_continuous_speech(self, sample_audio_continuous):
"""Test chunking of continuous speech without natural breaks."""
audio, sample_rate = sample_audio_continuous
chunker = AdaptiveChunker(prefer_silence_splits=True)
chunks = chunker.create_adaptive_chunks(audio, sample_rate)
# Should fall back to time-based chunking
assert len(chunks) > 1
# Chunks should be roughly equal size
chunk_sizes = [c.end_sample - c.start_sample for c in chunks]
avg_size = np.mean(chunk_sizes)
std_size = np.std(chunk_sizes)
# Standard deviation should be small (uniform chunks)
assert std_size / avg_size < 0.2
def test_speech_density_affects_chunk_size(self):
"""Test that speech density influences chunk sizing."""
chunker = AdaptiveChunker()
# High density speech - smaller chunks for accuracy
high_density_size = chunker.determine_chunk_size(
duration_seconds=300,
speech_density=0.95 # 95% speech
)
# Low density speech - larger chunks acceptable
low_density_size = chunker.determine_chunk_size(
duration_seconds=300,
speech_density=0.50 # 50% speech
)
assert high_density_size < low_density_size
def test_respects_min_max_constraints(self):
"""Test that chunk sizes respect min/max constraints."""
chunker = AdaptiveChunker(
min_chunk_seconds=10,
max_chunk_seconds=60
)
# Very short audio
size = chunker.determine_chunk_size(duration_seconds=5)
assert size == 10 # Minimum constraint
# Very long audio
size = chunker.determine_chunk_size(duration_seconds=3600)
assert size == 60 # Maximum constraint
def test_overlap_adjusts_with_chunk_size(self):
"""Test that overlap duration scales with chunk size."""
chunker = AdaptiveChunker()
# Small chunks - smaller overlap
small_chunks = chunker.create_adaptive_chunks(
np.zeros(16000 * 30), 16000, # 30 seconds
target_chunk_size=10
)
small_overlap = chunker.determine_overlap(10)
assert 0.5 <= small_overlap <= 1.5
# Large chunks - larger overlap
large_overlap = chunker.determine_overlap(60)
assert 2 <= large_overlap <= 4
def test_chunking_strategy_selection(self):
"""Test selection of appropriate chunking strategy."""
chunker = AdaptiveChunker()
# Short audio - time-based strategy
strategy = chunker.select_strategy(
duration_seconds=30,
has_silence=False
)
assert strategy == ChunkingStrategy.TIME_BASED
# Long audio with silence - silence-based strategy
strategy = chunker.select_strategy(
duration_seconds=600,
has_silence=True
)
assert strategy == ChunkingStrategy.SILENCE_BASED
# Medium audio with high speech density - hybrid strategy
strategy = chunker.select_strategy(
duration_seconds=300,
has_silence=True,
speech_density=0.9
)
assert strategy == ChunkingStrategy.HYBRID
def test_performance_improvement(self, sample_audio_with_silence):
"""Test that adaptive chunking provides 1.5-2x improvement."""
audio, sample_rate = sample_audio_with_silence
# Fixed size chunking
fixed_chunker = AdaptiveChunker(adaptive=False, fixed_chunk_size=30)
fixed_chunks = fixed_chunker.create_adaptive_chunks(audio, sample_rate)
# Adaptive chunking
adaptive_chunker = AdaptiveChunker(adaptive=True)
adaptive_chunks = adaptive_chunker.create_adaptive_chunks(audio, sample_rate)
# Adaptive should create more efficient chunks
# Measured by total processing overhead (overlaps)
fixed_overhead = sum(c.overlap_duration for c in fixed_chunks)
adaptive_overhead = sum(c.overlap_duration for c in adaptive_chunks)
# Adaptive should have less overhead
improvement = fixed_overhead / adaptive_overhead
assert improvement >= 1.5 # At least 1.5x improvement
def test_chunk_info_metadata(self):
"""Test that chunk info contains useful metadata."""
chunker = AdaptiveChunker()
audio = np.random.randn(16000 * 60).astype(np.float32)
chunks = chunker.create_adaptive_chunks(audio, 16000)
for chunk in chunks:
assert hasattr(chunk, 'start_sample')
assert hasattr(chunk, 'end_sample')
assert hasattr(chunk, 'start_time')
assert hasattr(chunk, 'end_time')
assert hasattr(chunk, 'duration')
assert hasattr(chunk, 'overlap_duration')
assert hasattr(chunk, 'confidence')
assert hasattr(chunk, 'split_at_silence')
assert hasattr(chunk, 'strategy_used')
def test_energy_based_splitting(self):
"""Test energy-based split point detection."""
chunker = AdaptiveChunker()
# Create audio with clear energy variation
sample_rate = 16000
loud = np.random.randn(sample_rate * 5) * 0.5 # Loud section
quiet = np.random.randn(sample_rate * 5) * 0.05 # Quiet section
audio = np.concatenate([loud, quiet, loud])
# Find best split points
split_points = chunker.find_energy_valleys(audio, sample_rate)
assert len(split_points) > 0
# Should identify the quiet section as a split point
assert any(
sample_rate * 4 < point < sample_rate * 6
for point in split_points
)
def test_handles_very_short_audio(self):
"""Test handling of audio shorter than minimum chunk size."""
chunker = AdaptiveChunker(min_chunk_seconds=30)
# 10-second audio
short_audio = np.random.randn(16000 * 10).astype(np.float32)
chunks = chunker.create_adaptive_chunks(short_audio, 16000)
# Should create single chunk
assert len(chunks) == 1
assert chunks[0].duration == 10
def test_progressive_chunk_sizing(self):
"""Test progressive increase in chunk size for very long audio."""
chunker = AdaptiveChunker(progressive_sizing=True)
# 1-hour audio
chunks = chunker.plan_progressive_chunks(duration_seconds=3600)
# Early chunks should be smaller
assert chunks[0]['size'] < chunks[-1]['size']
# Size should increase progressively
for i in range(1, len(chunks)):
assert chunks[i]['size'] >= chunks[i-1]['size']
class TestChunkingOptimization:
"""Test optimization benefits of adaptive chunking."""
def test_reduces_redundant_processing(self):
"""Test that adaptive chunking reduces redundant overlap processing."""
chunker = AdaptiveChunker()
duration = 600 # 10 minutes
# Fixed 30-second chunks with 2-second overlap
fixed_chunks = chunker.calculate_fixed_chunks(duration, 30, 2)
fixed_overlap_total = len(fixed_chunks) * 2
# Adaptive chunks with variable overlap
adaptive_chunks = chunker.calculate_adaptive_chunks(duration)
adaptive_overlap_total = sum(c['overlap'] for c in adaptive_chunks)
# Adaptive should have less total overlap
reduction = (fixed_overlap_total - adaptive_overlap_total) / fixed_overlap_total
assert reduction > 0.3 # At least 30% reduction in overlap
def test_memory_efficiency(self):
"""Test that adaptive chunking improves memory efficiency."""
chunker = AdaptiveChunker()
# Large audio file simulation
audio_size_mb = 500 # 500MB audio file
fixed_memory = chunker.estimate_memory_usage(
audio_size_mb,
strategy='fixed',
chunk_size=30
)
adaptive_memory = chunker.estimate_memory_usage(
audio_size_mb,
strategy='adaptive'
)
# Adaptive should use less peak memory
assert adaptive_memory < fixed_memory * 0.8 # 20% less memory