trax/examples/batch_processing.py

274 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""Example: Batch processing with circuit breaker pattern.
This example demonstrates:
- Using BatchProcessor for parallel file processing
- Circuit breaker pattern for fault tolerance
- Progress tracking and statistics
- Error recovery strategies
"""
import asyncio
import logging
from pathlib import Path
from typing import Dict, Any, List
import random
# Add parent directory to path for imports
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.base.processors import BatchProcessor, AudioProcessor, CircuitBreaker, CircuitBreakerState
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class SimulatedAudioProcessor(AudioProcessor):
"""Simulated audio processor for demonstration."""
def __init__(self, failure_rate: float = 0.1, **kwargs):
"""Initialize with configurable failure rate for testing."""
super().__init__(**kwargs)
self.failure_rate = failure_rate
self.processed_count = 0
async def _process_audio(self, input_path: Path) -> Path:
"""Simulate audio processing with occasional failures."""
self.processed_count += 1
# Simulate processing time
await asyncio.sleep(random.uniform(0.5, 2.0))
# Simulate failures for circuit breaker demonstration
if random.random() < self.failure_rate:
if self.processed_count < 5:
# Early failures to trigger circuit breaker
raise Exception(f"Processing failed for {input_path.name}")
# Return processed file path
output_path = input_path.parent / f"{input_path.stem}_processed.wav"
output_path.touch() # Create dummy file
logger.info(f"✓ Processed {input_path.name} -> {output_path.name}")
return output_path
async def process_with_circuit_breaker(files: List[Path]) -> Dict[str, Any]:
"""Process files with circuit breaker protection.
Demonstrates how circuit breaker prevents cascading failures
and provides graceful degradation.
"""
# Configure circuit breaker
breaker = CircuitBreaker(
failure_threshold=3, # Open after 3 failures
recovery_timeout=10, # Try again after 10 seconds
expected_exception=Exception,
)
# Initialize processor with higher failure rate
processor = SimulatedAudioProcessor(failure_rate=0.3)
results = {
"successful": [],
"failed": [],
"circuit_breaker_rejections": [],
}
for file_path in files:
try:
# Check circuit breaker state
if breaker.state == CircuitBreakerState.OPEN:
logger.warning(f"⚡ Circuit breaker OPEN - skipping {file_path.name}")
results["circuit_breaker_rejections"].append(str(file_path))
continue
# Process with circuit breaker protection
async with breaker:
result = await processor.process(file_path)
results["successful"].append({
"input": str(file_path),
"output": str(result)
})
except Exception as e:
logger.error(f"✗ Failed to process {file_path.name}: {e}")
results["failed"].append({
"file": str(file_path),
"error": str(e),
"breaker_state": breaker.state.value
})
# Add circuit breaker statistics
results["circuit_breaker_stats"] = {
"final_state": breaker.state.value,
"failure_count": breaker.failure_count,
"success_count": breaker.success_count,
"last_failure_time": breaker.last_failure_time.isoformat() if breaker.last_failure_time else None,
}
return results
async def main():
"""Run batch processing examples."""
# Create test files
test_dir = Path("test_media")
test_dir.mkdir(exist_ok=True)
test_files = []
for i in range(20):
file_path = test_dir / f"audio_{i:03d}.mp3"
file_path.touch()
test_files.append(file_path)
try:
# Example 1: Basic batch processing with parallel execution
print("\n" + "="*60)
print("Example 1: Parallel Batch Processing")
print("="*60)
# Initialize processors
audio_processor = SimulatedAudioProcessor(failure_rate=0.1)
batch_processor = BatchProcessor(config={
"max_parallel": 4,
"batch_size": 5,
"max_retries": 2,
})
# Process batch
print(f"Processing {len(test_files[:10])} files with max 4 parallel...")
results = await batch_processor.process_batch(test_files[:10], audio_processor)
print(f"\nBatch Processing Results:")
print(f" Total: {results['total']}")
print(f" Successful: {results['successful']}")
print(f" Failed: {results['failed']}")
print(f" Time: {results['elapsed_seconds']:.2f}s")
print(f" Speed: {results['files_per_second']:.2f} files/sec")
if results['errors']:
print(f"\nErrors encountered:")
for error in results['errors'][:3]: # Show first 3 errors
print(f" - {Path(error['file']).name}: {error['error']}")
# Example 2: Circuit breaker pattern for fault tolerance
print("\n" + "="*60)
print("Example 2: Circuit Breaker Pattern")
print("="*60)
print("Processing with circuit breaker (simulating failures)...")
breaker_results = await process_with_circuit_breaker(test_files[10:20])
print(f"\nCircuit Breaker Results:")
print(f" Successful: {len(breaker_results['successful'])}")
print(f" Failed: {len(breaker_results['failed'])}")
print(f" Rejected by breaker: {len(breaker_results['circuit_breaker_rejections'])}")
print(f" Final breaker state: {breaker_results['circuit_breaker_stats']['final_state']}")
# Example 3: Progress tracking and statistics
print("\n" + "="*60)
print("Example 3: Progress Tracking and Statistics")
print("="*60)
# Process with progress updates
batch_processor_with_progress = BatchProcessor(config={
"max_parallel": 2,
"batch_size": 3,
})
print("Processing with progress tracking...")
# Simulate progress updates
async def process_with_progress():
task = asyncio.create_task(
batch_processor_with_progress.process_batch(test_files[:6], audio_processor)
)
# Check progress periodically
while not task.done():
await asyncio.sleep(0.5)
stats = batch_processor_with_progress.get_stats()
if stats.get("current_batch"):
progress = stats["current_batch"]["progress"]
success_rate = stats["current_batch"]["success_rate"]
print(f" Progress: {progress} | Success rate: {success_rate:.1%}", end="\r")
return await task
final_results = await process_with_progress()
# Get final statistics
final_stats = batch_processor_with_progress.get_stats()
print(f"\n\nFinal Statistics:")
print(f" Total processed: {final_stats['total_processed']}")
print(f" Total successful: {final_stats['total_successful']}")
print(f" Total failed: {final_stats['total_failed']}")
print(f" Overall success rate: {final_stats['overall_success_rate']:.1%}")
# Example 4: Error recovery strategies
print("\n" + "="*60)
print("Example 4: Error Recovery Strategies")
print("="*60)
# Identify failed files from previous batch
failed_files = [
Path(error['file'])
for error in results.get('errors', [])
if Path(error['file']).exists()
]
if failed_files:
print(f"Retrying {len(failed_files)} failed files with reduced parallelism...")
# Retry with more conservative settings
recovery_processor = BatchProcessor(config={
"max_parallel": 1, # Process sequentially
"batch_size": 2,
"max_retries": 5, # More retry attempts
})
# Use processor with lower failure rate
reliable_processor = SimulatedAudioProcessor(failure_rate=0.05)
recovery_results = await recovery_processor.process_batch(
failed_files,
reliable_processor
)
print(f"Recovery Results:")
print(f" Recovered: {recovery_results['successful']}/{len(failed_files)}")
print(f" Still failing: {recovery_results['failed']}")
else:
print("No failed files to retry")
finally:
# Cleanup test files
for file_path in test_files:
if file_path.exists():
file_path.unlink()
# Also remove processed files
processed = file_path.parent / f"{file_path.stem}_processed.wav"
if processed.exists():
processed.unlink()
if test_dir.exists():
test_dir.rmdir()
print("\n✓ Cleanup complete")
if __name__ == "__main__":
print("Trax Batch Processing Example")
print("Using AI Assistant Library for resilient batch operations")
print("-" * 60)
# Run the async main function
asyncio.run(main())