"""Enhanced multi-level intelligent caching system for YouTube Summarizer.""" import hashlib import json import time import asyncio import logging from datetime import datetime, timedelta from typing import Dict, List, Optional, Any, Union from enum import Enum from dataclasses import dataclass, asdict import redis from redis import asyncio as aioredis logger = logging.getLogger(__name__) class CacheLevel(Enum): """Cache storage levels.""" L1_MEMORY = "l1_memory" # Redis - fastest, volatile L2_DATABASE = "l2_database" # SQLite/PostgreSQL - persistent, structured class CachePolicy(Enum): """Cache write policies.""" WRITE_THROUGH = "write_through" # Write to all levels immediately WRITE_BACK = "write_back" # Write to fast cache first, sync later WRITE_AROUND = "write_around" # Skip cache on write, read from storage @dataclass class CacheConfig: """Cache configuration settings.""" transcript_ttl_hours: int = 168 # 7 days summary_ttl_hours: int = 72 # 3 days memory_max_size_mb: int = 512 # Redis memory limit warming_batch_size: int = 50 # Videos per warming batch cleanup_interval_hours: int = 6 # Cleanup frequency hit_rate_alert_threshold: float = 0.7 # Alert if hit rate drops below redis_url: Optional[str] = None # Redis connection URL enable_warming: bool = False # Enable cache warming enable_analytics: bool = True # Enable analytics collection @dataclass class CacheMetrics: """Cache performance metrics.""" hits: int = 0 misses: int = 0 write_operations: int = 0 evictions: int = 0 errors: int = 0 total_size_bytes: int = 0 average_response_time_ms: float = 0.0 @property def hit_rate(self) -> float: """Calculate cache hit rate.""" total = self.hits + self.misses return self.hits / total if total > 0 else 0.0 @property def total_operations(self) -> int: """Total cache operations.""" return self.hits + self.misses + self.write_operations def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { **asdict(self), 'hit_rate': self.hit_rate, 'total_operations': self.total_operations } class EnhancedCacheManager: """Enhanced multi-level intelligent caching system.""" def __init__(self, config: Optional[CacheConfig] = None): """Initialize enhanced cache manager. Args: config: Cache configuration settings """ self.config = config or CacheConfig() self.metrics = CacheMetrics() self.redis_client: Optional[aioredis.Redis] = None self._memory_cache: Dict[str, Dict[str, Any]] = {} # Fallback memory cache # Cache key prefixes self.TRANSCRIPT_PREFIX = "yt:transcript:" self.SUMMARY_PREFIX = "yt:summary:" self.METADATA_PREFIX = "yt:meta:" self.ANALYTICS_PREFIX = "yt:analytics:" # Background tasks self._cleanup_task: Optional[asyncio.Task] = None self._warming_task: Optional[asyncio.Task] = None self._initialized = False async def initialize(self) -> None: """Initialize cache connections and background tasks.""" if self._initialized: return # Initialize Redis connection if available if self.config.redis_url: try: self.redis_client = await aioredis.from_url( self.config.redis_url, encoding="utf-8", decode_responses=True ) await self.redis_client.ping() logger.info("Redis connection established") except Exception as e: logger.warning(f"Redis connection failed, using memory cache: {e}") self.redis_client = None else: logger.info("Redis URL not configured, using memory cache") # Start background tasks await self.start_background_tasks() self._initialized = True async def start_background_tasks(self) -> None: """Start background cache management tasks.""" self._cleanup_task = asyncio.create_task(self._periodic_cleanup()) if self.config.enable_warming: self._warming_task = asyncio.create_task(self._cache_warming_scheduler()) async def stop_background_tasks(self) -> None: """Stop background tasks gracefully.""" if self._cleanup_task: self._cleanup_task.cancel() try: await self._cleanup_task except asyncio.CancelledError: pass if self._warming_task: self._warming_task.cancel() try: await self._warming_task except asyncio.CancelledError: pass async def close(self) -> None: """Close cache connections and cleanup.""" await self.stop_background_tasks() if self.redis_client: await self.redis_client.close() self._initialized = False # Transcript Caching Methods async def get_cached_transcript( self, video_id: str, language: str = "en" ) -> Optional[Dict[str, Any]]: """Retrieve cached transcript with multi-level fallback. Args: video_id: YouTube video ID language: Transcript language code Returns: Cached transcript data or None if not found """ cache_key = self._generate_transcript_key(video_id, language) start_time = time.time() try: # Try Redis first if available if self.redis_client: cached_data = await self._get_from_redis(cache_key) if cached_data: self._record_cache_hit("transcript", "l1_memory", start_time) return cached_data else: # Fallback to memory cache cached_data = self._memory_cache.get(cache_key) if cached_data and self._is_cache_valid(cached_data): self._record_cache_hit("transcript", "memory", start_time) return cached_data["data"] self._record_cache_miss("transcript", start_time) return None except Exception as e: self.metrics.errors += 1 logger.error(f"Cache retrieval error: {e}") return None async def cache_transcript( self, video_id: str, language: str, transcript_data: Dict[str, Any], policy: CachePolicy = CachePolicy.WRITE_THROUGH ) -> bool: """Cache transcript with specified write policy. Args: video_id: YouTube video ID language: Transcript language code transcript_data: Transcript data to cache policy: Cache write policy Returns: True if caching succeeded """ cache_key = self._generate_transcript_key(video_id, language) start_time = time.time() try: ttl_seconds = self.config.transcript_ttl_hours * 3600 success = True if policy == CachePolicy.WRITE_THROUGH: # Write to all cache levels if self.redis_client: success &= await self._set_in_redis(cache_key, transcript_data, ttl_seconds) else: # Use memory cache as fallback self._set_in_memory(cache_key, transcript_data, ttl_seconds) elif policy == CachePolicy.WRITE_BACK: # Write to fastest cache first if self.redis_client: success = await self._set_in_redis(cache_key, transcript_data, ttl_seconds) else: self._set_in_memory(cache_key, transcript_data, ttl_seconds) self.metrics.write_operations += 1 self._record_cache_operation("transcript_write", start_time) return success except Exception as e: self.metrics.errors += 1 logger.error(f"Cache write error: {e}") return False # Summary Caching Methods async def get_cached_summary( self, transcript_hash: str, config_hash: str ) -> Optional[Dict[str, Any]]: """Retrieve cached summary by content and configuration hash. Args: transcript_hash: Hash of transcript content config_hash: Hash of summary configuration Returns: Cached summary data or None if not found """ cache_key = self._generate_summary_key(transcript_hash, config_hash) start_time = time.time() try: # Try Redis first if self.redis_client: cached_data = await self._get_from_redis(cache_key) if cached_data and self._is_summary_valid(cached_data): self._record_cache_hit("summary", "l1_memory", start_time) return cached_data else: # Fallback to memory cache cached_data = self._memory_cache.get(cache_key) if cached_data and self._is_cache_valid(cached_data) and self._is_summary_valid(cached_data["data"]): self._record_cache_hit("summary", "memory", start_time) return cached_data["data"] self._record_cache_miss("summary", start_time) return None except Exception as e: self.metrics.errors += 1 logger.error(f"Summary cache retrieval error: {e}") return None async def cache_summary( self, transcript_hash: str, config_hash: str, summary_data: Dict[str, Any] ) -> bool: """Cache summary result with metadata. Args: transcript_hash: Hash of transcript content config_hash: Hash of summary configuration summary_data: Summary data to cache Returns: True if caching succeeded """ cache_key = self._generate_summary_key(transcript_hash, config_hash) # Add versioning and timestamp metadata enhanced_data = { **summary_data, "_cache_metadata": { "cached_at": datetime.utcnow().isoformat(), "ai_model_version": summary_data.get("model", "claude-3-5-haiku-20241022"), "prompt_version": "v1.0", "cache_version": "1.0" } } try: ttl_seconds = self.config.summary_ttl_hours * 3600 if self.redis_client: success = await self._set_in_redis(cache_key, enhanced_data, ttl_seconds) else: self._set_in_memory(cache_key, enhanced_data, ttl_seconds) success = True self.metrics.write_operations += 1 return success except Exception as e: self.metrics.errors += 1 logger.error(f"Summary cache write error: {e}") return False # Cache Key Generation def _generate_transcript_key(self, video_id: str, language: str) -> str: """Generate unique cache key for transcript.""" return f"{self.TRANSCRIPT_PREFIX}{video_id}:{language}" def _generate_summary_key(self, transcript_hash: str, config_hash: str) -> str: """Generate unique cache key for summary.""" return f"{self.SUMMARY_PREFIX}{transcript_hash}:{config_hash}" def generate_content_hash(self, content: str) -> str: """Generate stable hash for content.""" return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16] def generate_config_hash(self, config: Dict[str, Any]) -> str: """Generate stable hash for configuration.""" # Sort keys for consistent hashing config_str = json.dumps(config, sort_keys=True) return hashlib.sha256(config_str.encode('utf-8')).hexdigest()[:16] # Redis Operations async def _get_from_redis(self, key: str) -> Optional[Dict[str, Any]]: """Get data from Redis with error handling.""" if not self.redis_client: return None try: data = await self.redis_client.get(key) if data: return json.loads(data) return None except Exception as e: logger.error(f"Redis get error: {e}") return None async def _set_in_redis(self, key: str, data: Dict[str, Any], ttl_seconds: int) -> bool: """Set data in Redis with TTL.""" if not self.redis_client: return False try: serialized = json.dumps(data) await self.redis_client.setex(key, ttl_seconds, serialized) return True except Exception as e: logger.error(f"Redis set error: {e}") return False async def _delete_from_redis(self, key: str) -> bool: """Delete key from Redis.""" if not self.redis_client: return False try: await self.redis_client.delete(key) return True except Exception as e: logger.error(f"Redis delete error: {e}") return False # Memory Cache Operations (Fallback) def _set_in_memory(self, key: str, data: Dict[str, Any], ttl_seconds: int) -> None: """Set data in memory cache with expiration.""" expires_at = datetime.utcnow() + timedelta(seconds=ttl_seconds) self._memory_cache[key] = { "data": data, "expires_at": expires_at.isoformat() } def _is_cache_valid(self, cache_entry: Dict[str, Any]) -> bool: """Check if memory cache entry is still valid.""" expires_at_str = cache_entry.get("expires_at") if not expires_at_str: return False expires_at = datetime.fromisoformat(expires_at_str) return datetime.utcnow() < expires_at # Cache Validation def _is_summary_valid(self, cached_data: Dict[str, Any]) -> bool: """Check if cached summary is still valid based on versioning.""" metadata = cached_data.get("_cache_metadata", {}) # Check cache version compatibility cached_version = metadata.get("cache_version", "0.0") if cached_version != "1.0": return False # Check age (additional validation beyond TTL) cached_at = metadata.get("cached_at") if cached_at: cached_time = datetime.fromisoformat(cached_at) age_hours = (datetime.utcnow() - cached_time).total_seconds() / 3600 if age_hours > self.config.summary_ttl_hours: return False return True # Background Tasks async def _periodic_cleanup(self): """Background task for cache cleanup and maintenance.""" while True: try: await asyncio.sleep(self.config.cleanup_interval_hours * 3600) # Clean up memory cache await self._cleanup_memory_cache() # Clean up Redis if available if self.redis_client: await self._cleanup_redis_memory() logger.info("Cache cleanup completed") except asyncio.CancelledError: break except Exception as e: logger.error(f"Cache cleanup error: {e}") async def _cleanup_memory_cache(self): """Remove expired entries from memory cache.""" now = datetime.utcnow() expired_keys = [] for key, entry in self._memory_cache.items(): if not self._is_cache_valid(entry): expired_keys.append(key) for key in expired_keys: del self._memory_cache[key] if expired_keys: logger.info(f"Cleaned up {len(expired_keys)} expired memory cache entries") async def _cleanup_redis_memory(self): """Monitor and manage Redis memory usage.""" if not self.redis_client: return try: info = await self.redis_client.info('memory') used_memory_mb = info.get('used_memory', 0) / (1024 * 1024) if used_memory_mb > self.config.memory_max_size_mb * 0.8: # 80% threshold logger.warning(f"Redis memory usage high: {used_memory_mb:.1f}MB") # Redis will handle eviction based on maxmemory-policy except Exception as e: logger.error(f"Redis memory check error: {e}") async def _cache_warming_scheduler(self): """Background task for intelligent cache warming.""" while True: try: await asyncio.sleep(3600) # Run hourly # Get popular videos for warming popular_videos = await self._get_popular_videos() for video_batch in self._batch_videos(popular_videos, self.config.warming_batch_size): await self._warm_video_batch(video_batch) await asyncio.sleep(5) # Rate limiting except asyncio.CancelledError: break except Exception as e: logger.error(f"Cache warming error: {e}") async def _get_popular_videos(self) -> List[str]: """Get list of popular video IDs for cache warming.""" # TODO: Integrate with analytics service return [] def _batch_videos(self, videos: List[str], batch_size: int) -> List[List[str]]: """Split videos into batches for processing.""" return [videos[i:i + batch_size] for i in range(0, len(videos), batch_size)] async def _warm_video_batch(self, video_ids: List[str]): """Warm cache for a batch of videos.""" # TODO: Implement cache warming logic pass # Metrics and Analytics def _record_cache_hit(self, cache_type: str, level: str, start_time: float): """Record cache hit metrics.""" self.metrics.hits += 1 response_time = (time.time() - start_time) * 1000 self._update_average_response_time(response_time) def _record_cache_miss(self, cache_type: str, start_time: float): """Record cache miss metrics.""" self.metrics.misses += 1 response_time = (time.time() - start_time) * 1000 self._update_average_response_time(response_time) def _record_cache_operation(self, operation_type: str, start_time: float): """Record cache operation metrics.""" response_time = (time.time() - start_time) * 1000 self._update_average_response_time(response_time) def _update_average_response_time(self, response_time: float): """Update rolling average response time.""" total_ops = self.metrics.total_operations if total_ops > 1: self.metrics.average_response_time_ms = ( (self.metrics.average_response_time_ms * (total_ops - 1) + response_time) / total_ops ) else: self.metrics.average_response_time_ms = response_time async def get_cache_analytics(self) -> Dict[str, Any]: """Get comprehensive cache analytics.""" # Get Redis info if available redis_info = {} if self.redis_client: try: memory_info = await self.redis_client.info('memory') redis_info = { "used_memory_mb": memory_info.get('used_memory', 0) / (1024 * 1024), "max_memory_mb": self.config.memory_max_size_mb, "memory_usage_percent": (memory_info.get('used_memory', 0) / (1024 * 1024)) / self.config.memory_max_size_mb * 100 } except Exception as e: redis_info = {"error": str(e)} # Memory cache info memory_cache_info = { "entries": len(self._memory_cache), "estimated_size_mb": sum(len(json.dumps(v)) for v in self._memory_cache.values()) / (1024 * 1024) } return { "performance_metrics": self.metrics.to_dict(), "redis_usage": redis_info if self.redis_client else None, "memory_cache_usage": memory_cache_info, "configuration": { "transcript_ttl_hours": self.config.transcript_ttl_hours, "summary_ttl_hours": self.config.summary_ttl_hours, "memory_max_size_mb": self.config.memory_max_size_mb, "using_redis": bool(self.redis_client) } } async def invalidate_cache(self, pattern: Optional[str] = None) -> int: """Invalidate cache entries matching pattern. Args: pattern: Optional pattern to match cache keys Returns: Number of entries invalidated """ count = 0 # Clear memory cache if pattern: keys_to_delete = [k for k in self._memory_cache.keys() if pattern in k] for key in keys_to_delete: del self._memory_cache[key] count += 1 else: count = len(self._memory_cache) self._memory_cache.clear() # Clear Redis if available if self.redis_client: try: if pattern: # Use SCAN to find matching keys cursor = 0 while True: cursor, keys = await self.redis_client.scan(cursor, match=f"*{pattern}*") if keys: await self.redis_client.delete(*keys) count += len(keys) if cursor == 0: break else: # Clear all cache keys await self.redis_client.flushdb() except Exception as e: logger.error(f"Redis invalidation error: {e}") logger.info(f"Invalidated {count} cache entries") return count # Compatibility methods with existing CacheManager async def cache_pipeline_result(self, job_id: str, result: Any, ttl: Optional[int] = None) -> bool: """Cache pipeline result (compatibility method).""" cache_key = f"pipeline:{job_id}" ttl_seconds = ttl or self.config.summary_ttl_hours * 3600 if hasattr(result, '__dataclass_fields__'): result_data = asdict(result) else: result_data = result if self.redis_client: return await self._set_in_redis(cache_key, result_data, ttl_seconds) else: self._set_in_memory(cache_key, result_data, ttl_seconds) return True async def get_cached_pipeline_result(self, job_id: str) -> Optional[Dict[str, Any]]: """Get cached pipeline result (compatibility method).""" cache_key = f"pipeline:{job_id}" if self.redis_client: return await self._get_from_redis(cache_key) else: entry = self._memory_cache.get(cache_key) if entry and self._is_cache_valid(entry): return entry["data"] return None async def get_cache_stats(self) -> Dict[str, Any]: """Get cache statistics (compatibility method).""" return await self.get_cache_analytics()