"""Cache management service for pipeline results and intermediate data.""" import json import hashlib from datetime import datetime, timedelta from typing import Dict, Optional, Any from dataclasses import asdict class CacheManager: """Manages caching of pipeline results and intermediate data.""" def __init__(self, default_ttl: int = 3600): """Initialize cache manager. Args: default_ttl: Default time-to-live for cache entries in seconds """ self.default_ttl = default_ttl # In-memory cache for now (would use Redis in production) self._cache: Dict[str, Dict[str, Any]] = {} def _generate_key(self, prefix: str, identifier: str) -> str: """Generate cache key with prefix.""" return f"{prefix}:{identifier}" def _is_expired(self, entry: Dict[str, Any]) -> bool: """Check if cache entry is expired.""" expires_at = entry.get("expires_at") if not expires_at: return False return datetime.fromisoformat(expires_at) < datetime.utcnow() def _cleanup_expired(self): """Remove expired entries from cache.""" expired_keys = [ key for key, entry in self._cache.items() if self._is_expired(entry) ] for key in expired_keys: del self._cache[key] async def cache_pipeline_result(self, job_id: str, result: Any, ttl: Optional[int] = None) -> bool: """Cache pipeline result. Args: job_id: Pipeline job ID result: Pipeline result object ttl: Time-to-live in seconds (uses default if None) Returns: True if cached successfully """ try: key = self._generate_key("pipeline_result", job_id) expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl) # Convert result to dict if it's a dataclass if hasattr(result, '__dataclass_fields__'): result_data = asdict(result) else: result_data = result self._cache[key] = { "data": result_data, "expires_at": expires_at.isoformat(), "cached_at": datetime.utcnow().isoformat() } # Cleanup expired entries periodically if len(self._cache) % 100 == 0: self._cleanup_expired() return True except Exception as e: print(f"Failed to cache pipeline result: {e}") return False async def get_cached_pipeline_result(self, job_id: str) -> Optional[Dict[str, Any]]: """Get cached pipeline result. Args: job_id: Pipeline job ID Returns: Cached result data or None if not found/expired """ key = self._generate_key("pipeline_result", job_id) entry = self._cache.get(key) if not entry: return None if self._is_expired(entry): del self._cache[key] return None return entry["data"] async def cache_transcript(self, video_id: str, transcript: str, metadata: Dict[str, Any] = None, ttl: Optional[int] = None) -> bool: """Cache transcript data. Args: video_id: YouTube video ID transcript: Transcript text metadata: Optional metadata ttl: Time-to-live in seconds Returns: True if cached successfully """ try: key = self._generate_key("transcript", video_id) expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl) self._cache[key] = { "data": { "transcript": transcript, "metadata": metadata or {}, "video_id": video_id }, "expires_at": expires_at.isoformat(), "cached_at": datetime.utcnow().isoformat() } return True except Exception as e: print(f"Failed to cache transcript: {e}") return False async def get_cached_transcript(self, video_id: str) -> Optional[Dict[str, Any]]: """Get cached transcript. Args: video_id: YouTube video ID Returns: Cached transcript data or None if not found/expired """ key = self._generate_key("transcript", video_id) entry = self._cache.get(key) if not entry: return None if self._is_expired(entry): del self._cache[key] return None return entry["data"] async def cache_video_metadata(self, video_id: str, metadata: Dict[str, Any], ttl: Optional[int] = None) -> bool: """Cache video metadata. Args: video_id: YouTube video ID metadata: Video metadata ttl: Time-to-live in seconds Returns: True if cached successfully """ try: key = self._generate_key("video_metadata", video_id) expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl) self._cache[key] = { "data": metadata, "expires_at": expires_at.isoformat(), "cached_at": datetime.utcnow().isoformat() } return True except Exception as e: print(f"Failed to cache video metadata: {e}") return False async def get_cached_video_metadata(self, video_id: str) -> Optional[Dict[str, Any]]: """Get cached video metadata. Args: video_id: YouTube video ID Returns: Cached metadata or None if not found/expired """ key = self._generate_key("video_metadata", video_id) entry = self._cache.get(key) if not entry: return None if self._is_expired(entry): del self._cache[key] return None return entry["data"] async def cache_summary(self, cache_key: str, summary_data: Dict[str, Any], ttl: Optional[int] = None) -> bool: """Cache summary data with custom key. Args: cache_key: Custom cache key (e.g., hash of transcript + config) summary_data: Summary result data ttl: Time-to-live in seconds Returns: True if cached successfully """ try: key = self._generate_key("summary", cache_key) expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl) self._cache[key] = { "data": summary_data, "expires_at": expires_at.isoformat(), "cached_at": datetime.utcnow().isoformat() } return True except Exception as e: print(f"Failed to cache summary: {e}") return False async def get_cached_summary(self, cache_key: str) -> Optional[Dict[str, Any]]: """Get cached summary data. Args: cache_key: Custom cache key Returns: Cached summary data or None if not found/expired """ key = self._generate_key("summary", cache_key) entry = self._cache.get(key) if not entry: return None if self._is_expired(entry): del self._cache[key] return None return entry["data"] def generate_summary_cache_key(self, video_id: str, config: Dict[str, Any]) -> str: """Generate cache key for summary based on video ID and configuration. Args: video_id: YouTube video ID config: Summary configuration Returns: Cache key string """ # Create deterministic key from video ID and config config_str = json.dumps(config, sort_keys=True) key_input = f"{video_id}:{config_str}" return hashlib.sha256(key_input.encode()).hexdigest()[:16] async def invalidate_video_cache(self, video_id: str) -> int: """Invalidate all cache entries for a video. Args: video_id: YouTube video ID Returns: Number of entries invalidated """ patterns = [ self._generate_key("transcript", video_id), self._generate_key("video_metadata", video_id), self._generate_key("pipeline_result", video_id) ] # Also find summary cache entries that start with video_id summary_keys = [ key for key in self._cache.keys() if key.startswith(self._generate_key("summary", "")) and video_id in key ] all_keys = patterns + summary_keys removed_count = 0 for key in all_keys: if key in self._cache: del self._cache[key] removed_count += 1 return removed_count async def get_cache_stats(self) -> Dict[str, Any]: """Get cache statistics. Returns: Cache statistics dictionary """ self._cleanup_expired() total_entries = len(self._cache) entries_by_type = {} for key in self._cache.keys(): prefix = key.split(":", 1)[0] entries_by_type[prefix] = entries_by_type.get(prefix, 0) + 1 return { "total_entries": total_entries, "entries_by_type": entries_by_type, "default_ttl_seconds": self.default_ttl } async def clear_cache(self) -> int: """Clear all cache entries. Returns: Number of entries cleared """ count = len(self._cache) self._cache.clear() return count