youtube-summarizer/backend/services/cache_manager.py

324 lines
10 KiB
Python

"""Cache management service for pipeline results and intermediate data."""
import json
import hashlib
from datetime import datetime, timedelta
from typing import Dict, Optional, Any
from dataclasses import asdict
class CacheManager:
"""Manages caching of pipeline results and intermediate data."""
def __init__(self, default_ttl: int = 3600):
"""Initialize cache manager.
Args:
default_ttl: Default time-to-live for cache entries in seconds
"""
self.default_ttl = default_ttl
# In-memory cache for now (would use Redis in production)
self._cache: Dict[str, Dict[str, Any]] = {}
def _generate_key(self, prefix: str, identifier: str) -> str:
"""Generate cache key with prefix."""
return f"{prefix}:{identifier}"
def _is_expired(self, entry: Dict[str, Any]) -> bool:
"""Check if cache entry is expired."""
expires_at = entry.get("expires_at")
if not expires_at:
return False
return datetime.fromisoformat(expires_at) < datetime.utcnow()
def _cleanup_expired(self):
"""Remove expired entries from cache."""
expired_keys = [
key for key, entry in self._cache.items()
if self._is_expired(entry)
]
for key in expired_keys:
del self._cache[key]
async def cache_pipeline_result(self, job_id: str, result: Any, ttl: Optional[int] = None) -> bool:
"""Cache pipeline result.
Args:
job_id: Pipeline job ID
result: Pipeline result object
ttl: Time-to-live in seconds (uses default if None)
Returns:
True if cached successfully
"""
try:
key = self._generate_key("pipeline_result", job_id)
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
# Convert result to dict if it's a dataclass
if hasattr(result, '__dataclass_fields__'):
result_data = asdict(result)
else:
result_data = result
self._cache[key] = {
"data": result_data,
"expires_at": expires_at.isoformat(),
"cached_at": datetime.utcnow().isoformat()
}
# Cleanup expired entries periodically
if len(self._cache) % 100 == 0:
self._cleanup_expired()
return True
except Exception as e:
print(f"Failed to cache pipeline result: {e}")
return False
async def get_cached_pipeline_result(self, job_id: str) -> Optional[Dict[str, Any]]:
"""Get cached pipeline result.
Args:
job_id: Pipeline job ID
Returns:
Cached result data or None if not found/expired
"""
key = self._generate_key("pipeline_result", job_id)
entry = self._cache.get(key)
if not entry:
return None
if self._is_expired(entry):
del self._cache[key]
return None
return entry["data"]
async def cache_transcript(self, video_id: str, transcript: str, metadata: Dict[str, Any] = None, ttl: Optional[int] = None) -> bool:
"""Cache transcript data.
Args:
video_id: YouTube video ID
transcript: Transcript text
metadata: Optional metadata
ttl: Time-to-live in seconds
Returns:
True if cached successfully
"""
try:
key = self._generate_key("transcript", video_id)
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
self._cache[key] = {
"data": {
"transcript": transcript,
"metadata": metadata or {},
"video_id": video_id
},
"expires_at": expires_at.isoformat(),
"cached_at": datetime.utcnow().isoformat()
}
return True
except Exception as e:
print(f"Failed to cache transcript: {e}")
return False
async def get_cached_transcript(self, video_id: str) -> Optional[Dict[str, Any]]:
"""Get cached transcript.
Args:
video_id: YouTube video ID
Returns:
Cached transcript data or None if not found/expired
"""
key = self._generate_key("transcript", video_id)
entry = self._cache.get(key)
if not entry:
return None
if self._is_expired(entry):
del self._cache[key]
return None
return entry["data"]
async def cache_video_metadata(self, video_id: str, metadata: Dict[str, Any], ttl: Optional[int] = None) -> bool:
"""Cache video metadata.
Args:
video_id: YouTube video ID
metadata: Video metadata
ttl: Time-to-live in seconds
Returns:
True if cached successfully
"""
try:
key = self._generate_key("video_metadata", video_id)
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
self._cache[key] = {
"data": metadata,
"expires_at": expires_at.isoformat(),
"cached_at": datetime.utcnow().isoformat()
}
return True
except Exception as e:
print(f"Failed to cache video metadata: {e}")
return False
async def get_cached_video_metadata(self, video_id: str) -> Optional[Dict[str, Any]]:
"""Get cached video metadata.
Args:
video_id: YouTube video ID
Returns:
Cached metadata or None if not found/expired
"""
key = self._generate_key("video_metadata", video_id)
entry = self._cache.get(key)
if not entry:
return None
if self._is_expired(entry):
del self._cache[key]
return None
return entry["data"]
async def cache_summary(self, cache_key: str, summary_data: Dict[str, Any], ttl: Optional[int] = None) -> bool:
"""Cache summary data with custom key.
Args:
cache_key: Custom cache key (e.g., hash of transcript + config)
summary_data: Summary result data
ttl: Time-to-live in seconds
Returns:
True if cached successfully
"""
try:
key = self._generate_key("summary", cache_key)
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
self._cache[key] = {
"data": summary_data,
"expires_at": expires_at.isoformat(),
"cached_at": datetime.utcnow().isoformat()
}
return True
except Exception as e:
print(f"Failed to cache summary: {e}")
return False
async def get_cached_summary(self, cache_key: str) -> Optional[Dict[str, Any]]:
"""Get cached summary data.
Args:
cache_key: Custom cache key
Returns:
Cached summary data or None if not found/expired
"""
key = self._generate_key("summary", cache_key)
entry = self._cache.get(key)
if not entry:
return None
if self._is_expired(entry):
del self._cache[key]
return None
return entry["data"]
def generate_summary_cache_key(self, video_id: str, config: Dict[str, Any]) -> str:
"""Generate cache key for summary based on video ID and configuration.
Args:
video_id: YouTube video ID
config: Summary configuration
Returns:
Cache key string
"""
# Create deterministic key from video ID and config
config_str = json.dumps(config, sort_keys=True)
key_input = f"{video_id}:{config_str}"
return hashlib.sha256(key_input.encode()).hexdigest()[:16]
async def invalidate_video_cache(self, video_id: str) -> int:
"""Invalidate all cache entries for a video.
Args:
video_id: YouTube video ID
Returns:
Number of entries invalidated
"""
patterns = [
self._generate_key("transcript", video_id),
self._generate_key("video_metadata", video_id),
self._generate_key("pipeline_result", video_id)
]
# Also find summary cache entries that start with video_id
summary_keys = [
key for key in self._cache.keys()
if key.startswith(self._generate_key("summary", "")) and video_id in key
]
all_keys = patterns + summary_keys
removed_count = 0
for key in all_keys:
if key in self._cache:
del self._cache[key]
removed_count += 1
return removed_count
async def get_cache_stats(self) -> Dict[str, Any]:
"""Get cache statistics.
Returns:
Cache statistics dictionary
"""
self._cleanup_expired()
total_entries = len(self._cache)
entries_by_type = {}
for key in self._cache.keys():
prefix = key.split(":", 1)[0]
entries_by_type[prefix] = entries_by_type.get(prefix, 0) + 1
return {
"total_entries": total_entries,
"entries_by_type": entries_by_type,
"default_ttl_seconds": self.default_ttl
}
async def clear_cache(self) -> int:
"""Clear all cache entries.
Returns:
Number of entries cleared
"""
count = len(self._cache)
self._cache.clear()
return count