324 lines
10 KiB
Python
324 lines
10 KiB
Python
"""Cache management service for pipeline results and intermediate data."""
|
|
import json
|
|
import hashlib
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Optional, Any
|
|
from dataclasses import asdict
|
|
|
|
|
|
class CacheManager:
|
|
"""Manages caching of pipeline results and intermediate data."""
|
|
|
|
def __init__(self, default_ttl: int = 3600):
|
|
"""Initialize cache manager.
|
|
|
|
Args:
|
|
default_ttl: Default time-to-live for cache entries in seconds
|
|
"""
|
|
self.default_ttl = default_ttl
|
|
# In-memory cache for now (would use Redis in production)
|
|
self._cache: Dict[str, Dict[str, Any]] = {}
|
|
|
|
def _generate_key(self, prefix: str, identifier: str) -> str:
|
|
"""Generate cache key with prefix."""
|
|
return f"{prefix}:{identifier}"
|
|
|
|
def _is_expired(self, entry: Dict[str, Any]) -> bool:
|
|
"""Check if cache entry is expired."""
|
|
expires_at = entry.get("expires_at")
|
|
if not expires_at:
|
|
return False
|
|
return datetime.fromisoformat(expires_at) < datetime.utcnow()
|
|
|
|
def _cleanup_expired(self):
|
|
"""Remove expired entries from cache."""
|
|
expired_keys = [
|
|
key for key, entry in self._cache.items()
|
|
if self._is_expired(entry)
|
|
]
|
|
for key in expired_keys:
|
|
del self._cache[key]
|
|
|
|
async def cache_pipeline_result(self, job_id: str, result: Any, ttl: Optional[int] = None) -> bool:
|
|
"""Cache pipeline result.
|
|
|
|
Args:
|
|
job_id: Pipeline job ID
|
|
result: Pipeline result object
|
|
ttl: Time-to-live in seconds (uses default if None)
|
|
|
|
Returns:
|
|
True if cached successfully
|
|
"""
|
|
try:
|
|
key = self._generate_key("pipeline_result", job_id)
|
|
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
|
|
|
|
# Convert result to dict if it's a dataclass
|
|
if hasattr(result, '__dataclass_fields__'):
|
|
result_data = asdict(result)
|
|
else:
|
|
result_data = result
|
|
|
|
self._cache[key] = {
|
|
"data": result_data,
|
|
"expires_at": expires_at.isoformat(),
|
|
"cached_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Cleanup expired entries periodically
|
|
if len(self._cache) % 100 == 0:
|
|
self._cleanup_expired()
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Failed to cache pipeline result: {e}")
|
|
return False
|
|
|
|
async def get_cached_pipeline_result(self, job_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get cached pipeline result.
|
|
|
|
Args:
|
|
job_id: Pipeline job ID
|
|
|
|
Returns:
|
|
Cached result data or None if not found/expired
|
|
"""
|
|
key = self._generate_key("pipeline_result", job_id)
|
|
entry = self._cache.get(key)
|
|
|
|
if not entry:
|
|
return None
|
|
|
|
if self._is_expired(entry):
|
|
del self._cache[key]
|
|
return None
|
|
|
|
return entry["data"]
|
|
|
|
async def cache_transcript(self, video_id: str, transcript: str, metadata: Dict[str, Any] = None, ttl: Optional[int] = None) -> bool:
|
|
"""Cache transcript data.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
transcript: Transcript text
|
|
metadata: Optional metadata
|
|
ttl: Time-to-live in seconds
|
|
|
|
Returns:
|
|
True if cached successfully
|
|
"""
|
|
try:
|
|
key = self._generate_key("transcript", video_id)
|
|
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
|
|
|
|
self._cache[key] = {
|
|
"data": {
|
|
"transcript": transcript,
|
|
"metadata": metadata or {},
|
|
"video_id": video_id
|
|
},
|
|
"expires_at": expires_at.isoformat(),
|
|
"cached_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Failed to cache transcript: {e}")
|
|
return False
|
|
|
|
async def get_cached_transcript(self, video_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get cached transcript.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
|
|
Returns:
|
|
Cached transcript data or None if not found/expired
|
|
"""
|
|
key = self._generate_key("transcript", video_id)
|
|
entry = self._cache.get(key)
|
|
|
|
if not entry:
|
|
return None
|
|
|
|
if self._is_expired(entry):
|
|
del self._cache[key]
|
|
return None
|
|
|
|
return entry["data"]
|
|
|
|
async def cache_video_metadata(self, video_id: str, metadata: Dict[str, Any], ttl: Optional[int] = None) -> bool:
|
|
"""Cache video metadata.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
metadata: Video metadata
|
|
ttl: Time-to-live in seconds
|
|
|
|
Returns:
|
|
True if cached successfully
|
|
"""
|
|
try:
|
|
key = self._generate_key("video_metadata", video_id)
|
|
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
|
|
|
|
self._cache[key] = {
|
|
"data": metadata,
|
|
"expires_at": expires_at.isoformat(),
|
|
"cached_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Failed to cache video metadata: {e}")
|
|
return False
|
|
|
|
async def get_cached_video_metadata(self, video_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get cached video metadata.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
|
|
Returns:
|
|
Cached metadata or None if not found/expired
|
|
"""
|
|
key = self._generate_key("video_metadata", video_id)
|
|
entry = self._cache.get(key)
|
|
|
|
if not entry:
|
|
return None
|
|
|
|
if self._is_expired(entry):
|
|
del self._cache[key]
|
|
return None
|
|
|
|
return entry["data"]
|
|
|
|
async def cache_summary(self, cache_key: str, summary_data: Dict[str, Any], ttl: Optional[int] = None) -> bool:
|
|
"""Cache summary data with custom key.
|
|
|
|
Args:
|
|
cache_key: Custom cache key (e.g., hash of transcript + config)
|
|
summary_data: Summary result data
|
|
ttl: Time-to-live in seconds
|
|
|
|
Returns:
|
|
True if cached successfully
|
|
"""
|
|
try:
|
|
key = self._generate_key("summary", cache_key)
|
|
expires_at = datetime.utcnow() + timedelta(seconds=ttl or self.default_ttl)
|
|
|
|
self._cache[key] = {
|
|
"data": summary_data,
|
|
"expires_at": expires_at.isoformat(),
|
|
"cached_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Failed to cache summary: {e}")
|
|
return False
|
|
|
|
async def get_cached_summary(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
|
"""Get cached summary data.
|
|
|
|
Args:
|
|
cache_key: Custom cache key
|
|
|
|
Returns:
|
|
Cached summary data or None if not found/expired
|
|
"""
|
|
key = self._generate_key("summary", cache_key)
|
|
entry = self._cache.get(key)
|
|
|
|
if not entry:
|
|
return None
|
|
|
|
if self._is_expired(entry):
|
|
del self._cache[key]
|
|
return None
|
|
|
|
return entry["data"]
|
|
|
|
def generate_summary_cache_key(self, video_id: str, config: Dict[str, Any]) -> str:
|
|
"""Generate cache key for summary based on video ID and configuration.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
config: Summary configuration
|
|
|
|
Returns:
|
|
Cache key string
|
|
"""
|
|
# Create deterministic key from video ID and config
|
|
config_str = json.dumps(config, sort_keys=True)
|
|
key_input = f"{video_id}:{config_str}"
|
|
return hashlib.sha256(key_input.encode()).hexdigest()[:16]
|
|
|
|
async def invalidate_video_cache(self, video_id: str) -> int:
|
|
"""Invalidate all cache entries for a video.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
|
|
Returns:
|
|
Number of entries invalidated
|
|
"""
|
|
patterns = [
|
|
self._generate_key("transcript", video_id),
|
|
self._generate_key("video_metadata", video_id),
|
|
self._generate_key("pipeline_result", video_id)
|
|
]
|
|
|
|
# Also find summary cache entries that start with video_id
|
|
summary_keys = [
|
|
key for key in self._cache.keys()
|
|
if key.startswith(self._generate_key("summary", "")) and video_id in key
|
|
]
|
|
|
|
all_keys = patterns + summary_keys
|
|
removed_count = 0
|
|
|
|
for key in all_keys:
|
|
if key in self._cache:
|
|
del self._cache[key]
|
|
removed_count += 1
|
|
|
|
return removed_count
|
|
|
|
async def get_cache_stats(self) -> Dict[str, Any]:
|
|
"""Get cache statistics.
|
|
|
|
Returns:
|
|
Cache statistics dictionary
|
|
"""
|
|
self._cleanup_expired()
|
|
|
|
total_entries = len(self._cache)
|
|
entries_by_type = {}
|
|
|
|
for key in self._cache.keys():
|
|
prefix = key.split(":", 1)[0]
|
|
entries_by_type[prefix] = entries_by_type.get(prefix, 0) + 1
|
|
|
|
return {
|
|
"total_entries": total_entries,
|
|
"entries_by_type": entries_by_type,
|
|
"default_ttl_seconds": self.default_ttl
|
|
}
|
|
|
|
async def clear_cache(self) -> int:
|
|
"""Clear all cache entries.
|
|
|
|
Returns:
|
|
Number of entries cleared
|
|
"""
|
|
count = len(self._cache)
|
|
self._cache.clear()
|
|
return count |