youtube-summarizer/backend/services/cache_manager.py

"""Cache management service for pipeline results and intermediate data."""
import json
import hashlib
import sys
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, Any
from dataclasses import asdict

# Add library path to import MemoryCache
lib_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../lib'))
if lib_path not in sys.path:
    sys.path.insert(0, lib_path)

try:
    from ai_assistant_lib.utils.helpers.cache import MemoryCache
except ImportError:
    # Fallback to basic dict if library not available
    class MemoryCache:
        def __init__(self, default_ttl=3600, max_size=10000):
            self._cache = {}
            self.default_ttl = default_ttl

        async def get(self, key):
            return self._cache.get(key)

        async def set(self, key, value, ttl=None):
            self._cache[key] = value

        async def delete(self, key):
            return self._cache.pop(key, None) is not None

        async def clear(self):
            self._cache.clear()

        async def stats(self):
            return {"size": len(self._cache), "hit_rate": 0.0, "miss_rate": 0.0}


class CacheManager:
    """Manages caching of pipeline results and intermediate data using AI Assistant Library."""

    def __init__(self, default_ttl: int = 3600):
        """Initialize cache manager.

        Args:
            default_ttl: Default time-to-live for cache entries in seconds
        """
        self.default_ttl = default_ttl
        self._cache = MemoryCache(default_ttl=default_ttl, max_size=10000)

    def _generate_key(self, prefix: str, identifier: str) -> str:
        """Generate cache key with prefix."""
        return f"{prefix}:{identifier}"

    async def cache_pipeline_result(self, job_id: str, result: Any, ttl: Optional[int] = None) -> bool:
        """Cache pipeline result.

        Args:
            job_id: Pipeline job ID
            result: Pipeline result object
            ttl: Time-to-live in seconds (uses default if None)

        Returns:
            True if cached successfully
        """
        try:
            key = self._generate_key("pipeline_result", job_id)

            # Convert result to dict if it's a dataclass
            if hasattr(result, '__dataclass_fields__'):
                result_data = asdict(result)
            else:
                result_data = result

            # MemoryCache handles TTL and expiration automatically
            await self._cache.set(key, result_data, ttl=ttl or self.default_ttl)
            return True

        except Exception as e:
            print(f"Failed to cache pipeline result: {e}")
            return False

    async def get_cached_pipeline_result(self, job_id: str) -> Optional[Dict[str, Any]]:
        """Get cached pipeline result.

        Args:
            job_id: Pipeline job ID

        Returns:
            Cached result data or None if not found/expired
        """
        key = self._generate_key("pipeline_result", job_id)
        return await self._cache.get(key)

    async def cache_transcript(self, video_id: str, transcript: str, metadata: Dict[str, Any] = None, ttl: Optional[int] = None) -> bool:
        """Cache transcript data.

        Args:
            video_id: YouTube video ID
            transcript: Transcript text
            metadata: Optional metadata
            ttl: Time-to-live in seconds

        Returns:
            True if cached successfully
        """
        try:
            key = self._generate_key("transcript", video_id)
            data = {
                "transcript": transcript,
                "metadata": metadata or {},
                "video_id": video_id
            }
            await self._cache.set(key, data, ttl=ttl or self.default_ttl)
            return True

        except Exception as e:
            print(f"Failed to cache transcript: {e}")
            return False

    async def get_cached_transcript(self, video_id: str) -> Optional[Dict[str, Any]]:
        """Get cached transcript.

        Args:
            video_id: YouTube video ID

        Returns:
            Cached transcript data or None if not found/expired
        """
        key = self._generate_key("transcript", video_id)
        return await self._cache.get(key)

    async def cache_video_metadata(self, video_id: str, metadata: Dict[str, Any], ttl: Optional[int] = None) -> bool:
        """Cache video metadata.

        Args:
            video_id: YouTube video ID
            metadata: Video metadata
            ttl: Time-to-live in seconds

        Returns:
            True if cached successfully
        """
        try:
            key = self._generate_key("video_metadata", video_id)
            await self._cache.set(key, metadata, ttl=ttl or self.default_ttl)
            return True

        except Exception as e:
            print(f"Failed to cache video metadata: {e}")
            return False

    async def get_cached_video_metadata(self, video_id: str) -> Optional[Dict[str, Any]]:
        """Get cached video metadata.

        Args:
            video_id: YouTube video ID

        Returns:
            Cached metadata or None if not found/expired
        """
        key = self._generate_key("video_metadata", video_id)
        return await self._cache.get(key)

    async def cache_summary(self, cache_key: str, summary_data: Dict[str, Any], ttl: Optional[int] = None) -> bool:
        """Cache summary data with custom key.

        Args:
            cache_key: Custom cache key (e.g., hash of transcript + config)
            summary_data: Summary result data
            ttl: Time-to-live in seconds

        Returns:
            True if cached successfully
        """
        try:
            key = self._generate_key("summary", cache_key)
            await self._cache.set(key, summary_data, ttl=ttl or self.default_ttl)
            return True

        except Exception as e:
            print(f"Failed to cache summary: {e}")
            return False

    async def get_cached_summary(self, cache_key: str) -> Optional[Dict[str, Any]]:
        """Get cached summary data.

        Args:
            cache_key: Custom cache key

        Returns:
            Cached summary data or None if not found/expired
        """
        key = self._generate_key("summary", cache_key)
        return await self._cache.get(key)

    def generate_summary_cache_key(self, video_id: str, config: Dict[str, Any]) -> str:
        """Generate cache key for summary based on video ID and configuration.

        Args:
            video_id: YouTube video ID
            config: Summary configuration

        Returns:
            Cache key string
        """
        # Create deterministic key from video ID and config
        config_str = json.dumps(config, sort_keys=True)
        key_input = f"{video_id}:{config_str}"
        return hashlib.sha256(key_input.encode()).hexdigest()[:16]

    async def invalidate_video_cache(self, video_id: str) -> int:
        """Invalidate all cache entries for a video.

        Args:
            video_id: YouTube video ID

        Returns:
            Number of entries invalidated
        """
        patterns = [
            self._generate_key("transcript", video_id),
            self._generate_key("video_metadata", video_id),
            self._generate_key("pipeline_result", video_id)
        ]

        removed_count = 0
        for key in patterns:
            if await self._cache.delete(key):
                removed_count += 1

        # For summary keys with video_id, we'd need to scan all keys
        # This is a limitation of the current MemoryCache interface
        # In production, we'd use pattern-based invalidation

        return removed_count

    async def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics.

        Returns:
            Cache statistics dictionary
        """
        stats = await self._cache.stats()
        return {
            "total_entries": stats.get("size", 0),
            "entries_by_type": {},  # Not available in current MemoryCache interface
            "default_ttl_seconds": self.default_ttl,
            "hit_rate": stats.get("hit_rate", 0.0),
            "miss_rate": stats.get("miss_rate", 0.0)
        }

    async def clear_cache(self) -> int:
        """Clear all cache entries.

        Returns:
            Number of entries cleared
        """
        stats = await self._cache.stats()
        count = stats.get("size", 0)
        await self._cache.clear()
        return count