youtube-summarizer/backend/services/enhanced_cache_manager.py

"""Enhanced multi-level intelligent caching system for YouTube Summarizer."""

import hashlib
import json
import time
import asyncio
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Union
from enum import Enum
from dataclasses import dataclass, asdict

import redis
from redis import asyncio as aioredis

logger = logging.getLogger(__name__)


class CacheLevel(Enum):
    """Cache storage levels."""
    L1_MEMORY = "l1_memory"      # Redis - fastest, volatile
    L2_DATABASE = "l2_database"   # SQLite/PostgreSQL - persistent, structured


class CachePolicy(Enum):
    """Cache write policies."""
    WRITE_THROUGH = "write_through"    # Write to all levels immediately
    WRITE_BACK = "write_back"          # Write to fast cache first, sync later
    WRITE_AROUND = "write_around"      # Skip cache on write, read from storage


@dataclass
class CacheConfig:
    """Cache configuration settings."""
    transcript_ttl_hours: int = 168  # 7 days
    summary_ttl_hours: int = 72      # 3 days
    memory_max_size_mb: int = 512    # Redis memory limit
    warming_batch_size: int = 50     # Videos per warming batch
    cleanup_interval_hours: int = 6  # Cleanup frequency
    hit_rate_alert_threshold: float = 0.7  # Alert if hit rate drops below
    redis_url: Optional[str] = None  # Redis connection URL
    enable_warming: bool = False     # Enable cache warming
    enable_analytics: bool = True    # Enable analytics collection


@dataclass
class CacheMetrics:
    """Cache performance metrics."""
    hits: int = 0
    misses: int = 0
    write_operations: int = 0
    evictions: int = 0
    errors: int = 0
    total_size_bytes: int = 0
    average_response_time_ms: float = 0.0

    @property
    def hit_rate(self) -> float:
        """Calculate cache hit rate."""
        total = self.hits + self.misses
        return self.hits / total if total > 0 else 0.0

    @property
    def total_operations(self) -> int:
        """Total cache operations."""
        return self.hits + self.misses + self.write_operations

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            **asdict(self),
            'hit_rate': self.hit_rate,
            'total_operations': self.total_operations
        }


class EnhancedCacheManager:
    """Enhanced multi-level intelligent caching system."""

    def __init__(self, config: Optional[CacheConfig] = None):
        """Initialize enhanced cache manager.

        Args:
            config: Cache configuration settings
        """
        self.config = config or CacheConfig()
        self.metrics = CacheMetrics()
        self.redis_client: Optional[aioredis.Redis] = None
        self._memory_cache: Dict[str, Dict[str, Any]] = {}  # Fallback memory cache

        # Cache key prefixes
        self.TRANSCRIPT_PREFIX = "yt:transcript:"
        self.SUMMARY_PREFIX = "yt:summary:"
        self.METADATA_PREFIX = "yt:meta:"
        self.ANALYTICS_PREFIX = "yt:analytics:"

        # Background tasks
        self._cleanup_task: Optional[asyncio.Task] = None
        self._warming_task: Optional[asyncio.Task] = None
        self._initialized = False

    async def initialize(self) -> None:
        """Initialize cache connections and background tasks."""
        if self._initialized:
            return

        # Initialize Redis connection if available
        if self.config.redis_url:
            try:
                self.redis_client = await aioredis.from_url(
                    self.config.redis_url,
                    encoding="utf-8",
                    decode_responses=True
                )
                await self.redis_client.ping()
                logger.info("Redis connection established")
            except Exception as e:
                logger.warning(f"Redis connection failed, using memory cache: {e}")
                self.redis_client = None
        else:
            logger.info("Redis URL not configured, using memory cache")

        # Start background tasks
        await self.start_background_tasks()
        self._initialized = True

    async def start_background_tasks(self) -> None:
        """Start background cache management tasks."""
        self._cleanup_task = asyncio.create_task(self._periodic_cleanup())

        if self.config.enable_warming:
            self._warming_task = asyncio.create_task(self._cache_warming_scheduler())

    async def stop_background_tasks(self) -> None:
        """Stop background tasks gracefully."""
        if self._cleanup_task:
            self._cleanup_task.cancel()
            try:
                await self._cleanup_task
            except asyncio.CancelledError:
                pass

        if self._warming_task:
            self._warming_task.cancel()
            try:
                await self._warming_task
            except asyncio.CancelledError:
                pass

    async def close(self) -> None:
        """Close cache connections and cleanup."""
        await self.stop_background_tasks()

        if self.redis_client:
            await self.redis_client.close()

        self._initialized = False

    # Transcript Caching Methods

    async def get_cached_transcript(
        self,
        video_id: str,
        language: str = "en"
    ) -> Optional[Dict[str, Any]]:
        """Retrieve cached transcript with multi-level fallback.

        Args:
            video_id: YouTube video ID
            language: Transcript language code

        Returns:
            Cached transcript data or None if not found
        """
        cache_key = self._generate_transcript_key(video_id, language)
        start_time = time.time()

        try:
            # Try Redis first if available
            if self.redis_client:
                cached_data = await self._get_from_redis(cache_key)
                if cached_data:
                    self._record_cache_hit("transcript", "l1_memory", start_time)
                    return cached_data
            else:
                # Fallback to memory cache
                cached_data = self._memory_cache.get(cache_key)
                if cached_data and self._is_cache_valid(cached_data):
                    self._record_cache_hit("transcript", "memory", start_time)
                    return cached_data["data"]

            self._record_cache_miss("transcript", start_time)
            return None

        except Exception as e:
            self.metrics.errors += 1
            logger.error(f"Cache retrieval error: {e}")
            return None

    async def cache_transcript(
        self,
        video_id: str,
        language: str,
        transcript_data: Dict[str, Any],
        policy: CachePolicy = CachePolicy.WRITE_THROUGH
    ) -> bool:
        """Cache transcript with specified write policy.

        Args:
            video_id: YouTube video ID
            language: Transcript language code
            transcript_data: Transcript data to cache
            policy: Cache write policy

        Returns:
            True if caching succeeded
        """
        cache_key = self._generate_transcript_key(video_id, language)
        start_time = time.time()

        try:
            ttl_seconds = self.config.transcript_ttl_hours * 3600
            success = True

            if policy == CachePolicy.WRITE_THROUGH:
                # Write to all cache levels
                if self.redis_client:
                    success &= await self._set_in_redis(cache_key, transcript_data, ttl_seconds)
                else:
                    # Use memory cache as fallback
                    self._set_in_memory(cache_key, transcript_data, ttl_seconds)

            elif policy == CachePolicy.WRITE_BACK:
                # Write to fastest cache first
                if self.redis_client:
                    success = await self._set_in_redis(cache_key, transcript_data, ttl_seconds)
                else:
                    self._set_in_memory(cache_key, transcript_data, ttl_seconds)

            self.metrics.write_operations += 1
            self._record_cache_operation("transcript_write", start_time)

            return success

        except Exception as e:
            self.metrics.errors += 1
            logger.error(f"Cache write error: {e}")
            return False

    # Summary Caching Methods

    async def get_cached_summary(
        self,
        transcript_hash: str,
        config_hash: str
    ) -> Optional[Dict[str, Any]]:
        """Retrieve cached summary by content and configuration hash.

        Args:
            transcript_hash: Hash of transcript content
            config_hash: Hash of summary configuration

        Returns:
            Cached summary data or None if not found
        """
        cache_key = self._generate_summary_key(transcript_hash, config_hash)
        start_time = time.time()

        try:
            # Try Redis first
            if self.redis_client:
                cached_data = await self._get_from_redis(cache_key)
                if cached_data and self._is_summary_valid(cached_data):
                    self._record_cache_hit("summary", "l1_memory", start_time)
                    return cached_data
            else:
                # Fallback to memory cache
                cached_data = self._memory_cache.get(cache_key)
                if cached_data and self._is_cache_valid(cached_data) and self._is_summary_valid(cached_data["data"]):
                    self._record_cache_hit("summary", "memory", start_time)
                    return cached_data["data"]

            self._record_cache_miss("summary", start_time)
            return None

        except Exception as e:
            self.metrics.errors += 1
            logger.error(f"Summary cache retrieval error: {e}")
            return None

    async def cache_summary(
        self,
        transcript_hash: str,
        config_hash: str,
        summary_data: Dict[str, Any]
    ) -> bool:
        """Cache summary result with metadata.

        Args:
            transcript_hash: Hash of transcript content
            config_hash: Hash of summary configuration
            summary_data: Summary data to cache

        Returns:
            True if caching succeeded
        """
        cache_key = self._generate_summary_key(transcript_hash, config_hash)

        # Add versioning and timestamp metadata
        enhanced_data = {
            **summary_data,
            "_cache_metadata": {
                "cached_at": datetime.utcnow().isoformat(),
                "ai_model_version": summary_data.get("model", "claude-3-5-haiku-20241022"),
                "prompt_version": "v1.0",
                "cache_version": "1.0"
            }
        }

        try:
            ttl_seconds = self.config.summary_ttl_hours * 3600

            if self.redis_client:
                success = await self._set_in_redis(cache_key, enhanced_data, ttl_seconds)
            else:
                self._set_in_memory(cache_key, enhanced_data, ttl_seconds)
                success = True

            self.metrics.write_operations += 1
            return success

        except Exception as e:
            self.metrics.errors += 1
            logger.error(f"Summary cache write error: {e}")
            return False

    # Cache Key Generation

    def _generate_transcript_key(self, video_id: str, language: str) -> str:
        """Generate unique cache key for transcript."""
        return f"{self.TRANSCRIPT_PREFIX}{video_id}:{language}"

    def _generate_summary_key(self, transcript_hash: str, config_hash: str) -> str:
        """Generate unique cache key for summary."""
        return f"{self.SUMMARY_PREFIX}{transcript_hash}:{config_hash}"

    def generate_content_hash(self, content: str) -> str:
        """Generate stable hash for content."""
        return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]

    def generate_config_hash(self, config: Dict[str, Any]) -> str:
        """Generate stable hash for configuration."""
        # Sort keys for consistent hashing
        config_str = json.dumps(config, sort_keys=True)
        return hashlib.sha256(config_str.encode('utf-8')).hexdigest()[:16]

    # Redis Operations

    async def _get_from_redis(self, key: str) -> Optional[Dict[str, Any]]:
        """Get data from Redis with error handling."""
        if not self.redis_client:
            return None

        try:
            data = await self.redis_client.get(key)
            if data:
                return json.loads(data)
            return None
        except Exception as e:
            logger.error(f"Redis get error: {e}")
            return None

    async def _set_in_redis(self, key: str, data: Dict[str, Any], ttl_seconds: int) -> bool:
        """Set data in Redis with TTL."""
        if not self.redis_client:
            return False

        try:
            serialized = json.dumps(data)
            await self.redis_client.setex(key, ttl_seconds, serialized)
            return True
        except Exception as e:
            logger.error(f"Redis set error: {e}")
            return False

    async def _delete_from_redis(self, key: str) -> bool:
        """Delete key from Redis."""
        if not self.redis_client:
            return False

        try:
            await self.redis_client.delete(key)
            return True
        except Exception as e:
            logger.error(f"Redis delete error: {e}")
            return False

    # Memory Cache Operations (Fallback)

    def _set_in_memory(self, key: str, data: Dict[str, Any], ttl_seconds: int) -> None:
        """Set data in memory cache with expiration."""
        expires_at = datetime.utcnow() + timedelta(seconds=ttl_seconds)
        self._memory_cache[key] = {
            "data": data,
            "expires_at": expires_at.isoformat()
        }

    def _is_cache_valid(self, cache_entry: Dict[str, Any]) -> bool:
        """Check if memory cache entry is still valid."""
        expires_at_str = cache_entry.get("expires_at")
        if not expires_at_str:
            return False

        expires_at = datetime.fromisoformat(expires_at_str)
        return datetime.utcnow() < expires_at

    # Cache Validation

    def _is_summary_valid(self, cached_data: Dict[str, Any]) -> bool:
        """Check if cached summary is still valid based on versioning."""
        metadata = cached_data.get("_cache_metadata", {})

        # Check cache version compatibility
        cached_version = metadata.get("cache_version", "0.0")
        if cached_version != "1.0":
            return False

        # Check age (additional validation beyond TTL)
        cached_at = metadata.get("cached_at")
        if cached_at:
            cached_time = datetime.fromisoformat(cached_at)
            age_hours = (datetime.utcnow() - cached_time).total_seconds() / 3600

            if age_hours > self.config.summary_ttl_hours:
                return False

        return True

    # Background Tasks

    async def _periodic_cleanup(self):
        """Background task for cache cleanup and maintenance."""
        while True:
            try:
                await asyncio.sleep(self.config.cleanup_interval_hours * 3600)

                # Clean up memory cache
                await self._cleanup_memory_cache()

                # Clean up Redis if available
                if self.redis_client:
                    await self._cleanup_redis_memory()

                logger.info("Cache cleanup completed")

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"Cache cleanup error: {e}")

    async def _cleanup_memory_cache(self):
        """Remove expired entries from memory cache."""
        now = datetime.utcnow()
        expired_keys = []

        for key, entry in self._memory_cache.items():
            if not self._is_cache_valid(entry):
                expired_keys.append(key)

        for key in expired_keys:
            del self._memory_cache[key]

        if expired_keys:
            logger.info(f"Cleaned up {len(expired_keys)} expired memory cache entries")

    async def _cleanup_redis_memory(self):
        """Monitor and manage Redis memory usage."""
        if not self.redis_client:
            return

        try:
            info = await self.redis_client.info('memory')
            used_memory_mb = info.get('used_memory', 0) / (1024 * 1024)

            if used_memory_mb > self.config.memory_max_size_mb * 0.8:  # 80% threshold
                logger.warning(f"Redis memory usage high: {used_memory_mb:.1f}MB")
                # Redis will handle eviction based on maxmemory-policy
        except Exception as e:
            logger.error(f"Redis memory check error: {e}")

    async def _cache_warming_scheduler(self):
        """Background task for intelligent cache warming."""
        while True:
            try:
                await asyncio.sleep(3600)  # Run hourly

                # Get popular videos for warming
                popular_videos = await self._get_popular_videos()

                for video_batch in self._batch_videos(popular_videos, self.config.warming_batch_size):
                    await self._warm_video_batch(video_batch)
                    await asyncio.sleep(5)  # Rate limiting

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"Cache warming error: {e}")

    async def _get_popular_videos(self) -> List[str]:
        """Get list of popular video IDs for cache warming."""
        # TODO: Integrate with analytics service
        return []

    def _batch_videos(self, videos: List[str], batch_size: int) -> List[List[str]]:
        """Split videos into batches for processing."""
        return [videos[i:i + batch_size] for i in range(0, len(videos), batch_size)]

    async def _warm_video_batch(self, video_ids: List[str]):
        """Warm cache for a batch of videos."""
        # TODO: Implement cache warming logic
        pass

    # Metrics and Analytics

    def _record_cache_hit(self, cache_type: str, level: str, start_time: float):
        """Record cache hit metrics."""
        self.metrics.hits += 1
        response_time = (time.time() - start_time) * 1000
        self._update_average_response_time(response_time)

    def _record_cache_miss(self, cache_type: str, start_time: float):
        """Record cache miss metrics."""
        self.metrics.misses += 1
        response_time = (time.time() - start_time) * 1000
        self._update_average_response_time(response_time)

    def _record_cache_operation(self, operation_type: str, start_time: float):
        """Record cache operation metrics."""
        response_time = (time.time() - start_time) * 1000
        self._update_average_response_time(response_time)

    def _update_average_response_time(self, response_time: float):
        """Update rolling average response time."""
        total_ops = self.metrics.total_operations
        if total_ops > 1:
            self.metrics.average_response_time_ms = (
                (self.metrics.average_response_time_ms * (total_ops - 1) + response_time) / total_ops
            )
        else:
            self.metrics.average_response_time_ms = response_time

    async def get_cache_analytics(self) -> Dict[str, Any]:
        """Get comprehensive cache analytics."""
        # Get Redis info if available
        redis_info = {}
        if self.redis_client:
            try:
                memory_info = await self.redis_client.info('memory')
                redis_info = {
                    "used_memory_mb": memory_info.get('used_memory', 0) / (1024 * 1024),
                    "max_memory_mb": self.config.memory_max_size_mb,
                    "memory_usage_percent": (memory_info.get('used_memory', 0) / (1024 * 1024)) / self.config.memory_max_size_mb * 100
                }
            except Exception as e:
                redis_info = {"error": str(e)}

        # Memory cache info
        memory_cache_info = {
            "entries": len(self._memory_cache),
            "estimated_size_mb": sum(len(json.dumps(v)) for v in self._memory_cache.values()) / (1024 * 1024)
        }

        return {
            "performance_metrics": self.metrics.to_dict(),
            "redis_usage": redis_info if self.redis_client else None,
            "memory_cache_usage": memory_cache_info,
            "configuration": {
                "transcript_ttl_hours": self.config.transcript_ttl_hours,
                "summary_ttl_hours": self.config.summary_ttl_hours,
                "memory_max_size_mb": self.config.memory_max_size_mb,
                "using_redis": bool(self.redis_client)
            }
        }

    async def invalidate_cache(self, pattern: Optional[str] = None) -> int:
        """Invalidate cache entries matching pattern.

        Args:
            pattern: Optional pattern to match cache keys

        Returns:
            Number of entries invalidated
        """
        count = 0

        # Clear memory cache
        if pattern:
            keys_to_delete = [k for k in self._memory_cache.keys() if pattern in k]
            for key in keys_to_delete:
                del self._memory_cache[key]
                count += 1
        else:
            count = len(self._memory_cache)
            self._memory_cache.clear()

        # Clear Redis if available
        if self.redis_client:
            try:
                if pattern:
                    # Use SCAN to find matching keys
                    cursor = 0
                    while True:
                        cursor, keys = await self.redis_client.scan(cursor, match=f"*{pattern}*")
                        if keys:
                            await self.redis_client.delete(*keys)
                            count += len(keys)
                        if cursor == 0:
                            break
                else:
                    # Clear all cache keys
                    await self.redis_client.flushdb()
            except Exception as e:
                logger.error(f"Redis invalidation error: {e}")

        logger.info(f"Invalidated {count} cache entries")
        return count

    # Compatibility methods with existing CacheManager

    async def cache_pipeline_result(self, job_id: str, result: Any, ttl: Optional[int] = None) -> bool:
        """Cache pipeline result (compatibility method)."""
        cache_key = f"pipeline:{job_id}"
        ttl_seconds = ttl or self.config.summary_ttl_hours * 3600

        if hasattr(result, '__dataclass_fields__'):
            result_data = asdict(result)
        else:
            result_data = result

        if self.redis_client:
            return await self._set_in_redis(cache_key, result_data, ttl_seconds)
        else:
            self._set_in_memory(cache_key, result_data, ttl_seconds)
            return True

    async def get_cached_pipeline_result(self, job_id: str) -> Optional[Dict[str, Any]]:
        """Get cached pipeline result (compatibility method)."""
        cache_key = f"pipeline:{job_id}"

        if self.redis_client:
            return await self._get_from_redis(cache_key)
        else:
            entry = self._memory_cache.get(cache_key)
            if entry and self._is_cache_valid(entry):
                return entry["data"]
            return None

    async def get_cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics (compatibility method)."""
        return await self.get_cache_analytics()