youtube-summarizer/backend/services/chroma_service.py

"""ChromaDB service for vector storage and similarity search."""

import asyncio
import logging
from typing import List, Dict, Any, Optional, Tuple
import uuid
import hashlib
import json
from datetime import datetime

import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
import numpy as np

from backend.core.exceptions import ServiceError

logger = logging.getLogger(__name__)


class ChromaDBError(ServiceError):
    """ChromaDB specific errors."""
    pass


class ChromaService:
    """Service for ChromaDB vector database operations."""

    def __init__(
        self,
        persist_directory: str = "./data/chromadb",
        embedding_model: str = "all-MiniLM-L6-v2",
        collection_name: str = "youtube_transcripts"
    ):
        """Initialize ChromaDB service.

        Args:
            persist_directory: Directory for persistent storage
            embedding_model: SentenceTransformers model name
            collection_name: ChromaDB collection name
        """
        self.persist_directory = persist_directory
        self.embedding_model_name = f"sentence-transformers/{embedding_model}"
        self.collection_name = collection_name

        # Initialize components
        self._client = None
        self._collection = None
        self._embedding_model = None
        self._embedding_function = None

        # Performance metrics
        self.stats = {
            'documents_added': 0,
            'queries_executed': 0,
            'total_embedding_time': 0.0,
            'total_search_time': 0.0
        }

    async def initialize(self) -> None:
        """Initialize ChromaDB client and collection."""
        try:
            logger.info(f"Initializing ChromaDB with persist_directory: {self.persist_directory}")

            # Initialize ChromaDB client with persistent storage
            self._client = chromadb.PersistentClient(
                path=self.persist_directory,
                settings=Settings(
                    anonymized_telemetry=False,
                    allow_reset=True
                )
            )

            # Initialize embedding function
            self._embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
                model_name=self.embedding_model_name
            )

            # Load embedding model for manual operations
            self._embedding_model = SentenceTransformer(self.embedding_model_name)

            # Get or create collection
            try:
                self._collection = self._client.get_collection(
                    name=self.collection_name,
                    embedding_function=self._embedding_function
                )
                logger.info(f"Loaded existing collection '{self.collection_name}' with {self._collection.count()} documents")
            except Exception:
                self._collection = self._client.create_collection(
                    name=self.collection_name,
                    embedding_function=self._embedding_function,
                    metadata={"description": "YouTube video transcript chunks for RAG"}
                )
                logger.info(f"Created new collection '{self.collection_name}'")

        except Exception as e:
            logger.error(f"Failed to initialize ChromaDB: {e}")
            raise ChromaDBError(f"ChromaDB initialization failed: {e}")

    async def add_document_chunks(
        self,
        video_id: str,
        chunks: List[Dict[str, Any]]
    ) -> List[str]:
        """Add document chunks to ChromaDB.

        Args:
            video_id: YouTube video ID
            chunks: List of chunk dictionaries with content and metadata

        Returns:
            List of ChromaDB document IDs
        """
        if not self._collection:
            await self.initialize()

        try:
            start_time = datetime.now()

            # Prepare documents for ChromaDB
            documents = []
            metadatas = []
            ids = []

            for chunk in chunks:
                # Generate unique ID for ChromaDB
                chunk_id = str(uuid.uuid4())
                ids.append(chunk_id)

                # Document content
                content = chunk.get('content', '')
                documents.append(content)

                # Metadata for filtering and context
                metadata = {
                    'video_id': video_id,
                    'chunk_type': chunk.get('chunk_type', 'transcript'),
                    'chunk_index': chunk.get('chunk_index', 0),
                    'start_timestamp': chunk.get('start_timestamp'),
                    'end_timestamp': chunk.get('end_timestamp'),
                    'content_length': len(content),
                    'content_hash': hashlib.sha256(content.encode()).hexdigest(),
                    'created_at': datetime.now().isoformat(),
                    'embedding_model': self.embedding_model_name
                }

                # Add optional metadata
                if 'keywords' in chunk:
                    metadata['keywords'] = json.dumps(chunk['keywords'])
                if 'entities' in chunk:
                    metadata['entities'] = json.dumps(chunk['entities'])

                metadatas.append(metadata)

            # Add to ChromaDB collection
            self._collection.add(
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )

            # Update statistics
            processing_time = (datetime.now() - start_time).total_seconds()
            self.stats['documents_added'] += len(documents)
            self.stats['total_embedding_time'] += processing_time

            logger.info(f"Added {len(documents)} chunks to ChromaDB in {processing_time:.3f}s")
            return ids

        except Exception as e:
            logger.error(f"Failed to add documents to ChromaDB: {e}")
            raise ChromaDBError(f"Failed to add documents: {e}")

    async def search_similar(
        self,
        query: str,
        video_id: Optional[str] = None,
        chunk_types: Optional[List[str]] = None,
        n_results: int = 5,
        similarity_threshold: float = 0.0
    ) -> List[Dict[str, Any]]:
        """Search for similar content using vector similarity.

        Args:
            query: Search query text
            video_id: Optional filter by video ID
            chunk_types: Optional filter by chunk types
            n_results: Number of results to return
            similarity_threshold: Minimum similarity score

        Returns:
            List of search results with content, metadata, and scores
        """
        if not self._collection:
            await self.initialize()

        try:
            start_time = datetime.now()

            # Build where clause for filtering
            where = {}
            if video_id:
                where['video_id'] = video_id
            if chunk_types:
                where['chunk_type'] = {"$in": chunk_types}

            # Perform similarity search
            results = self._collection.query(
                query_texts=[query],
                n_results=n_results,
                where=where if where else None,
                include=['metadatas', 'documents', 'distances']
            )

            # Process and format results
            formatted_results = []
            if results['documents'] and results['documents'][0]:
                for i, (doc, metadata, distance) in enumerate(zip(
                    results['documents'][0],
                    results['metadatas'][0],
                    results['distances'][0]
                )):
                    # Convert distance to similarity score (ChromaDB uses L2 distance)
                    similarity_score = max(0.0, 1.0 - (distance / 2.0))

                    if similarity_score >= similarity_threshold:
                        result = {
                            'content': doc,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1,
                            'video_id': metadata.get('video_id'),
                            'chunk_type': metadata.get('chunk_type'),
                            'start_timestamp': metadata.get('start_timestamp'),
                            'end_timestamp': metadata.get('end_timestamp'),
                            'chunk_index': metadata.get('chunk_index')
                        }

                        # Format timestamp for display
                        if result['start_timestamp'] is not None:
                            timestamp = result['start_timestamp']
                            hours = int(timestamp // 3600)
                            minutes = int((timestamp % 3600) // 60)
                            seconds = int(timestamp % 60)
                            result['timestamp_formatted'] = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
                            result['youtube_link'] = f"https://youtube.com/watch?v={result['video_id']}&t={int(timestamp)}s"

                        formatted_results.append(result)

            # Update statistics
            search_time = (datetime.now() - start_time).total_seconds()
            self.stats['queries_executed'] += 1
            self.stats['total_search_time'] += search_time

            logger.info(f"Search completed in {search_time:.3f}s, found {len(formatted_results)} results")
            return formatted_results

        except Exception as e:
            logger.error(f"Search failed: {e}")
            raise ChromaDBError(f"Search failed: {e}")

    async def get_collection_stats(self) -> Dict[str, Any]:
        """Get collection statistics and health metrics."""
        if not self._collection:
            await self.initialize()

        try:
            count = self._collection.count()

            return {
                'collection_name': self.collection_name,
                'total_documents': count,
                'embedding_model': self.embedding_model_name,
                'persist_directory': self.persist_directory,
                **self.stats
            }
        except Exception as e:
            logger.error(f"Failed to get collection stats: {e}")
            return {'error': str(e)}

    async def delete_video_chunks(self, video_id: str) -> int:
        """Delete all chunks for a specific video.

        Args:
            video_id: YouTube video ID

        Returns:
            Number of deleted documents
        """
        if not self._collection:
            await self.initialize()

        try:
            # Get documents to delete
            results = self._collection.get(
                where={'video_id': video_id},
                include=['documents']
            )

            if results['ids']:
                # Delete documents
                self._collection.delete(ids=results['ids'])
                deleted_count = len(results['ids'])
                logger.info(f"Deleted {deleted_count} chunks for video {video_id}")
                return deleted_count

            return 0

        except Exception as e:
            logger.error(f"Failed to delete video chunks: {e}")
            raise ChromaDBError(f"Failed to delete video chunks: {e}")

    async def reset_collection(self) -> None:
        """Reset the collection (delete all documents)."""
        if not self._client:
            await self.initialize()

        try:
            # Delete and recreate collection
            self._client.delete_collection(self.collection_name)
            self._collection = self._client.create_collection(
                name=self.collection_name,
                embedding_function=self._embedding_function,
                metadata={"description": "YouTube video transcript chunks for RAG"}
            )

            # Reset stats
            self.stats = {
                'documents_added': 0,
                'queries_executed': 0,
                'total_embedding_time': 0.0,
                'total_search_time': 0.0
            }

            logger.info("ChromaDB collection reset successfully")

        except Exception as e:
            logger.error(f"Failed to reset collection: {e}")
            raise ChromaDBError(f"Failed to reset collection: {e}")

    async def health_check(self) -> Dict[str, Any]:
        """Perform health check on ChromaDB service."""
        try:
            if not self._collection:
                await self.initialize()

            # Test basic operations
            count = self._collection.count()

            # Test embedding generation
            test_embedding = self._embedding_model.encode(["test query"])

            return {
                'status': 'healthy',
                'collection_count': count,
                'embedding_model': self.embedding_model_name,
                'embedding_dimension': len(test_embedding[0]),
                'persist_directory': self.persist_directory
            }

        except Exception as e:
            logger.error(f"ChromaDB health check failed: {e}")
            return {
                'status': 'unhealthy',
                'error': str(e)
            }

    def __del__(self):
        """Cleanup resources."""
        if self._client:
            try:
                # ChromaDB client doesn't need explicit cleanup
                pass
            except:
                pass