"""Semantic search service combining ChromaDB vector search with traditional search methods.""" import logging import asyncio import hashlib from typing import List, Dict, Any, Optional, Tuple, Union from datetime import datetime, timedelta import json import uuid from sqlalchemy import and_, or_, func from sqlalchemy.orm import Session from backend.core.exceptions import ServiceError from backend.models.chat import VideoChunk from backend.models.rag_models import RAGChunk, SemanticSearchResult from backend.models.summary import Summary from backend.services.chroma_service import ChromaService, ChromaDBError from backend.core.database_registry import registry logger = logging.getLogger(__name__) class SemanticSearchError(ServiceError): """Semantic search specific errors.""" pass class SearchMode: """Search mode constants.""" VECTOR_ONLY = "vector" HYBRID = "hybrid" TRADITIONAL = "traditional" class SemanticSearchService: """Service for semantic search across video content using multiple methods.""" def __init__( self, chroma_service: Optional[ChromaService] = None, default_search_mode: str = SearchMode.HYBRID ): """Initialize semantic search service. Args: chroma_service: ChromaDB service instance default_search_mode: Default search strategy """ self.chroma_service = chroma_service or ChromaService() self.default_search_mode = default_search_mode # Search performance metrics self.metrics = { 'total_searches': 0, 'vector_searches': 0, 'hybrid_searches': 0, 'traditional_searches': 0, 'avg_search_time': 0.0, 'cache_hits': 0 } # Query cache for performance self._query_cache = {} self._cache_ttl = 300 # 5 minutes async def initialize(self) -> None: """Initialize the search service.""" try: await self.chroma_service.initialize() logger.info("Semantic search service initialized successfully") except Exception as e: logger.error(f"Failed to initialize semantic search service: {e}") raise SemanticSearchError(f"Initialization failed: {e}") async def search( self, query: str, video_id: Optional[str] = None, search_mode: Optional[str] = None, max_results: int = 10, similarity_threshold: float = 0.3, user_id: Optional[str] = None, include_metadata: bool = True ) -> Dict[str, Any]: """Perform semantic search across video content. Args: query: Search query text video_id: Optional filter by specific video search_mode: Search strategy (vector, hybrid, traditional) max_results: Maximum number of results similarity_threshold: Minimum similarity score for vector search user_id: Optional user ID for analytics include_metadata: Whether to include detailed metadata Returns: Search results with content, scores, and metadata """ start_time = datetime.now() search_mode = search_mode or self.default_search_mode try: logger.info(f"Starting {search_mode} search for query: '{query[:50]}...'") # Check cache first cache_key = self._generate_cache_key(query, video_id, search_mode, max_results) cached_result = self._get_cached_result(cache_key) if cached_result: self.metrics['cache_hits'] += 1 logger.info("Returning cached search result") return cached_result # Perform search based on mode if search_mode == SearchMode.VECTOR_ONLY: results = await self._vector_search( query, video_id, max_results, similarity_threshold ) elif search_mode == SearchMode.HYBRID: results = await self._hybrid_search( query, video_id, max_results, similarity_threshold ) else: # TRADITIONAL results = await self._traditional_search( query, video_id, max_results ) # Enhance results with metadata if requested if include_metadata: results = await self._enhance_results_with_metadata(results) # Log search analytics await self._log_search_analytics( query, search_mode, len(results.get('results', [])), user_id, start_time ) # Prepare final response search_response = { 'query': query, 'search_mode': search_mode, 'video_id': video_id, 'total_results': len(results.get('results', [])), 'search_time_seconds': (datetime.now() - start_time).total_seconds(), 'similarity_threshold': similarity_threshold, 'results': results.get('results', []), 'metadata': { 'cached': False, 'timestamp': datetime.now().isoformat(), 'service_metrics': self._get_current_metrics() } } # Cache result self._cache_result(cache_key, search_response) # Update metrics self._update_metrics(search_mode, start_time) return search_response except Exception as e: logger.error(f"Search failed: {e}") raise SemanticSearchError(f"Search failed: {e}") async def _vector_search( self, query: str, video_id: Optional[str], max_results: int, similarity_threshold: float ) -> Dict[str, Any]: """Perform pure vector similarity search. Args: query: Search query video_id: Optional video filter max_results: Maximum results similarity_threshold: Minimum similarity Returns: Vector search results """ try: # Perform ChromaDB search chroma_results = await self.chroma_service.search_similar( query=query, video_id=video_id, n_results=max_results, similarity_threshold=similarity_threshold ) # Format results formatted_results = [] for result in chroma_results: formatted_results.append({ 'content': result['content'], 'similarity_score': result['similarity_score'], 'video_id': result['video_id'], 'chunk_type': result['chunk_type'], 'start_timestamp': result.get('start_timestamp'), 'end_timestamp': result.get('end_timestamp'), 'timestamp_formatted': result.get('timestamp_formatted'), 'youtube_link': result.get('youtube_link'), 'rank': result['rank'], 'search_method': 'vector', 'metadata': result['metadata'] }) return { 'results': formatted_results, 'search_method': 'vector_only', 'vector_results_count': len(formatted_results) } except ChromaDBError as e: logger.error(f"Vector search failed: {e}") return {'results': [], 'error': str(e)} async def _hybrid_search( self, query: str, video_id: Optional[str], max_results: int, similarity_threshold: float ) -> Dict[str, Any]: """Perform hybrid search combining vector and traditional methods. Args: query: Search query video_id: Optional video filter max_results: Maximum results similarity_threshold: Minimum similarity Returns: Hybrid search results """ try: # Run both searches in parallel vector_task = asyncio.create_task( self._vector_search(query, video_id, max_results, similarity_threshold) ) traditional_task = asyncio.create_task( self._traditional_search(query, video_id, max_results // 2) ) vector_results, traditional_results = await asyncio.gather( vector_task, traditional_task ) # Combine and rank results combined_results = self._combine_and_rank_results( vector_results.get('results', []), traditional_results.get('results', []), max_results ) return { 'results': combined_results, 'search_method': 'hybrid', 'vector_results_count': len(vector_results.get('results', [])), 'traditional_results_count': len(traditional_results.get('results', [])), 'combined_results_count': len(combined_results) } except Exception as e: logger.error(f"Hybrid search failed: {e}") # Fallback to vector search only return await self._vector_search(query, video_id, max_results, similarity_threshold) async def _traditional_search( self, query: str, video_id: Optional[str], max_results: int ) -> Dict[str, Any]: """Perform traditional text-based search using database. Args: query: Search query video_id: Optional video filter max_results: Maximum results Returns: Traditional search results """ try: with registry.get_session() as session: # Build query base_query = session.query(RAGChunk) if video_id: base_query = base_query.filter(RAGChunk.video_id == video_id) # Text search using LIKE (SQLite compatible) search_terms = query.lower().split() text_conditions = [] for term in search_terms: text_conditions.append( func.lower(RAGChunk.content).like(f'%{term}%') ) if text_conditions: base_query = base_query.filter(or_(*text_conditions)) # Execute query chunks = base_query.limit(max_results).all() # Format results formatted_results = [] for i, chunk in enumerate(chunks): # Calculate relevance score based on term frequency relevance_score = self._calculate_text_relevance(query, chunk.content) result = { 'content': chunk.content, 'relevance_score': relevance_score, 'video_id': chunk.video_id, 'chunk_type': chunk.chunk_type, 'start_timestamp': chunk.start_timestamp, 'end_timestamp': chunk.end_timestamp, 'rank': i + 1, 'search_method': 'traditional', 'chunk_id': str(chunk.id) } # Add formatted timestamp if chunk.start_timestamp is not None: timestamp = chunk.start_timestamp hours = int(timestamp // 3600) minutes = int((timestamp % 3600) // 60) seconds = int(timestamp % 60) result['timestamp_formatted'] = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]" result['youtube_link'] = f"https://youtube.com/watch?v={chunk.video_id}&t={int(timestamp)}s" formatted_results.append(result) return { 'results': formatted_results, 'search_method': 'traditional', 'traditional_results_count': len(formatted_results) } except Exception as e: logger.error(f"Traditional search failed: {e}") return {'results': [], 'error': str(e)} def _combine_and_rank_results( self, vector_results: List[Dict[str, Any]], traditional_results: List[Dict[str, Any]], max_results: int ) -> List[Dict[str, Any]]: """Combine and rank results from different search methods. Args: vector_results: Vector search results traditional_results: Traditional search results max_results: Maximum final results Returns: Combined and ranked results """ combined = {} # Add vector results with boosted scores for result in vector_results: key = self._get_result_key(result) result['combined_score'] = result.get('similarity_score', 0.0) * 1.2 # Boost vector scores result['sources'] = ['vector'] combined[key] = result # Add traditional results, merge if already exists for result in traditional_results: key = self._get_result_key(result) if key in combined: # Merge scores from different methods existing_score = combined[key]['combined_score'] new_score = result.get('relevance_score', 0.0) * 0.8 combined[key]['combined_score'] = max(existing_score, existing_score + new_score * 0.5) combined[key]['sources'].append('traditional') else: result['combined_score'] = result.get('relevance_score', 0.0) * 0.8 result['sources'] = ['traditional'] combined[key] = result # Sort by combined score and return top results sorted_results = sorted( combined.values(), key=lambda x: x['combined_score'], reverse=True ) # Re-rank and add final rank final_results = [] for i, result in enumerate(sorted_results[:max_results]): result['final_rank'] = i + 1 result['search_method'] = 'hybrid' final_results.append(result) return final_results def _get_result_key(self, result: Dict[str, Any]) -> str: """Generate unique key for result deduplication. Args: result: Search result Returns: Unique key string """ video_id = result.get('video_id', '') start_time = result.get('start_timestamp', 0) or 0 content_hash = hash(result.get('content', '')[:100]) return f"{video_id}:{start_time}:{content_hash}" def _calculate_text_relevance(self, query: str, content: str) -> float: """Calculate relevance score for traditional text search. Args: query: Search query content: Content to score Returns: Relevance score between 0 and 1 """ query_terms = set(query.lower().split()) content_terms = set(content.lower().split()) if not query_terms: return 0.0 # Simple term frequency scoring matches = len(query_terms.intersection(content_terms)) score = matches / len(query_terms) # Boost for exact phrase matches if query.lower() in content.lower(): score += 0.3 return min(score, 1.0) async def _enhance_results_with_metadata( self, results: Dict[str, Any] ) -> Dict[str, Any]: """Enhance search results with additional metadata. Args: results: Raw search results Returns: Enhanced results with metadata """ try: enhanced_results = [] for result in results.get('results', []): # Get video metadata video_id = result.get('video_id') if video_id: with registry.get_session() as session: summary = session.query(Summary).filter( Summary.video_id == video_id ).first() if summary: result['video_metadata'] = { 'title': summary.video_title, 'duration': getattr(summary, 'video_duration', None), 'channel': getattr(summary, 'channel_name', None), 'summary_created': summary.created_at.isoformat() if summary.created_at else None } enhanced_results.append(result) results['results'] = enhanced_results return results except Exception as e: logger.warning(f"Failed to enhance results with metadata: {e}") return results async def _log_search_analytics( self, query: str, search_mode: str, results_count: int, user_id: Optional[str], start_time: datetime ) -> None: """Log search analytics to database. Args: query: Search query search_mode: Search method used results_count: Number of results returned user_id: Optional user ID start_time: Search start time """ try: search_time = (datetime.now() - start_time).total_seconds() query_id = str(uuid.uuid4()) # This would typically log to a search_analytics table logger.info(f"Search analytics: query='{query[:50]}...', mode={search_mode}, " f"results={results_count}, time={search_time:.3f}s") except Exception as e: logger.warning(f"Failed to log search analytics: {e}") def _generate_cache_key( self, query: str, video_id: Optional[str], search_mode: str, max_results: int ) -> str: """Generate cache key for query. Args: query: Search query video_id: Optional video filter search_mode: Search method max_results: Maximum results Returns: Cache key string """ key_components = [ query.lower(), video_id or "all", search_mode, str(max_results) ] return hashlib.sha256("|".join(key_components).encode()).hexdigest()[:16] def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]: """Get cached search result if valid. Args: cache_key: Cache key Returns: Cached result or None """ if cache_key in self._query_cache: cached_item = self._query_cache[cache_key] if datetime.now() - cached_item['timestamp'] < timedelta(seconds=self._cache_ttl): cached_item['data']['metadata']['cached'] = True return cached_item['data'] else: del self._query_cache[cache_key] return None def _cache_result(self, cache_key: str, result: Dict[str, Any]) -> None: """Cache search result. Args: cache_key: Cache key result: Search result to cache """ self._query_cache[cache_key] = { 'timestamp': datetime.now(), 'data': result } # Cleanup old cache entries (simple LRU) if len(self._query_cache) > 100: oldest_key = min( self._query_cache.keys(), key=lambda k: self._query_cache[k]['timestamp'] ) del self._query_cache[oldest_key] def _update_metrics(self, search_mode: str, start_time: datetime) -> None: """Update search metrics. Args: search_mode: Search method used start_time: Search start time """ search_time = (datetime.now() - start_time).total_seconds() self.metrics['total_searches'] += 1 if search_mode == SearchMode.VECTOR_ONLY: self.metrics['vector_searches'] += 1 elif search_mode == SearchMode.HYBRID: self.metrics['hybrid_searches'] += 1 else: self.metrics['traditional_searches'] += 1 # Update average search time total_time = self.metrics['avg_search_time'] * (self.metrics['total_searches'] - 1) self.metrics['avg_search_time'] = (total_time + search_time) / self.metrics['total_searches'] def _get_current_metrics(self) -> Dict[str, Any]: """Get current search metrics. Returns: Current metrics dictionary """ return dict(self.metrics) async def get_search_suggestions( self, query: str, video_id: Optional[str] = None, max_suggestions: int = 5 ) -> List[str]: """Get search suggestions based on query and available content. Args: query: Partial query text video_id: Optional video filter max_suggestions: Maximum suggestions Returns: List of suggested queries """ try: # Get recent searches and popular terms from content suggestions = [] # This would typically query search logs and content analysis # For now, return basic suggestions if len(query) >= 2: base_suggestions = [ f"{query} explanation", f"{query} examples", f"{query} benefits", f"how to {query}", f"what is {query}" ] suggestions.extend(base_suggestions[:max_suggestions]) return suggestions[:max_suggestions] except Exception as e: logger.warning(f"Failed to get search suggestions: {e}") return [] async def health_check(self) -> Dict[str, Any]: """Perform health check on search service. Returns: Health check results """ try: # Check ChromaDB health chroma_health = await self.chroma_service.health_check() # Check database connectivity db_healthy = True try: with registry.get_session() as session: session.execute("SELECT 1").fetchone() except Exception as e: db_healthy = False logger.error(f"Database health check failed: {e}") return { 'status': 'healthy' if chroma_health.get('status') == 'healthy' and db_healthy else 'degraded', 'chroma_status': chroma_health.get('status'), 'database_status': 'healthy' if db_healthy else 'unhealthy', 'metrics': self.metrics, 'cache_size': len(self._query_cache) } except Exception as e: logger.error(f"Search service health check failed: {e}") return { 'status': 'unhealthy', 'error': str(e) }