youtube-summarizer/backend/services/semantic_search_service.py

677 lines
24 KiB
Python

"""Semantic search service combining ChromaDB vector search with traditional search methods."""
import logging
import asyncio
import hashlib
from typing import List, Dict, Any, Optional, Tuple, Union
from datetime import datetime, timedelta
import json
import uuid
from sqlalchemy import and_, or_, func
from sqlalchemy.orm import Session
from backend.core.exceptions import ServiceError
from backend.models.chat import VideoChunk
from backend.models.rag_models import RAGChunk, SemanticSearchResult
from backend.models.summary import Summary
from backend.services.chroma_service import ChromaService, ChromaDBError
from backend.core.database_registry import registry
logger = logging.getLogger(__name__)
class SemanticSearchError(ServiceError):
"""Semantic search specific errors."""
pass
class SearchMode:
"""Search mode constants."""
VECTOR_ONLY = "vector"
HYBRID = "hybrid"
TRADITIONAL = "traditional"
class SemanticSearchService:
"""Service for semantic search across video content using multiple methods."""
def __init__(
self,
chroma_service: Optional[ChromaService] = None,
default_search_mode: str = SearchMode.HYBRID
):
"""Initialize semantic search service.
Args:
chroma_service: ChromaDB service instance
default_search_mode: Default search strategy
"""
self.chroma_service = chroma_service or ChromaService()
self.default_search_mode = default_search_mode
# Search performance metrics
self.metrics = {
'total_searches': 0,
'vector_searches': 0,
'hybrid_searches': 0,
'traditional_searches': 0,
'avg_search_time': 0.0,
'cache_hits': 0
}
# Query cache for performance
self._query_cache = {}
self._cache_ttl = 300 # 5 minutes
async def initialize(self) -> None:
"""Initialize the search service."""
try:
await self.chroma_service.initialize()
logger.info("Semantic search service initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize semantic search service: {e}")
raise SemanticSearchError(f"Initialization failed: {e}")
async def search(
self,
query: str,
video_id: Optional[str] = None,
search_mode: Optional[str] = None,
max_results: int = 10,
similarity_threshold: float = 0.3,
user_id: Optional[str] = None,
include_metadata: bool = True
) -> Dict[str, Any]:
"""Perform semantic search across video content.
Args:
query: Search query text
video_id: Optional filter by specific video
search_mode: Search strategy (vector, hybrid, traditional)
max_results: Maximum number of results
similarity_threshold: Minimum similarity score for vector search
user_id: Optional user ID for analytics
include_metadata: Whether to include detailed metadata
Returns:
Search results with content, scores, and metadata
"""
start_time = datetime.now()
search_mode = search_mode or self.default_search_mode
try:
logger.info(f"Starting {search_mode} search for query: '{query[:50]}...'")
# Check cache first
cache_key = self._generate_cache_key(query, video_id, search_mode, max_results)
cached_result = self._get_cached_result(cache_key)
if cached_result:
self.metrics['cache_hits'] += 1
logger.info("Returning cached search result")
return cached_result
# Perform search based on mode
if search_mode == SearchMode.VECTOR_ONLY:
results = await self._vector_search(
query, video_id, max_results, similarity_threshold
)
elif search_mode == SearchMode.HYBRID:
results = await self._hybrid_search(
query, video_id, max_results, similarity_threshold
)
else: # TRADITIONAL
results = await self._traditional_search(
query, video_id, max_results
)
# Enhance results with metadata if requested
if include_metadata:
results = await self._enhance_results_with_metadata(results)
# Log search analytics
await self._log_search_analytics(
query, search_mode, len(results.get('results', [])),
user_id, start_time
)
# Prepare final response
search_response = {
'query': query,
'search_mode': search_mode,
'video_id': video_id,
'total_results': len(results.get('results', [])),
'search_time_seconds': (datetime.now() - start_time).total_seconds(),
'similarity_threshold': similarity_threshold,
'results': results.get('results', []),
'metadata': {
'cached': False,
'timestamp': datetime.now().isoformat(),
'service_metrics': self._get_current_metrics()
}
}
# Cache result
self._cache_result(cache_key, search_response)
# Update metrics
self._update_metrics(search_mode, start_time)
return search_response
except Exception as e:
logger.error(f"Search failed: {e}")
raise SemanticSearchError(f"Search failed: {e}")
async def _vector_search(
self,
query: str,
video_id: Optional[str],
max_results: int,
similarity_threshold: float
) -> Dict[str, Any]:
"""Perform pure vector similarity search.
Args:
query: Search query
video_id: Optional video filter
max_results: Maximum results
similarity_threshold: Minimum similarity
Returns:
Vector search results
"""
try:
# Perform ChromaDB search
chroma_results = await self.chroma_service.search_similar(
query=query,
video_id=video_id,
n_results=max_results,
similarity_threshold=similarity_threshold
)
# Format results
formatted_results = []
for result in chroma_results:
formatted_results.append({
'content': result['content'],
'similarity_score': result['similarity_score'],
'video_id': result['video_id'],
'chunk_type': result['chunk_type'],
'start_timestamp': result.get('start_timestamp'),
'end_timestamp': result.get('end_timestamp'),
'timestamp_formatted': result.get('timestamp_formatted'),
'youtube_link': result.get('youtube_link'),
'rank': result['rank'],
'search_method': 'vector',
'metadata': result['metadata']
})
return {
'results': formatted_results,
'search_method': 'vector_only',
'vector_results_count': len(formatted_results)
}
except ChromaDBError as e:
logger.error(f"Vector search failed: {e}")
return {'results': [], 'error': str(e)}
async def _hybrid_search(
self,
query: str,
video_id: Optional[str],
max_results: int,
similarity_threshold: float
) -> Dict[str, Any]:
"""Perform hybrid search combining vector and traditional methods.
Args:
query: Search query
video_id: Optional video filter
max_results: Maximum results
similarity_threshold: Minimum similarity
Returns:
Hybrid search results
"""
try:
# Run both searches in parallel
vector_task = asyncio.create_task(
self._vector_search(query, video_id, max_results, similarity_threshold)
)
traditional_task = asyncio.create_task(
self._traditional_search(query, video_id, max_results // 2)
)
vector_results, traditional_results = await asyncio.gather(
vector_task, traditional_task
)
# Combine and rank results
combined_results = self._combine_and_rank_results(
vector_results.get('results', []),
traditional_results.get('results', []),
max_results
)
return {
'results': combined_results,
'search_method': 'hybrid',
'vector_results_count': len(vector_results.get('results', [])),
'traditional_results_count': len(traditional_results.get('results', [])),
'combined_results_count': len(combined_results)
}
except Exception as e:
logger.error(f"Hybrid search failed: {e}")
# Fallback to vector search only
return await self._vector_search(query, video_id, max_results, similarity_threshold)
async def _traditional_search(
self,
query: str,
video_id: Optional[str],
max_results: int
) -> Dict[str, Any]:
"""Perform traditional text-based search using database.
Args:
query: Search query
video_id: Optional video filter
max_results: Maximum results
Returns:
Traditional search results
"""
try:
with registry.get_session() as session:
# Build query
base_query = session.query(RAGChunk)
if video_id:
base_query = base_query.filter(RAGChunk.video_id == video_id)
# Text search using LIKE (SQLite compatible)
search_terms = query.lower().split()
text_conditions = []
for term in search_terms:
text_conditions.append(
func.lower(RAGChunk.content).like(f'%{term}%')
)
if text_conditions:
base_query = base_query.filter(or_(*text_conditions))
# Execute query
chunks = base_query.limit(max_results).all()
# Format results
formatted_results = []
for i, chunk in enumerate(chunks):
# Calculate relevance score based on term frequency
relevance_score = self._calculate_text_relevance(query, chunk.content)
result = {
'content': chunk.content,
'relevance_score': relevance_score,
'video_id': chunk.video_id,
'chunk_type': chunk.chunk_type,
'start_timestamp': chunk.start_timestamp,
'end_timestamp': chunk.end_timestamp,
'rank': i + 1,
'search_method': 'traditional',
'chunk_id': str(chunk.id)
}
# Add formatted timestamp
if chunk.start_timestamp is not None:
timestamp = chunk.start_timestamp
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
result['timestamp_formatted'] = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
result['youtube_link'] = f"https://youtube.com/watch?v={chunk.video_id}&t={int(timestamp)}s"
formatted_results.append(result)
return {
'results': formatted_results,
'search_method': 'traditional',
'traditional_results_count': len(formatted_results)
}
except Exception as e:
logger.error(f"Traditional search failed: {e}")
return {'results': [], 'error': str(e)}
def _combine_and_rank_results(
self,
vector_results: List[Dict[str, Any]],
traditional_results: List[Dict[str, Any]],
max_results: int
) -> List[Dict[str, Any]]:
"""Combine and rank results from different search methods.
Args:
vector_results: Vector search results
traditional_results: Traditional search results
max_results: Maximum final results
Returns:
Combined and ranked results
"""
combined = {}
# Add vector results with boosted scores
for result in vector_results:
key = self._get_result_key(result)
result['combined_score'] = result.get('similarity_score', 0.0) * 1.2 # Boost vector scores
result['sources'] = ['vector']
combined[key] = result
# Add traditional results, merge if already exists
for result in traditional_results:
key = self._get_result_key(result)
if key in combined:
# Merge scores from different methods
existing_score = combined[key]['combined_score']
new_score = result.get('relevance_score', 0.0) * 0.8
combined[key]['combined_score'] = max(existing_score, existing_score + new_score * 0.5)
combined[key]['sources'].append('traditional')
else:
result['combined_score'] = result.get('relevance_score', 0.0) * 0.8
result['sources'] = ['traditional']
combined[key] = result
# Sort by combined score and return top results
sorted_results = sorted(
combined.values(),
key=lambda x: x['combined_score'],
reverse=True
)
# Re-rank and add final rank
final_results = []
for i, result in enumerate(sorted_results[:max_results]):
result['final_rank'] = i + 1
result['search_method'] = 'hybrid'
final_results.append(result)
return final_results
def _get_result_key(self, result: Dict[str, Any]) -> str:
"""Generate unique key for result deduplication.
Args:
result: Search result
Returns:
Unique key string
"""
video_id = result.get('video_id', '')
start_time = result.get('start_timestamp', 0) or 0
content_hash = hash(result.get('content', '')[:100])
return f"{video_id}:{start_time}:{content_hash}"
def _calculate_text_relevance(self, query: str, content: str) -> float:
"""Calculate relevance score for traditional text search.
Args:
query: Search query
content: Content to score
Returns:
Relevance score between 0 and 1
"""
query_terms = set(query.lower().split())
content_terms = set(content.lower().split())
if not query_terms:
return 0.0
# Simple term frequency scoring
matches = len(query_terms.intersection(content_terms))
score = matches / len(query_terms)
# Boost for exact phrase matches
if query.lower() in content.lower():
score += 0.3
return min(score, 1.0)
async def _enhance_results_with_metadata(
self,
results: Dict[str, Any]
) -> Dict[str, Any]:
"""Enhance search results with additional metadata.
Args:
results: Raw search results
Returns:
Enhanced results with metadata
"""
try:
enhanced_results = []
for result in results.get('results', []):
# Get video metadata
video_id = result.get('video_id')
if video_id:
with registry.get_session() as session:
summary = session.query(Summary).filter(
Summary.video_id == video_id
).first()
if summary:
result['video_metadata'] = {
'title': summary.video_title,
'duration': getattr(summary, 'video_duration', None),
'channel': getattr(summary, 'channel_name', None),
'summary_created': summary.created_at.isoformat() if summary.created_at else None
}
enhanced_results.append(result)
results['results'] = enhanced_results
return results
except Exception as e:
logger.warning(f"Failed to enhance results with metadata: {e}")
return results
async def _log_search_analytics(
self,
query: str,
search_mode: str,
results_count: int,
user_id: Optional[str],
start_time: datetime
) -> None:
"""Log search analytics to database.
Args:
query: Search query
search_mode: Search method used
results_count: Number of results returned
user_id: Optional user ID
start_time: Search start time
"""
try:
search_time = (datetime.now() - start_time).total_seconds()
query_id = str(uuid.uuid4())
# This would typically log to a search_analytics table
logger.info(f"Search analytics: query='{query[:50]}...', mode={search_mode}, "
f"results={results_count}, time={search_time:.3f}s")
except Exception as e:
logger.warning(f"Failed to log search analytics: {e}")
def _generate_cache_key(
self,
query: str,
video_id: Optional[str],
search_mode: str,
max_results: int
) -> str:
"""Generate cache key for query.
Args:
query: Search query
video_id: Optional video filter
search_mode: Search method
max_results: Maximum results
Returns:
Cache key string
"""
key_components = [
query.lower(),
video_id or "all",
search_mode,
str(max_results)
]
return hashlib.sha256("|".join(key_components).encode()).hexdigest()[:16]
def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
"""Get cached search result if valid.
Args:
cache_key: Cache key
Returns:
Cached result or None
"""
if cache_key in self._query_cache:
cached_item = self._query_cache[cache_key]
if datetime.now() - cached_item['timestamp'] < timedelta(seconds=self._cache_ttl):
cached_item['data']['metadata']['cached'] = True
return cached_item['data']
else:
del self._query_cache[cache_key]
return None
def _cache_result(self, cache_key: str, result: Dict[str, Any]) -> None:
"""Cache search result.
Args:
cache_key: Cache key
result: Search result to cache
"""
self._query_cache[cache_key] = {
'timestamp': datetime.now(),
'data': result
}
# Cleanup old cache entries (simple LRU)
if len(self._query_cache) > 100:
oldest_key = min(
self._query_cache.keys(),
key=lambda k: self._query_cache[k]['timestamp']
)
del self._query_cache[oldest_key]
def _update_metrics(self, search_mode: str, start_time: datetime) -> None:
"""Update search metrics.
Args:
search_mode: Search method used
start_time: Search start time
"""
search_time = (datetime.now() - start_time).total_seconds()
self.metrics['total_searches'] += 1
if search_mode == SearchMode.VECTOR_ONLY:
self.metrics['vector_searches'] += 1
elif search_mode == SearchMode.HYBRID:
self.metrics['hybrid_searches'] += 1
else:
self.metrics['traditional_searches'] += 1
# Update average search time
total_time = self.metrics['avg_search_time'] * (self.metrics['total_searches'] - 1)
self.metrics['avg_search_time'] = (total_time + search_time) / self.metrics['total_searches']
def _get_current_metrics(self) -> Dict[str, Any]:
"""Get current search metrics.
Returns:
Current metrics dictionary
"""
return dict(self.metrics)
async def get_search_suggestions(
self,
query: str,
video_id: Optional[str] = None,
max_suggestions: int = 5
) -> List[str]:
"""Get search suggestions based on query and available content.
Args:
query: Partial query text
video_id: Optional video filter
max_suggestions: Maximum suggestions
Returns:
List of suggested queries
"""
try:
# Get recent searches and popular terms from content
suggestions = []
# This would typically query search logs and content analysis
# For now, return basic suggestions
if len(query) >= 2:
base_suggestions = [
f"{query} explanation",
f"{query} examples",
f"{query} benefits",
f"how to {query}",
f"what is {query}"
]
suggestions.extend(base_suggestions[:max_suggestions])
return suggestions[:max_suggestions]
except Exception as e:
logger.warning(f"Failed to get search suggestions: {e}")
return []
async def health_check(self) -> Dict[str, Any]:
"""Perform health check on search service.
Returns:
Health check results
"""
try:
# Check ChromaDB health
chroma_health = await self.chroma_service.health_check()
# Check database connectivity
db_healthy = True
try:
with registry.get_session() as session:
session.execute("SELECT 1").fetchone()
except Exception as e:
db_healthy = False
logger.error(f"Database health check failed: {e}")
return {
'status': 'healthy' if chroma_health.get('status') == 'healthy' and db_healthy else 'degraded',
'chroma_status': chroma_health.get('status'),
'database_status': 'healthy' if db_healthy else 'unhealthy',
'metrics': self.metrics,
'cache_size': len(self._query_cache)
}
except Exception as e:
logger.error(f"Search service health check failed: {e}")
return {
'status': 'unhealthy',
'error': str(e)
}