677 lines
24 KiB
Python
677 lines
24 KiB
Python
"""Semantic search service combining ChromaDB vector search with traditional search methods."""
|
|
|
|
import logging
|
|
import asyncio
|
|
import hashlib
|
|
from typing import List, Dict, Any, Optional, Tuple, Union
|
|
from datetime import datetime, timedelta
|
|
import json
|
|
import uuid
|
|
|
|
from sqlalchemy import and_, or_, func
|
|
from sqlalchemy.orm import Session
|
|
|
|
from backend.core.exceptions import ServiceError
|
|
from backend.models.chat import VideoChunk
|
|
from backend.models.rag_models import RAGChunk, SemanticSearchResult
|
|
from backend.models.summary import Summary
|
|
from backend.services.chroma_service import ChromaService, ChromaDBError
|
|
from backend.core.database_registry import registry
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SemanticSearchError(ServiceError):
|
|
"""Semantic search specific errors."""
|
|
pass
|
|
|
|
|
|
class SearchMode:
|
|
"""Search mode constants."""
|
|
VECTOR_ONLY = "vector"
|
|
HYBRID = "hybrid"
|
|
TRADITIONAL = "traditional"
|
|
|
|
|
|
class SemanticSearchService:
|
|
"""Service for semantic search across video content using multiple methods."""
|
|
|
|
def __init__(
|
|
self,
|
|
chroma_service: Optional[ChromaService] = None,
|
|
default_search_mode: str = SearchMode.HYBRID
|
|
):
|
|
"""Initialize semantic search service.
|
|
|
|
Args:
|
|
chroma_service: ChromaDB service instance
|
|
default_search_mode: Default search strategy
|
|
"""
|
|
self.chroma_service = chroma_service or ChromaService()
|
|
self.default_search_mode = default_search_mode
|
|
|
|
# Search performance metrics
|
|
self.metrics = {
|
|
'total_searches': 0,
|
|
'vector_searches': 0,
|
|
'hybrid_searches': 0,
|
|
'traditional_searches': 0,
|
|
'avg_search_time': 0.0,
|
|
'cache_hits': 0
|
|
}
|
|
|
|
# Query cache for performance
|
|
self._query_cache = {}
|
|
self._cache_ttl = 300 # 5 minutes
|
|
|
|
async def initialize(self) -> None:
|
|
"""Initialize the search service."""
|
|
try:
|
|
await self.chroma_service.initialize()
|
|
logger.info("Semantic search service initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize semantic search service: {e}")
|
|
raise SemanticSearchError(f"Initialization failed: {e}")
|
|
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
video_id: Optional[str] = None,
|
|
search_mode: Optional[str] = None,
|
|
max_results: int = 10,
|
|
similarity_threshold: float = 0.3,
|
|
user_id: Optional[str] = None,
|
|
include_metadata: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""Perform semantic search across video content.
|
|
|
|
Args:
|
|
query: Search query text
|
|
video_id: Optional filter by specific video
|
|
search_mode: Search strategy (vector, hybrid, traditional)
|
|
max_results: Maximum number of results
|
|
similarity_threshold: Minimum similarity score for vector search
|
|
user_id: Optional user ID for analytics
|
|
include_metadata: Whether to include detailed metadata
|
|
|
|
Returns:
|
|
Search results with content, scores, and metadata
|
|
"""
|
|
start_time = datetime.now()
|
|
search_mode = search_mode or self.default_search_mode
|
|
|
|
try:
|
|
logger.info(f"Starting {search_mode} search for query: '{query[:50]}...'")
|
|
|
|
# Check cache first
|
|
cache_key = self._generate_cache_key(query, video_id, search_mode, max_results)
|
|
cached_result = self._get_cached_result(cache_key)
|
|
if cached_result:
|
|
self.metrics['cache_hits'] += 1
|
|
logger.info("Returning cached search result")
|
|
return cached_result
|
|
|
|
# Perform search based on mode
|
|
if search_mode == SearchMode.VECTOR_ONLY:
|
|
results = await self._vector_search(
|
|
query, video_id, max_results, similarity_threshold
|
|
)
|
|
elif search_mode == SearchMode.HYBRID:
|
|
results = await self._hybrid_search(
|
|
query, video_id, max_results, similarity_threshold
|
|
)
|
|
else: # TRADITIONAL
|
|
results = await self._traditional_search(
|
|
query, video_id, max_results
|
|
)
|
|
|
|
# Enhance results with metadata if requested
|
|
if include_metadata:
|
|
results = await self._enhance_results_with_metadata(results)
|
|
|
|
# Log search analytics
|
|
await self._log_search_analytics(
|
|
query, search_mode, len(results.get('results', [])),
|
|
user_id, start_time
|
|
)
|
|
|
|
# Prepare final response
|
|
search_response = {
|
|
'query': query,
|
|
'search_mode': search_mode,
|
|
'video_id': video_id,
|
|
'total_results': len(results.get('results', [])),
|
|
'search_time_seconds': (datetime.now() - start_time).total_seconds(),
|
|
'similarity_threshold': similarity_threshold,
|
|
'results': results.get('results', []),
|
|
'metadata': {
|
|
'cached': False,
|
|
'timestamp': datetime.now().isoformat(),
|
|
'service_metrics': self._get_current_metrics()
|
|
}
|
|
}
|
|
|
|
# Cache result
|
|
self._cache_result(cache_key, search_response)
|
|
|
|
# Update metrics
|
|
self._update_metrics(search_mode, start_time)
|
|
|
|
return search_response
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search failed: {e}")
|
|
raise SemanticSearchError(f"Search failed: {e}")
|
|
|
|
async def _vector_search(
|
|
self,
|
|
query: str,
|
|
video_id: Optional[str],
|
|
max_results: int,
|
|
similarity_threshold: float
|
|
) -> Dict[str, Any]:
|
|
"""Perform pure vector similarity search.
|
|
|
|
Args:
|
|
query: Search query
|
|
video_id: Optional video filter
|
|
max_results: Maximum results
|
|
similarity_threshold: Minimum similarity
|
|
|
|
Returns:
|
|
Vector search results
|
|
"""
|
|
try:
|
|
# Perform ChromaDB search
|
|
chroma_results = await self.chroma_service.search_similar(
|
|
query=query,
|
|
video_id=video_id,
|
|
n_results=max_results,
|
|
similarity_threshold=similarity_threshold
|
|
)
|
|
|
|
# Format results
|
|
formatted_results = []
|
|
for result in chroma_results:
|
|
formatted_results.append({
|
|
'content': result['content'],
|
|
'similarity_score': result['similarity_score'],
|
|
'video_id': result['video_id'],
|
|
'chunk_type': result['chunk_type'],
|
|
'start_timestamp': result.get('start_timestamp'),
|
|
'end_timestamp': result.get('end_timestamp'),
|
|
'timestamp_formatted': result.get('timestamp_formatted'),
|
|
'youtube_link': result.get('youtube_link'),
|
|
'rank': result['rank'],
|
|
'search_method': 'vector',
|
|
'metadata': result['metadata']
|
|
})
|
|
|
|
return {
|
|
'results': formatted_results,
|
|
'search_method': 'vector_only',
|
|
'vector_results_count': len(formatted_results)
|
|
}
|
|
|
|
except ChromaDBError as e:
|
|
logger.error(f"Vector search failed: {e}")
|
|
return {'results': [], 'error': str(e)}
|
|
|
|
async def _hybrid_search(
|
|
self,
|
|
query: str,
|
|
video_id: Optional[str],
|
|
max_results: int,
|
|
similarity_threshold: float
|
|
) -> Dict[str, Any]:
|
|
"""Perform hybrid search combining vector and traditional methods.
|
|
|
|
Args:
|
|
query: Search query
|
|
video_id: Optional video filter
|
|
max_results: Maximum results
|
|
similarity_threshold: Minimum similarity
|
|
|
|
Returns:
|
|
Hybrid search results
|
|
"""
|
|
try:
|
|
# Run both searches in parallel
|
|
vector_task = asyncio.create_task(
|
|
self._vector_search(query, video_id, max_results, similarity_threshold)
|
|
)
|
|
traditional_task = asyncio.create_task(
|
|
self._traditional_search(query, video_id, max_results // 2)
|
|
)
|
|
|
|
vector_results, traditional_results = await asyncio.gather(
|
|
vector_task, traditional_task
|
|
)
|
|
|
|
# Combine and rank results
|
|
combined_results = self._combine_and_rank_results(
|
|
vector_results.get('results', []),
|
|
traditional_results.get('results', []),
|
|
max_results
|
|
)
|
|
|
|
return {
|
|
'results': combined_results,
|
|
'search_method': 'hybrid',
|
|
'vector_results_count': len(vector_results.get('results', [])),
|
|
'traditional_results_count': len(traditional_results.get('results', [])),
|
|
'combined_results_count': len(combined_results)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Hybrid search failed: {e}")
|
|
# Fallback to vector search only
|
|
return await self._vector_search(query, video_id, max_results, similarity_threshold)
|
|
|
|
async def _traditional_search(
|
|
self,
|
|
query: str,
|
|
video_id: Optional[str],
|
|
max_results: int
|
|
) -> Dict[str, Any]:
|
|
"""Perform traditional text-based search using database.
|
|
|
|
Args:
|
|
query: Search query
|
|
video_id: Optional video filter
|
|
max_results: Maximum results
|
|
|
|
Returns:
|
|
Traditional search results
|
|
"""
|
|
try:
|
|
with registry.get_session() as session:
|
|
# Build query
|
|
base_query = session.query(RAGChunk)
|
|
|
|
if video_id:
|
|
base_query = base_query.filter(RAGChunk.video_id == video_id)
|
|
|
|
# Text search using LIKE (SQLite compatible)
|
|
search_terms = query.lower().split()
|
|
text_conditions = []
|
|
|
|
for term in search_terms:
|
|
text_conditions.append(
|
|
func.lower(RAGChunk.content).like(f'%{term}%')
|
|
)
|
|
|
|
if text_conditions:
|
|
base_query = base_query.filter(or_(*text_conditions))
|
|
|
|
# Execute query
|
|
chunks = base_query.limit(max_results).all()
|
|
|
|
# Format results
|
|
formatted_results = []
|
|
for i, chunk in enumerate(chunks):
|
|
# Calculate relevance score based on term frequency
|
|
relevance_score = self._calculate_text_relevance(query, chunk.content)
|
|
|
|
result = {
|
|
'content': chunk.content,
|
|
'relevance_score': relevance_score,
|
|
'video_id': chunk.video_id,
|
|
'chunk_type': chunk.chunk_type,
|
|
'start_timestamp': chunk.start_timestamp,
|
|
'end_timestamp': chunk.end_timestamp,
|
|
'rank': i + 1,
|
|
'search_method': 'traditional',
|
|
'chunk_id': str(chunk.id)
|
|
}
|
|
|
|
# Add formatted timestamp
|
|
if chunk.start_timestamp is not None:
|
|
timestamp = chunk.start_timestamp
|
|
hours = int(timestamp // 3600)
|
|
minutes = int((timestamp % 3600) // 60)
|
|
seconds = int(timestamp % 60)
|
|
result['timestamp_formatted'] = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
|
|
result['youtube_link'] = f"https://youtube.com/watch?v={chunk.video_id}&t={int(timestamp)}s"
|
|
|
|
formatted_results.append(result)
|
|
|
|
return {
|
|
'results': formatted_results,
|
|
'search_method': 'traditional',
|
|
'traditional_results_count': len(formatted_results)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Traditional search failed: {e}")
|
|
return {'results': [], 'error': str(e)}
|
|
|
|
def _combine_and_rank_results(
|
|
self,
|
|
vector_results: List[Dict[str, Any]],
|
|
traditional_results: List[Dict[str, Any]],
|
|
max_results: int
|
|
) -> List[Dict[str, Any]]:
|
|
"""Combine and rank results from different search methods.
|
|
|
|
Args:
|
|
vector_results: Vector search results
|
|
traditional_results: Traditional search results
|
|
max_results: Maximum final results
|
|
|
|
Returns:
|
|
Combined and ranked results
|
|
"""
|
|
combined = {}
|
|
|
|
# Add vector results with boosted scores
|
|
for result in vector_results:
|
|
key = self._get_result_key(result)
|
|
result['combined_score'] = result.get('similarity_score', 0.0) * 1.2 # Boost vector scores
|
|
result['sources'] = ['vector']
|
|
combined[key] = result
|
|
|
|
# Add traditional results, merge if already exists
|
|
for result in traditional_results:
|
|
key = self._get_result_key(result)
|
|
if key in combined:
|
|
# Merge scores from different methods
|
|
existing_score = combined[key]['combined_score']
|
|
new_score = result.get('relevance_score', 0.0) * 0.8
|
|
combined[key]['combined_score'] = max(existing_score, existing_score + new_score * 0.5)
|
|
combined[key]['sources'].append('traditional')
|
|
else:
|
|
result['combined_score'] = result.get('relevance_score', 0.0) * 0.8
|
|
result['sources'] = ['traditional']
|
|
combined[key] = result
|
|
|
|
# Sort by combined score and return top results
|
|
sorted_results = sorted(
|
|
combined.values(),
|
|
key=lambda x: x['combined_score'],
|
|
reverse=True
|
|
)
|
|
|
|
# Re-rank and add final rank
|
|
final_results = []
|
|
for i, result in enumerate(sorted_results[:max_results]):
|
|
result['final_rank'] = i + 1
|
|
result['search_method'] = 'hybrid'
|
|
final_results.append(result)
|
|
|
|
return final_results
|
|
|
|
def _get_result_key(self, result: Dict[str, Any]) -> str:
|
|
"""Generate unique key for result deduplication.
|
|
|
|
Args:
|
|
result: Search result
|
|
|
|
Returns:
|
|
Unique key string
|
|
"""
|
|
video_id = result.get('video_id', '')
|
|
start_time = result.get('start_timestamp', 0) or 0
|
|
content_hash = hash(result.get('content', '')[:100])
|
|
return f"{video_id}:{start_time}:{content_hash}"
|
|
|
|
def _calculate_text_relevance(self, query: str, content: str) -> float:
|
|
"""Calculate relevance score for traditional text search.
|
|
|
|
Args:
|
|
query: Search query
|
|
content: Content to score
|
|
|
|
Returns:
|
|
Relevance score between 0 and 1
|
|
"""
|
|
query_terms = set(query.lower().split())
|
|
content_terms = set(content.lower().split())
|
|
|
|
if not query_terms:
|
|
return 0.0
|
|
|
|
# Simple term frequency scoring
|
|
matches = len(query_terms.intersection(content_terms))
|
|
score = matches / len(query_terms)
|
|
|
|
# Boost for exact phrase matches
|
|
if query.lower() in content.lower():
|
|
score += 0.3
|
|
|
|
return min(score, 1.0)
|
|
|
|
async def _enhance_results_with_metadata(
|
|
self,
|
|
results: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""Enhance search results with additional metadata.
|
|
|
|
Args:
|
|
results: Raw search results
|
|
|
|
Returns:
|
|
Enhanced results with metadata
|
|
"""
|
|
try:
|
|
enhanced_results = []
|
|
|
|
for result in results.get('results', []):
|
|
# Get video metadata
|
|
video_id = result.get('video_id')
|
|
if video_id:
|
|
with registry.get_session() as session:
|
|
summary = session.query(Summary).filter(
|
|
Summary.video_id == video_id
|
|
).first()
|
|
|
|
if summary:
|
|
result['video_metadata'] = {
|
|
'title': summary.video_title,
|
|
'duration': getattr(summary, 'video_duration', None),
|
|
'channel': getattr(summary, 'channel_name', None),
|
|
'summary_created': summary.created_at.isoformat() if summary.created_at else None
|
|
}
|
|
|
|
enhanced_results.append(result)
|
|
|
|
results['results'] = enhanced_results
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to enhance results with metadata: {e}")
|
|
return results
|
|
|
|
async def _log_search_analytics(
|
|
self,
|
|
query: str,
|
|
search_mode: str,
|
|
results_count: int,
|
|
user_id: Optional[str],
|
|
start_time: datetime
|
|
) -> None:
|
|
"""Log search analytics to database.
|
|
|
|
Args:
|
|
query: Search query
|
|
search_mode: Search method used
|
|
results_count: Number of results returned
|
|
user_id: Optional user ID
|
|
start_time: Search start time
|
|
"""
|
|
try:
|
|
search_time = (datetime.now() - start_time).total_seconds()
|
|
query_id = str(uuid.uuid4())
|
|
|
|
# This would typically log to a search_analytics table
|
|
logger.info(f"Search analytics: query='{query[:50]}...', mode={search_mode}, "
|
|
f"results={results_count}, time={search_time:.3f}s")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to log search analytics: {e}")
|
|
|
|
def _generate_cache_key(
|
|
self,
|
|
query: str,
|
|
video_id: Optional[str],
|
|
search_mode: str,
|
|
max_results: int
|
|
) -> str:
|
|
"""Generate cache key for query.
|
|
|
|
Args:
|
|
query: Search query
|
|
video_id: Optional video filter
|
|
search_mode: Search method
|
|
max_results: Maximum results
|
|
|
|
Returns:
|
|
Cache key string
|
|
"""
|
|
key_components = [
|
|
query.lower(),
|
|
video_id or "all",
|
|
search_mode,
|
|
str(max_results)
|
|
]
|
|
return hashlib.sha256("|".join(key_components).encode()).hexdigest()[:16]
|
|
|
|
def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
|
|
"""Get cached search result if valid.
|
|
|
|
Args:
|
|
cache_key: Cache key
|
|
|
|
Returns:
|
|
Cached result or None
|
|
"""
|
|
if cache_key in self._query_cache:
|
|
cached_item = self._query_cache[cache_key]
|
|
if datetime.now() - cached_item['timestamp'] < timedelta(seconds=self._cache_ttl):
|
|
cached_item['data']['metadata']['cached'] = True
|
|
return cached_item['data']
|
|
else:
|
|
del self._query_cache[cache_key]
|
|
return None
|
|
|
|
def _cache_result(self, cache_key: str, result: Dict[str, Any]) -> None:
|
|
"""Cache search result.
|
|
|
|
Args:
|
|
cache_key: Cache key
|
|
result: Search result to cache
|
|
"""
|
|
self._query_cache[cache_key] = {
|
|
'timestamp': datetime.now(),
|
|
'data': result
|
|
}
|
|
|
|
# Cleanup old cache entries (simple LRU)
|
|
if len(self._query_cache) > 100:
|
|
oldest_key = min(
|
|
self._query_cache.keys(),
|
|
key=lambda k: self._query_cache[k]['timestamp']
|
|
)
|
|
del self._query_cache[oldest_key]
|
|
|
|
def _update_metrics(self, search_mode: str, start_time: datetime) -> None:
|
|
"""Update search metrics.
|
|
|
|
Args:
|
|
search_mode: Search method used
|
|
start_time: Search start time
|
|
"""
|
|
search_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
self.metrics['total_searches'] += 1
|
|
|
|
if search_mode == SearchMode.VECTOR_ONLY:
|
|
self.metrics['vector_searches'] += 1
|
|
elif search_mode == SearchMode.HYBRID:
|
|
self.metrics['hybrid_searches'] += 1
|
|
else:
|
|
self.metrics['traditional_searches'] += 1
|
|
|
|
# Update average search time
|
|
total_time = self.metrics['avg_search_time'] * (self.metrics['total_searches'] - 1)
|
|
self.metrics['avg_search_time'] = (total_time + search_time) / self.metrics['total_searches']
|
|
|
|
def _get_current_metrics(self) -> Dict[str, Any]:
|
|
"""Get current search metrics.
|
|
|
|
Returns:
|
|
Current metrics dictionary
|
|
"""
|
|
return dict(self.metrics)
|
|
|
|
async def get_search_suggestions(
|
|
self,
|
|
query: str,
|
|
video_id: Optional[str] = None,
|
|
max_suggestions: int = 5
|
|
) -> List[str]:
|
|
"""Get search suggestions based on query and available content.
|
|
|
|
Args:
|
|
query: Partial query text
|
|
video_id: Optional video filter
|
|
max_suggestions: Maximum suggestions
|
|
|
|
Returns:
|
|
List of suggested queries
|
|
"""
|
|
try:
|
|
# Get recent searches and popular terms from content
|
|
suggestions = []
|
|
|
|
# This would typically query search logs and content analysis
|
|
# For now, return basic suggestions
|
|
if len(query) >= 2:
|
|
base_suggestions = [
|
|
f"{query} explanation",
|
|
f"{query} examples",
|
|
f"{query} benefits",
|
|
f"how to {query}",
|
|
f"what is {query}"
|
|
]
|
|
suggestions.extend(base_suggestions[:max_suggestions])
|
|
|
|
return suggestions[:max_suggestions]
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get search suggestions: {e}")
|
|
return []
|
|
|
|
async def health_check(self) -> Dict[str, Any]:
|
|
"""Perform health check on search service.
|
|
|
|
Returns:
|
|
Health check results
|
|
"""
|
|
try:
|
|
# Check ChromaDB health
|
|
chroma_health = await self.chroma_service.health_check()
|
|
|
|
# Check database connectivity
|
|
db_healthy = True
|
|
try:
|
|
with registry.get_session() as session:
|
|
session.execute("SELECT 1").fetchone()
|
|
except Exception as e:
|
|
db_healthy = False
|
|
logger.error(f"Database health check failed: {e}")
|
|
|
|
return {
|
|
'status': 'healthy' if chroma_health.get('status') == 'healthy' and db_healthy else 'degraded',
|
|
'chroma_status': chroma_health.get('status'),
|
|
'database_status': 'healthy' if db_healthy else 'unhealthy',
|
|
'metrics': self.metrics,
|
|
'cache_size': len(self._query_cache)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search service health check failed: {e}")
|
|
return {
|
|
'status': 'unhealthy',
|
|
'error': str(e)
|
|
} |