178 lines
6.8 KiB
Python
178 lines
6.8 KiB
Python
"""Database models for RAG (Retrieval-Augmented Generation) functionality."""
|
|
|
|
from sqlalchemy import Column, String, Integer, Text, DateTime, Float, Boolean, ForeignKey, Index, JSON
|
|
from sqlalchemy.orm import relationship
|
|
from sqlalchemy.sql import func
|
|
from sqlalchemy.dialects.postgresql import UUID
|
|
from sqlalchemy.types import TypeDecorator, CHAR
|
|
import uuid
|
|
from datetime import datetime
|
|
|
|
from backend.models.base import Model
|
|
|
|
|
|
class GUID(TypeDecorator):
|
|
"""Platform-independent GUID type for SQLite and PostgreSQL compatibility."""
|
|
impl = CHAR
|
|
cache_ok = True
|
|
|
|
def load_dialect_impl(self, dialect):
|
|
if dialect.name == 'postgresql':
|
|
return dialect.type_descriptor(UUID())
|
|
else:
|
|
return dialect.type_descriptor(CHAR(32))
|
|
|
|
def process_bind_param(self, value, dialect):
|
|
if value is None:
|
|
return value
|
|
elif dialect.name == 'postgresql':
|
|
return str(value)
|
|
else:
|
|
if not isinstance(value, uuid.UUID):
|
|
return "%.32x" % uuid.UUID(value).int
|
|
else:
|
|
return "%.32x" % value.int
|
|
|
|
def process_result_value(self, value, dialect):
|
|
if value is None:
|
|
return value
|
|
else:
|
|
if not isinstance(value, uuid.UUID):
|
|
return uuid.UUID(value)
|
|
return value
|
|
|
|
|
|
class RAGChunk(Model):
|
|
"""Text chunks for RAG processing and vector embeddings."""
|
|
__tablename__ = "rag_chunks"
|
|
|
|
id = Column(GUID, primary_key=True, default=uuid.uuid4)
|
|
summary_id = Column(GUID, ForeignKey("summaries.id"), nullable=True)
|
|
video_id = Column(String(20), nullable=False) # YouTube video ID
|
|
|
|
# Chunk metadata
|
|
chunk_type = Column(String(50), nullable=False) # transcript, summary, agent_analysis, metadata
|
|
chunk_index = Column(Integer, nullable=False) # Order within the source document
|
|
start_timestamp = Column(Float) # For transcript chunks, start time in seconds
|
|
end_timestamp = Column(Float) # For transcript chunks, end time in seconds
|
|
|
|
# Content
|
|
content = Column(Text, nullable=False) # The actual text content
|
|
content_hash = Column(String(64)) # SHA-256 hash for deduplication
|
|
word_count = Column(Integer)
|
|
character_count = Column(Integer)
|
|
|
|
# Preprocessing metadata
|
|
language = Column(String(10), default="en")
|
|
cleaned_content = Column(Text) # Preprocessed content for embedding
|
|
keywords = Column(JSON) # Extracted keywords
|
|
entities = Column(JSON) # Named entities
|
|
|
|
# Processing metadata
|
|
created_at = Column(DateTime, server_default=func.now())
|
|
embedding_created_at = Column(DateTime)
|
|
last_accessed = Column(DateTime)
|
|
access_count = Column(Integer, default=0)
|
|
|
|
# Relationships
|
|
summary = relationship("backend.models.summary.Summary")
|
|
embeddings = relationship("backend.models.rag_models.VectorEmbedding", back_populates="chunk", cascade="all, delete-orphan")
|
|
search_results = relationship("backend.models.rag_models.SemanticSearchResult", back_populates="chunk")
|
|
|
|
# Indexes for efficient querying
|
|
__table_args__ = (
|
|
Index('ix_rag_chunks_video_id', 'video_id'),
|
|
Index('ix_rag_chunks_type_video', 'chunk_type', 'video_id'),
|
|
Index('ix_rag_chunks_hash', 'content_hash'),
|
|
Index('ix_rag_chunks_timestamps', 'start_timestamp', 'end_timestamp'),
|
|
{'extend_existing': True}
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<RAGChunk(id={self.id}, video_id={self.video_id}, type={self.chunk_type})>"
|
|
|
|
|
|
class VectorEmbedding(Model):
|
|
"""Vector embeddings for semantic search."""
|
|
__tablename__ = "vector_embeddings"
|
|
|
|
id = Column(GUID, primary_key=True, default=uuid.uuid4)
|
|
chunk_id = Column(GUID, ForeignKey("rag_chunks.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
# Embedding metadata
|
|
model_name = Column(String(100), nullable=False) # e.g., 'sentence-transformers/all-MiniLM-L6-v2'
|
|
model_version = Column(String(50))
|
|
embedding_dimension = Column(Integer, nullable=False)
|
|
|
|
# Vector data (stored as JSON array for SQLite compatibility)
|
|
embedding_vector = Column(JSON, nullable=False)
|
|
|
|
# Embedding quality metrics
|
|
confidence_score = Column(Float)
|
|
norm = Column(Float) # L2 norm of the vector
|
|
|
|
# Processing metadata
|
|
created_at = Column(DateTime, server_default=func.now())
|
|
processing_time_ms = Column(Integer)
|
|
|
|
# Relationships
|
|
chunk = relationship("backend.models.rag_models.RAGChunk", back_populates="embeddings")
|
|
|
|
# Indexes for efficient vector operations
|
|
__table_args__ = (
|
|
Index('ix_vector_embeddings_chunk_id', 'chunk_id'),
|
|
Index('ix_vector_embeddings_model', 'model_name', 'model_version'),
|
|
{'extend_existing': True}
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<VectorEmbedding(id={self.id}, model={self.model_name}, dim={self.embedding_dimension})>"
|
|
|
|
|
|
class SemanticSearchResult(Model):
|
|
"""Results from semantic search queries."""
|
|
__tablename__ = "semantic_search_results"
|
|
|
|
id = Column(GUID, primary_key=True, default=uuid.uuid4)
|
|
query_id = Column(String(100), nullable=False) # Identifier for the search query
|
|
chunk_id = Column(GUID, ForeignKey("rag_chunks.id"), nullable=False)
|
|
|
|
# Query metadata
|
|
query_text = Column(Text, nullable=False)
|
|
query_embedding = Column(JSON) # Optional: store query embedding
|
|
query_type = Column(String(50)) # question, keyword, semantic, hybrid
|
|
|
|
# Search results
|
|
similarity_score = Column(Float, nullable=False) # Cosine similarity or other metric
|
|
rank_position = Column(Integer, nullable=False) # Position in search results (1-based)
|
|
relevance_score = Column(Float) # Combined relevance score
|
|
|
|
# Context enhancement
|
|
context_window = Column(Text) # Surrounding text for better context
|
|
highlight_spans = Column(JSON) # Character ranges to highlight
|
|
|
|
# User interaction
|
|
user_id = Column(GUID, ForeignKey("users.id"), nullable=True)
|
|
clicked = Column(Boolean, default=False)
|
|
helpful_rating = Column(Integer) # 1-5 rating from user
|
|
|
|
# Metadata
|
|
search_timestamp = Column(DateTime, server_default=func.now())
|
|
response_time_ms = Column(Integer)
|
|
model_used = Column(String(100))
|
|
|
|
# Relationships
|
|
chunk = relationship("backend.models.rag_models.RAGChunk", back_populates="search_results")
|
|
user = relationship("backend.models.user.User")
|
|
|
|
# Indexes for efficient search result retrieval
|
|
__table_args__ = (
|
|
Index('ix_search_results_query_id', 'query_id'),
|
|
Index('ix_search_results_similarity', 'similarity_score'),
|
|
Index('ix_search_results_timestamp', 'search_timestamp'),
|
|
Index('ix_search_results_user', 'user_id', 'search_timestamp'),
|
|
{'extend_existing': True}
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<SemanticSearchResult(id={self.id}, query_id={self.query_id}, score={self.similarity_score:.3f})>" |