youtube-summarizer/backend/models/rag_models.py

178 lines
6.8 KiB
Python

"""Database models for RAG (Retrieval-Augmented Generation) functionality."""
from sqlalchemy import Column, String, Integer, Text, DateTime, Float, Boolean, ForeignKey, Index, JSON
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.types import TypeDecorator, CHAR
import uuid
from datetime import datetime
from backend.models.base import Model
class GUID(TypeDecorator):
"""Platform-independent GUID type for SQLite and PostgreSQL compatibility."""
impl = CHAR
cache_ok = True
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(UUID())
else:
return dialect.type_descriptor(CHAR(32))
def process_bind_param(self, value, dialect):
if value is None:
return value
elif dialect.name == 'postgresql':
return str(value)
else:
if not isinstance(value, uuid.UUID):
return "%.32x" % uuid.UUID(value).int
else:
return "%.32x" % value.int
def process_result_value(self, value, dialect):
if value is None:
return value
else:
if not isinstance(value, uuid.UUID):
return uuid.UUID(value)
return value
class RAGChunk(Model):
"""Text chunks for RAG processing and vector embeddings."""
__tablename__ = "rag_chunks"
id = Column(GUID, primary_key=True, default=uuid.uuid4)
summary_id = Column(GUID, ForeignKey("summaries.id"), nullable=True)
video_id = Column(String(20), nullable=False) # YouTube video ID
# Chunk metadata
chunk_type = Column(String(50), nullable=False) # transcript, summary, agent_analysis, metadata
chunk_index = Column(Integer, nullable=False) # Order within the source document
start_timestamp = Column(Float) # For transcript chunks, start time in seconds
end_timestamp = Column(Float) # For transcript chunks, end time in seconds
# Content
content = Column(Text, nullable=False) # The actual text content
content_hash = Column(String(64)) # SHA-256 hash for deduplication
word_count = Column(Integer)
character_count = Column(Integer)
# Preprocessing metadata
language = Column(String(10), default="en")
cleaned_content = Column(Text) # Preprocessed content for embedding
keywords = Column(JSON) # Extracted keywords
entities = Column(JSON) # Named entities
# Processing metadata
created_at = Column(DateTime, server_default=func.now())
embedding_created_at = Column(DateTime)
last_accessed = Column(DateTime)
access_count = Column(Integer, default=0)
# Relationships
summary = relationship("backend.models.summary.Summary")
embeddings = relationship("backend.models.rag_models.VectorEmbedding", back_populates="chunk", cascade="all, delete-orphan")
search_results = relationship("backend.models.rag_models.SemanticSearchResult", back_populates="chunk")
# Indexes for efficient querying
__table_args__ = (
Index('ix_rag_chunks_video_id', 'video_id'),
Index('ix_rag_chunks_type_video', 'chunk_type', 'video_id'),
Index('ix_rag_chunks_hash', 'content_hash'),
Index('ix_rag_chunks_timestamps', 'start_timestamp', 'end_timestamp'),
{'extend_existing': True}
)
def __repr__(self):
return f"<RAGChunk(id={self.id}, video_id={self.video_id}, type={self.chunk_type})>"
class VectorEmbedding(Model):
"""Vector embeddings for semantic search."""
__tablename__ = "vector_embeddings"
id = Column(GUID, primary_key=True, default=uuid.uuid4)
chunk_id = Column(GUID, ForeignKey("rag_chunks.id", ondelete="CASCADE"), nullable=False)
# Embedding metadata
model_name = Column(String(100), nullable=False) # e.g., 'sentence-transformers/all-MiniLM-L6-v2'
model_version = Column(String(50))
embedding_dimension = Column(Integer, nullable=False)
# Vector data (stored as JSON array for SQLite compatibility)
embedding_vector = Column(JSON, nullable=False)
# Embedding quality metrics
confidence_score = Column(Float)
norm = Column(Float) # L2 norm of the vector
# Processing metadata
created_at = Column(DateTime, server_default=func.now())
processing_time_ms = Column(Integer)
# Relationships
chunk = relationship("backend.models.rag_models.RAGChunk", back_populates="embeddings")
# Indexes for efficient vector operations
__table_args__ = (
Index('ix_vector_embeddings_chunk_id', 'chunk_id'),
Index('ix_vector_embeddings_model', 'model_name', 'model_version'),
{'extend_existing': True}
)
def __repr__(self):
return f"<VectorEmbedding(id={self.id}, model={self.model_name}, dim={self.embedding_dimension})>"
class SemanticSearchResult(Model):
"""Results from semantic search queries."""
__tablename__ = "semantic_search_results"
id = Column(GUID, primary_key=True, default=uuid.uuid4)
query_id = Column(String(100), nullable=False) # Identifier for the search query
chunk_id = Column(GUID, ForeignKey("rag_chunks.id"), nullable=False)
# Query metadata
query_text = Column(Text, nullable=False)
query_embedding = Column(JSON) # Optional: store query embedding
query_type = Column(String(50)) # question, keyword, semantic, hybrid
# Search results
similarity_score = Column(Float, nullable=False) # Cosine similarity or other metric
rank_position = Column(Integer, nullable=False) # Position in search results (1-based)
relevance_score = Column(Float) # Combined relevance score
# Context enhancement
context_window = Column(Text) # Surrounding text for better context
highlight_spans = Column(JSON) # Character ranges to highlight
# User interaction
user_id = Column(GUID, ForeignKey("users.id"), nullable=True)
clicked = Column(Boolean, default=False)
helpful_rating = Column(Integer) # 1-5 rating from user
# Metadata
search_timestamp = Column(DateTime, server_default=func.now())
response_time_ms = Column(Integer)
model_used = Column(String(100))
# Relationships
chunk = relationship("backend.models.rag_models.RAGChunk", back_populates="search_results")
user = relationship("backend.models.user.User")
# Indexes for efficient search result retrieval
__table_args__ = (
Index('ix_search_results_query_id', 'query_id'),
Index('ix_search_results_similarity', 'similarity_score'),
Index('ix_search_results_timestamp', 'search_timestamp'),
Index('ix_search_results_user', 'user_id', 'search_timestamp'),
{'extend_existing': True}
)
def __repr__(self):
return f"<SemanticSearchResult(id={self.id}, query_id={self.query_id}, score={self.similarity_score:.3f})>"