"""Database models for RAG (Retrieval-Augmented Generation) functionality.""" from sqlalchemy import Column, String, Integer, Text, DateTime, Float, Boolean, ForeignKey, Index, JSON from sqlalchemy.orm import relationship from sqlalchemy.sql import func from sqlalchemy.dialects.postgresql import UUID from sqlalchemy.types import TypeDecorator, CHAR import uuid from datetime import datetime from backend.models.base import Model class GUID(TypeDecorator): """Platform-independent GUID type for SQLite and PostgreSQL compatibility.""" impl = CHAR cache_ok = True def load_dialect_impl(self, dialect): if dialect.name == 'postgresql': return dialect.type_descriptor(UUID()) else: return dialect.type_descriptor(CHAR(32)) def process_bind_param(self, value, dialect): if value is None: return value elif dialect.name == 'postgresql': return str(value) else: if not isinstance(value, uuid.UUID): return "%.32x" % uuid.UUID(value).int else: return "%.32x" % value.int def process_result_value(self, value, dialect): if value is None: return value else: if not isinstance(value, uuid.UUID): return uuid.UUID(value) return value class RAGChunk(Model): """Text chunks for RAG processing and vector embeddings.""" __tablename__ = "rag_chunks" id = Column(GUID, primary_key=True, default=uuid.uuid4) summary_id = Column(GUID, ForeignKey("summaries.id"), nullable=True) video_id = Column(String(20), nullable=False) # YouTube video ID # Chunk metadata chunk_type = Column(String(50), nullable=False) # transcript, summary, agent_analysis, metadata chunk_index = Column(Integer, nullable=False) # Order within the source document start_timestamp = Column(Float) # For transcript chunks, start time in seconds end_timestamp = Column(Float) # For transcript chunks, end time in seconds # Content content = Column(Text, nullable=False) # The actual text content content_hash = Column(String(64)) # SHA-256 hash for deduplication word_count = Column(Integer) character_count = Column(Integer) # Preprocessing metadata language = Column(String(10), default="en") cleaned_content = Column(Text) # Preprocessed content for embedding keywords = Column(JSON) # Extracted keywords entities = Column(JSON) # Named entities # Processing metadata created_at = Column(DateTime, server_default=func.now()) embedding_created_at = Column(DateTime) last_accessed = Column(DateTime) access_count = Column(Integer, default=0) # Relationships summary = relationship("backend.models.summary.Summary") embeddings = relationship("backend.models.rag_models.VectorEmbedding", back_populates="chunk", cascade="all, delete-orphan") search_results = relationship("backend.models.rag_models.SemanticSearchResult", back_populates="chunk") # Indexes for efficient querying __table_args__ = ( Index('ix_rag_chunks_video_id', 'video_id'), Index('ix_rag_chunks_type_video', 'chunk_type', 'video_id'), Index('ix_rag_chunks_hash', 'content_hash'), Index('ix_rag_chunks_timestamps', 'start_timestamp', 'end_timestamp'), {'extend_existing': True} ) def __repr__(self): return f"" class VectorEmbedding(Model): """Vector embeddings for semantic search.""" __tablename__ = "vector_embeddings" id = Column(GUID, primary_key=True, default=uuid.uuid4) chunk_id = Column(GUID, ForeignKey("rag_chunks.id", ondelete="CASCADE"), nullable=False) # Embedding metadata model_name = Column(String(100), nullable=False) # e.g., 'sentence-transformers/all-MiniLM-L6-v2' model_version = Column(String(50)) embedding_dimension = Column(Integer, nullable=False) # Vector data (stored as JSON array for SQLite compatibility) embedding_vector = Column(JSON, nullable=False) # Embedding quality metrics confidence_score = Column(Float) norm = Column(Float) # L2 norm of the vector # Processing metadata created_at = Column(DateTime, server_default=func.now()) processing_time_ms = Column(Integer) # Relationships chunk = relationship("backend.models.rag_models.RAGChunk", back_populates="embeddings") # Indexes for efficient vector operations __table_args__ = ( Index('ix_vector_embeddings_chunk_id', 'chunk_id'), Index('ix_vector_embeddings_model', 'model_name', 'model_version'), {'extend_existing': True} ) def __repr__(self): return f"" class SemanticSearchResult(Model): """Results from semantic search queries.""" __tablename__ = "semantic_search_results" id = Column(GUID, primary_key=True, default=uuid.uuid4) query_id = Column(String(100), nullable=False) # Identifier for the search query chunk_id = Column(GUID, ForeignKey("rag_chunks.id"), nullable=False) # Query metadata query_text = Column(Text, nullable=False) query_embedding = Column(JSON) # Optional: store query embedding query_type = Column(String(50)) # question, keyword, semantic, hybrid # Search results similarity_score = Column(Float, nullable=False) # Cosine similarity or other metric rank_position = Column(Integer, nullable=False) # Position in search results (1-based) relevance_score = Column(Float) # Combined relevance score # Context enhancement context_window = Column(Text) # Surrounding text for better context highlight_spans = Column(JSON) # Character ranges to highlight # User interaction user_id = Column(GUID, ForeignKey("users.id"), nullable=True) clicked = Column(Boolean, default=False) helpful_rating = Column(Integer) # 1-5 rating from user # Metadata search_timestamp = Column(DateTime, server_default=func.now()) response_time_ms = Column(Integer) model_used = Column(String(100)) # Relationships chunk = relationship("backend.models.rag_models.RAGChunk", back_populates="search_results") user = relationship("backend.models.user.User") # Indexes for efficient search result retrieval __table_args__ = ( Index('ix_search_results_query_id', 'query_id'), Index('ix_search_results_similarity', 'similarity_score'), Index('ix_search_results_timestamp', 'search_timestamp'), Index('ix_search_results_user', 'user_id', 'search_timestamp'), {'extend_existing': True} ) def __repr__(self): return f""