239 lines
8.5 KiB
Python
239 lines
8.5 KiB
Python
"""Database models for RAG-powered chat functionality."""
|
|
|
|
from sqlalchemy import Column, String, Text, DateTime, Float, Boolean, ForeignKey, Index, JSON, Integer
|
|
from sqlalchemy.orm import relationship
|
|
from sqlalchemy.sql import func
|
|
from sqlalchemy.dialects.postgresql import UUID
|
|
from sqlalchemy.types import TypeDecorator, CHAR
|
|
import uuid
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any
|
|
from enum import Enum
|
|
|
|
from backend.models.base import Model
|
|
|
|
|
|
class GUID(TypeDecorator):
|
|
"""Platform-independent GUID type for SQLite and PostgreSQL compatibility."""
|
|
impl = CHAR
|
|
cache_ok = True
|
|
|
|
def load_dialect_impl(self, dialect):
|
|
if dialect.name == 'postgresql':
|
|
return dialect.type_descriptor(UUID())
|
|
else:
|
|
return dialect.type_descriptor(CHAR(32))
|
|
|
|
def process_bind_param(self, value, dialect):
|
|
if value is None:
|
|
return value
|
|
elif dialect.name == 'postgresql':
|
|
return str(value)
|
|
else:
|
|
if not isinstance(value, uuid.UUID):
|
|
return "%.32x" % uuid.UUID(value).int
|
|
else:
|
|
return "%.32x" % value.int
|
|
|
|
def process_result_value(self, value, dialect):
|
|
if value is None:
|
|
return value
|
|
else:
|
|
if not isinstance(value, uuid.UUID):
|
|
return uuid.UUID(value)
|
|
return value
|
|
|
|
|
|
class MessageType(str, Enum):
|
|
"""Chat message types."""
|
|
USER = "user"
|
|
ASSISTANT = "assistant"
|
|
SYSTEM = "system"
|
|
|
|
|
|
class ChatSession(Model):
|
|
"""Chat session for RAG-powered video conversations."""
|
|
__tablename__ = "chat_sessions"
|
|
|
|
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|
user_id = Column(String(36), ForeignKey("users.id"), nullable=True)
|
|
video_id = Column(String(20), nullable=False) # YouTube video ID
|
|
summary_id = Column(String(36), ForeignKey("summaries.id"), nullable=True)
|
|
|
|
# Session metadata
|
|
title = Column(String(200)) # Auto-generated or user-defined
|
|
description = Column(Text)
|
|
session_config = Column(JSON) # Model settings, search parameters, etc.
|
|
|
|
# Session state
|
|
is_active = Column(Boolean, default=True)
|
|
message_count = Column(Integer, default=0)
|
|
total_processing_time = Column(Float, default=0.0)
|
|
|
|
# Analytics
|
|
avg_response_time = Column(Float)
|
|
user_satisfaction = Column(Integer) # 1-5 rating
|
|
feedback_notes = Column(Text)
|
|
|
|
# Timestamps
|
|
created_at = Column(DateTime, server_default=func.now())
|
|
last_message_at = Column(DateTime)
|
|
ended_at = Column(DateTime)
|
|
|
|
# Relationships
|
|
user = relationship("backend.models.user.User")
|
|
summary = relationship("backend.models.summary.Summary")
|
|
messages = relationship("backend.models.chat.ChatMessage", back_populates="session", cascade="all, delete-orphan")
|
|
|
|
# Indexes
|
|
__table_args__ = (
|
|
Index('ix_chat_sessions_user_id', 'user_id'),
|
|
Index('ix_chat_sessions_video_id', 'video_id'),
|
|
Index('ix_chat_sessions_is_active', 'is_active'),
|
|
{'extend_existing': True}
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<ChatSession(id={self.id}, video_id={self.video_id}, messages={self.message_count})>"
|
|
|
|
|
|
class ChatMessage(Model):
|
|
"""Individual chat message within a session."""
|
|
__tablename__ = "chat_messages"
|
|
|
|
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|
session_id = Column(String(36), ForeignKey("chat_sessions.id", ondelete="CASCADE"), nullable=False)
|
|
|
|
# Message content
|
|
message_type = Column(String(20), nullable=False) # user, assistant, system
|
|
content = Column(Text, nullable=False)
|
|
original_query = Column(Text) # Original user query if this is an assistant response
|
|
|
|
# RAG context
|
|
context_chunks = Column(JSON) # List of chunk IDs used for response
|
|
sources = Column(JSON) # Array of {chunk_id, timestamp, relevance_score}
|
|
total_sources = Column(Integer, default=0)
|
|
|
|
# AI metadata
|
|
model_used = Column(String(100))
|
|
prompt_tokens = Column(Integer)
|
|
completion_tokens = Column(Integer)
|
|
total_tokens = Column(Integer)
|
|
cost_usd = Column(Float)
|
|
|
|
# Processing metadata
|
|
processing_time_seconds = Column(Float)
|
|
search_time_seconds = Column(Float)
|
|
generation_time_seconds = Column(Float)
|
|
|
|
# User interaction
|
|
user_rating = Column(Integer) # 1-5 thumbs up/down
|
|
user_feedback = Column(Text)
|
|
is_helpful = Column(Boolean)
|
|
|
|
# Timestamps
|
|
created_at = Column(DateTime, server_default=func.now())
|
|
|
|
# Relationships
|
|
session = relationship("backend.models.chat.ChatSession", back_populates="messages")
|
|
|
|
# Indexes
|
|
__table_args__ = (
|
|
Index('ix_chat_messages_session_id', 'session_id'),
|
|
Index('ix_chat_messages_message_type', 'message_type'),
|
|
Index('ix_chat_messages_created_at', 'created_at'),
|
|
{'extend_existing': True}
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<ChatMessage(id={self.id}, type={self.message_type}, session={self.session_id})>"
|
|
|
|
@property
|
|
def formatted_sources(self) -> List[Dict[str, Any]]:
|
|
"""Format sources with timestamp links."""
|
|
if not self.sources:
|
|
return []
|
|
|
|
formatted = []
|
|
for source in self.sources:
|
|
if isinstance(source, dict):
|
|
chunk_id = source.get('chunk_id')
|
|
timestamp = source.get('timestamp')
|
|
score = source.get('relevance_score', 0.0)
|
|
|
|
# Format timestamp as [HH:MM:SS] link
|
|
if timestamp:
|
|
hours = int(timestamp // 3600)
|
|
minutes = int((timestamp % 3600) // 60)
|
|
seconds = int(timestamp % 60)
|
|
time_str = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
|
|
else:
|
|
time_str = "[00:00:00]"
|
|
|
|
formatted.append({
|
|
'chunk_id': chunk_id,
|
|
'timestamp': timestamp,
|
|
'timestamp_formatted': time_str,
|
|
'relevance_score': round(score, 3),
|
|
'youtube_link': f"https://youtube.com/watch?v={self.session.video_id}&t={int(timestamp)}s" if timestamp else None
|
|
})
|
|
|
|
return formatted
|
|
|
|
|
|
class VideoChunk(Model):
|
|
"""Video content chunks for ChromaDB vector storage."""
|
|
__tablename__ = "video_chunks"
|
|
|
|
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
|
|
video_id = Column(String(20), nullable=False) # YouTube video ID
|
|
summary_id = Column(String(36), ForeignKey("summaries.id"), nullable=True)
|
|
|
|
# Chunk metadata
|
|
chunk_index = Column(Integer, nullable=False)
|
|
chunk_type = Column(String(50), nullable=False) # transcript, summary, metadata
|
|
start_timestamp = Column(Float) # Start time in seconds
|
|
end_timestamp = Column(Float) # End time in seconds
|
|
|
|
# Content
|
|
content = Column(Text, nullable=False)
|
|
content_length = Column(Integer)
|
|
content_hash = Column(String(64)) # For deduplication
|
|
|
|
# ChromaDB integration
|
|
chromadb_id = Column(String(100)) # ID in ChromaDB collection
|
|
embedding_model = Column(String(100)) # Model used for embedding
|
|
embedding_created_at = Column(DateTime)
|
|
|
|
# Processing metadata
|
|
created_at = Column(DateTime, server_default=func.now())
|
|
updated_at = Column(DateTime, onupdate=func.now())
|
|
|
|
# Relationships
|
|
summary = relationship("backend.models.summary.Summary")
|
|
|
|
# Indexes
|
|
__table_args__ = (
|
|
Index('ix_video_chunks_video_id', 'video_id'),
|
|
Index('ix_video_chunks_hash', 'content_hash'),
|
|
Index('ix_video_chunks_timestamps', 'start_timestamp', 'end_timestamp'),
|
|
{'extend_existing': True}
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<VideoChunk(id={self.id}, video_id={self.video_id}, type={self.chunk_type})>"
|
|
|
|
@property
|
|
def timestamp_range(self) -> str:
|
|
"""Format timestamp range for display."""
|
|
if self.start_timestamp is not None and self.end_timestamp is not None:
|
|
start_h = int(self.start_timestamp // 3600)
|
|
start_m = int((self.start_timestamp % 3600) // 60)
|
|
start_s = int(self.start_timestamp % 60)
|
|
|
|
end_h = int(self.end_timestamp // 3600)
|
|
end_m = int((self.end_timestamp % 3600) // 60)
|
|
end_e = int(self.end_timestamp % 60)
|
|
|
|
return f"[{start_h:02d}:{start_m:02d}:{start_s:02d}] - [{end_h:02d}:{end_m:02d}:{end_e:02d}]"
|
|
return "[00:00:00] - [00:00:00]" |