youtube-summarizer/backend/models/chat.py

239 lines
8.5 KiB
Python

"""Database models for RAG-powered chat functionality."""
from sqlalchemy import Column, String, Text, DateTime, Float, Boolean, ForeignKey, Index, JSON, Integer
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.types import TypeDecorator, CHAR
import uuid
from datetime import datetime
from typing import Optional, List, Dict, Any
from enum import Enum
from backend.models.base import Model
class GUID(TypeDecorator):
"""Platform-independent GUID type for SQLite and PostgreSQL compatibility."""
impl = CHAR
cache_ok = True
def load_dialect_impl(self, dialect):
if dialect.name == 'postgresql':
return dialect.type_descriptor(UUID())
else:
return dialect.type_descriptor(CHAR(32))
def process_bind_param(self, value, dialect):
if value is None:
return value
elif dialect.name == 'postgresql':
return str(value)
else:
if not isinstance(value, uuid.UUID):
return "%.32x" % uuid.UUID(value).int
else:
return "%.32x" % value.int
def process_result_value(self, value, dialect):
if value is None:
return value
else:
if not isinstance(value, uuid.UUID):
return uuid.UUID(value)
return value
class MessageType(str, Enum):
"""Chat message types."""
USER = "user"
ASSISTANT = "assistant"
SYSTEM = "system"
class ChatSession(Model):
"""Chat session for RAG-powered video conversations."""
__tablename__ = "chat_sessions"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
user_id = Column(String(36), ForeignKey("users.id"), nullable=True)
video_id = Column(String(20), nullable=False) # YouTube video ID
summary_id = Column(String(36), ForeignKey("summaries.id"), nullable=True)
# Session metadata
title = Column(String(200)) # Auto-generated or user-defined
description = Column(Text)
session_config = Column(JSON) # Model settings, search parameters, etc.
# Session state
is_active = Column(Boolean, default=True)
message_count = Column(Integer, default=0)
total_processing_time = Column(Float, default=0.0)
# Analytics
avg_response_time = Column(Float)
user_satisfaction = Column(Integer) # 1-5 rating
feedback_notes = Column(Text)
# Timestamps
created_at = Column(DateTime, server_default=func.now())
last_message_at = Column(DateTime)
ended_at = Column(DateTime)
# Relationships
user = relationship("backend.models.user.User")
summary = relationship("backend.models.summary.Summary")
messages = relationship("backend.models.chat.ChatMessage", back_populates="session", cascade="all, delete-orphan")
# Indexes
__table_args__ = (
Index('ix_chat_sessions_user_id', 'user_id'),
Index('ix_chat_sessions_video_id', 'video_id'),
Index('ix_chat_sessions_is_active', 'is_active'),
{'extend_existing': True}
)
def __repr__(self):
return f"<ChatSession(id={self.id}, video_id={self.video_id}, messages={self.message_count})>"
class ChatMessage(Model):
"""Individual chat message within a session."""
__tablename__ = "chat_messages"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
session_id = Column(String(36), ForeignKey("chat_sessions.id", ondelete="CASCADE"), nullable=False)
# Message content
message_type = Column(String(20), nullable=False) # user, assistant, system
content = Column(Text, nullable=False)
original_query = Column(Text) # Original user query if this is an assistant response
# RAG context
context_chunks = Column(JSON) # List of chunk IDs used for response
sources = Column(JSON) # Array of {chunk_id, timestamp, relevance_score}
total_sources = Column(Integer, default=0)
# AI metadata
model_used = Column(String(100))
prompt_tokens = Column(Integer)
completion_tokens = Column(Integer)
total_tokens = Column(Integer)
cost_usd = Column(Float)
# Processing metadata
processing_time_seconds = Column(Float)
search_time_seconds = Column(Float)
generation_time_seconds = Column(Float)
# User interaction
user_rating = Column(Integer) # 1-5 thumbs up/down
user_feedback = Column(Text)
is_helpful = Column(Boolean)
# Timestamps
created_at = Column(DateTime, server_default=func.now())
# Relationships
session = relationship("backend.models.chat.ChatSession", back_populates="messages")
# Indexes
__table_args__ = (
Index('ix_chat_messages_session_id', 'session_id'),
Index('ix_chat_messages_message_type', 'message_type'),
Index('ix_chat_messages_created_at', 'created_at'),
{'extend_existing': True}
)
def __repr__(self):
return f"<ChatMessage(id={self.id}, type={self.message_type}, session={self.session_id})>"
@property
def formatted_sources(self) -> List[Dict[str, Any]]:
"""Format sources with timestamp links."""
if not self.sources:
return []
formatted = []
for source in self.sources:
if isinstance(source, dict):
chunk_id = source.get('chunk_id')
timestamp = source.get('timestamp')
score = source.get('relevance_score', 0.0)
# Format timestamp as [HH:MM:SS] link
if timestamp:
hours = int(timestamp // 3600)
minutes = int((timestamp % 3600) // 60)
seconds = int(timestamp % 60)
time_str = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
else:
time_str = "[00:00:00]"
formatted.append({
'chunk_id': chunk_id,
'timestamp': timestamp,
'timestamp_formatted': time_str,
'relevance_score': round(score, 3),
'youtube_link': f"https://youtube.com/watch?v={self.session.video_id}&t={int(timestamp)}s" if timestamp else None
})
return formatted
class VideoChunk(Model):
"""Video content chunks for ChromaDB vector storage."""
__tablename__ = "video_chunks"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
video_id = Column(String(20), nullable=False) # YouTube video ID
summary_id = Column(String(36), ForeignKey("summaries.id"), nullable=True)
# Chunk metadata
chunk_index = Column(Integer, nullable=False)
chunk_type = Column(String(50), nullable=False) # transcript, summary, metadata
start_timestamp = Column(Float) # Start time in seconds
end_timestamp = Column(Float) # End time in seconds
# Content
content = Column(Text, nullable=False)
content_length = Column(Integer)
content_hash = Column(String(64)) # For deduplication
# ChromaDB integration
chromadb_id = Column(String(100)) # ID in ChromaDB collection
embedding_model = Column(String(100)) # Model used for embedding
embedding_created_at = Column(DateTime)
# Processing metadata
created_at = Column(DateTime, server_default=func.now())
updated_at = Column(DateTime, onupdate=func.now())
# Relationships
summary = relationship("backend.models.summary.Summary")
# Indexes
__table_args__ = (
Index('ix_video_chunks_video_id', 'video_id'),
Index('ix_video_chunks_hash', 'content_hash'),
Index('ix_video_chunks_timestamps', 'start_timestamp', 'end_timestamp'),
{'extend_existing': True}
)
def __repr__(self):
return f"<VideoChunk(id={self.id}, video_id={self.video_id}, type={self.chunk_type})>"
@property
def timestamp_range(self) -> str:
"""Format timestamp range for display."""
if self.start_timestamp is not None and self.end_timestamp is not None:
start_h = int(self.start_timestamp // 3600)
start_m = int((self.start_timestamp % 3600) // 60)
start_s = int(self.start_timestamp % 60)
end_h = int(self.end_timestamp // 3600)
end_m = int((self.end_timestamp % 3600) // 60)
end_e = int(self.end_timestamp % 60)
return f"[{start_h:02d}:{start_m:02d}:{start_s:02d}] - [{end_h:02d}:{end_m:02d}:{end_e:02d}]"
return "[00:00:00] - [00:00:00]"