youtube-summarizer/backend/models/chat.py

"""Database models for RAG-powered chat functionality."""

from sqlalchemy import Column, String, Text, DateTime, Float, Boolean, ForeignKey, Index, JSON, Integer
from sqlalchemy.orm import relationship
from sqlalchemy.sql import func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.types import TypeDecorator, CHAR
import uuid
from datetime import datetime
from typing import Optional, List, Dict, Any
from enum import Enum

from backend.models.base import Model


class GUID(TypeDecorator):
    """Platform-independent GUID type for SQLite and PostgreSQL compatibility."""
    impl = CHAR
    cache_ok = True

    def load_dialect_impl(self, dialect):
        if dialect.name == 'postgresql':
            return dialect.type_descriptor(UUID())
        else:
            return dialect.type_descriptor(CHAR(32))

    def process_bind_param(self, value, dialect):
        if value is None:
            return value
        elif dialect.name == 'postgresql':
            return str(value)
        else:
            if not isinstance(value, uuid.UUID):
                return "%.32x" % uuid.UUID(value).int
            else:
                return "%.32x" % value.int

    def process_result_value(self, value, dialect):
        if value is None:
            return value
        else:
            if not isinstance(value, uuid.UUID):
                return uuid.UUID(value)
            return value


class MessageType(str, Enum):
    """Chat message types."""
    USER = "user"
    ASSISTANT = "assistant"
    SYSTEM = "system"


class ChatSession(Model):
    """Chat session for RAG-powered video conversations."""
    __tablename__ = "chat_sessions"

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    user_id = Column(String(36), ForeignKey("users.id"), nullable=True)
    video_id = Column(String(20), nullable=False)  # YouTube video ID
    summary_id = Column(String(36), ForeignKey("summaries.id"), nullable=True)

    # Session metadata
    title = Column(String(200))  # Auto-generated or user-defined
    description = Column(Text)
    session_config = Column(JSON)  # Model settings, search parameters, etc.

    # Session state
    is_active = Column(Boolean, default=True)
    message_count = Column(Integer, default=0)
    total_processing_time = Column(Float, default=0.0)

    # Analytics
    avg_response_time = Column(Float)
    user_satisfaction = Column(Integer)  # 1-5 rating
    feedback_notes = Column(Text)

    # Timestamps
    created_at = Column(DateTime, server_default=func.now())
    last_message_at = Column(DateTime)
    ended_at = Column(DateTime)

    # Relationships
    user = relationship("backend.models.user.User")
    summary = relationship("backend.models.summary.Summary")
    messages = relationship("backend.models.chat.ChatMessage", back_populates="session", cascade="all, delete-orphan")

    # Indexes
    __table_args__ = (
        Index('ix_chat_sessions_user_id', 'user_id'),
        Index('ix_chat_sessions_video_id', 'video_id'),
        Index('ix_chat_sessions_is_active', 'is_active'),
        {'extend_existing': True}
    )

    def __repr__(self):
        return f"<ChatSession(id={self.id}, video_id={self.video_id}, messages={self.message_count})>"


class ChatMessage(Model):
    """Individual chat message within a session."""
    __tablename__ = "chat_messages"

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    session_id = Column(String(36), ForeignKey("chat_sessions.id", ondelete="CASCADE"), nullable=False)

    # Message content
    message_type = Column(String(20), nullable=False)  # user, assistant, system
    content = Column(Text, nullable=False)
    original_query = Column(Text)  # Original user query if this is an assistant response

    # RAG context
    context_chunks = Column(JSON)  # List of chunk IDs used for response
    sources = Column(JSON)  # Array of {chunk_id, timestamp, relevance_score}
    total_sources = Column(Integer, default=0)

    # AI metadata
    model_used = Column(String(100))
    prompt_tokens = Column(Integer)
    completion_tokens = Column(Integer)
    total_tokens = Column(Integer)
    cost_usd = Column(Float)

    # Processing metadata
    processing_time_seconds = Column(Float)
    search_time_seconds = Column(Float)
    generation_time_seconds = Column(Float)

    # User interaction
    user_rating = Column(Integer)  # 1-5 thumbs up/down
    user_feedback = Column(Text)
    is_helpful = Column(Boolean)

    # Timestamps
    created_at = Column(DateTime, server_default=func.now())

    # Relationships
    session = relationship("backend.models.chat.ChatSession", back_populates="messages")

    # Indexes
    __table_args__ = (
        Index('ix_chat_messages_session_id', 'session_id'),
        Index('ix_chat_messages_message_type', 'message_type'),
        Index('ix_chat_messages_created_at', 'created_at'),
        {'extend_existing': True}
    )

    def __repr__(self):
        return f"<ChatMessage(id={self.id}, type={self.message_type}, session={self.session_id})>"

    @property
    def formatted_sources(self) -> List[Dict[str, Any]]:
        """Format sources with timestamp links."""
        if not self.sources:
            return []

        formatted = []
        for source in self.sources:
            if isinstance(source, dict):
                chunk_id = source.get('chunk_id')
                timestamp = source.get('timestamp')
                score = source.get('relevance_score', 0.0)

                # Format timestamp as [HH:MM:SS] link
                if timestamp:
                    hours = int(timestamp // 3600)
                    minutes = int((timestamp % 3600) // 60)
                    seconds = int(timestamp % 60)
                    time_str = f"[{hours:02d}:{minutes:02d}:{seconds:02d}]"
                else:
                    time_str = "[00:00:00]"

                formatted.append({
                    'chunk_id': chunk_id,
                    'timestamp': timestamp,
                    'timestamp_formatted': time_str,
                    'relevance_score': round(score, 3),
                    'youtube_link': f"https://youtube.com/watch?v={self.session.video_id}&t={int(timestamp)}s" if timestamp else None
                })

        return formatted


class VideoChunk(Model):
    """Video content chunks for ChromaDB vector storage."""
    __tablename__ = "video_chunks"

    id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()))
    video_id = Column(String(20), nullable=False)  # YouTube video ID
    summary_id = Column(String(36), ForeignKey("summaries.id"), nullable=True)

    # Chunk metadata
    chunk_index = Column(Integer, nullable=False)
    chunk_type = Column(String(50), nullable=False)  # transcript, summary, metadata
    start_timestamp = Column(Float)  # Start time in seconds
    end_timestamp = Column(Float)  # End time in seconds

    # Content
    content = Column(Text, nullable=False)
    content_length = Column(Integer)
    content_hash = Column(String(64))  # For deduplication

    # ChromaDB integration
    chromadb_id = Column(String(100))  # ID in ChromaDB collection
    embedding_model = Column(String(100))  # Model used for embedding
    embedding_created_at = Column(DateTime)

    # Processing metadata
    created_at = Column(DateTime, server_default=func.now())
    updated_at = Column(DateTime, onupdate=func.now())

    # Relationships
    summary = relationship("backend.models.summary.Summary")

    # Indexes
    __table_args__ = (
        Index('ix_video_chunks_video_id', 'video_id'),
        Index('ix_video_chunks_hash', 'content_hash'),
        Index('ix_video_chunks_timestamps', 'start_timestamp', 'end_timestamp'),
        {'extend_existing': True}
    )

    def __repr__(self):
        return f"<VideoChunk(id={self.id}, video_id={self.video_id}, type={self.chunk_type})>"

    @property
    def timestamp_range(self) -> str:
        """Format timestamp range for display."""
        if self.start_timestamp is not None and self.end_timestamp is not None:
            start_h = int(self.start_timestamp // 3600)
            start_m = int((self.start_timestamp % 3600) // 60)
            start_s = int(self.start_timestamp % 60)

            end_h = int(self.end_timestamp // 3600)
            end_m = int((self.end_timestamp % 3600) // 60)
            end_e = int(self.end_timestamp % 60)

            return f"[{start_h:02d}:{start_m:02d}:{start_s:02d}] - [{end_h:02d}:{end_m:02d}:{end_e:02d}]"
        return "[00:00:00] - [00:00:00]"