trax/migrations/versions/20241230_add_v2_schema.py

"""Add v2 schema

Revision ID: 20241230_add_v2_schema
Revises: dcdfa10e65bd
Create Date: 2024-12-30 10:00:00.000000

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB

# revision identifiers, used by Alembic.
revision = '20241230_add_v2_schema'
down_revision = 'dcdfa10e65bd'
branch_labels = None
depends_on = None


def upgrade() -> None:
    """Upgrade to v2 schema.

    Creates new tables for speaker profiles and v2 processing jobs,
    and adds v2-specific columns to the transcription_results table.
    """
    # Create speaker_profiles table
    op.create_table(
        'speaker_profiles',
        sa.Column('id', sa.Integer(), nullable=False),
        sa.Column('name', sa.String(255), nullable=False),
        sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('CURRENT_TIMESTAMP')),
        sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('CURRENT_TIMESTAMP')),
        sa.Column('characteristics', JSONB, nullable=True),
        sa.Column('embedding', sa.Text(), nullable=True),
        sa.Column('sample_count', sa.Integer(), server_default='0'),
        sa.Column('user_id', sa.Integer(), nullable=True),
        sa.PrimaryKeyConstraint('id')
    )

    # Create indexes for speaker_profiles
    op.create_index('ix_speaker_profiles_name', 'speaker_profiles', ['name'])
    op.create_index('ix_speaker_profiles_user_id', 'speaker_profiles', ['user_id'])

    # Create v2_processing_jobs table
    op.create_table(
        'v2_processing_jobs',
        sa.Column('id', sa.Integer(), nullable=False),
        sa.Column('status', sa.String(50), server_default='pending', nullable=False),
        sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('CURRENT_TIMESTAMP')),
        sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('CURRENT_TIMESTAMP')),
        sa.Column('completed_at', sa.TIMESTAMP(timezone=True), nullable=True),
        sa.Column('transcript_id', sa.UUID(), nullable=True),
        sa.Column('job_type', sa.String(50), nullable=False),
        sa.Column('parameters', JSONB, nullable=True),
        sa.Column('progress', sa.Float(), server_default='0'),
        sa.Column('error_message', sa.Text(), nullable=True),
        sa.Column('result_data', JSONB, nullable=True),
        sa.PrimaryKeyConstraint('id')
    )

    # Create indexes for v2_processing_jobs
    op.create_index('ix_v2_processing_jobs_status', 'v2_processing_jobs', ['status'])
    op.create_index('ix_v2_processing_jobs_transcript_id', 'v2_processing_jobs', ['transcript_id'])
    op.create_index('ix_v2_processing_jobs_job_type', 'v2_processing_jobs', ['job_type'])

    # Add foreign key constraint for v2_processing_jobs
    op.create_foreign_key(
        'fk_v2_processing_jobs_transcript_id',
        'v2_processing_jobs', 'transcription_results',
        ['transcript_id'], ['id'],
        ondelete='CASCADE'
    )

    # Add v2 columns to transcription_results table
    op.add_column('transcription_results', sa.Column('pipeline_version', sa.String(20), nullable=True))
    op.add_column('transcription_results', sa.Column('enhanced_content', JSONB, nullable=True))
    op.add_column('transcription_results', sa.Column('diarization_content', JSONB, nullable=True))
    op.add_column('transcription_results', sa.Column('merged_content', JSONB, nullable=True))
    op.add_column('transcription_results', sa.Column('domain_used', sa.String(100), nullable=True))
    op.add_column('transcription_results', sa.Column('accuracy_estimate', sa.Float(), nullable=True))
    op.add_column('transcription_results', sa.Column('speaker_count', sa.Integer(), nullable=True))
    op.add_column('transcription_results', sa.Column('quality_warnings', JSONB, nullable=True))
    op.add_column('transcription_results', sa.Column('processing_metadata', JSONB, nullable=True))

    # Create indexes for new v2 columns
    op.create_index('ix_transcription_results_pipeline_version', 'transcription_results', ['pipeline_version'])
    op.create_index('ix_transcription_results_domain_used', 'transcription_results', ['domain_used'])
    op.create_index('ix_transcription_results_speaker_count', 'transcription_results', ['speaker_count'])

    # Update existing transcripts to have pipeline_version = 'v1'
    op.execute("""
        UPDATE transcription_results
        SET pipeline_version = 'v1'
        WHERE pipeline_version IS NULL
    """)


def downgrade() -> None:
    """Downgrade from v2 schema.

    Removes v2-specific columns and tables, reverting to v1 schema.
    """
    # Remove indexes for v2 columns
    op.drop_index('ix_transcription_results_speaker_count', 'transcription_results')
    op.drop_index('ix_transcription_results_domain_used', 'transcription_results')
    op.drop_index('ix_transcription_results_pipeline_version', 'transcription_results')

    # Remove v2 columns from transcription_results table
    op.drop_column('transcription_results', 'processing_metadata')
    op.drop_column('transcription_results', 'quality_warnings')
    op.drop_column('transcription_results', 'speaker_count')
    op.drop_column('transcription_results', 'accuracy_estimate')
    op.drop_column('transcription_results', 'domain_used')
    op.drop_column('transcription_results', 'merged_content')
    op.drop_column('transcription_results', 'diarization_content')
    op.drop_column('transcription_results', 'enhanced_content')
    op.drop_column('transcription_results', 'pipeline_version')

    # Remove foreign key constraint for v2_processing_jobs
    op.drop_constraint('fk_v2_processing_jobs_transcript_id', 'v2_processing_jobs', type_='foreignkey')

    # Remove indexes for v2_processing_jobs
    op.drop_index('ix_v2_processing_jobs_job_type', 'v2_processing_jobs')
    op.drop_index('ix_v2_processing_jobs_transcript_id', 'v2_processing_jobs')
    op.drop_index('ix_v2_processing_jobs_status', 'v2_processing_jobs')

    # Drop v2_processing_jobs table
    op.drop_table('v2_processing_jobs')

    # Remove indexes for speaker_profiles
    op.drop_index('ix_speaker_profiles_user_id', 'speaker_profiles')
    op.drop_index('ix_speaker_profiles_name', 'speaker_profiles')

    # Drop speaker_profiles table
    op.drop_table('speaker_profiles')