505 lines
17 KiB
Markdown
505 lines
17 KiB
Markdown
# Database Schema & Migration History
|
|
|
|
This document tracks all database schema changes, migrations, and evolution for the Trax Media Processing Platform.
|
|
|
|
## Database Overview
|
|
|
|
- **Database**: PostgreSQL with JSONB support
|
|
- **ORM**: SQLAlchemy 2.0+ with async support
|
|
- **Migrations**: Alembic for schema versioning
|
|
- **Registry Pattern**: Prevents SQLAlchemy "multiple classes" errors
|
|
|
|
## Current Schema (Version 2.0.0)
|
|
|
|
### Core Tables
|
|
|
|
```sql
|
|
-- YouTube videos metadata
|
|
CREATE TABLE youtube_videos (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
youtube_id VARCHAR(20) UNIQUE NOT NULL,
|
|
title TEXT NOT NULL,
|
|
channel TEXT NOT NULL,
|
|
description TEXT,
|
|
duration_seconds INTEGER NOT NULL,
|
|
url TEXT NOT NULL,
|
|
metadata_extracted_at TIMESTAMP DEFAULT NOW(),
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
-- Media files (downloaded from YouTube or local)
|
|
CREATE TABLE media_files (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
youtube_video_id UUID REFERENCES youtube_videos(id),
|
|
file_path TEXT NOT NULL,
|
|
file_name TEXT NOT NULL,
|
|
file_size BIGINT NOT NULL,
|
|
duration_seconds FLOAT,
|
|
format_info JSONB,
|
|
download_status VARCHAR(20) DEFAULT 'pending', -- pending, downloading, completed, failed
|
|
download_error TEXT,
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
-- Transcription results (Enhanced for v2)
|
|
CREATE TABLE transcription_results (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
media_file_id UUID REFERENCES media_files(id),
|
|
version VARCHAR(10) NOT NULL DEFAULT 'v1', -- v1, v2, v3, v4
|
|
text_content TEXT,
|
|
enhanced_content TEXT,
|
|
segments JSONB, -- Whisper segments with timestamps
|
|
raw_content JSONB, -- Complete Whisper API response
|
|
accuracy FLOAT,
|
|
processing_time_seconds FLOAT,
|
|
quality_warnings TEXT[],
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW(),
|
|
|
|
-- V2-specific columns (nullable for backward compatibility)
|
|
pipeline_version VARCHAR(20), -- v1, v2, v3, v4
|
|
enhanced_content_v2 JSONB, -- Enhanced transcription content
|
|
diarization_content JSONB, -- Speaker diarization data
|
|
merged_content JSONB, -- Merged content from multiple sources
|
|
domain_used VARCHAR(100), -- Domain-specific processing
|
|
accuracy_estimate FLOAT, -- Estimated accuracy for v2
|
|
speaker_count INTEGER, -- Number of speakers detected
|
|
quality_warnings_v2 JSONB, -- Quality warnings and issues
|
|
processing_metadata JSONB -- Additional processing metadata
|
|
);
|
|
|
|
-- Speaker profiles (NEW in v2)
|
|
CREATE TABLE speaker_profiles (
|
|
id SERIAL PRIMARY KEY,
|
|
name VARCHAR(255) NOT NULL,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
characteristics JSONB, -- Voice characteristics
|
|
embedding TEXT, -- Speaker embedding (base64 encoded)
|
|
sample_count INTEGER DEFAULT 0, -- Number of samples for this speaker
|
|
user_id INTEGER -- Associated user (optional)
|
|
);
|
|
|
|
-- V2 Processing jobs (NEW in v2)
|
|
CREATE TABLE v2_processing_jobs (
|
|
id SERIAL PRIMARY KEY,
|
|
status VARCHAR(50) NOT NULL DEFAULT 'pending',
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
completed_at TIMESTAMP WITH TIME ZONE,
|
|
transcript_id UUID REFERENCES transcription_results(id) ON DELETE CASCADE,
|
|
job_type VARCHAR(50) NOT NULL, -- enhancement, diarization, etc.
|
|
parameters JSONB, -- Job parameters
|
|
progress FLOAT DEFAULT 0, -- Progress percentage (0.0 to 1.0)
|
|
error_message TEXT, -- Error message if failed
|
|
result_data JSONB -- Job result data
|
|
);
|
|
|
|
-- Batch processing jobs
|
|
CREATE TABLE batch_jobs (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
name TEXT NOT NULL,
|
|
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending, running, completed, failed, paused
|
|
total_files INTEGER NOT NULL,
|
|
processed_files INTEGER DEFAULT 0,
|
|
failed_files INTEGER DEFAULT 0,
|
|
worker_count INTEGER DEFAULT 8,
|
|
memory_limit_mb INTEGER DEFAULT 2048,
|
|
cpu_limit_percent INTEGER DEFAULT 90,
|
|
start_time TIMESTAMP,
|
|
end_time TIMESTAMP,
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
-- Batch processing items
|
|
CREATE TABLE batch_items (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
batch_job_id UUID REFERENCES batch_jobs(id),
|
|
task_type VARCHAR(20) NOT NULL, -- transcribe, enhance, youtube, download, preprocess
|
|
task_data JSONB NOT NULL, -- Task-specific data
|
|
priority INTEGER DEFAULT 0,
|
|
status VARCHAR(20) NOT NULL DEFAULT 'pending', -- pending, running, completed, failed, retrying
|
|
retry_count INTEGER DEFAULT 0,
|
|
max_retries INTEGER DEFAULT 3,
|
|
error_message TEXT,
|
|
result_data JSONB, -- Task result data
|
|
processing_time_seconds FLOAT,
|
|
started_at TIMESTAMP,
|
|
completed_at TIMESTAMP,
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
-- Audio processing metadata
|
|
CREATE TABLE audio_processing_metadata (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
media_file_id UUID REFERENCES media_files(id),
|
|
original_format VARCHAR(20),
|
|
original_sample_rate INTEGER,
|
|
original_channels INTEGER,
|
|
processed_format VARCHAR(20) DEFAULT 'wav',
|
|
processed_sample_rate INTEGER DEFAULT 16000,
|
|
processed_channels INTEGER DEFAULT 1,
|
|
preprocessing_time_seconds FLOAT,
|
|
chunk_count INTEGER,
|
|
created_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
-- Export history
|
|
CREATE TABLE exports (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
transcript_id UUID REFERENCES transcripts(id),
|
|
format VARCHAR(10) NOT NULL, -- 'json', 'txt', 'srt'
|
|
file_path TEXT NOT NULL,
|
|
file_size BIGINT,
|
|
created_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
```
|
|
|
|
## Schema Evolution Timeline
|
|
|
|
### Version 2.0.0 - V2 Schema Migration (2024-12-30)
|
|
**Status**: ✅ COMPLETED
|
|
|
|
**New Features**:
|
|
- Speaker profiles for speaker diarization and identification
|
|
- V2 processing jobs for individual transcript processing
|
|
- Enhanced transcription results with v2-specific columns
|
|
- Backward compatibility layer for v1 clients
|
|
- Comprehensive data migration utilities
|
|
|
|
**Schema Changes**:
|
|
```sql
|
|
-- New tables for v2 features
|
|
CREATE TABLE speaker_profiles (
|
|
id SERIAL PRIMARY KEY,
|
|
name VARCHAR(255) NOT NULL,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
characteristics JSONB,
|
|
embedding TEXT,
|
|
sample_count INTEGER DEFAULT 0,
|
|
user_id INTEGER
|
|
);
|
|
|
|
CREATE TABLE v2_processing_jobs (
|
|
id SERIAL PRIMARY KEY,
|
|
status VARCHAR(50) NOT NULL DEFAULT 'pending',
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
completed_at TIMESTAMP WITH TIME ZONE,
|
|
transcript_id UUID REFERENCES transcription_results(id) ON DELETE CASCADE,
|
|
job_type VARCHAR(50) NOT NULL,
|
|
parameters JSONB,
|
|
progress FLOAT DEFAULT 0,
|
|
error_message TEXT,
|
|
result_data JSONB
|
|
);
|
|
|
|
-- Enhanced transcription_results table with v2 columns
|
|
ALTER TABLE transcription_results
|
|
ADD COLUMN pipeline_version VARCHAR(20),
|
|
ADD COLUMN enhanced_content_v2 JSONB,
|
|
ADD COLUMN diarization_content JSONB,
|
|
ADD COLUMN merged_content JSONB,
|
|
ADD COLUMN domain_used VARCHAR(100),
|
|
ADD COLUMN accuracy_estimate FLOAT,
|
|
ADD COLUMN speaker_count INTEGER,
|
|
ADD COLUMN quality_warnings_v2 JSONB,
|
|
ADD COLUMN processing_metadata JSONB;
|
|
```
|
|
|
|
**Key Improvements**:
|
|
- **Backward Compatibility**: All v2 columns are nullable, allowing v1 clients to continue working
|
|
- **Speaker Diarization Support**: Speaker profiles and diarization content storage
|
|
- **Enhanced Processing**: Individual processing jobs with progress tracking
|
|
- **Domain-Specific Processing**: Support for different processing domains
|
|
- **Quality Metrics**: Enhanced accuracy estimation and quality warnings
|
|
- **Metadata Tracking**: Comprehensive processing metadata storage
|
|
|
|
### Version 0.2.0 - Batch Processing System (2024-12-30)
|
|
**Status**: ✅ COMPLETED
|
|
|
|
**New Features**:
|
|
- Batch processing jobs and items tracking
|
|
- Task type support (transcribe, enhance, youtube, download, preprocess)
|
|
- Priority-based task processing
|
|
- Retry mechanism with configurable limits
|
|
- Processing time and error tracking
|
|
- Resource monitoring integration
|
|
|
|
**Schema Changes**:
|
|
```sql
|
|
-- Added batch processing tables
|
|
CREATE TABLE batch_jobs (...);
|
|
CREATE TABLE batch_items (...);
|
|
|
|
-- Enhanced existing tables
|
|
ALTER TABLE batch_items ADD COLUMN task_type VARCHAR(20) NOT NULL;
|
|
ALTER TABLE batch_items ADD COLUMN task_data JSONB NOT NULL;
|
|
ALTER TABLE batch_items ADD COLUMN priority INTEGER DEFAULT 0;
|
|
ALTER TABLE batch_items ADD COLUMN retry_count INTEGER DEFAULT 0;
|
|
ALTER TABLE batch_items ADD COLUMN max_retries INTEGER DEFAULT 3;
|
|
ALTER TABLE batch_items ADD COLUMN result_data JSONB;
|
|
```
|
|
|
|
### Version 0.1.1 - AI Enhancement Service (2024-12-25)
|
|
**Status**: ✅ COMPLETED
|
|
|
|
**New Features**:
|
|
- Enhanced content storage in transcripts table
|
|
- Quality validation and accuracy tracking
|
|
- Processing time and quality warnings
|
|
- Caching support for enhancement results
|
|
|
|
**Schema Changes**:
|
|
```sql
|
|
-- Enhanced transcripts table
|
|
ALTER TABLE transcripts ADD COLUMN enhanced_content TEXT;
|
|
ALTER TABLE transcripts ADD COLUMN accuracy FLOAT;
|
|
ALTER TABLE transcripts ADD COLUMN quality_warnings TEXT[];
|
|
```
|
|
|
|
### Version 0.1.0 - Core Services (2024-12-19)
|
|
**Status**: ✅ COMPLETED
|
|
|
|
**Initial Schema**:
|
|
- YouTube videos metadata extraction
|
|
- Media files download and storage
|
|
- Basic transcription with Whisper API
|
|
- Audio processing metadata tracking
|
|
- Export functionality
|
|
|
|
**Core Tables Created**:
|
|
- `youtube_videos` - YouTube metadata storage
|
|
- `media_files` - Downloaded media file tracking
|
|
- `transcripts` - Transcription results storage
|
|
- `audio_processing_metadata` - Audio processing details
|
|
- `exports` - Export history tracking
|
|
|
|
## Migration History
|
|
|
|
### Migration 004 - V2 Schema Migration (2024-12-30)
|
|
**Status**: ✅ COMPLETED
|
|
|
|
**Implementation Notes**:
|
|
- **Test Database Setup**: Required explicit creation of `trax_test` database
|
|
- **Foreign Key Dependencies**: Tests must create records in dependency order (media_files → transcription_jobs → transcription_results)
|
|
- **Schema Validation**: All 15 schema tests pass, validating v2 structure
|
|
- **Backward Compatibility**: v1 clients can continue working with v2 schema
|
|
- **Helper Methods**: Created `_create_test_transcript()` helper for test data creation
|
|
|
|
**Lessons for Next Time**:
|
|
1. **Test Database Isolation**: Always create separate test database before running schema tests
|
|
2. **Dependency Order**: When testing foreign key relationships, create parent records first
|
|
3. **Schema Matching**: Ensure test expectations match actual database schema (column types, nullability)
|
|
4. **Helper Functions**: Create reusable test helpers for complex data setup
|
|
5. **Migration Testing**: Test both upgrade and downgrade paths for migrations
|
|
|
|
```sql
|
|
-- Create speaker_profiles table
|
|
CREATE TABLE speaker_profiles (
|
|
id SERIAL PRIMARY KEY,
|
|
name VARCHAR(255) NOT NULL,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
characteristics JSONB,
|
|
embedding TEXT,
|
|
sample_count INTEGER DEFAULT 0,
|
|
user_id INTEGER
|
|
);
|
|
|
|
-- Create v2_processing_jobs table
|
|
CREATE TABLE v2_processing_jobs (
|
|
id SERIAL PRIMARY KEY,
|
|
status VARCHAR(50) NOT NULL DEFAULT 'pending',
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
completed_at TIMESTAMP WITH TIME ZONE,
|
|
transcript_id UUID REFERENCES transcription_results(id) ON DELETE CASCADE,
|
|
job_type VARCHAR(50) NOT NULL,
|
|
parameters JSONB,
|
|
progress FLOAT DEFAULT 0,
|
|
error_message TEXT,
|
|
result_data JSONB
|
|
);
|
|
|
|
-- Add v2 columns to transcription_results table
|
|
ALTER TABLE transcription_results
|
|
ADD COLUMN pipeline_version VARCHAR(20),
|
|
ADD COLUMN enhanced_content_v2 JSONB,
|
|
ADD COLUMN diarization_content JSONB,
|
|
ADD COLUMN merged_content JSONB,
|
|
ADD COLUMN domain_used VARCHAR(100),
|
|
ADD COLUMN accuracy_estimate FLOAT,
|
|
ADD COLUMN speaker_count INTEGER,
|
|
ADD COLUMN quality_warnings_v2 JSONB,
|
|
ADD COLUMN processing_metadata JSONB;
|
|
|
|
-- Create indexes for performance
|
|
CREATE INDEX ix_speaker_profiles_name ON speaker_profiles(name);
|
|
CREATE INDEX ix_speaker_profiles_user_id ON speaker_profiles(user_id);
|
|
CREATE INDEX ix_v2_processing_jobs_status ON v2_processing_jobs(status);
|
|
CREATE INDEX ix_v2_processing_jobs_transcript_id ON v2_processing_jobs(transcript_id);
|
|
CREATE INDEX ix_v2_processing_jobs_job_type ON v2_processing_jobs(job_type);
|
|
CREATE INDEX ix_transcription_results_pipeline_version ON transcription_results(pipeline_version);
|
|
CREATE INDEX ix_transcription_results_domain_used ON transcription_results(domain_used);
|
|
CREATE INDEX ix_transcription_results_speaker_count ON transcription_results(speaker_count);
|
|
|
|
-- Update existing transcripts to v1
|
|
UPDATE transcription_results SET pipeline_version = 'v1' WHERE pipeline_version IS NULL;
|
|
```
|
|
|
|
### Migration 003 - Batch Processing Support (2024-12-30)
|
|
```sql
|
|
-- Add batch processing tables
|
|
CREATE TABLE batch_jobs (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
name TEXT NOT NULL,
|
|
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
|
total_files INTEGER NOT NULL,
|
|
processed_files INTEGER DEFAULT 0,
|
|
failed_files INTEGER DEFAULT 0,
|
|
worker_count INTEGER DEFAULT 8,
|
|
memory_limit_mb INTEGER DEFAULT 2048,
|
|
cpu_limit_percent INTEGER DEFAULT 90,
|
|
start_time TIMESTAMP,
|
|
end_time TIMESTAMP,
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
|
|
CREATE TABLE batch_items (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
batch_job_id UUID REFERENCES batch_jobs(id),
|
|
task_type VARCHAR(20) NOT NULL,
|
|
task_data JSONB NOT NULL,
|
|
priority INTEGER DEFAULT 0,
|
|
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
|
retry_count INTEGER DEFAULT 0,
|
|
max_retries INTEGER DEFAULT 3,
|
|
error_message TEXT,
|
|
result_data JSONB,
|
|
processing_time_seconds FLOAT,
|
|
started_at TIMESTAMP,
|
|
completed_at TIMESTAMP,
|
|
created_at TIMESTAMP DEFAULT NOW(),
|
|
updated_at TIMESTAMP DEFAULT NOW()
|
|
);
|
|
```
|
|
|
|
### Migration 002 - AI Enhancement Support (2024-12-25)
|
|
```sql
|
|
-- Add enhancement support to transcripts
|
|
ALTER TABLE transcripts ADD COLUMN enhanced_content TEXT;
|
|
ALTER TABLE transcripts ADD COLUMN accuracy FLOAT;
|
|
ALTER TABLE transcripts ADD COLUMN quality_warnings TEXT[];
|
|
```
|
|
|
|
### Migration 001 - Initial Schema (2024-12-19)
|
|
```sql
|
|
-- Create core tables
|
|
CREATE TABLE youtube_videos (...);
|
|
CREATE TABLE media_files (...);
|
|
CREATE TABLE transcripts (...);
|
|
CREATE TABLE audio_processing_metadata (...);
|
|
CREATE TABLE exports (...);
|
|
```
|
|
|
|
## Data Relationships
|
|
|
|
### Core Relationships
|
|
```
|
|
youtube_videos (1) ←→ (many) media_files
|
|
media_files (1) ←→ (many) transcription_results
|
|
transcription_results (1) ←→ (many) exports
|
|
transcription_results (1) ←→ (many) v2_processing_jobs
|
|
batch_jobs (1) ←→ (many) batch_items
|
|
media_files (1) ←→ (1) audio_processing_metadata
|
|
```
|
|
|
|
### V2 Relationships
|
|
```
|
|
speaker_profiles (many) ←→ (1) users (future)
|
|
transcription_results (1) ←→ (many) v2_processing_jobs
|
|
transcription_results (1) ←→ (many) speaker_profiles (via diarization_content)
|
|
```
|
|
|
|
### Task Processing Flow
|
|
```
|
|
batch_jobs → batch_items → task processing → results
|
|
v2_processing_jobs → individual processing → enhanced results
|
|
```
|
|
|
|
## Indexes and Performance
|
|
|
|
### Primary Indexes
|
|
- All tables have UUID primary keys with `gen_random_uuid()` default
|
|
- Foreign key relationships are properly indexed
|
|
- JSONB columns support efficient querying
|
|
|
|
### V2 Performance Optimizations
|
|
- Speaker profile name and user_id indexes for quick lookups
|
|
- Processing job status and type indexes for job management
|
|
- Transcript pipeline version and domain indexes for filtering
|
|
- Speaker count index for analytics queries
|
|
|
|
### Performance Guidelines
|
|
- Connection pooling with appropriate timeouts
|
|
- Async/await pattern throughout data access layer
|
|
- Efficient JSONB queries for flexible data storage
|
|
- Proper indexing for common query patterns
|
|
|
|
## Data Validation
|
|
|
|
### Constraints
|
|
- All timestamps use UTC timezone
|
|
- File paths are validated and normalized
|
|
- JSONB data is validated before storage
|
|
- Status values are constrained to valid options
|
|
|
|
### V2 Business Rules
|
|
- Speaker profiles must have unique names per user
|
|
- Processing jobs must have valid status transitions
|
|
- V2 transcripts maintain backward compatibility with v1
|
|
- Speaker count must be non-negative
|
|
- Progress values must be between 0.0 and 1.0
|
|
|
|
## Backup and Recovery
|
|
|
|
### Backup Strategy
|
|
- Regular PostgreSQL backups
|
|
- JSONB data integrity checks
|
|
- Migration rollback procedures
|
|
- Data export capabilities
|
|
|
|
### V2 Recovery Procedures
|
|
- Point-in-time recovery support
|
|
- Migration rollback scripts with data preservation
|
|
- Data validation and repair tools
|
|
- Integrity check procedures
|
|
- Backward compatibility verification
|
|
|
|
## Future Schema Plans
|
|
|
|
### Version 2.1.0 - Advanced Speaker Features
|
|
- Speaker clustering and identification
|
|
- Voice fingerprinting
|
|
- Speaker confidence scoring
|
|
- Multi-language speaker support
|
|
|
|
### Version 2.2.0 - Enhanced Processing
|
|
- Real-time processing capabilities
|
|
- Advanced quality metrics
|
|
- Processing pipeline optimization
|
|
- Performance monitoring
|
|
|
|
### Version 3.0.0 - Enterprise Features
|
|
- Multi-tenant support
|
|
- Advanced analytics and reporting
|
|
- API rate limiting and quotas
|
|
- Enterprise integration features
|