#!/usr/bin/env python3 """Migrate existing file-based summaries to database storage.""" import json import sys import logging from pathlib import Path from datetime import datetime from typing import Dict, Any, List # Add parent directory to path sys.path.append(str(Path(__file__).parent.parent)) from backend.services.summary_storage import SummaryStorageService from backend.services.database_storage_service import database_storage_service # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def convert_file_summary_to_db_format(file_data: Dict[str, Any]) -> Dict[str, Any]: """Convert file-based summary format to database format. Args: file_data: Summary data from JSON file Returns: Dictionary compatible with Summary model """ # Map file format to database format db_data = { 'video_id': file_data.get('video_id'), 'video_url': file_data.get('video_url', file_data.get('url')), 'video_title': file_data.get('title', file_data.get('video_title')), 'channel_name': file_data.get('channel', file_data.get('channel_name')), 'duration': file_data.get('duration'), 'view_count': file_data.get('view_count'), 'transcript': file_data.get('transcript'), 'summary': file_data.get('summary', file_data.get('summary_text')), 'key_points': file_data.get('key_points', []), 'main_themes': file_data.get('main_themes', []), 'model_used': file_data.get('model', file_data.get('model_used', 'unknown')), 'processing_time': file_data.get('processing_time_seconds', file_data.get('processing_time')), 'quality_score': file_data.get('quality_score'), 'summary_length': file_data.get('summary_length', 'standard'), 'focus_areas': file_data.get('focus_areas', []), 'source': 'migrated_from_file', # Mark as migrated 'job_id': file_data.get('job_id'), } # Handle timestamps if 'generated_at' in file_data: try: db_data['created_at'] = datetime.fromisoformat(file_data['generated_at']) except (ValueError, TypeError): db_data['created_at'] = datetime.utcnow() elif 'file_created_at' in file_data: try: db_data['created_at'] = datetime.fromisoformat(file_data['file_created_at']) except (ValueError, TypeError): db_data['created_at'] = datetime.utcnow() else: db_data['created_at'] = datetime.utcnow() # Clean up None values and ensure required fields db_data = {k: v for k, v in db_data.items() if v is not None} # Ensure required fields have defaults if not db_data.get('video_id'): logger.warning(f"Missing video_id in summary data: {file_data}") return None if not db_data.get('video_url'): # Generate YouTube URL from video_id if missing if db_data.get('video_id'): db_data['video_url'] = f"https://youtube.com/watch?v={db_data['video_id']}" logger.info(f"Generated video_url from video_id: {db_data['video_url']}") else: logger.warning(f"Missing video_url and video_id in summary data: {file_data}") return None return db_data def migrate_summaries(): """Migrate all file-based summaries to database.""" logger.info("Starting migration of file-based summaries to database") # Initialize file storage service storage_service = SummaryStorageService() # Get all videos with summaries video_ids = storage_service.get_videos_with_summaries() logger.info(f"Found {len(video_ids)} videos with summaries") if not video_ids: logger.info("No file-based summaries found to migrate") return migrated_count = 0 skipped_count = 0 error_count = 0 for video_id in video_ids: logger.info(f"Migrating summaries for video: {video_id}") # Get all summaries for this video summaries = storage_service.list_summaries(video_id) for summary_data in summaries: try: # Check if already migrated by checking for video_id and URL in database existing = database_storage_service.get_summary_by_video( summary_data.get('video_id', video_id) ) # Skip if we already have a summary for this video with same content if existing: should_skip = False for existing_summary in existing: if (existing_summary.summary == summary_data.get('summary') and existing_summary.model_used == summary_data.get('model')): logger.info(f" Skipping duplicate summary for {video_id}") should_skip = True break if should_skip: skipped_count += 1 continue # Convert to database format db_data = convert_file_summary_to_db_format(summary_data) if db_data is None: logger.warning(f" Skipping invalid summary data for {video_id}") skipped_count += 1 continue # Save to database saved_summary = database_storage_service.save_summary_from_dict(db_data) logger.info(f" ✓ Migrated summary {saved_summary.id[:8]} for {video_id}") migrated_count += 1 except Exception as e: logger.error(f" ✗ Failed to migrate summary for {video_id}: {e}") error_count += 1 logger.info(f"\nMigration complete:") logger.info(f" Migrated: {migrated_count}") logger.info(f" Skipped: {skipped_count}") logger.info(f" Errors: {error_count}") # Show database stats try: stats = database_storage_service.get_summary_stats() logger.info(f"\nDatabase now contains:") logger.info(f" Total summaries: {stats['total_summaries']}") logger.info(f" Model distribution: {stats['model_distribution']}") logger.info(f" Source distribution: {stats['source_distribution']}") except Exception as e: logger.error(f"Failed to get database stats: {e}") def verify_migration(): """Verify the migration by comparing counts and sampling data.""" logger.info("\nVerifying migration...") try: # Get file storage stats storage_service = SummaryStorageService() file_stats = storage_service.get_summary_stats() # Get database stats db_stats = database_storage_service.get_summary_stats() logger.info(f"File storage: {file_stats['total_summaries']} summaries") logger.info(f"Database: {db_stats['total_summaries']} summaries") # Sample a few summaries to verify data integrity video_ids = storage_service.get_videos_with_summaries()[:3] # Check first 3 for video_id in video_ids: file_summaries = storage_service.list_summaries(video_id) db_summaries = database_storage_service.get_summary_by_video(video_id) logger.info(f"\nVideo {video_id}:") logger.info(f" File: {len(file_summaries)} summaries") logger.info(f" DB: {len(db_summaries)} summaries") if file_summaries and db_summaries: file_summary = file_summaries[0] db_summary = db_summaries[0] logger.info(f" Title match: {file_summary.get('title') == db_summary.video_title}") logger.info(f" Model match: {file_summary.get('model') == db_summary.model_used}") except Exception as e: logger.error(f"Verification failed: {e}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Migrate file-based summaries to database") parser.add_argument("--verify", action="store_true", help="Verify migration after completion") parser.add_argument("--dry-run", action="store_true", help="Show what would be migrated without doing it") args = parser.parse_args() if args.dry_run: logger.info("DRY RUN MODE - No changes will be made") storage_service = SummaryStorageService() video_ids = storage_service.get_videos_with_summaries() total_summaries = 0 for video_id in video_ids: summaries = storage_service.list_summaries(video_id) total_summaries += len(summaries) logger.info(f"Would migrate {len(summaries)} summaries for video {video_id}") logger.info(f"Total summaries that would be migrated: {total_summaries}") else: # Run migration migrate_summaries() if args.verify: verify_migration() logger.info("\nMigration script completed!") logger.info("Summary data is now available in both frontend and CLI interfaces.")