youtube-summarizer/scripts/migrate_file_summaries_to_d...

233 lines
9.2 KiB
Python

#!/usr/bin/env python3
"""Migrate existing file-based summaries to database storage."""
import json
import sys
import logging
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, List
# Add parent directory to path
sys.path.append(str(Path(__file__).parent.parent))
from backend.services.summary_storage import SummaryStorageService
from backend.services.database_storage_service import database_storage_service
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def convert_file_summary_to_db_format(file_data: Dict[str, Any]) -> Dict[str, Any]:
"""Convert file-based summary format to database format.
Args:
file_data: Summary data from JSON file
Returns:
Dictionary compatible with Summary model
"""
# Map file format to database format
db_data = {
'video_id': file_data.get('video_id'),
'video_url': file_data.get('video_url', file_data.get('url')),
'video_title': file_data.get('title', file_data.get('video_title')),
'channel_name': file_data.get('channel', file_data.get('channel_name')),
'duration': file_data.get('duration'),
'view_count': file_data.get('view_count'),
'transcript': file_data.get('transcript'),
'summary': file_data.get('summary', file_data.get('summary_text')),
'key_points': file_data.get('key_points', []),
'main_themes': file_data.get('main_themes', []),
'model_used': file_data.get('model', file_data.get('model_used', 'unknown')),
'processing_time': file_data.get('processing_time_seconds', file_data.get('processing_time')),
'quality_score': file_data.get('quality_score'),
'summary_length': file_data.get('summary_length', 'standard'),
'focus_areas': file_data.get('focus_areas', []),
'source': 'migrated_from_file', # Mark as migrated
'job_id': file_data.get('job_id'),
}
# Handle timestamps
if 'generated_at' in file_data:
try:
db_data['created_at'] = datetime.fromisoformat(file_data['generated_at'])
except (ValueError, TypeError):
db_data['created_at'] = datetime.utcnow()
elif 'file_created_at' in file_data:
try:
db_data['created_at'] = datetime.fromisoformat(file_data['file_created_at'])
except (ValueError, TypeError):
db_data['created_at'] = datetime.utcnow()
else:
db_data['created_at'] = datetime.utcnow()
# Clean up None values and ensure required fields
db_data = {k: v for k, v in db_data.items() if v is not None}
# Ensure required fields have defaults
if not db_data.get('video_id'):
logger.warning(f"Missing video_id in summary data: {file_data}")
return None
if not db_data.get('video_url'):
# Generate YouTube URL from video_id if missing
if db_data.get('video_id'):
db_data['video_url'] = f"https://youtube.com/watch?v={db_data['video_id']}"
logger.info(f"Generated video_url from video_id: {db_data['video_url']}")
else:
logger.warning(f"Missing video_url and video_id in summary data: {file_data}")
return None
return db_data
def migrate_summaries():
"""Migrate all file-based summaries to database."""
logger.info("Starting migration of file-based summaries to database")
# Initialize file storage service
storage_service = SummaryStorageService()
# Get all videos with summaries
video_ids = storage_service.get_videos_with_summaries()
logger.info(f"Found {len(video_ids)} videos with summaries")
if not video_ids:
logger.info("No file-based summaries found to migrate")
return
migrated_count = 0
skipped_count = 0
error_count = 0
for video_id in video_ids:
logger.info(f"Migrating summaries for video: {video_id}")
# Get all summaries for this video
summaries = storage_service.list_summaries(video_id)
for summary_data in summaries:
try:
# Check if already migrated by checking for video_id and URL in database
existing = database_storage_service.get_summary_by_video(
summary_data.get('video_id', video_id)
)
# Skip if we already have a summary for this video with same content
if existing:
should_skip = False
for existing_summary in existing:
if (existing_summary.summary == summary_data.get('summary') and
existing_summary.model_used == summary_data.get('model')):
logger.info(f" Skipping duplicate summary for {video_id}")
should_skip = True
break
if should_skip:
skipped_count += 1
continue
# Convert to database format
db_data = convert_file_summary_to_db_format(summary_data)
if db_data is None:
logger.warning(f" Skipping invalid summary data for {video_id}")
skipped_count += 1
continue
# Save to database
saved_summary = database_storage_service.save_summary_from_dict(db_data)
logger.info(f" ✓ Migrated summary {saved_summary.id[:8]} for {video_id}")
migrated_count += 1
except Exception as e:
logger.error(f" ✗ Failed to migrate summary for {video_id}: {e}")
error_count += 1
logger.info(f"\nMigration complete:")
logger.info(f" Migrated: {migrated_count}")
logger.info(f" Skipped: {skipped_count}")
logger.info(f" Errors: {error_count}")
# Show database stats
try:
stats = database_storage_service.get_summary_stats()
logger.info(f"\nDatabase now contains:")
logger.info(f" Total summaries: {stats['total_summaries']}")
logger.info(f" Model distribution: {stats['model_distribution']}")
logger.info(f" Source distribution: {stats['source_distribution']}")
except Exception as e:
logger.error(f"Failed to get database stats: {e}")
def verify_migration():
"""Verify the migration by comparing counts and sampling data."""
logger.info("\nVerifying migration...")
try:
# Get file storage stats
storage_service = SummaryStorageService()
file_stats = storage_service.get_summary_stats()
# Get database stats
db_stats = database_storage_service.get_summary_stats()
logger.info(f"File storage: {file_stats['total_summaries']} summaries")
logger.info(f"Database: {db_stats['total_summaries']} summaries")
# Sample a few summaries to verify data integrity
video_ids = storage_service.get_videos_with_summaries()[:3] # Check first 3
for video_id in video_ids:
file_summaries = storage_service.list_summaries(video_id)
db_summaries = database_storage_service.get_summary_by_video(video_id)
logger.info(f"\nVideo {video_id}:")
logger.info(f" File: {len(file_summaries)} summaries")
logger.info(f" DB: {len(db_summaries)} summaries")
if file_summaries and db_summaries:
file_summary = file_summaries[0]
db_summary = db_summaries[0]
logger.info(f" Title match: {file_summary.get('title') == db_summary.video_title}")
logger.info(f" Model match: {file_summary.get('model') == db_summary.model_used}")
except Exception as e:
logger.error(f"Verification failed: {e}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Migrate file-based summaries to database")
parser.add_argument("--verify", action="store_true", help="Verify migration after completion")
parser.add_argument("--dry-run", action="store_true", help="Show what would be migrated without doing it")
args = parser.parse_args()
if args.dry_run:
logger.info("DRY RUN MODE - No changes will be made")
storage_service = SummaryStorageService()
video_ids = storage_service.get_videos_with_summaries()
total_summaries = 0
for video_id in video_ids:
summaries = storage_service.list_summaries(video_id)
total_summaries += len(summaries)
logger.info(f"Would migrate {len(summaries)} summaries for video {video_id}")
logger.info(f"Total summaries that would be migrated: {total_summaries}")
else:
# Run migration
migrate_summaries()
if args.verify:
verify_migration()
logger.info("\nMigration script completed!")
logger.info("Summary data is now available in both frontend and CLI interfaces.")