233 lines
9.2 KiB
Python
233 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Migrate existing file-based summaries to database storage."""
|
|
|
|
import json
|
|
import sys
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, Any, List
|
|
|
|
# Add parent directory to path
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
from backend.services.summary_storage import SummaryStorageService
|
|
from backend.services.database_storage_service import database_storage_service
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def convert_file_summary_to_db_format(file_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Convert file-based summary format to database format.
|
|
|
|
Args:
|
|
file_data: Summary data from JSON file
|
|
|
|
Returns:
|
|
Dictionary compatible with Summary model
|
|
"""
|
|
# Map file format to database format
|
|
db_data = {
|
|
'video_id': file_data.get('video_id'),
|
|
'video_url': file_data.get('video_url', file_data.get('url')),
|
|
'video_title': file_data.get('title', file_data.get('video_title')),
|
|
'channel_name': file_data.get('channel', file_data.get('channel_name')),
|
|
'duration': file_data.get('duration'),
|
|
'view_count': file_data.get('view_count'),
|
|
'transcript': file_data.get('transcript'),
|
|
'summary': file_data.get('summary', file_data.get('summary_text')),
|
|
'key_points': file_data.get('key_points', []),
|
|
'main_themes': file_data.get('main_themes', []),
|
|
'model_used': file_data.get('model', file_data.get('model_used', 'unknown')),
|
|
'processing_time': file_data.get('processing_time_seconds', file_data.get('processing_time')),
|
|
'quality_score': file_data.get('quality_score'),
|
|
'summary_length': file_data.get('summary_length', 'standard'),
|
|
'focus_areas': file_data.get('focus_areas', []),
|
|
'source': 'migrated_from_file', # Mark as migrated
|
|
'job_id': file_data.get('job_id'),
|
|
}
|
|
|
|
# Handle timestamps
|
|
if 'generated_at' in file_data:
|
|
try:
|
|
db_data['created_at'] = datetime.fromisoformat(file_data['generated_at'])
|
|
except (ValueError, TypeError):
|
|
db_data['created_at'] = datetime.utcnow()
|
|
elif 'file_created_at' in file_data:
|
|
try:
|
|
db_data['created_at'] = datetime.fromisoformat(file_data['file_created_at'])
|
|
except (ValueError, TypeError):
|
|
db_data['created_at'] = datetime.utcnow()
|
|
else:
|
|
db_data['created_at'] = datetime.utcnow()
|
|
|
|
# Clean up None values and ensure required fields
|
|
db_data = {k: v for k, v in db_data.items() if v is not None}
|
|
|
|
# Ensure required fields have defaults
|
|
if not db_data.get('video_id'):
|
|
logger.warning(f"Missing video_id in summary data: {file_data}")
|
|
return None
|
|
|
|
if not db_data.get('video_url'):
|
|
# Generate YouTube URL from video_id if missing
|
|
if db_data.get('video_id'):
|
|
db_data['video_url'] = f"https://youtube.com/watch?v={db_data['video_id']}"
|
|
logger.info(f"Generated video_url from video_id: {db_data['video_url']}")
|
|
else:
|
|
logger.warning(f"Missing video_url and video_id in summary data: {file_data}")
|
|
return None
|
|
|
|
return db_data
|
|
|
|
|
|
def migrate_summaries():
|
|
"""Migrate all file-based summaries to database."""
|
|
logger.info("Starting migration of file-based summaries to database")
|
|
|
|
# Initialize file storage service
|
|
storage_service = SummaryStorageService()
|
|
|
|
# Get all videos with summaries
|
|
video_ids = storage_service.get_videos_with_summaries()
|
|
logger.info(f"Found {len(video_ids)} videos with summaries")
|
|
|
|
if not video_ids:
|
|
logger.info("No file-based summaries found to migrate")
|
|
return
|
|
|
|
migrated_count = 0
|
|
skipped_count = 0
|
|
error_count = 0
|
|
|
|
for video_id in video_ids:
|
|
logger.info(f"Migrating summaries for video: {video_id}")
|
|
|
|
# Get all summaries for this video
|
|
summaries = storage_service.list_summaries(video_id)
|
|
|
|
for summary_data in summaries:
|
|
try:
|
|
# Check if already migrated by checking for video_id and URL in database
|
|
existing = database_storage_service.get_summary_by_video(
|
|
summary_data.get('video_id', video_id)
|
|
)
|
|
|
|
# Skip if we already have a summary for this video with same content
|
|
if existing:
|
|
should_skip = False
|
|
for existing_summary in existing:
|
|
if (existing_summary.summary == summary_data.get('summary') and
|
|
existing_summary.model_used == summary_data.get('model')):
|
|
logger.info(f" Skipping duplicate summary for {video_id}")
|
|
should_skip = True
|
|
break
|
|
if should_skip:
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Convert to database format
|
|
db_data = convert_file_summary_to_db_format(summary_data)
|
|
|
|
if db_data is None:
|
|
logger.warning(f" Skipping invalid summary data for {video_id}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Save to database
|
|
saved_summary = database_storage_service.save_summary_from_dict(db_data)
|
|
logger.info(f" ✓ Migrated summary {saved_summary.id[:8]} for {video_id}")
|
|
migrated_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f" ✗ Failed to migrate summary for {video_id}: {e}")
|
|
error_count += 1
|
|
|
|
logger.info(f"\nMigration complete:")
|
|
logger.info(f" Migrated: {migrated_count}")
|
|
logger.info(f" Skipped: {skipped_count}")
|
|
logger.info(f" Errors: {error_count}")
|
|
|
|
# Show database stats
|
|
try:
|
|
stats = database_storage_service.get_summary_stats()
|
|
logger.info(f"\nDatabase now contains:")
|
|
logger.info(f" Total summaries: {stats['total_summaries']}")
|
|
logger.info(f" Model distribution: {stats['model_distribution']}")
|
|
logger.info(f" Source distribution: {stats['source_distribution']}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to get database stats: {e}")
|
|
|
|
|
|
def verify_migration():
|
|
"""Verify the migration by comparing counts and sampling data."""
|
|
logger.info("\nVerifying migration...")
|
|
|
|
try:
|
|
# Get file storage stats
|
|
storage_service = SummaryStorageService()
|
|
file_stats = storage_service.get_summary_stats()
|
|
|
|
# Get database stats
|
|
db_stats = database_storage_service.get_summary_stats()
|
|
|
|
logger.info(f"File storage: {file_stats['total_summaries']} summaries")
|
|
logger.info(f"Database: {db_stats['total_summaries']} summaries")
|
|
|
|
# Sample a few summaries to verify data integrity
|
|
video_ids = storage_service.get_videos_with_summaries()[:3] # Check first 3
|
|
|
|
for video_id in video_ids:
|
|
file_summaries = storage_service.list_summaries(video_id)
|
|
db_summaries = database_storage_service.get_summary_by_video(video_id)
|
|
|
|
logger.info(f"\nVideo {video_id}:")
|
|
logger.info(f" File: {len(file_summaries)} summaries")
|
|
logger.info(f" DB: {len(db_summaries)} summaries")
|
|
|
|
if file_summaries and db_summaries:
|
|
file_summary = file_summaries[0]
|
|
db_summary = db_summaries[0]
|
|
|
|
logger.info(f" Title match: {file_summary.get('title') == db_summary.video_title}")
|
|
logger.info(f" Model match: {file_summary.get('model') == db_summary.model_used}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Verification failed: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Migrate file-based summaries to database")
|
|
parser.add_argument("--verify", action="store_true", help="Verify migration after completion")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be migrated without doing it")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY RUN MODE - No changes will be made")
|
|
storage_service = SummaryStorageService()
|
|
video_ids = storage_service.get_videos_with_summaries()
|
|
|
|
total_summaries = 0
|
|
for video_id in video_ids:
|
|
summaries = storage_service.list_summaries(video_id)
|
|
total_summaries += len(summaries)
|
|
logger.info(f"Would migrate {len(summaries)} summaries for video {video_id}")
|
|
|
|
logger.info(f"Total summaries that would be migrated: {total_summaries}")
|
|
else:
|
|
# Run migration
|
|
migrate_summaries()
|
|
|
|
if args.verify:
|
|
verify_migration()
|
|
|
|
logger.info("\nMigration script completed!")
|
|
logger.info("Summary data is now available in both frontend and CLI interfaces.") |