youtube-summarizer/backend/services/database_storage_service.py

299 lines
11 KiB
Python

"""Unified database storage service for summaries."""
import json
import logging
from datetime import datetime
from typing import List, Dict, Optional, Any
from sqlalchemy import create_engine, desc
from sqlalchemy.orm import sessionmaker, Session
from sqlalchemy.exc import SQLAlchemyError
from backend.core.config import settings
from backend.core.database_registry import registry
from backend.models import Summary
from backend.models.pipeline import PipelineResult
logger = logging.getLogger(__name__)
class DatabaseStorageService:
"""Unified storage service for summaries using SQLite database."""
def __init__(self):
"""Initialize database connection."""
self.engine = create_engine(settings.DATABASE_URL)
registry.create_all_tables(self.engine)
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
logger.info("DatabaseStorageService initialized with database: %s", settings.DATABASE_URL)
def get_session(self) -> Session:
"""Get a database session."""
return self.SessionLocal()
def save_summary_from_pipeline(self, pipeline_result: PipelineResult) -> Summary:
"""Save pipeline result to database.
Args:
pipeline_result: Completed pipeline result
Returns:
Saved Summary model instance
"""
with self.get_session() as session:
try:
# Extract data from pipeline result
summary_content = ""
key_points = []
main_themes = []
if pipeline_result.summary:
if isinstance(pipeline_result.summary, dict):
summary_content = pipeline_result.summary.get('content', '')
key_points = pipeline_result.summary.get('key_points', [])
main_themes = pipeline_result.summary.get('main_themes', [])
else:
summary_content = str(pipeline_result.summary)
# Extract quality score
quality_score = None
if pipeline_result.quality_metrics:
quality_score = pipeline_result.quality_metrics.overall_score
# Create Summary instance
summary = Summary(
video_id=pipeline_result.video_id,
video_url=pipeline_result.video_url,
video_title=pipeline_result.metadata.get('title') if pipeline_result.metadata else None,
channel_name=pipeline_result.metadata.get('channel') if pipeline_result.metadata else None,
video_duration=pipeline_result.metadata.get('duration_seconds') if pipeline_result.metadata else None,
transcript=pipeline_result.transcript,
summary=summary_content,
key_points=key_points,
main_themes=main_themes,
model_used=pipeline_result.model_used or 'deepseek',
processing_time=pipeline_result.processing_time,
quality_score=quality_score,
summary_length=pipeline_result.config.summary_length if pipeline_result.config else 'standard',
focus_areas=pipeline_result.config.focus_areas if pipeline_result.config else [],
source='frontend', # Mark as created via frontend/API
job_id=pipeline_result.job_id,
created_at=datetime.utcnow()
)
session.add(summary)
session.commit()
session.refresh(summary)
logger.info(f"Saved summary {summary.id} for video {summary.video_id}")
return summary
except SQLAlchemyError as e:
logger.error(f"Database error saving summary: {e}")
session.rollback()
raise
except Exception as e:
logger.error(f"Error saving summary: {e}")
session.rollback()
raise
def save_summary_from_dict(self, summary_data: Dict[str, Any]) -> Summary:
"""Save summary from dictionary (for CLI compatibility).
Args:
summary_data: Dictionary containing summary data
Returns:
Saved Summary model instance
"""
with self.get_session() as session:
try:
# Ensure required fields have defaults
summary_data.setdefault('source', 'cli')
summary_data.setdefault('created_at', datetime.utcnow())
# Handle list fields that might be strings
for field in ['key_points', 'main_themes', 'focus_areas']:
if field in summary_data and isinstance(summary_data[field], str):
try:
summary_data[field] = json.loads(summary_data[field])
except json.JSONDecodeError:
summary_data[field] = []
summary = Summary(**summary_data)
session.add(summary)
session.commit()
session.refresh(summary)
logger.info(f"Saved summary {summary.id} from dict")
return summary
except SQLAlchemyError as e:
logger.error(f"Database error saving summary from dict: {e}")
session.rollback()
raise
def get_summary(self, summary_id: str) -> Optional[Summary]:
"""Get a specific summary by ID.
Args:
summary_id: UUID of the summary
Returns:
Summary instance or None if not found
"""
with self.get_session() as session:
return session.query(Summary).filter_by(id=summary_id).first()
def get_summary_by_video(self, video_id: str) -> List[Summary]:
"""Get all summaries for a specific video ID.
Args:
video_id: YouTube video ID
Returns:
List of Summary instances
"""
with self.get_session() as session:
return session.query(Summary).filter_by(video_id=video_id).order_by(desc(Summary.created_at)).all()
def list_summaries(
self,
limit: int = 10,
skip: int = 0,
model: Optional[str] = None,
source: Optional[str] = None,
user_id: Optional[str] = None
) -> List[Summary]:
"""List summaries with optional filtering.
Args:
limit: Maximum number of results
skip: Number of results to skip
model: Filter by AI model used
source: Filter by source (frontend/cli/api)
user_id: Filter by user ID
Returns:
List of Summary instances
"""
with self.get_session() as session:
query = session.query(Summary)
# Apply filters
if model:
query = query.filter_by(model_used=model)
if source:
query = query.filter_by(source=source)
if user_id:
query = query.filter_by(user_id=user_id)
# Order by creation date (newest first) and apply pagination
return query.order_by(desc(Summary.created_at)).offset(skip).limit(limit).all()
def search_summaries(
self,
query: str,
limit: int = 10
) -> List[Summary]:
"""Search summaries by title or content.
Args:
query: Search query string
limit: Maximum number of results
Returns:
List of matching Summary instances
"""
with self.get_session() as session:
search_pattern = f"%{query}%"
return session.query(Summary).filter(
(Summary.video_title.ilike(search_pattern)) |
(Summary.summary.ilike(search_pattern))
).limit(limit).all()
def get_summary_stats(self) -> Dict[str, Any]:
"""Get statistics about stored summaries.
Returns:
Dictionary with summary statistics
"""
with self.get_session() as session:
from sqlalchemy import func
total_count = session.query(Summary).count()
# Model distribution
model_stats = session.query(
Summary.model_used,
func.count(Summary.id)
).group_by(Summary.model_used).all()
# Source distribution
source_stats = session.query(
Summary.source,
func.count(Summary.id)
).group_by(Summary.source).all()
# Recent activity (last 7 days)
from datetime import timedelta
recent_date = datetime.utcnow() - timedelta(days=7)
recent_count = session.query(Summary).filter(
Summary.created_at >= recent_date
).count()
# Average scores
avg_quality = session.query(func.avg(Summary.quality_score)).scalar()
avg_processing_time = session.query(func.avg(Summary.processing_time)).scalar()
return {
"total_summaries": total_count,
"recent_summaries_7d": recent_count,
"model_distribution": dict(model_stats),
"source_distribution": dict(source_stats),
"average_quality_score": round(avg_quality, 2) if avg_quality else None,
"average_processing_time": round(avg_processing_time, 2) if avg_processing_time else None
}
def update_summary(self, summary_id: str, updates: Dict[str, Any]) -> Optional[Summary]:
"""Update an existing summary.
Args:
summary_id: UUID of the summary to update
updates: Dictionary of fields to update
Returns:
Updated Summary instance or None if not found
"""
with self.get_session() as session:
summary = session.query(Summary).filter_by(id=summary_id).first()
if summary:
for key, value in updates.items():
if hasattr(summary, key):
setattr(summary, key, value)
summary.updated_at = datetime.utcnow()
session.commit()
session.refresh(summary)
logger.info(f"Updated summary {summary_id}")
return summary
def delete_summary(self, summary_id: str) -> bool:
"""Delete a summary from database.
Args:
summary_id: UUID of the summary to delete
Returns:
True if deleted, False if not found
"""
with self.get_session() as session:
summary = session.query(Summary).filter_by(id=summary_id).first()
if summary:
session.delete(summary)
session.commit()
logger.info(f"Deleted summary {summary_id}")
return True
return False
# Global instance for easy access
database_storage_service = DatabaseStorageService()