youtube-summarizer/backend/services/summary_storage.py

197 lines
7.3 KiB
Python

"""Service for managing file-based summary storage."""
import json
import os
from pathlib import Path
from typing import List, Dict, Optional, Any
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
class SummaryStorageService:
"""Service for managing summary files in the file system."""
def __init__(self, base_storage_path: str = "video_storage/summaries"):
self.base_path = Path(base_storage_path)
self.base_path.mkdir(parents=True, exist_ok=True)
def get_video_summary_dir(self, video_id: str) -> Path:
"""Get the directory path for a video's summaries."""
return self.base_path / video_id
def list_summaries(self, video_id: str) -> List[Dict[str, Any]]:
"""List all summaries for a given video ID."""
video_dir = self.get_video_summary_dir(video_id)
if not video_dir.exists():
return []
summaries = []
# Find all JSON summary files
summary_files = list(video_dir.glob("summary_*.json"))
for summary_file in sorted(summary_files):
try:
with open(summary_file, 'r', encoding='utf-8') as f:
summary_data = json.load(f)
# Add file metadata
file_stat = summary_file.stat()
summary_data.update({
"file_path": str(summary_file.relative_to(self.base_path)),
"file_size_bytes": file_stat.st_size,
"file_created_at": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
"file_modified_at": datetime.fromtimestamp(file_stat.st_mtime).isoformat()
})
summaries.append(summary_data)
except (json.JSONDecodeError, KeyError, OSError) as e:
logger.warning(f"Failed to load summary file {summary_file}: {e}")
continue
# Sort by generated_at timestamp, most recent first
summaries.sort(
key=lambda x: x.get('generated_at', '1970-01-01T00:00:00'),
reverse=True
)
return summaries
def get_summary(self, video_id: str, timestamp: str) -> Optional[Dict[str, Any]]:
"""Get a specific summary by video ID and timestamp."""
video_dir = self.get_video_summary_dir(video_id)
# Try to find the summary file by timestamp
summary_file = video_dir / f"summary_{timestamp}.json"
if not summary_file.exists():
# If exact timestamp not found, try to find by partial match
matching_files = list(video_dir.glob(f"summary_*{timestamp}*.json"))
if not matching_files:
return None
summary_file = matching_files[0]
try:
with open(summary_file, 'r', encoding='utf-8') as f:
summary_data = json.load(f)
# Add file metadata
file_stat = summary_file.stat()
summary_data.update({
"file_path": str(summary_file.relative_to(self.base_path)),
"file_size_bytes": file_stat.st_size,
"file_created_at": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
"file_modified_at": datetime.fromtimestamp(file_stat.st_mtime).isoformat()
})
return summary_data
except (json.JSONDecodeError, KeyError, OSError) as e:
logger.error(f"Failed to load summary file {summary_file}: {e}")
return None
def save_summary(
self,
video_id: str,
summary_data: Dict[str, Any],
timestamp: Optional[str] = None
) -> str:
"""Save a summary to the file system."""
video_dir = self.get_video_summary_dir(video_id)
video_dir.mkdir(parents=True, exist_ok=True)
# Generate timestamp if not provided
if not timestamp:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = video_dir / f"summary_{timestamp}.json"
# Ensure video_id and generated_at are set
summary_data["video_id"] = video_id
if "generated_at" not in summary_data:
summary_data["generated_at"] = datetime.now().isoformat()
try:
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary_data, f, indent=2, ensure_ascii=False)
logger.info(f"Saved summary for video {video_id} to {summary_file}")
return str(summary_file.relative_to(self.base_path))
except OSError as e:
logger.error(f"Failed to save summary file {summary_file}: {e}")
raise
def delete_summary(self, video_id: str, timestamp: str) -> bool:
"""Delete a specific summary file."""
video_dir = self.get_video_summary_dir(video_id)
summary_file = video_dir / f"summary_{timestamp}.json"
try:
if summary_file.exists():
summary_file.unlink()
logger.info(f"Deleted summary file {summary_file}")
# Clean up directory if empty
if video_dir.exists() and not any(video_dir.iterdir()):
video_dir.rmdir()
logger.info(f"Removed empty directory {video_dir}")
return True
else:
logger.warning(f"Summary file {summary_file} not found")
return False
except OSError as e:
logger.error(f"Failed to delete summary file {summary_file}: {e}")
return False
def get_videos_with_summaries(self) -> List[str]:
"""Get list of video IDs that have summaries."""
if not self.base_path.exists():
return []
video_ids = []
for video_dir in self.base_path.iterdir():
if video_dir.is_dir():
# Check if directory has any summary files
summary_files = list(video_dir.glob("summary_*.json"))
if summary_files:
video_ids.append(video_dir.name)
return sorted(video_ids)
def get_summary_stats(self) -> Dict[str, Any]:
"""Get statistics about stored summaries."""
video_ids = self.get_videos_with_summaries()
total_summaries = 0
total_size_bytes = 0
model_counts = {}
for video_id in video_ids:
summaries = self.list_summaries(video_id)
total_summaries += len(summaries)
for summary in summaries:
total_size_bytes += summary.get("file_size_bytes", 0)
model = summary.get("model", "unknown")
model_counts[model] = model_counts.get(model, 0) + 1
return {
"total_videos_with_summaries": len(video_ids),
"total_summaries": total_summaries,
"total_size_bytes": total_size_bytes,
"total_size_mb": round(total_size_bytes / (1024 * 1024), 2),
"model_distribution": model_counts,
"video_ids": video_ids
}
# Global instance
storage_service = SummaryStorageService()