youtube-summarizer/backend/services/export_service.py

333 lines
11 KiB
Python

"""
Export Service for YouTube Summarizer
Handles export of summaries to multiple formats with customization options
"""
import os
import json
import zipfile
import tempfile
from datetime import datetime
from typing import Dict, List, Optional, Any, Union
from enum import Enum
from abc import ABC, abstractmethod
import asyncio
import aiofiles
from dataclasses import dataclass
from pathlib import Path
import uuid
class ExportFormat(Enum):
"""Supported export formats"""
MARKDOWN = "markdown"
PDF = "pdf"
PLAIN_TEXT = "text"
JSON = "json"
HTML = "html"
class ExportStatus(Enum):
"""Export job status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
@dataclass
class ExportRequest:
"""Single export request"""
summary_id: str
format: ExportFormat
template: Optional[str] = None
include_metadata: bool = True
custom_branding: Optional[Dict[str, Any]] = None
@dataclass
class BulkExportRequest:
"""Bulk export request for multiple summaries"""
summary_ids: List[str]
formats: List[ExportFormat]
template: Optional[str] = None
include_metadata: bool = True
organize_by: str = "format" # "format", "date", "video"
custom_branding: Optional[Dict[str, Any]] = None
@dataclass
class ExportResult:
"""Export operation result"""
export_id: str
status: ExportStatus
format: ExportFormat
file_path: Optional[str] = None
file_size_bytes: Optional[int] = None
download_url: Optional[str] = None
error: Optional[str] = None
created_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
class BaseExporter(ABC):
"""Base class for format-specific exporters"""
@abstractmethod
async def export(
self,
summary_data: Dict[str, Any],
template: Optional[str] = None,
branding: Optional[Dict[str, Any]] = None
) -> str:
"""Export summary to specific format and return file path"""
pass
@abstractmethod
def get_file_extension(self) -> str:
"""Get file extension for this export format"""
pass
def _prepare_summary_data(self, summary_data: Dict[str, Any]) -> Dict[str, Any]:
"""Prepare and enrich summary data for export"""
return {
**summary_data,
"export_metadata": {
"exported_at": datetime.utcnow().isoformat(),
"exporter_version": "1.0",
"youtube_summarizer_version": "2.0"
}
}
class ExportService:
"""Main service for handling summary exports"""
def __init__(self, export_dir: str = "/tmp/youtube_summarizer_exports"):
self.export_dir = Path(export_dir)
self.export_dir.mkdir(parents=True, exist_ok=True)
# Initialize format-specific exporters (will be imported later)
self.exporters: Dict[ExportFormat, BaseExporter] = {}
self._initialize_exporters()
# Track active exports
self.active_exports: Dict[str, ExportResult] = {}
def _initialize_exporters(self):
"""Initialize all available exporters"""
try:
from .exporters.markdown_exporter import MarkdownExporter
self.exporters[ExportFormat.MARKDOWN] = MarkdownExporter()
except ImportError:
pass
try:
from .exporters.pdf_exporter import PDFExporter
self.exporters[ExportFormat.PDF] = PDFExporter()
except ImportError:
pass
try:
from .exporters.text_exporter import PlainTextExporter
self.exporters[ExportFormat.PLAIN_TEXT] = PlainTextExporter()
except ImportError:
pass
try:
from .exporters.json_exporter import JSONExporter
self.exporters[ExportFormat.JSON] = JSONExporter()
except ImportError:
pass
try:
from .exporters.html_exporter import HTMLExporter
self.exporters[ExportFormat.HTML] = HTMLExporter()
except ImportError:
pass
async def export_summary(
self,
summary_data: Dict[str, Any],
request: ExportRequest
) -> ExportResult:
"""Export single summary"""
export_id = str(uuid.uuid4())
result = ExportResult(
export_id=export_id,
status=ExportStatus.PENDING,
format=request.format,
created_at=datetime.utcnow()
)
self.active_exports[export_id] = result
try:
result.status = ExportStatus.PROCESSING
# Check if exporter is available
if request.format not in self.exporters:
raise ValueError(f"Exporter for format {request.format.value} not available")
# Get appropriate exporter
exporter = self.exporters[request.format]
# Export the summary
file_path = await exporter.export(
summary_data=summary_data,
template=request.template,
branding=request.custom_branding
)
# Update result
result.file_path = file_path
result.file_size_bytes = os.path.getsize(file_path)
result.download_url = f"/api/export/download/{export_id}"
result.status = ExportStatus.COMPLETED
result.completed_at = datetime.utcnow()
except Exception as e:
result.status = ExportStatus.FAILED
result.error = str(e)
result.completed_at = datetime.utcnow()
return result
async def bulk_export_summaries(
self,
summaries_data: List[Dict[str, Any]],
request: BulkExportRequest
) -> ExportResult:
"""Export multiple summaries with organization"""
export_id = str(uuid.uuid4())
result = ExportResult(
export_id=export_id,
status=ExportStatus.PENDING,
format=ExportFormat.JSON, # Bulk exports are archives
created_at=datetime.utcnow()
)
self.active_exports[export_id] = result
try:
result.status = ExportStatus.PROCESSING
# Create temporary directory for bulk export
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Export each summary in requested formats
for summary_data in summaries_data:
await self._export_summary_to_bulk(
summary_data, request, temp_path
)
# Create ZIP archive
archive_path = self.export_dir / f"bulk_export_{export_id}.zip"
await self._create_archive(temp_path, archive_path)
result.file_path = str(archive_path)
result.file_size_bytes = os.path.getsize(archive_path)
result.download_url = f"/api/export/download/{export_id}"
result.status = ExportStatus.COMPLETED
result.completed_at = datetime.utcnow()
except Exception as e:
result.status = ExportStatus.FAILED
result.error = str(e)
result.completed_at = datetime.utcnow()
return result
async def _export_summary_to_bulk(
self,
summary_data: Dict[str, Any],
request: BulkExportRequest,
output_dir: Path
):
"""Export single summary to bulk export directory"""
video_title = summary_data.get("video_metadata", {}).get("title", "Unknown")
safe_title = self._sanitize_filename(video_title)
for format in request.formats:
if format not in self.exporters:
continue
exporter = self.exporters[format]
# Determine output path based on organization preference
if request.organize_by == "format":
format_dir = output_dir / format.value
format_dir.mkdir(exist_ok=True)
output_path = format_dir / f"{safe_title}.{exporter.get_file_extension()}"
elif request.organize_by == "date":
date_str = summary_data.get("created_at", "unknown")[:10] # YYYY-MM-DD
date_dir = output_dir / date_str
date_dir.mkdir(exist_ok=True)
output_path = date_dir / f"{safe_title}.{exporter.get_file_extension()}"
else: # organize by video
video_dir = output_dir / safe_title
video_dir.mkdir(exist_ok=True)
output_path = video_dir / f"{safe_title}.{exporter.get_file_extension()}"
# Export to specific format
temp_file = await exporter.export(
summary_data=summary_data,
template=request.template,
branding=request.custom_branding
)
# Move to organized location
import shutil
shutil.move(temp_file, str(output_path))
async def _create_archive(self, source_dir: Path, archive_path: Path):
"""Create ZIP archive from directory"""
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in source_dir.rglob('*'):
if file_path.is_file():
arcname = file_path.relative_to(source_dir)
zipf.write(file_path, arcname)
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename for filesystem compatibility"""
import re
import string
# First, remove control characters and null bytes
# Create a translation table that removes control characters
control_chars = ''.join(chr(i) for i in range(32))
control_chars += '\x7f' # DEL character
translator = str.maketrans('', '', control_chars)
filename = filename.translate(translator)
# Replace invalid filesystem characters with underscores
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
# Limit length and strip whitespace
return sanitized[:100].strip()
def get_export_status(self, export_id: str) -> Optional[ExportResult]:
"""Get export status by ID"""
return self.active_exports.get(export_id)
async def cleanup_old_exports(self, max_age_hours: int = 24):
"""Clean up old export files"""
cutoff_time = datetime.utcnow().timestamp() - (max_age_hours * 3600)
for export_id, result in list(self.active_exports.items()):
if result.created_at and result.created_at.timestamp() < cutoff_time:
# Remove file if exists
if result.file_path and os.path.exists(result.file_path):
os.remove(result.file_path)
# Remove from active exports
del self.active_exports[export_id]