333 lines
11 KiB
Python
333 lines
11 KiB
Python
"""
|
|
Export Service for YouTube Summarizer
|
|
Handles export of summaries to multiple formats with customization options
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import zipfile
|
|
import tempfile
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Any, Union
|
|
from enum import Enum
|
|
from abc import ABC, abstractmethod
|
|
import asyncio
|
|
import aiofiles
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
import uuid
|
|
|
|
|
|
class ExportFormat(Enum):
|
|
"""Supported export formats"""
|
|
MARKDOWN = "markdown"
|
|
PDF = "pdf"
|
|
PLAIN_TEXT = "text"
|
|
JSON = "json"
|
|
HTML = "html"
|
|
|
|
|
|
class ExportStatus(Enum):
|
|
"""Export job status"""
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
@dataclass
|
|
class ExportRequest:
|
|
"""Single export request"""
|
|
summary_id: str
|
|
format: ExportFormat
|
|
template: Optional[str] = None
|
|
include_metadata: bool = True
|
|
custom_branding: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
@dataclass
|
|
class BulkExportRequest:
|
|
"""Bulk export request for multiple summaries"""
|
|
summary_ids: List[str]
|
|
formats: List[ExportFormat]
|
|
template: Optional[str] = None
|
|
include_metadata: bool = True
|
|
organize_by: str = "format" # "format", "date", "video"
|
|
custom_branding: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
@dataclass
|
|
class ExportResult:
|
|
"""Export operation result"""
|
|
export_id: str
|
|
status: ExportStatus
|
|
format: ExportFormat
|
|
file_path: Optional[str] = None
|
|
file_size_bytes: Optional[int] = None
|
|
download_url: Optional[str] = None
|
|
error: Optional[str] = None
|
|
created_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
|
|
|
|
class BaseExporter(ABC):
|
|
"""Base class for format-specific exporters"""
|
|
|
|
@abstractmethod
|
|
async def export(
|
|
self,
|
|
summary_data: Dict[str, Any],
|
|
template: Optional[str] = None,
|
|
branding: Optional[Dict[str, Any]] = None
|
|
) -> str:
|
|
"""Export summary to specific format and return file path"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_file_extension(self) -> str:
|
|
"""Get file extension for this export format"""
|
|
pass
|
|
|
|
def _prepare_summary_data(self, summary_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Prepare and enrich summary data for export"""
|
|
|
|
return {
|
|
**summary_data,
|
|
"export_metadata": {
|
|
"exported_at": datetime.utcnow().isoformat(),
|
|
"exporter_version": "1.0",
|
|
"youtube_summarizer_version": "2.0"
|
|
}
|
|
}
|
|
|
|
|
|
class ExportService:
|
|
"""Main service for handling summary exports"""
|
|
|
|
def __init__(self, export_dir: str = "/tmp/youtube_summarizer_exports"):
|
|
self.export_dir = Path(export_dir)
|
|
self.export_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize format-specific exporters (will be imported later)
|
|
self.exporters: Dict[ExportFormat, BaseExporter] = {}
|
|
self._initialize_exporters()
|
|
|
|
# Track active exports
|
|
self.active_exports: Dict[str, ExportResult] = {}
|
|
|
|
def _initialize_exporters(self):
|
|
"""Initialize all available exporters"""
|
|
try:
|
|
from .exporters.markdown_exporter import MarkdownExporter
|
|
self.exporters[ExportFormat.MARKDOWN] = MarkdownExporter()
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from .exporters.pdf_exporter import PDFExporter
|
|
self.exporters[ExportFormat.PDF] = PDFExporter()
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from .exporters.text_exporter import PlainTextExporter
|
|
self.exporters[ExportFormat.PLAIN_TEXT] = PlainTextExporter()
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from .exporters.json_exporter import JSONExporter
|
|
self.exporters[ExportFormat.JSON] = JSONExporter()
|
|
except ImportError:
|
|
pass
|
|
|
|
try:
|
|
from .exporters.html_exporter import HTMLExporter
|
|
self.exporters[ExportFormat.HTML] = HTMLExporter()
|
|
except ImportError:
|
|
pass
|
|
|
|
async def export_summary(
|
|
self,
|
|
summary_data: Dict[str, Any],
|
|
request: ExportRequest
|
|
) -> ExportResult:
|
|
"""Export single summary"""
|
|
|
|
export_id = str(uuid.uuid4())
|
|
|
|
result = ExportResult(
|
|
export_id=export_id,
|
|
status=ExportStatus.PENDING,
|
|
format=request.format,
|
|
created_at=datetime.utcnow()
|
|
)
|
|
|
|
self.active_exports[export_id] = result
|
|
|
|
try:
|
|
result.status = ExportStatus.PROCESSING
|
|
|
|
# Check if exporter is available
|
|
if request.format not in self.exporters:
|
|
raise ValueError(f"Exporter for format {request.format.value} not available")
|
|
|
|
# Get appropriate exporter
|
|
exporter = self.exporters[request.format]
|
|
|
|
# Export the summary
|
|
file_path = await exporter.export(
|
|
summary_data=summary_data,
|
|
template=request.template,
|
|
branding=request.custom_branding
|
|
)
|
|
|
|
# Update result
|
|
result.file_path = file_path
|
|
result.file_size_bytes = os.path.getsize(file_path)
|
|
result.download_url = f"/api/export/download/{export_id}"
|
|
result.status = ExportStatus.COMPLETED
|
|
result.completed_at = datetime.utcnow()
|
|
|
|
except Exception as e:
|
|
result.status = ExportStatus.FAILED
|
|
result.error = str(e)
|
|
result.completed_at = datetime.utcnow()
|
|
|
|
return result
|
|
|
|
async def bulk_export_summaries(
|
|
self,
|
|
summaries_data: List[Dict[str, Any]],
|
|
request: BulkExportRequest
|
|
) -> ExportResult:
|
|
"""Export multiple summaries with organization"""
|
|
|
|
export_id = str(uuid.uuid4())
|
|
|
|
result = ExportResult(
|
|
export_id=export_id,
|
|
status=ExportStatus.PENDING,
|
|
format=ExportFormat.JSON, # Bulk exports are archives
|
|
created_at=datetime.utcnow()
|
|
)
|
|
|
|
self.active_exports[export_id] = result
|
|
|
|
try:
|
|
result.status = ExportStatus.PROCESSING
|
|
|
|
# Create temporary directory for bulk export
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Export each summary in requested formats
|
|
for summary_data in summaries_data:
|
|
await self._export_summary_to_bulk(
|
|
summary_data, request, temp_path
|
|
)
|
|
|
|
# Create ZIP archive
|
|
archive_path = self.export_dir / f"bulk_export_{export_id}.zip"
|
|
await self._create_archive(temp_path, archive_path)
|
|
|
|
result.file_path = str(archive_path)
|
|
result.file_size_bytes = os.path.getsize(archive_path)
|
|
result.download_url = f"/api/export/download/{export_id}"
|
|
result.status = ExportStatus.COMPLETED
|
|
result.completed_at = datetime.utcnow()
|
|
|
|
except Exception as e:
|
|
result.status = ExportStatus.FAILED
|
|
result.error = str(e)
|
|
result.completed_at = datetime.utcnow()
|
|
|
|
return result
|
|
|
|
async def _export_summary_to_bulk(
|
|
self,
|
|
summary_data: Dict[str, Any],
|
|
request: BulkExportRequest,
|
|
output_dir: Path
|
|
):
|
|
"""Export single summary to bulk export directory"""
|
|
|
|
video_title = summary_data.get("video_metadata", {}).get("title", "Unknown")
|
|
safe_title = self._sanitize_filename(video_title)
|
|
|
|
for format in request.formats:
|
|
if format not in self.exporters:
|
|
continue
|
|
|
|
exporter = self.exporters[format]
|
|
|
|
# Determine output path based on organization preference
|
|
if request.organize_by == "format":
|
|
format_dir = output_dir / format.value
|
|
format_dir.mkdir(exist_ok=True)
|
|
output_path = format_dir / f"{safe_title}.{exporter.get_file_extension()}"
|
|
elif request.organize_by == "date":
|
|
date_str = summary_data.get("created_at", "unknown")[:10] # YYYY-MM-DD
|
|
date_dir = output_dir / date_str
|
|
date_dir.mkdir(exist_ok=True)
|
|
output_path = date_dir / f"{safe_title}.{exporter.get_file_extension()}"
|
|
else: # organize by video
|
|
video_dir = output_dir / safe_title
|
|
video_dir.mkdir(exist_ok=True)
|
|
output_path = video_dir / f"{safe_title}.{exporter.get_file_extension()}"
|
|
|
|
# Export to specific format
|
|
temp_file = await exporter.export(
|
|
summary_data=summary_data,
|
|
template=request.template,
|
|
branding=request.custom_branding
|
|
)
|
|
|
|
# Move to organized location
|
|
import shutil
|
|
shutil.move(temp_file, str(output_path))
|
|
|
|
async def _create_archive(self, source_dir: Path, archive_path: Path):
|
|
"""Create ZIP archive from directory"""
|
|
|
|
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
for file_path in source_dir.rglob('*'):
|
|
if file_path.is_file():
|
|
arcname = file_path.relative_to(source_dir)
|
|
zipf.write(file_path, arcname)
|
|
|
|
def _sanitize_filename(self, filename: str) -> str:
|
|
"""Sanitize filename for filesystem compatibility"""
|
|
import re
|
|
import string
|
|
|
|
# First, remove control characters and null bytes
|
|
# Create a translation table that removes control characters
|
|
control_chars = ''.join(chr(i) for i in range(32))
|
|
control_chars += '\x7f' # DEL character
|
|
translator = str.maketrans('', '', control_chars)
|
|
filename = filename.translate(translator)
|
|
|
|
# Replace invalid filesystem characters with underscores
|
|
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
|
|
|
# Limit length and strip whitespace
|
|
return sanitized[:100].strip()
|
|
|
|
def get_export_status(self, export_id: str) -> Optional[ExportResult]:
|
|
"""Get export status by ID"""
|
|
return self.active_exports.get(export_id)
|
|
|
|
async def cleanup_old_exports(self, max_age_hours: int = 24):
|
|
"""Clean up old export files"""
|
|
|
|
cutoff_time = datetime.utcnow().timestamp() - (max_age_hours * 3600)
|
|
|
|
for export_id, result in list(self.active_exports.items()):
|
|
if result.created_at and result.created_at.timestamp() < cutoff_time:
|
|
# Remove file if exists
|
|
if result.file_path and os.path.exists(result.file_path):
|
|
os.remove(result.file_path)
|
|
|
|
# Remove from active exports
|
|
del self.active_exports[export_id] |