""" Export Service for YouTube Summarizer Handles export of summaries to multiple formats with customization options """ import os import json import zipfile import tempfile from datetime import datetime from typing import Dict, List, Optional, Any, Union from enum import Enum from abc import ABC, abstractmethod import asyncio import aiofiles from dataclasses import dataclass from pathlib import Path import uuid class ExportFormat(Enum): """Supported export formats""" MARKDOWN = "markdown" PDF = "pdf" PLAIN_TEXT = "text" JSON = "json" HTML = "html" class ExportStatus(Enum): """Export job status""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" FAILED = "failed" @dataclass class ExportRequest: """Single export request""" summary_id: str format: ExportFormat template: Optional[str] = None include_metadata: bool = True custom_branding: Optional[Dict[str, Any]] = None @dataclass class BulkExportRequest: """Bulk export request for multiple summaries""" summary_ids: List[str] formats: List[ExportFormat] template: Optional[str] = None include_metadata: bool = True organize_by: str = "format" # "format", "date", "video" custom_branding: Optional[Dict[str, Any]] = None @dataclass class ExportResult: """Export operation result""" export_id: str status: ExportStatus format: ExportFormat file_path: Optional[str] = None file_size_bytes: Optional[int] = None download_url: Optional[str] = None error: Optional[str] = None created_at: Optional[datetime] = None completed_at: Optional[datetime] = None class BaseExporter(ABC): """Base class for format-specific exporters""" @abstractmethod async def export( self, summary_data: Dict[str, Any], template: Optional[str] = None, branding: Optional[Dict[str, Any]] = None ) -> str: """Export summary to specific format and return file path""" pass @abstractmethod def get_file_extension(self) -> str: """Get file extension for this export format""" pass def _prepare_summary_data(self, summary_data: Dict[str, Any]) -> Dict[str, Any]: """Prepare and enrich summary data for export""" return { **summary_data, "export_metadata": { "exported_at": datetime.utcnow().isoformat(), "exporter_version": "1.0", "youtube_summarizer_version": "2.0" } } class ExportService: """Main service for handling summary exports""" def __init__(self, export_dir: str = "/tmp/youtube_summarizer_exports"): self.export_dir = Path(export_dir) self.export_dir.mkdir(parents=True, exist_ok=True) # Initialize format-specific exporters (will be imported later) self.exporters: Dict[ExportFormat, BaseExporter] = {} self._initialize_exporters() # Track active exports self.active_exports: Dict[str, ExportResult] = {} def _initialize_exporters(self): """Initialize all available exporters""" try: from .exporters.markdown_exporter import MarkdownExporter self.exporters[ExportFormat.MARKDOWN] = MarkdownExporter() except ImportError: pass try: from .exporters.pdf_exporter import PDFExporter self.exporters[ExportFormat.PDF] = PDFExporter() except ImportError: pass try: from .exporters.text_exporter import PlainTextExporter self.exporters[ExportFormat.PLAIN_TEXT] = PlainTextExporter() except ImportError: pass try: from .exporters.json_exporter import JSONExporter self.exporters[ExportFormat.JSON] = JSONExporter() except ImportError: pass try: from .exporters.html_exporter import HTMLExporter self.exporters[ExportFormat.HTML] = HTMLExporter() except ImportError: pass async def export_summary( self, summary_data: Dict[str, Any], request: ExportRequest ) -> ExportResult: """Export single summary""" export_id = str(uuid.uuid4()) result = ExportResult( export_id=export_id, status=ExportStatus.PENDING, format=request.format, created_at=datetime.utcnow() ) self.active_exports[export_id] = result try: result.status = ExportStatus.PROCESSING # Check if exporter is available if request.format not in self.exporters: raise ValueError(f"Exporter for format {request.format.value} not available") # Get appropriate exporter exporter = self.exporters[request.format] # Export the summary file_path = await exporter.export( summary_data=summary_data, template=request.template, branding=request.custom_branding ) # Update result result.file_path = file_path result.file_size_bytes = os.path.getsize(file_path) result.download_url = f"/api/export/download/{export_id}" result.status = ExportStatus.COMPLETED result.completed_at = datetime.utcnow() except Exception as e: result.status = ExportStatus.FAILED result.error = str(e) result.completed_at = datetime.utcnow() return result async def bulk_export_summaries( self, summaries_data: List[Dict[str, Any]], request: BulkExportRequest ) -> ExportResult: """Export multiple summaries with organization""" export_id = str(uuid.uuid4()) result = ExportResult( export_id=export_id, status=ExportStatus.PENDING, format=ExportFormat.JSON, # Bulk exports are archives created_at=datetime.utcnow() ) self.active_exports[export_id] = result try: result.status = ExportStatus.PROCESSING # Create temporary directory for bulk export with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) # Export each summary in requested formats for summary_data in summaries_data: await self._export_summary_to_bulk( summary_data, request, temp_path ) # Create ZIP archive archive_path = self.export_dir / f"bulk_export_{export_id}.zip" await self._create_archive(temp_path, archive_path) result.file_path = str(archive_path) result.file_size_bytes = os.path.getsize(archive_path) result.download_url = f"/api/export/download/{export_id}" result.status = ExportStatus.COMPLETED result.completed_at = datetime.utcnow() except Exception as e: result.status = ExportStatus.FAILED result.error = str(e) result.completed_at = datetime.utcnow() return result async def _export_summary_to_bulk( self, summary_data: Dict[str, Any], request: BulkExportRequest, output_dir: Path ): """Export single summary to bulk export directory""" video_title = summary_data.get("video_metadata", {}).get("title", "Unknown") safe_title = self._sanitize_filename(video_title) for format in request.formats: if format not in self.exporters: continue exporter = self.exporters[format] # Determine output path based on organization preference if request.organize_by == "format": format_dir = output_dir / format.value format_dir.mkdir(exist_ok=True) output_path = format_dir / f"{safe_title}.{exporter.get_file_extension()}" elif request.organize_by == "date": date_str = summary_data.get("created_at", "unknown")[:10] # YYYY-MM-DD date_dir = output_dir / date_str date_dir.mkdir(exist_ok=True) output_path = date_dir / f"{safe_title}.{exporter.get_file_extension()}" else: # organize by video video_dir = output_dir / safe_title video_dir.mkdir(exist_ok=True) output_path = video_dir / f"{safe_title}.{exporter.get_file_extension()}" # Export to specific format temp_file = await exporter.export( summary_data=summary_data, template=request.template, branding=request.custom_branding ) # Move to organized location import shutil shutil.move(temp_file, str(output_path)) async def _create_archive(self, source_dir: Path, archive_path: Path): """Create ZIP archive from directory""" with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for file_path in source_dir.rglob('*'): if file_path.is_file(): arcname = file_path.relative_to(source_dir) zipf.write(file_path, arcname) def _sanitize_filename(self, filename: str) -> str: """Sanitize filename for filesystem compatibility""" import re import string # First, remove control characters and null bytes # Create a translation table that removes control characters control_chars = ''.join(chr(i) for i in range(32)) control_chars += '\x7f' # DEL character translator = str.maketrans('', '', control_chars) filename = filename.translate(translator) # Replace invalid filesystem characters with underscores sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename) # Limit length and strip whitespace return sanitized[:100].strip() def get_export_status(self, export_id: str) -> Optional[ExportResult]: """Get export status by ID""" return self.active_exports.get(export_id) async def cleanup_old_exports(self, max_age_hours: int = 24): """Clean up old export files""" cutoff_time = datetime.utcnow().timestamp() - (max_age_hours * 3600) for export_id, result in list(self.active_exports.items()): if result.created_at and result.created_at.timestamp() < cutoff_time: # Remove file if exists if result.file_path and os.path.exists(result.file_path): os.remove(result.file_path) # Remove from active exports del self.active_exports[export_id]