youtube-summarizer/backend/services/export_service.py

"""
Export Service for YouTube Summarizer
Handles export of summaries to multiple formats with customization options
"""

import os
import json
import zipfile
import tempfile
from datetime import datetime
from typing import Dict, List, Optional, Any, Union
from enum import Enum
from abc import ABC, abstractmethod
import asyncio
import aiofiles
from dataclasses import dataclass
from pathlib import Path
import uuid


class ExportFormat(Enum):
    """Supported export formats"""
    MARKDOWN = "markdown"
    PDF = "pdf"
    PLAIN_TEXT = "text"
    JSON = "json"
    HTML = "html"


class ExportStatus(Enum):
    """Export job status"""
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"


@dataclass
class ExportRequest:
    """Single export request"""
    summary_id: str
    format: ExportFormat
    template: Optional[str] = None
    include_metadata: bool = True
    custom_branding: Optional[Dict[str, Any]] = None


@dataclass
class BulkExportRequest:
    """Bulk export request for multiple summaries"""
    summary_ids: List[str]
    formats: List[ExportFormat]
    template: Optional[str] = None
    include_metadata: bool = True
    organize_by: str = "format"  # "format", "date", "video"
    custom_branding: Optional[Dict[str, Any]] = None


@dataclass
class ExportResult:
    """Export operation result"""
    export_id: str
    status: ExportStatus
    format: ExportFormat
    file_path: Optional[str] = None
    file_size_bytes: Optional[int] = None
    download_url: Optional[str] = None
    error: Optional[str] = None
    created_at: Optional[datetime] = None
    completed_at: Optional[datetime] = None


class BaseExporter(ABC):
    """Base class for format-specific exporters"""

    @abstractmethod
    async def export(
        self,
        summary_data: Dict[str, Any],
        template: Optional[str] = None,
        branding: Optional[Dict[str, Any]] = None
    ) -> str:
        """Export summary to specific format and return file path"""
        pass

    @abstractmethod
    def get_file_extension(self) -> str:
        """Get file extension for this export format"""
        pass

    def _prepare_summary_data(self, summary_data: Dict[str, Any]) -> Dict[str, Any]:
        """Prepare and enrich summary data for export"""

        return {
            **summary_data,
            "export_metadata": {
                "exported_at": datetime.utcnow().isoformat(),
                "exporter_version": "1.0",
                "youtube_summarizer_version": "2.0"
            }
        }


class ExportService:
    """Main service for handling summary exports"""

    def __init__(self, export_dir: str = "/tmp/youtube_summarizer_exports"):
        self.export_dir = Path(export_dir)
        self.export_dir.mkdir(parents=True, exist_ok=True)

        # Initialize format-specific exporters (will be imported later)
        self.exporters: Dict[ExportFormat, BaseExporter] = {}
        self._initialize_exporters()

        # Track active exports
        self.active_exports: Dict[str, ExportResult] = {}

    def _initialize_exporters(self):
        """Initialize all available exporters"""
        try:
            from .exporters.markdown_exporter import MarkdownExporter
            self.exporters[ExportFormat.MARKDOWN] = MarkdownExporter()
        except ImportError:
            pass

        try:
            from .exporters.pdf_exporter import PDFExporter
            self.exporters[ExportFormat.PDF] = PDFExporter()
        except ImportError:
            pass

        try:
            from .exporters.text_exporter import PlainTextExporter
            self.exporters[ExportFormat.PLAIN_TEXT] = PlainTextExporter()
        except ImportError:
            pass

        try:
            from .exporters.json_exporter import JSONExporter
            self.exporters[ExportFormat.JSON] = JSONExporter()
        except ImportError:
            pass

        try:
            from .exporters.html_exporter import HTMLExporter
            self.exporters[ExportFormat.HTML] = HTMLExporter()
        except ImportError:
            pass

    async def export_summary(
        self,
        summary_data: Dict[str, Any],
        request: ExportRequest
    ) -> ExportResult:
        """Export single summary"""

        export_id = str(uuid.uuid4())

        result = ExportResult(
            export_id=export_id,
            status=ExportStatus.PENDING,
            format=request.format,
            created_at=datetime.utcnow()
        )

        self.active_exports[export_id] = result

        try:
            result.status = ExportStatus.PROCESSING

            # Check if exporter is available
            if request.format not in self.exporters:
                raise ValueError(f"Exporter for format {request.format.value} not available")

            # Get appropriate exporter
            exporter = self.exporters[request.format]

            # Export the summary
            file_path = await exporter.export(
                summary_data=summary_data,
                template=request.template,
                branding=request.custom_branding
            )

            # Update result
            result.file_path = file_path
            result.file_size_bytes = os.path.getsize(file_path)
            result.download_url = f"/api/export/download/{export_id}"
            result.status = ExportStatus.COMPLETED
            result.completed_at = datetime.utcnow()

        except Exception as e:
            result.status = ExportStatus.FAILED
            result.error = str(e)
            result.completed_at = datetime.utcnow()

        return result

    async def bulk_export_summaries(
        self,
        summaries_data: List[Dict[str, Any]],
        request: BulkExportRequest
    ) -> ExportResult:
        """Export multiple summaries with organization"""

        export_id = str(uuid.uuid4())

        result = ExportResult(
            export_id=export_id,
            status=ExportStatus.PENDING,
            format=ExportFormat.JSON,  # Bulk exports are archives
            created_at=datetime.utcnow()
        )

        self.active_exports[export_id] = result

        try:
            result.status = ExportStatus.PROCESSING

            # Create temporary directory for bulk export
            with tempfile.TemporaryDirectory() as temp_dir:
                temp_path = Path(temp_dir)

                # Export each summary in requested formats
                for summary_data in summaries_data:
                    await self._export_summary_to_bulk(
                        summary_data, request, temp_path
                    )

                # Create ZIP archive
                archive_path = self.export_dir / f"bulk_export_{export_id}.zip"
                await self._create_archive(temp_path, archive_path)

                result.file_path = str(archive_path)
                result.file_size_bytes = os.path.getsize(archive_path)
                result.download_url = f"/api/export/download/{export_id}"
                result.status = ExportStatus.COMPLETED
                result.completed_at = datetime.utcnow()

        except Exception as e:
            result.status = ExportStatus.FAILED
            result.error = str(e)
            result.completed_at = datetime.utcnow()

        return result

    async def _export_summary_to_bulk(
        self,
        summary_data: Dict[str, Any],
        request: BulkExportRequest,
        output_dir: Path
    ):
        """Export single summary to bulk export directory"""

        video_title = summary_data.get("video_metadata", {}).get("title", "Unknown")
        safe_title = self._sanitize_filename(video_title)

        for format in request.formats:
            if format not in self.exporters:
                continue

            exporter = self.exporters[format]

            # Determine output path based on organization preference
            if request.organize_by == "format":
                format_dir = output_dir / format.value
                format_dir.mkdir(exist_ok=True)
                output_path = format_dir / f"{safe_title}.{exporter.get_file_extension()}"
            elif request.organize_by == "date":
                date_str = summary_data.get("created_at", "unknown")[:10]  # YYYY-MM-DD
                date_dir = output_dir / date_str
                date_dir.mkdir(exist_ok=True)
                output_path = date_dir / f"{safe_title}.{exporter.get_file_extension()}"
            else:  # organize by video
                video_dir = output_dir / safe_title
                video_dir.mkdir(exist_ok=True)
                output_path = video_dir / f"{safe_title}.{exporter.get_file_extension()}"

            # Export to specific format
            temp_file = await exporter.export(
                summary_data=summary_data,
                template=request.template,
                branding=request.custom_branding
            )

            # Move to organized location
            import shutil
            shutil.move(temp_file, str(output_path))

    async def _create_archive(self, source_dir: Path, archive_path: Path):
        """Create ZIP archive from directory"""

        with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for file_path in source_dir.rglob('*'):
                if file_path.is_file():
                    arcname = file_path.relative_to(source_dir)
                    zipf.write(file_path, arcname)

    def _sanitize_filename(self, filename: str) -> str:
        """Sanitize filename for filesystem compatibility"""
        import re
        import string

        # First, remove control characters and null bytes
        # Create a translation table that removes control characters
        control_chars = ''.join(chr(i) for i in range(32))
        control_chars += '\x7f'  # DEL character
        translator = str.maketrans('', '', control_chars)
        filename = filename.translate(translator)

        # Replace invalid filesystem characters with underscores
        sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)

        # Limit length and strip whitespace
        return sanitized[:100].strip()

    def get_export_status(self, export_id: str) -> Optional[ExportResult]:
        """Get export status by ID"""
        return self.active_exports.get(export_id)

    async def cleanup_old_exports(self, max_age_hours: int = 24):
        """Clean up old export files"""

        cutoff_time = datetime.utcnow().timestamp() - (max_age_hours * 3600)

        for export_id, result in list(self.active_exports.items()):
            if result.created_at and result.created_at.timestamp() < cutoff_time:
                # Remove file if exists
                if result.file_path and os.path.exists(result.file_path):
                    os.remove(result.file_path)

                # Remove from active exports
                del self.active_exports[export_id]