trax/tests/test_export_service.py

"""Unit tests for export functionality.

Tests cover JSON, TXT, SRT, and Markdown export formats with various scenarios
including error handling, file naming, and batch operations.
"""

import json
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Any, List
from unittest.mock import AsyncMock, patch

import pytest

from src.services.export_service import (
    ExportService,
    ExportFormat,
    ExportError,
    format_timestamp,
    format_duration,
    convert_to_srt,
    convert_to_markdown,
)


class TestExportService:
    """Test cases for ExportService."""

    @pytest.fixture
    def export_service(self):
        """Create ExportService instance for testing."""
        return ExportService()

    @pytest.fixture
    def sample_transcript(self) -> Dict[str, Any]:
        """Sample transcript data for testing."""
        return {
            "id": "test-123",
            "title": "Sample Podcast Episode",
            "media_file_id": "media-456",
            "pipeline_version": "v1",
            "content": {
                "text": "Hello world. This is a test transcript.",
                "language": "en",
                "duration": 120.5
            },
            "segments": [
                {
                    "start": 0.0,
                    "end": 2.5,
                    "text": "Hello world.",
                    "confidence": 0.95,
                    "speaker": "Speaker 1"
                },
                {
                    "start": 2.5,
                    "end": 5.0,
                    "text": "This is a test transcript.",
                    "confidence": 0.92,
                    "speaker": "Speaker 2"
                }
            ],
            "confidence_scores": [0.95, 0.92],
            "speaker_info": {
                "speakers": ["Speaker 1", "Speaker 2"],
                "speaker_count": 2
            },
            "accuracy": 0.935,
            "word_count": 8,
            "processing_time": 15.2,
            "model_used": "whisper-1",
            "model_config": {"temperature": 0.0},
            "created_at": "2024-01-15T10:30:00Z",
            "updated_at": "2024-01-15T10:30:00Z"
        }

    @pytest.fixture
    def sample_media_file(self) -> Dict[str, Any]:
        """Sample media file data for testing."""
        return {
            "id": "media-456",
            "filename": "sample_podcast_episode.mp3",
            "local_path": "/path/to/sample_podcast_episode.mp3",
            "duration": 120.5,
            "file_size": 1024000
        }

    @pytest.fixture
    def temp_export_dir(self):
        """Create temporary export directory."""
        with tempfile.TemporaryDirectory() as temp_dir:
            yield Path(temp_dir)

    async def test_export_json_format(self, export_service, sample_transcript, temp_export_dir):
        """Test JSON export with full transcript data."""
        output_path = temp_export_dir / "test_export.json"

        result_path = await export_service.export_transcript(
            transcript=sample_transcript,
            format=ExportFormat.JSON,
            output_path=output_path
        )

        assert result_path.exists()
        assert result_path.suffix == ".json"

        # Verify JSON content
        with open(result_path, "r", encoding="utf-8") as f:
            exported_data = json.load(f)

        assert exported_data["id"] == sample_transcript["id"]
        assert exported_data["title"] == sample_transcript["title"]
        assert exported_data["segments"] == sample_transcript["segments"]
        assert exported_data["content"] == sample_transcript["content"]

    async def test_export_txt_format(self, export_service, sample_transcript, temp_export_dir):
        """Test TXT export with plain text content."""
        output_path = temp_export_dir / "test_export.txt"

        result_path = await export_service.export_transcript(
            transcript=sample_transcript,
            format=ExportFormat.TXT,
            output_path=output_path
        )

        assert result_path.exists()
        assert result_path.suffix == ".txt"

        # Verify text content
        with open(result_path, "r", encoding="utf-8") as f:
            content = f.read()

        expected_text = "Hello world. This is a test transcript."
        assert content.strip() == expected_text

    async def test_export_srt_format(self, export_service, sample_transcript, temp_export_dir):
        """Test SRT export with timestamps."""
        output_path = temp_export_dir / "test_export.srt"

        result_path = await export_service.export_transcript(
            transcript=sample_transcript,
            format=ExportFormat.SRT,
            output_path=output_path
        )

        assert result_path.exists()
        assert result_path.suffix == ".srt"

        # Verify SRT content
        with open(result_path, "r", encoding="utf-8") as f:
            content = f.read()

        expected_lines = [
            "1",
            "00:00:00,000 --> 00:00:02,500",
            "Hello world.",
            "",
            "2",
            "00:00:02,500 --> 00:00:05,000",
            "This is a test transcript."
        ]

        actual_lines = content.split("\n")
        # Remove trailing empty lines for comparison
        while actual_lines and actual_lines[-1] == "":
            actual_lines.pop()
        assert actual_lines == expected_lines

    async def test_export_markdown_format(self, export_service, sample_transcript, temp_export_dir):
        """Test Markdown export with formatting."""
        output_path = temp_export_dir / "test_export.md"

        result_path = await export_service.export_transcript(
            transcript=sample_transcript,
            format=ExportFormat.MARKDOWN,
            output_path=output_path
        )

        assert result_path.exists()
        assert result_path.suffix == ".md"

        # Verify Markdown content
        with open(result_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Check for required sections
        assert "# Sample Podcast Episode" in content
        assert "## Metadata" in content
        assert "## Content" in content
        assert "### Speaker: Speaker 1" in content
        assert "### Speaker: Speaker 2" in content
        assert "**[00:00]** Hello world." in content
        assert "**[00:02]** This is a test transcript." in content

    async def test_export_with_default_path(self, export_service, sample_transcript, sample_media_file):
        """Test export with auto-generated default path."""
        with patch.object(export_service, '_get_media_file', return_value=sample_media_file):
            with tempfile.TemporaryDirectory() as temp_dir:
                export_service.export_dir = Path(temp_dir)

                result_path = await export_service.export_transcript(
                    transcript=sample_transcript,
                    format=ExportFormat.JSON
                )

                assert result_path.exists()
                assert result_path.name == "sample_podcast_episode.json"
                assert result_path.parent == Path(temp_dir)

    async def test_export_unsupported_format(self, export_service, sample_transcript, temp_export_dir):
        """Test export with unsupported format raises error."""
        output_path = temp_export_dir / "test_export.xyz"

        with pytest.raises(ExportError, match="Unsupported export format"):
            await export_service.export_transcript(
                transcript=sample_transcript,
                format="xyz",
                output_path=output_path
            )

    async def test_export_file_system_error(self, export_service, sample_transcript):
        """Test export with file system error handling."""
        # Use a path that should cause permission error
        invalid_path = Path("/root/invalid_path/test.json")

        with pytest.raises(ExportError, match="Export error"):
            await export_service.export_transcript(
                transcript=sample_transcript,
                format=ExportFormat.JSON,
                output_path=invalid_path
            )

    async def test_batch_export(self, export_service, sample_transcript, temp_export_dir):
        """Test batch export functionality."""
        transcripts = [sample_transcript] * 3
        output_dir = temp_export_dir / "batch_export"

        results = await export_service.batch_export(
            transcripts=transcripts,
            format=ExportFormat.JSON,
            output_dir=output_dir
        )

        assert len(results) == 3
        assert all(result.exists() for result in results)
        assert all(result.suffix == ".json" for result in results)

    async def test_batch_export_with_errors(self, export_service, sample_transcript, temp_export_dir):
        """Test batch export with some failures."""
        # Create one invalid transcript
        invalid_transcript = {"invalid": "data"}
        transcripts = [sample_transcript, invalid_transcript, sample_transcript]
        output_dir = temp_export_dir / "batch_export"

        results = await export_service.batch_export(
            transcripts=transcripts,
            format=ExportFormat.JSON,
            output_dir=output_dir
        )

        # Should have 2 successful exports and 1 None for failure
        assert len(results) == 3
        assert results[0] is not None
        assert results[1] is None  # Invalid transcript
        assert results[2] is not None

    async def test_export_with_large_transcript(self, export_service, temp_export_dir):
        """Test export with very large transcript."""
        # Create large transcript with many segments
        large_transcript = {
            "id": "large-test",
            "title": "Large Transcript",
            "content": {"text": "Large content " * 1000},
            "segments": [
                {
                    "start": i * 10.0,
                    "end": (i + 1) * 10.0,
                    "text": f"Segment {i} " * 50,
                    "confidence": 0.9,
                    "speaker": f"Speaker {i % 3 + 1}"
                }
                for i in range(100)  # 100 segments
            ],
            "created_at": "2024-01-15T10:30:00Z"
        }

        output_path = temp_export_dir / "large_export.json"

        result_path = await export_service.export_transcript(
            transcript=large_transcript,
            format=ExportFormat.JSON,
            output_path=output_path
        )

        assert result_path.exists()
        assert result_path.stat().st_size > 10000  # Should be substantial size

    async def test_export_character_encoding(self, export_service, temp_export_dir):
        """Test export preserves character encoding."""
        transcript_with_unicode = {
            "id": "unicode-test",
            "title": "Unicode Test: 你好世界",
            "content": {"text": "Hello 你好世界 with unicode: ñáéíóú"},
            "segments": [
                {
                    "start": 0.0,
                    "end": 5.0,
                    "text": "Hello 你好世界 with unicode: ñáéíóú",
                    "confidence": 0.95,
                    "speaker": "Speaker 1"
                }
            ],
            "created_at": "2024-01-15T10:30:00Z"
        }

        output_path = temp_export_dir / "unicode_export.txt"

        result_path = await export_service.export_transcript(
            transcript=transcript_with_unicode,
            format=ExportFormat.TXT,
            output_path=output_path
        )

        # Verify encoding is preserved
        with open(result_path, "r", encoding="utf-8") as f:
            content = f.read()

        assert "你好世界" in content
        assert "ñáéíóú" in content

    async def test_export_directory_creation(self, export_service, sample_transcript):
        """Test export creates directory if it doesn't exist."""
        with tempfile.TemporaryDirectory() as temp_dir:
            new_export_dir = Path(temp_dir) / "new_export_dir"
            output_path = new_export_dir / "test.json"

            result_path = await export_service.export_transcript(
                transcript=sample_transcript,
                format=ExportFormat.JSON,
                output_path=output_path
            )

            assert new_export_dir.exists()
            assert result_path.exists()


class TestExportUtilities:
    """Test cases for export utility functions."""

    def test_format_timestamp(self):
        """Test timestamp formatting for SRT."""
        # Test various time values
        assert format_timestamp(0.0) == "00:00:00,000"
        assert format_timestamp(61.5) == "00:01:01,500"
        assert format_timestamp(3661.123) == "01:01:01,123"
        assert format_timestamp(7325.789) == "02:02:05,789"

    def test_format_duration(self):
        """Test duration formatting for Markdown."""
        # Test various duration values
        assert format_duration(0.0) == "00:00"
        assert format_duration(61.5) == "01:01"
        assert format_duration(3661.123) == "01:01:01"
        assert format_duration(7325.789) == "02:02:05"

    def test_convert_to_srt(self):
        """Test SRT conversion."""
        transcript = {
            "segments": [
                {"start": 0.0, "end": 2.5, "text": "Hello world."},
                {"start": 2.5, "end": 5.0, "text": "This is a test."}
            ]
        }

        srt_content = convert_to_srt(transcript)
        expected = "1\n00:00:00,000 --> 00:00:02,500\nHello world.\n\n2\n00:00:02,500 --> 00:00:05,000\nThis is a test.\n"

        assert srt_content == expected

    def test_convert_to_markdown(self):
        """Test Markdown conversion."""
        transcript = {
            "title": "Test Transcript",
            "created_at": "2024-01-15T10:30:00Z",
            "content": {"duration": 120.5},
            "segments": [
                {"start": 0.0, "end": 2.5, "text": "Hello world.", "speaker": "Speaker 1"},
                {"start": 2.5, "end": 5.0, "text": "This is a test.", "speaker": "Speaker 2"}
            ]
        }

        md_content = convert_to_markdown(transcript)

        # Check required sections
        assert "# Test Transcript" in md_content
        assert "## Metadata" in md_content
        assert "## Content" in md_content
        assert "### Speaker: Speaker 1" in md_content
        assert "### Speaker: Speaker 2" in md_content
        assert "**[00:00]** Hello world." in md_content
        assert "**[00:02]** This is a test." in md_content

    def test_convert_to_markdown_no_speakers(self):
        """Test Markdown conversion without speaker information."""
        transcript = {
            "title": "Test Transcript",
            "created_at": "2024-01-15T10:30:00Z",
            "content": {"duration": 120.5},
            "segments": [
                {"start": 0.0, "end": 2.5, "text": "Hello world."},
                {"start": 2.5, "end": 5.0, "text": "This is a test."}
            ]
        }

        md_content = convert_to_markdown(transcript)

        # Should not have speaker sections
        assert "### Speaker:" not in md_content
        assert "**[00:00]** Hello world." in md_content
        assert "**[00:02]** This is a test." in md_content

    def test_convert_to_markdown_empty_segments(self):
        """Test Markdown conversion with empty segments."""
        transcript = {
            "title": "Empty Transcript",
            "created_at": "2024-01-15T10:30:00Z",
            "content": {"duration": 0.0},
            "segments": []
        }

        md_content = convert_to_markdown(transcript)

        assert "# Empty Transcript" in md_content
        assert "## Metadata" in md_content
        assert "## Content" in md_content
        # Should not have any segment content
        assert "**[00:00]**" not in md_content


class TestExportServiceIntegration:
    """Integration tests for ExportService."""

    @pytest.fixture
    def export_service(self):
        """Create ExportService with mocked dependencies."""
        return ExportService()

    @pytest.fixture
    def temp_export_dir(self):
        """Create temporary export directory."""
        with tempfile.TemporaryDirectory() as temp_dir:
            yield Path(temp_dir)

    async def test_full_export_workflow(self, export_service, temp_export_dir):
        """Test complete export workflow with all formats."""
        transcript = {
            "id": "workflow-test",
            "title": "Full Workflow Test",
            "content": {"text": "Complete workflow test content."},
            "segments": [
                {"start": 0.0, "end": 3.0, "text": "Complete workflow test content.", "speaker": "Speaker 1"}
            ],
            "created_at": "2024-01-15T10:30:00Z"
        }

        formats = [ExportFormat.JSON, ExportFormat.TXT, ExportFormat.SRT, ExportFormat.MARKDOWN]
        results = []

        for format in formats:
            output_path = temp_export_dir / f"workflow_test.{format.value}"
            result = await export_service.export_transcript(
                transcript=transcript,
                format=format,
                output_path=output_path
            )
            results.append(result)

        # Verify all exports succeeded
        assert len(results) == 4
        assert all(result.exists() for result in results)

        # Verify file sizes are appropriate
        json_size = results[0].stat().st_size
        txt_size = results[1].stat().st_size
        srt_size = results[2].stat().st_size
        md_size = results[3].stat().st_size

        assert json_size > txt_size  # JSON has more metadata
        assert md_size > txt_size    # Markdown has formatting
        assert srt_size > txt_size   # SRT has timestamps

    async def test_export_with_real_audio_metadata(self, export_service, temp_export_dir):
        """Test export with realistic audio metadata."""
        transcript = {
            "id": "real-audio-test",
            "title": "Tech Podcast Episode 42: AI and Machine Learning",
            "media_file_id": "audio-123",
            "content": {
                "text": "Welcome to Tech Podcast Episode 42. Today we're discussing AI and machine learning.",
                "language": "en",
                "duration": 3600.0  # 1 hour
            },
            "segments": [
                {
                    "start": 0.0,
                    "end": 5.0,
                    "text": "Welcome to Tech Podcast Episode 42.",
                    "confidence": 0.98,
                    "speaker": "Host"
                },
                {
                    "start": 5.0,
                    "end": 10.0,
                    "text": "Today we're discussing AI and machine learning.",
                    "confidence": 0.95,
                    "speaker": "Host"
                }
            ],
            "accuracy": 0.965,
            "word_count": 12,
            "processing_time": 45.2,
            "model_used": "whisper-1",
            "created_at": "2024-01-15T10:30:00Z"
        }

        # Test all formats
        for format in ExportFormat:
            output_path = temp_export_dir / f"real_audio_test.{format.value}"
            result = await export_service.export_transcript(
                transcript=transcript,
                format=format,
                output_path=output_path
            )

            assert result.exists()

            # Verify content is appropriate for format
            with open(result, "r", encoding="utf-8") as f:
                content = f.read()

            if format == ExportFormat.JSON:
                data = json.loads(content)
                assert data["title"] == transcript["title"]
                assert data["segments"] == transcript["segments"]
            elif format == ExportFormat.TXT:
                assert "Welcome to Tech Podcast Episode 42" in content
            elif format == ExportFormat.SRT:
                assert "00:00:00,000 --> 00:00:05,000" in content
            elif format == ExportFormat.MARKDOWN:
                assert "# Tech Podcast Episode 42: AI and Machine Learning" in content
                assert "### Speaker: Host" in content