"""Unit tests for export functionality. Tests cover JSON, TXT, SRT, and Markdown export formats with various scenarios including error handling, file naming, and batch operations. """ import json import tempfile from datetime import datetime, timezone from pathlib import Path from typing import Dict, Any, List from unittest.mock import AsyncMock, patch import pytest from src.services.export_service import ( ExportService, ExportFormat, ExportError, format_timestamp, format_duration, convert_to_srt, convert_to_markdown, ) class TestExportService: """Test cases for ExportService.""" @pytest.fixture def export_service(self): """Create ExportService instance for testing.""" return ExportService() @pytest.fixture def sample_transcript(self) -> Dict[str, Any]: """Sample transcript data for testing.""" return { "id": "test-123", "title": "Sample Podcast Episode", "media_file_id": "media-456", "pipeline_version": "v1", "content": { "text": "Hello world. This is a test transcript.", "language": "en", "duration": 120.5 }, "segments": [ { "start": 0.0, "end": 2.5, "text": "Hello world.", "confidence": 0.95, "speaker": "Speaker 1" }, { "start": 2.5, "end": 5.0, "text": "This is a test transcript.", "confidence": 0.92, "speaker": "Speaker 2" } ], "confidence_scores": [0.95, 0.92], "speaker_info": { "speakers": ["Speaker 1", "Speaker 2"], "speaker_count": 2 }, "accuracy": 0.935, "word_count": 8, "processing_time": 15.2, "model_used": "whisper-1", "model_config": {"temperature": 0.0}, "created_at": "2024-01-15T10:30:00Z", "updated_at": "2024-01-15T10:30:00Z" } @pytest.fixture def sample_media_file(self) -> Dict[str, Any]: """Sample media file data for testing.""" return { "id": "media-456", "filename": "sample_podcast_episode.mp3", "local_path": "/path/to/sample_podcast_episode.mp3", "duration": 120.5, "file_size": 1024000 } @pytest.fixture def temp_export_dir(self): """Create temporary export directory.""" with tempfile.TemporaryDirectory() as temp_dir: yield Path(temp_dir) async def test_export_json_format(self, export_service, sample_transcript, temp_export_dir): """Test JSON export with full transcript data.""" output_path = temp_export_dir / "test_export.json" result_path = await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.JSON, output_path=output_path ) assert result_path.exists() assert result_path.suffix == ".json" # Verify JSON content with open(result_path, "r", encoding="utf-8") as f: exported_data = json.load(f) assert exported_data["id"] == sample_transcript["id"] assert exported_data["title"] == sample_transcript["title"] assert exported_data["segments"] == sample_transcript["segments"] assert exported_data["content"] == sample_transcript["content"] async def test_export_txt_format(self, export_service, sample_transcript, temp_export_dir): """Test TXT export with plain text content.""" output_path = temp_export_dir / "test_export.txt" result_path = await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.TXT, output_path=output_path ) assert result_path.exists() assert result_path.suffix == ".txt" # Verify text content with open(result_path, "r", encoding="utf-8") as f: content = f.read() expected_text = "Hello world. This is a test transcript." assert content.strip() == expected_text async def test_export_srt_format(self, export_service, sample_transcript, temp_export_dir): """Test SRT export with timestamps.""" output_path = temp_export_dir / "test_export.srt" result_path = await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.SRT, output_path=output_path ) assert result_path.exists() assert result_path.suffix == ".srt" # Verify SRT content with open(result_path, "r", encoding="utf-8") as f: content = f.read() expected_lines = [ "1", "00:00:00,000 --> 00:00:02,500", "Hello world.", "", "2", "00:00:02,500 --> 00:00:05,000", "This is a test transcript." ] actual_lines = content.split("\n") # Remove trailing empty lines for comparison while actual_lines and actual_lines[-1] == "": actual_lines.pop() assert actual_lines == expected_lines async def test_export_markdown_format(self, export_service, sample_transcript, temp_export_dir): """Test Markdown export with formatting.""" output_path = temp_export_dir / "test_export.md" result_path = await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.MARKDOWN, output_path=output_path ) assert result_path.exists() assert result_path.suffix == ".md" # Verify Markdown content with open(result_path, "r", encoding="utf-8") as f: content = f.read() # Check for required sections assert "# Sample Podcast Episode" in content assert "## Metadata" in content assert "## Content" in content assert "### Speaker: Speaker 1" in content assert "### Speaker: Speaker 2" in content assert "**[00:00]** Hello world." in content assert "**[00:02]** This is a test transcript." in content async def test_export_with_default_path(self, export_service, sample_transcript, sample_media_file): """Test export with auto-generated default path.""" with patch.object(export_service, '_get_media_file', return_value=sample_media_file): with tempfile.TemporaryDirectory() as temp_dir: export_service.export_dir = Path(temp_dir) result_path = await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.JSON ) assert result_path.exists() assert result_path.name == "sample_podcast_episode.json" assert result_path.parent == Path(temp_dir) async def test_export_unsupported_format(self, export_service, sample_transcript, temp_export_dir): """Test export with unsupported format raises error.""" output_path = temp_export_dir / "test_export.xyz" with pytest.raises(ExportError, match="Unsupported export format"): await export_service.export_transcript( transcript=sample_transcript, format="xyz", output_path=output_path ) async def test_export_file_system_error(self, export_service, sample_transcript): """Test export with file system error handling.""" # Use a path that should cause permission error invalid_path = Path("/root/invalid_path/test.json") with pytest.raises(ExportError, match="Export error"): await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.JSON, output_path=invalid_path ) async def test_batch_export(self, export_service, sample_transcript, temp_export_dir): """Test batch export functionality.""" transcripts = [sample_transcript] * 3 output_dir = temp_export_dir / "batch_export" results = await export_service.batch_export( transcripts=transcripts, format=ExportFormat.JSON, output_dir=output_dir ) assert len(results) == 3 assert all(result.exists() for result in results) assert all(result.suffix == ".json" for result in results) async def test_batch_export_with_errors(self, export_service, sample_transcript, temp_export_dir): """Test batch export with some failures.""" # Create one invalid transcript invalid_transcript = {"invalid": "data"} transcripts = [sample_transcript, invalid_transcript, sample_transcript] output_dir = temp_export_dir / "batch_export" results = await export_service.batch_export( transcripts=transcripts, format=ExportFormat.JSON, output_dir=output_dir ) # Should have 2 successful exports and 1 None for failure assert len(results) == 3 assert results[0] is not None assert results[1] is None # Invalid transcript assert results[2] is not None async def test_export_with_large_transcript(self, export_service, temp_export_dir): """Test export with very large transcript.""" # Create large transcript with many segments large_transcript = { "id": "large-test", "title": "Large Transcript", "content": {"text": "Large content " * 1000}, "segments": [ { "start": i * 10.0, "end": (i + 1) * 10.0, "text": f"Segment {i} " * 50, "confidence": 0.9, "speaker": f"Speaker {i % 3 + 1}" } for i in range(100) # 100 segments ], "created_at": "2024-01-15T10:30:00Z" } output_path = temp_export_dir / "large_export.json" result_path = await export_service.export_transcript( transcript=large_transcript, format=ExportFormat.JSON, output_path=output_path ) assert result_path.exists() assert result_path.stat().st_size > 10000 # Should be substantial size async def test_export_character_encoding(self, export_service, temp_export_dir): """Test export preserves character encoding.""" transcript_with_unicode = { "id": "unicode-test", "title": "Unicode Test: 你好世界", "content": {"text": "Hello 你好世界 with unicode: ñáéíóú"}, "segments": [ { "start": 0.0, "end": 5.0, "text": "Hello 你好世界 with unicode: ñáéíóú", "confidence": 0.95, "speaker": "Speaker 1" } ], "created_at": "2024-01-15T10:30:00Z" } output_path = temp_export_dir / "unicode_export.txt" result_path = await export_service.export_transcript( transcript=transcript_with_unicode, format=ExportFormat.TXT, output_path=output_path ) # Verify encoding is preserved with open(result_path, "r", encoding="utf-8") as f: content = f.read() assert "你好世界" in content assert "ñáéíóú" in content async def test_export_directory_creation(self, export_service, sample_transcript): """Test export creates directory if it doesn't exist.""" with tempfile.TemporaryDirectory() as temp_dir: new_export_dir = Path(temp_dir) / "new_export_dir" output_path = new_export_dir / "test.json" result_path = await export_service.export_transcript( transcript=sample_transcript, format=ExportFormat.JSON, output_path=output_path ) assert new_export_dir.exists() assert result_path.exists() class TestExportUtilities: """Test cases for export utility functions.""" def test_format_timestamp(self): """Test timestamp formatting for SRT.""" # Test various time values assert format_timestamp(0.0) == "00:00:00,000" assert format_timestamp(61.5) == "00:01:01,500" assert format_timestamp(3661.123) == "01:01:01,123" assert format_timestamp(7325.789) == "02:02:05,789" def test_format_duration(self): """Test duration formatting for Markdown.""" # Test various duration values assert format_duration(0.0) == "00:00" assert format_duration(61.5) == "01:01" assert format_duration(3661.123) == "01:01:01" assert format_duration(7325.789) == "02:02:05" def test_convert_to_srt(self): """Test SRT conversion.""" transcript = { "segments": [ {"start": 0.0, "end": 2.5, "text": "Hello world."}, {"start": 2.5, "end": 5.0, "text": "This is a test."} ] } srt_content = convert_to_srt(transcript) expected = "1\n00:00:00,000 --> 00:00:02,500\nHello world.\n\n2\n00:00:02,500 --> 00:00:05,000\nThis is a test.\n" assert srt_content == expected def test_convert_to_markdown(self): """Test Markdown conversion.""" transcript = { "title": "Test Transcript", "created_at": "2024-01-15T10:30:00Z", "content": {"duration": 120.5}, "segments": [ {"start": 0.0, "end": 2.5, "text": "Hello world.", "speaker": "Speaker 1"}, {"start": 2.5, "end": 5.0, "text": "This is a test.", "speaker": "Speaker 2"} ] } md_content = convert_to_markdown(transcript) # Check required sections assert "# Test Transcript" in md_content assert "## Metadata" in md_content assert "## Content" in md_content assert "### Speaker: Speaker 1" in md_content assert "### Speaker: Speaker 2" in md_content assert "**[00:00]** Hello world." in md_content assert "**[00:02]** This is a test." in md_content def test_convert_to_markdown_no_speakers(self): """Test Markdown conversion without speaker information.""" transcript = { "title": "Test Transcript", "created_at": "2024-01-15T10:30:00Z", "content": {"duration": 120.5}, "segments": [ {"start": 0.0, "end": 2.5, "text": "Hello world."}, {"start": 2.5, "end": 5.0, "text": "This is a test."} ] } md_content = convert_to_markdown(transcript) # Should not have speaker sections assert "### Speaker:" not in md_content assert "**[00:00]** Hello world." in md_content assert "**[00:02]** This is a test." in md_content def test_convert_to_markdown_empty_segments(self): """Test Markdown conversion with empty segments.""" transcript = { "title": "Empty Transcript", "created_at": "2024-01-15T10:30:00Z", "content": {"duration": 0.0}, "segments": [] } md_content = convert_to_markdown(transcript) assert "# Empty Transcript" in md_content assert "## Metadata" in md_content assert "## Content" in md_content # Should not have any segment content assert "**[00:00]**" not in md_content class TestExportServiceIntegration: """Integration tests for ExportService.""" @pytest.fixture def export_service(self): """Create ExportService with mocked dependencies.""" return ExportService() @pytest.fixture def temp_export_dir(self): """Create temporary export directory.""" with tempfile.TemporaryDirectory() as temp_dir: yield Path(temp_dir) async def test_full_export_workflow(self, export_service, temp_export_dir): """Test complete export workflow with all formats.""" transcript = { "id": "workflow-test", "title": "Full Workflow Test", "content": {"text": "Complete workflow test content."}, "segments": [ {"start": 0.0, "end": 3.0, "text": "Complete workflow test content.", "speaker": "Speaker 1"} ], "created_at": "2024-01-15T10:30:00Z" } formats = [ExportFormat.JSON, ExportFormat.TXT, ExportFormat.SRT, ExportFormat.MARKDOWN] results = [] for format in formats: output_path = temp_export_dir / f"workflow_test.{format.value}" result = await export_service.export_transcript( transcript=transcript, format=format, output_path=output_path ) results.append(result) # Verify all exports succeeded assert len(results) == 4 assert all(result.exists() for result in results) # Verify file sizes are appropriate json_size = results[0].stat().st_size txt_size = results[1].stat().st_size srt_size = results[2].stat().st_size md_size = results[3].stat().st_size assert json_size > txt_size # JSON has more metadata assert md_size > txt_size # Markdown has formatting assert srt_size > txt_size # SRT has timestamps async def test_export_with_real_audio_metadata(self, export_service, temp_export_dir): """Test export with realistic audio metadata.""" transcript = { "id": "real-audio-test", "title": "Tech Podcast Episode 42: AI and Machine Learning", "media_file_id": "audio-123", "content": { "text": "Welcome to Tech Podcast Episode 42. Today we're discussing AI and machine learning.", "language": "en", "duration": 3600.0 # 1 hour }, "segments": [ { "start": 0.0, "end": 5.0, "text": "Welcome to Tech Podcast Episode 42.", "confidence": 0.98, "speaker": "Host" }, { "start": 5.0, "end": 10.0, "text": "Today we're discussing AI and machine learning.", "confidence": 0.95, "speaker": "Host" } ], "accuracy": 0.965, "word_count": 12, "processing_time": 45.2, "model_used": "whisper-1", "created_at": "2024-01-15T10:30:00Z" } # Test all formats for format in ExportFormat: output_path = temp_export_dir / f"real_audio_test.{format.value}" result = await export_service.export_transcript( transcript=transcript, format=format, output_path=output_path ) assert result.exists() # Verify content is appropriate for format with open(result, "r", encoding="utf-8") as f: content = f.read() if format == ExportFormat.JSON: data = json.loads(content) assert data["title"] == transcript["title"] assert data["segments"] == transcript["segments"] elif format == ExportFormat.TXT: assert "Welcome to Tech Podcast Episode 42" in content elif format == ExportFormat.SRT: assert "00:00:00,000 --> 00:00:05,000" in content elif format == ExportFormat.MARKDOWN: assert "# Tech Podcast Episode 42: AI and Machine Learning" in content assert "### Speaker: Host" in content