trax/tests/test_export_service.py

553 lines
20 KiB
Python

"""Unit tests for export functionality.
Tests cover JSON, TXT, SRT, and Markdown export formats with various scenarios
including error handling, file naming, and batch operations.
"""
import json
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Any, List
from unittest.mock import AsyncMock, patch
import pytest
from src.services.export_service import (
ExportService,
ExportFormat,
ExportError,
format_timestamp,
format_duration,
convert_to_srt,
convert_to_markdown,
)
class TestExportService:
"""Test cases for ExportService."""
@pytest.fixture
def export_service(self):
"""Create ExportService instance for testing."""
return ExportService()
@pytest.fixture
def sample_transcript(self) -> Dict[str, Any]:
"""Sample transcript data for testing."""
return {
"id": "test-123",
"title": "Sample Podcast Episode",
"media_file_id": "media-456",
"pipeline_version": "v1",
"content": {
"text": "Hello world. This is a test transcript.",
"language": "en",
"duration": 120.5
},
"segments": [
{
"start": 0.0,
"end": 2.5,
"text": "Hello world.",
"confidence": 0.95,
"speaker": "Speaker 1"
},
{
"start": 2.5,
"end": 5.0,
"text": "This is a test transcript.",
"confidence": 0.92,
"speaker": "Speaker 2"
}
],
"confidence_scores": [0.95, 0.92],
"speaker_info": {
"speakers": ["Speaker 1", "Speaker 2"],
"speaker_count": 2
},
"accuracy": 0.935,
"word_count": 8,
"processing_time": 15.2,
"model_used": "whisper-1",
"model_config": {"temperature": 0.0},
"created_at": "2024-01-15T10:30:00Z",
"updated_at": "2024-01-15T10:30:00Z"
}
@pytest.fixture
def sample_media_file(self) -> Dict[str, Any]:
"""Sample media file data for testing."""
return {
"id": "media-456",
"filename": "sample_podcast_episode.mp3",
"local_path": "/path/to/sample_podcast_episode.mp3",
"duration": 120.5,
"file_size": 1024000
}
@pytest.fixture
def temp_export_dir(self):
"""Create temporary export directory."""
with tempfile.TemporaryDirectory() as temp_dir:
yield Path(temp_dir)
async def test_export_json_format(self, export_service, sample_transcript, temp_export_dir):
"""Test JSON export with full transcript data."""
output_path = temp_export_dir / "test_export.json"
result_path = await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.JSON,
output_path=output_path
)
assert result_path.exists()
assert result_path.suffix == ".json"
# Verify JSON content
with open(result_path, "r", encoding="utf-8") as f:
exported_data = json.load(f)
assert exported_data["id"] == sample_transcript["id"]
assert exported_data["title"] == sample_transcript["title"]
assert exported_data["segments"] == sample_transcript["segments"]
assert exported_data["content"] == sample_transcript["content"]
async def test_export_txt_format(self, export_service, sample_transcript, temp_export_dir):
"""Test TXT export with plain text content."""
output_path = temp_export_dir / "test_export.txt"
result_path = await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.TXT,
output_path=output_path
)
assert result_path.exists()
assert result_path.suffix == ".txt"
# Verify text content
with open(result_path, "r", encoding="utf-8") as f:
content = f.read()
expected_text = "Hello world. This is a test transcript."
assert content.strip() == expected_text
async def test_export_srt_format(self, export_service, sample_transcript, temp_export_dir):
"""Test SRT export with timestamps."""
output_path = temp_export_dir / "test_export.srt"
result_path = await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.SRT,
output_path=output_path
)
assert result_path.exists()
assert result_path.suffix == ".srt"
# Verify SRT content
with open(result_path, "r", encoding="utf-8") as f:
content = f.read()
expected_lines = [
"1",
"00:00:00,000 --> 00:00:02,500",
"Hello world.",
"",
"2",
"00:00:02,500 --> 00:00:05,000",
"This is a test transcript."
]
actual_lines = content.split("\n")
# Remove trailing empty lines for comparison
while actual_lines and actual_lines[-1] == "":
actual_lines.pop()
assert actual_lines == expected_lines
async def test_export_markdown_format(self, export_service, sample_transcript, temp_export_dir):
"""Test Markdown export with formatting."""
output_path = temp_export_dir / "test_export.md"
result_path = await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.MARKDOWN,
output_path=output_path
)
assert result_path.exists()
assert result_path.suffix == ".md"
# Verify Markdown content
with open(result_path, "r", encoding="utf-8") as f:
content = f.read()
# Check for required sections
assert "# Sample Podcast Episode" in content
assert "## Metadata" in content
assert "## Content" in content
assert "### Speaker: Speaker 1" in content
assert "### Speaker: Speaker 2" in content
assert "**[00:00]** Hello world." in content
assert "**[00:02]** This is a test transcript." in content
async def test_export_with_default_path(self, export_service, sample_transcript, sample_media_file):
"""Test export with auto-generated default path."""
with patch.object(export_service, '_get_media_file', return_value=sample_media_file):
with tempfile.TemporaryDirectory() as temp_dir:
export_service.export_dir = Path(temp_dir)
result_path = await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.JSON
)
assert result_path.exists()
assert result_path.name == "sample_podcast_episode.json"
assert result_path.parent == Path(temp_dir)
async def test_export_unsupported_format(self, export_service, sample_transcript, temp_export_dir):
"""Test export with unsupported format raises error."""
output_path = temp_export_dir / "test_export.xyz"
with pytest.raises(ExportError, match="Unsupported export format"):
await export_service.export_transcript(
transcript=sample_transcript,
format="xyz",
output_path=output_path
)
async def test_export_file_system_error(self, export_service, sample_transcript):
"""Test export with file system error handling."""
# Use a path that should cause permission error
invalid_path = Path("/root/invalid_path/test.json")
with pytest.raises(ExportError, match="Export error"):
await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.JSON,
output_path=invalid_path
)
async def test_batch_export(self, export_service, sample_transcript, temp_export_dir):
"""Test batch export functionality."""
transcripts = [sample_transcript] * 3
output_dir = temp_export_dir / "batch_export"
results = await export_service.batch_export(
transcripts=transcripts,
format=ExportFormat.JSON,
output_dir=output_dir
)
assert len(results) == 3
assert all(result.exists() for result in results)
assert all(result.suffix == ".json" for result in results)
async def test_batch_export_with_errors(self, export_service, sample_transcript, temp_export_dir):
"""Test batch export with some failures."""
# Create one invalid transcript
invalid_transcript = {"invalid": "data"}
transcripts = [sample_transcript, invalid_transcript, sample_transcript]
output_dir = temp_export_dir / "batch_export"
results = await export_service.batch_export(
transcripts=transcripts,
format=ExportFormat.JSON,
output_dir=output_dir
)
# Should have 2 successful exports and 1 None for failure
assert len(results) == 3
assert results[0] is not None
assert results[1] is None # Invalid transcript
assert results[2] is not None
async def test_export_with_large_transcript(self, export_service, temp_export_dir):
"""Test export with very large transcript."""
# Create large transcript with many segments
large_transcript = {
"id": "large-test",
"title": "Large Transcript",
"content": {"text": "Large content " * 1000},
"segments": [
{
"start": i * 10.0,
"end": (i + 1) * 10.0,
"text": f"Segment {i} " * 50,
"confidence": 0.9,
"speaker": f"Speaker {i % 3 + 1}"
}
for i in range(100) # 100 segments
],
"created_at": "2024-01-15T10:30:00Z"
}
output_path = temp_export_dir / "large_export.json"
result_path = await export_service.export_transcript(
transcript=large_transcript,
format=ExportFormat.JSON,
output_path=output_path
)
assert result_path.exists()
assert result_path.stat().st_size > 10000 # Should be substantial size
async def test_export_character_encoding(self, export_service, temp_export_dir):
"""Test export preserves character encoding."""
transcript_with_unicode = {
"id": "unicode-test",
"title": "Unicode Test: 你好世界",
"content": {"text": "Hello 你好世界 with unicode: ñáéíóú"},
"segments": [
{
"start": 0.0,
"end": 5.0,
"text": "Hello 你好世界 with unicode: ñáéíóú",
"confidence": 0.95,
"speaker": "Speaker 1"
}
],
"created_at": "2024-01-15T10:30:00Z"
}
output_path = temp_export_dir / "unicode_export.txt"
result_path = await export_service.export_transcript(
transcript=transcript_with_unicode,
format=ExportFormat.TXT,
output_path=output_path
)
# Verify encoding is preserved
with open(result_path, "r", encoding="utf-8") as f:
content = f.read()
assert "你好世界" in content
assert "ñáéíóú" in content
async def test_export_directory_creation(self, export_service, sample_transcript):
"""Test export creates directory if it doesn't exist."""
with tempfile.TemporaryDirectory() as temp_dir:
new_export_dir = Path(temp_dir) / "new_export_dir"
output_path = new_export_dir / "test.json"
result_path = await export_service.export_transcript(
transcript=sample_transcript,
format=ExportFormat.JSON,
output_path=output_path
)
assert new_export_dir.exists()
assert result_path.exists()
class TestExportUtilities:
"""Test cases for export utility functions."""
def test_format_timestamp(self):
"""Test timestamp formatting for SRT."""
# Test various time values
assert format_timestamp(0.0) == "00:00:00,000"
assert format_timestamp(61.5) == "00:01:01,500"
assert format_timestamp(3661.123) == "01:01:01,123"
assert format_timestamp(7325.789) == "02:02:05,789"
def test_format_duration(self):
"""Test duration formatting for Markdown."""
# Test various duration values
assert format_duration(0.0) == "00:00"
assert format_duration(61.5) == "01:01"
assert format_duration(3661.123) == "01:01:01"
assert format_duration(7325.789) == "02:02:05"
def test_convert_to_srt(self):
"""Test SRT conversion."""
transcript = {
"segments": [
{"start": 0.0, "end": 2.5, "text": "Hello world."},
{"start": 2.5, "end": 5.0, "text": "This is a test."}
]
}
srt_content = convert_to_srt(transcript)
expected = "1\n00:00:00,000 --> 00:00:02,500\nHello world.\n\n2\n00:00:02,500 --> 00:00:05,000\nThis is a test.\n"
assert srt_content == expected
def test_convert_to_markdown(self):
"""Test Markdown conversion."""
transcript = {
"title": "Test Transcript",
"created_at": "2024-01-15T10:30:00Z",
"content": {"duration": 120.5},
"segments": [
{"start": 0.0, "end": 2.5, "text": "Hello world.", "speaker": "Speaker 1"},
{"start": 2.5, "end": 5.0, "text": "This is a test.", "speaker": "Speaker 2"}
]
}
md_content = convert_to_markdown(transcript)
# Check required sections
assert "# Test Transcript" in md_content
assert "## Metadata" in md_content
assert "## Content" in md_content
assert "### Speaker: Speaker 1" in md_content
assert "### Speaker: Speaker 2" in md_content
assert "**[00:00]** Hello world." in md_content
assert "**[00:02]** This is a test." in md_content
def test_convert_to_markdown_no_speakers(self):
"""Test Markdown conversion without speaker information."""
transcript = {
"title": "Test Transcript",
"created_at": "2024-01-15T10:30:00Z",
"content": {"duration": 120.5},
"segments": [
{"start": 0.0, "end": 2.5, "text": "Hello world."},
{"start": 2.5, "end": 5.0, "text": "This is a test."}
]
}
md_content = convert_to_markdown(transcript)
# Should not have speaker sections
assert "### Speaker:" not in md_content
assert "**[00:00]** Hello world." in md_content
assert "**[00:02]** This is a test." in md_content
def test_convert_to_markdown_empty_segments(self):
"""Test Markdown conversion with empty segments."""
transcript = {
"title": "Empty Transcript",
"created_at": "2024-01-15T10:30:00Z",
"content": {"duration": 0.0},
"segments": []
}
md_content = convert_to_markdown(transcript)
assert "# Empty Transcript" in md_content
assert "## Metadata" in md_content
assert "## Content" in md_content
# Should not have any segment content
assert "**[00:00]**" not in md_content
class TestExportServiceIntegration:
"""Integration tests for ExportService."""
@pytest.fixture
def export_service(self):
"""Create ExportService with mocked dependencies."""
return ExportService()
@pytest.fixture
def temp_export_dir(self):
"""Create temporary export directory."""
with tempfile.TemporaryDirectory() as temp_dir:
yield Path(temp_dir)
async def test_full_export_workflow(self, export_service, temp_export_dir):
"""Test complete export workflow with all formats."""
transcript = {
"id": "workflow-test",
"title": "Full Workflow Test",
"content": {"text": "Complete workflow test content."},
"segments": [
{"start": 0.0, "end": 3.0, "text": "Complete workflow test content.", "speaker": "Speaker 1"}
],
"created_at": "2024-01-15T10:30:00Z"
}
formats = [ExportFormat.JSON, ExportFormat.TXT, ExportFormat.SRT, ExportFormat.MARKDOWN]
results = []
for format in formats:
output_path = temp_export_dir / f"workflow_test.{format.value}"
result = await export_service.export_transcript(
transcript=transcript,
format=format,
output_path=output_path
)
results.append(result)
# Verify all exports succeeded
assert len(results) == 4
assert all(result.exists() for result in results)
# Verify file sizes are appropriate
json_size = results[0].stat().st_size
txt_size = results[1].stat().st_size
srt_size = results[2].stat().st_size
md_size = results[3].stat().st_size
assert json_size > txt_size # JSON has more metadata
assert md_size > txt_size # Markdown has formatting
assert srt_size > txt_size # SRT has timestamps
async def test_export_with_real_audio_metadata(self, export_service, temp_export_dir):
"""Test export with realistic audio metadata."""
transcript = {
"id": "real-audio-test",
"title": "Tech Podcast Episode 42: AI and Machine Learning",
"media_file_id": "audio-123",
"content": {
"text": "Welcome to Tech Podcast Episode 42. Today we're discussing AI and machine learning.",
"language": "en",
"duration": 3600.0 # 1 hour
},
"segments": [
{
"start": 0.0,
"end": 5.0,
"text": "Welcome to Tech Podcast Episode 42.",
"confidence": 0.98,
"speaker": "Host"
},
{
"start": 5.0,
"end": 10.0,
"text": "Today we're discussing AI and machine learning.",
"confidence": 0.95,
"speaker": "Host"
}
],
"accuracy": 0.965,
"word_count": 12,
"processing_time": 45.2,
"model_used": "whisper-1",
"created_at": "2024-01-15T10:30:00Z"
}
# Test all formats
for format in ExportFormat:
output_path = temp_export_dir / f"real_audio_test.{format.value}"
result = await export_service.export_transcript(
transcript=transcript,
format=format,
output_path=output_path
)
assert result.exists()
# Verify content is appropriate for format
with open(result, "r", encoding="utf-8") as f:
content = f.read()
if format == ExportFormat.JSON:
data = json.loads(content)
assert data["title"] == transcript["title"]
assert data["segments"] == transcript["segments"]
elif format == ExportFormat.TXT:
assert "Welcome to Tech Podcast Episode 42" in content
elif format == ExportFormat.SRT:
assert "00:00:00,000 --> 00:00:05,000" in content
elif format == ExportFormat.MARKDOWN:
assert "# Tech Podcast Episode 42: AI and Machine Learning" in content
assert "### Speaker: Host" in content