553 lines
20 KiB
Python
553 lines
20 KiB
Python
"""Unit tests for export functionality.
|
|
|
|
Tests cover JSON, TXT, SRT, and Markdown export formats with various scenarios
|
|
including error handling, file naming, and batch operations.
|
|
"""
|
|
|
|
import json
|
|
import tempfile
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
|
|
from src.services.export_service import (
|
|
ExportService,
|
|
ExportFormat,
|
|
ExportError,
|
|
format_timestamp,
|
|
format_duration,
|
|
convert_to_srt,
|
|
convert_to_markdown,
|
|
)
|
|
|
|
|
|
class TestExportService:
|
|
"""Test cases for ExportService."""
|
|
|
|
@pytest.fixture
|
|
def export_service(self):
|
|
"""Create ExportService instance for testing."""
|
|
return ExportService()
|
|
|
|
@pytest.fixture
|
|
def sample_transcript(self) -> Dict[str, Any]:
|
|
"""Sample transcript data for testing."""
|
|
return {
|
|
"id": "test-123",
|
|
"title": "Sample Podcast Episode",
|
|
"media_file_id": "media-456",
|
|
"pipeline_version": "v1",
|
|
"content": {
|
|
"text": "Hello world. This is a test transcript.",
|
|
"language": "en",
|
|
"duration": 120.5
|
|
},
|
|
"segments": [
|
|
{
|
|
"start": 0.0,
|
|
"end": 2.5,
|
|
"text": "Hello world.",
|
|
"confidence": 0.95,
|
|
"speaker": "Speaker 1"
|
|
},
|
|
{
|
|
"start": 2.5,
|
|
"end": 5.0,
|
|
"text": "This is a test transcript.",
|
|
"confidence": 0.92,
|
|
"speaker": "Speaker 2"
|
|
}
|
|
],
|
|
"confidence_scores": [0.95, 0.92],
|
|
"speaker_info": {
|
|
"speakers": ["Speaker 1", "Speaker 2"],
|
|
"speaker_count": 2
|
|
},
|
|
"accuracy": 0.935,
|
|
"word_count": 8,
|
|
"processing_time": 15.2,
|
|
"model_used": "whisper-1",
|
|
"model_config": {"temperature": 0.0},
|
|
"created_at": "2024-01-15T10:30:00Z",
|
|
"updated_at": "2024-01-15T10:30:00Z"
|
|
}
|
|
|
|
@pytest.fixture
|
|
def sample_media_file(self) -> Dict[str, Any]:
|
|
"""Sample media file data for testing."""
|
|
return {
|
|
"id": "media-456",
|
|
"filename": "sample_podcast_episode.mp3",
|
|
"local_path": "/path/to/sample_podcast_episode.mp3",
|
|
"duration": 120.5,
|
|
"file_size": 1024000
|
|
}
|
|
|
|
@pytest.fixture
|
|
def temp_export_dir(self):
|
|
"""Create temporary export directory."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
yield Path(temp_dir)
|
|
|
|
async def test_export_json_format(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test JSON export with full transcript data."""
|
|
output_path = temp_export_dir / "test_export.json"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.JSON,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert result_path.exists()
|
|
assert result_path.suffix == ".json"
|
|
|
|
# Verify JSON content
|
|
with open(result_path, "r", encoding="utf-8") as f:
|
|
exported_data = json.load(f)
|
|
|
|
assert exported_data["id"] == sample_transcript["id"]
|
|
assert exported_data["title"] == sample_transcript["title"]
|
|
assert exported_data["segments"] == sample_transcript["segments"]
|
|
assert exported_data["content"] == sample_transcript["content"]
|
|
|
|
async def test_export_txt_format(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test TXT export with plain text content."""
|
|
output_path = temp_export_dir / "test_export.txt"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.TXT,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert result_path.exists()
|
|
assert result_path.suffix == ".txt"
|
|
|
|
# Verify text content
|
|
with open(result_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
expected_text = "Hello world. This is a test transcript."
|
|
assert content.strip() == expected_text
|
|
|
|
async def test_export_srt_format(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test SRT export with timestamps."""
|
|
output_path = temp_export_dir / "test_export.srt"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.SRT,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert result_path.exists()
|
|
assert result_path.suffix == ".srt"
|
|
|
|
# Verify SRT content
|
|
with open(result_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
expected_lines = [
|
|
"1",
|
|
"00:00:00,000 --> 00:00:02,500",
|
|
"Hello world.",
|
|
"",
|
|
"2",
|
|
"00:00:02,500 --> 00:00:05,000",
|
|
"This is a test transcript."
|
|
]
|
|
|
|
actual_lines = content.split("\n")
|
|
# Remove trailing empty lines for comparison
|
|
while actual_lines and actual_lines[-1] == "":
|
|
actual_lines.pop()
|
|
assert actual_lines == expected_lines
|
|
|
|
async def test_export_markdown_format(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test Markdown export with formatting."""
|
|
output_path = temp_export_dir / "test_export.md"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.MARKDOWN,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert result_path.exists()
|
|
assert result_path.suffix == ".md"
|
|
|
|
# Verify Markdown content
|
|
with open(result_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
# Check for required sections
|
|
assert "# Sample Podcast Episode" in content
|
|
assert "## Metadata" in content
|
|
assert "## Content" in content
|
|
assert "### Speaker: Speaker 1" in content
|
|
assert "### Speaker: Speaker 2" in content
|
|
assert "**[00:00]** Hello world." in content
|
|
assert "**[00:02]** This is a test transcript." in content
|
|
|
|
async def test_export_with_default_path(self, export_service, sample_transcript, sample_media_file):
|
|
"""Test export with auto-generated default path."""
|
|
with patch.object(export_service, '_get_media_file', return_value=sample_media_file):
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
export_service.export_dir = Path(temp_dir)
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.JSON
|
|
)
|
|
|
|
assert result_path.exists()
|
|
assert result_path.name == "sample_podcast_episode.json"
|
|
assert result_path.parent == Path(temp_dir)
|
|
|
|
async def test_export_unsupported_format(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test export with unsupported format raises error."""
|
|
output_path = temp_export_dir / "test_export.xyz"
|
|
|
|
with pytest.raises(ExportError, match="Unsupported export format"):
|
|
await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format="xyz",
|
|
output_path=output_path
|
|
)
|
|
|
|
async def test_export_file_system_error(self, export_service, sample_transcript):
|
|
"""Test export with file system error handling."""
|
|
# Use a path that should cause permission error
|
|
invalid_path = Path("/root/invalid_path/test.json")
|
|
|
|
with pytest.raises(ExportError, match="Export error"):
|
|
await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.JSON,
|
|
output_path=invalid_path
|
|
)
|
|
|
|
async def test_batch_export(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test batch export functionality."""
|
|
transcripts = [sample_transcript] * 3
|
|
output_dir = temp_export_dir / "batch_export"
|
|
|
|
results = await export_service.batch_export(
|
|
transcripts=transcripts,
|
|
format=ExportFormat.JSON,
|
|
output_dir=output_dir
|
|
)
|
|
|
|
assert len(results) == 3
|
|
assert all(result.exists() for result in results)
|
|
assert all(result.suffix == ".json" for result in results)
|
|
|
|
async def test_batch_export_with_errors(self, export_service, sample_transcript, temp_export_dir):
|
|
"""Test batch export with some failures."""
|
|
# Create one invalid transcript
|
|
invalid_transcript = {"invalid": "data"}
|
|
transcripts = [sample_transcript, invalid_transcript, sample_transcript]
|
|
output_dir = temp_export_dir / "batch_export"
|
|
|
|
results = await export_service.batch_export(
|
|
transcripts=transcripts,
|
|
format=ExportFormat.JSON,
|
|
output_dir=output_dir
|
|
)
|
|
|
|
# Should have 2 successful exports and 1 None for failure
|
|
assert len(results) == 3
|
|
assert results[0] is not None
|
|
assert results[1] is None # Invalid transcript
|
|
assert results[2] is not None
|
|
|
|
async def test_export_with_large_transcript(self, export_service, temp_export_dir):
|
|
"""Test export with very large transcript."""
|
|
# Create large transcript with many segments
|
|
large_transcript = {
|
|
"id": "large-test",
|
|
"title": "Large Transcript",
|
|
"content": {"text": "Large content " * 1000},
|
|
"segments": [
|
|
{
|
|
"start": i * 10.0,
|
|
"end": (i + 1) * 10.0,
|
|
"text": f"Segment {i} " * 50,
|
|
"confidence": 0.9,
|
|
"speaker": f"Speaker {i % 3 + 1}"
|
|
}
|
|
for i in range(100) # 100 segments
|
|
],
|
|
"created_at": "2024-01-15T10:30:00Z"
|
|
}
|
|
|
|
output_path = temp_export_dir / "large_export.json"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=large_transcript,
|
|
format=ExportFormat.JSON,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert result_path.exists()
|
|
assert result_path.stat().st_size > 10000 # Should be substantial size
|
|
|
|
async def test_export_character_encoding(self, export_service, temp_export_dir):
|
|
"""Test export preserves character encoding."""
|
|
transcript_with_unicode = {
|
|
"id": "unicode-test",
|
|
"title": "Unicode Test: 你好世界",
|
|
"content": {"text": "Hello 你好世界 with unicode: ñáéíóú"},
|
|
"segments": [
|
|
{
|
|
"start": 0.0,
|
|
"end": 5.0,
|
|
"text": "Hello 你好世界 with unicode: ñáéíóú",
|
|
"confidence": 0.95,
|
|
"speaker": "Speaker 1"
|
|
}
|
|
],
|
|
"created_at": "2024-01-15T10:30:00Z"
|
|
}
|
|
|
|
output_path = temp_export_dir / "unicode_export.txt"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=transcript_with_unicode,
|
|
format=ExportFormat.TXT,
|
|
output_path=output_path
|
|
)
|
|
|
|
# Verify encoding is preserved
|
|
with open(result_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
assert "你好世界" in content
|
|
assert "ñáéíóú" in content
|
|
|
|
async def test_export_directory_creation(self, export_service, sample_transcript):
|
|
"""Test export creates directory if it doesn't exist."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
new_export_dir = Path(temp_dir) / "new_export_dir"
|
|
output_path = new_export_dir / "test.json"
|
|
|
|
result_path = await export_service.export_transcript(
|
|
transcript=sample_transcript,
|
|
format=ExportFormat.JSON,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert new_export_dir.exists()
|
|
assert result_path.exists()
|
|
|
|
|
|
class TestExportUtilities:
|
|
"""Test cases for export utility functions."""
|
|
|
|
def test_format_timestamp(self):
|
|
"""Test timestamp formatting for SRT."""
|
|
# Test various time values
|
|
assert format_timestamp(0.0) == "00:00:00,000"
|
|
assert format_timestamp(61.5) == "00:01:01,500"
|
|
assert format_timestamp(3661.123) == "01:01:01,123"
|
|
assert format_timestamp(7325.789) == "02:02:05,789"
|
|
|
|
def test_format_duration(self):
|
|
"""Test duration formatting for Markdown."""
|
|
# Test various duration values
|
|
assert format_duration(0.0) == "00:00"
|
|
assert format_duration(61.5) == "01:01"
|
|
assert format_duration(3661.123) == "01:01:01"
|
|
assert format_duration(7325.789) == "02:02:05"
|
|
|
|
def test_convert_to_srt(self):
|
|
"""Test SRT conversion."""
|
|
transcript = {
|
|
"segments": [
|
|
{"start": 0.0, "end": 2.5, "text": "Hello world."},
|
|
{"start": 2.5, "end": 5.0, "text": "This is a test."}
|
|
]
|
|
}
|
|
|
|
srt_content = convert_to_srt(transcript)
|
|
expected = "1\n00:00:00,000 --> 00:00:02,500\nHello world.\n\n2\n00:00:02,500 --> 00:00:05,000\nThis is a test.\n"
|
|
|
|
assert srt_content == expected
|
|
|
|
def test_convert_to_markdown(self):
|
|
"""Test Markdown conversion."""
|
|
transcript = {
|
|
"title": "Test Transcript",
|
|
"created_at": "2024-01-15T10:30:00Z",
|
|
"content": {"duration": 120.5},
|
|
"segments": [
|
|
{"start": 0.0, "end": 2.5, "text": "Hello world.", "speaker": "Speaker 1"},
|
|
{"start": 2.5, "end": 5.0, "text": "This is a test.", "speaker": "Speaker 2"}
|
|
]
|
|
}
|
|
|
|
md_content = convert_to_markdown(transcript)
|
|
|
|
# Check required sections
|
|
assert "# Test Transcript" in md_content
|
|
assert "## Metadata" in md_content
|
|
assert "## Content" in md_content
|
|
assert "### Speaker: Speaker 1" in md_content
|
|
assert "### Speaker: Speaker 2" in md_content
|
|
assert "**[00:00]** Hello world." in md_content
|
|
assert "**[00:02]** This is a test." in md_content
|
|
|
|
def test_convert_to_markdown_no_speakers(self):
|
|
"""Test Markdown conversion without speaker information."""
|
|
transcript = {
|
|
"title": "Test Transcript",
|
|
"created_at": "2024-01-15T10:30:00Z",
|
|
"content": {"duration": 120.5},
|
|
"segments": [
|
|
{"start": 0.0, "end": 2.5, "text": "Hello world."},
|
|
{"start": 2.5, "end": 5.0, "text": "This is a test."}
|
|
]
|
|
}
|
|
|
|
md_content = convert_to_markdown(transcript)
|
|
|
|
# Should not have speaker sections
|
|
assert "### Speaker:" not in md_content
|
|
assert "**[00:00]** Hello world." in md_content
|
|
assert "**[00:02]** This is a test." in md_content
|
|
|
|
def test_convert_to_markdown_empty_segments(self):
|
|
"""Test Markdown conversion with empty segments."""
|
|
transcript = {
|
|
"title": "Empty Transcript",
|
|
"created_at": "2024-01-15T10:30:00Z",
|
|
"content": {"duration": 0.0},
|
|
"segments": []
|
|
}
|
|
|
|
md_content = convert_to_markdown(transcript)
|
|
|
|
assert "# Empty Transcript" in md_content
|
|
assert "## Metadata" in md_content
|
|
assert "## Content" in md_content
|
|
# Should not have any segment content
|
|
assert "**[00:00]**" not in md_content
|
|
|
|
|
|
class TestExportServiceIntegration:
|
|
"""Integration tests for ExportService."""
|
|
|
|
@pytest.fixture
|
|
def export_service(self):
|
|
"""Create ExportService with mocked dependencies."""
|
|
return ExportService()
|
|
|
|
@pytest.fixture
|
|
def temp_export_dir(self):
|
|
"""Create temporary export directory."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
yield Path(temp_dir)
|
|
|
|
async def test_full_export_workflow(self, export_service, temp_export_dir):
|
|
"""Test complete export workflow with all formats."""
|
|
transcript = {
|
|
"id": "workflow-test",
|
|
"title": "Full Workflow Test",
|
|
"content": {"text": "Complete workflow test content."},
|
|
"segments": [
|
|
{"start": 0.0, "end": 3.0, "text": "Complete workflow test content.", "speaker": "Speaker 1"}
|
|
],
|
|
"created_at": "2024-01-15T10:30:00Z"
|
|
}
|
|
|
|
formats = [ExportFormat.JSON, ExportFormat.TXT, ExportFormat.SRT, ExportFormat.MARKDOWN]
|
|
results = []
|
|
|
|
for format in formats:
|
|
output_path = temp_export_dir / f"workflow_test.{format.value}"
|
|
result = await export_service.export_transcript(
|
|
transcript=transcript,
|
|
format=format,
|
|
output_path=output_path
|
|
)
|
|
results.append(result)
|
|
|
|
# Verify all exports succeeded
|
|
assert len(results) == 4
|
|
assert all(result.exists() for result in results)
|
|
|
|
# Verify file sizes are appropriate
|
|
json_size = results[0].stat().st_size
|
|
txt_size = results[1].stat().st_size
|
|
srt_size = results[2].stat().st_size
|
|
md_size = results[3].stat().st_size
|
|
|
|
assert json_size > txt_size # JSON has more metadata
|
|
assert md_size > txt_size # Markdown has formatting
|
|
assert srt_size > txt_size # SRT has timestamps
|
|
|
|
async def test_export_with_real_audio_metadata(self, export_service, temp_export_dir):
|
|
"""Test export with realistic audio metadata."""
|
|
transcript = {
|
|
"id": "real-audio-test",
|
|
"title": "Tech Podcast Episode 42: AI and Machine Learning",
|
|
"media_file_id": "audio-123",
|
|
"content": {
|
|
"text": "Welcome to Tech Podcast Episode 42. Today we're discussing AI and machine learning.",
|
|
"language": "en",
|
|
"duration": 3600.0 # 1 hour
|
|
},
|
|
"segments": [
|
|
{
|
|
"start": 0.0,
|
|
"end": 5.0,
|
|
"text": "Welcome to Tech Podcast Episode 42.",
|
|
"confidence": 0.98,
|
|
"speaker": "Host"
|
|
},
|
|
{
|
|
"start": 5.0,
|
|
"end": 10.0,
|
|
"text": "Today we're discussing AI and machine learning.",
|
|
"confidence": 0.95,
|
|
"speaker": "Host"
|
|
}
|
|
],
|
|
"accuracy": 0.965,
|
|
"word_count": 12,
|
|
"processing_time": 45.2,
|
|
"model_used": "whisper-1",
|
|
"created_at": "2024-01-15T10:30:00Z"
|
|
}
|
|
|
|
# Test all formats
|
|
for format in ExportFormat:
|
|
output_path = temp_export_dir / f"real_audio_test.{format.value}"
|
|
result = await export_service.export_transcript(
|
|
transcript=transcript,
|
|
format=format,
|
|
output_path=output_path
|
|
)
|
|
|
|
assert result.exists()
|
|
|
|
# Verify content is appropriate for format
|
|
with open(result, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
if format == ExportFormat.JSON:
|
|
data = json.loads(content)
|
|
assert data["title"] == transcript["title"]
|
|
assert data["segments"] == transcript["segments"]
|
|
elif format == ExportFormat.TXT:
|
|
assert "Welcome to Tech Podcast Episode 42" in content
|
|
elif format == ExportFormat.SRT:
|
|
assert "00:00:00,000 --> 00:00:05,000" in content
|
|
elif format == ExportFormat.MARKDOWN:
|
|
assert "# Tech Podcast Episode 42: AI and Machine Learning" in content
|
|
assert "### Speaker: Host" in content
|