youtube-summarizer/backend/tests/unit/test_summary_pipeline.py

510 lines
18 KiB
Python

"""Unit tests for SummaryPipeline orchestration service."""
import pytest
import asyncio
from unittest.mock import Mock, AsyncMock, patch
from datetime import datetime, timedelta
from backend.services.summary_pipeline import SummaryPipeline
from backend.models.pipeline import (
PipelineStage, PipelineConfig, PipelineResult, ContentAnalysis
)
from backend.services.ai_service import SummaryResult
@pytest.fixture
def mock_video_service():
"""Mock VideoService for testing."""
service = Mock()
service.extract_video_id = AsyncMock(return_value="test_video_id")
service.get_video_metadata = AsyncMock(return_value={
"title": "Test Video",
"description": "Test description",
"duration": "PT10M30S",
"category": "Education",
"tags": ["test", "tutorial"],
"language": "en"
})
return service
@pytest.fixture
def mock_transcript_service():
"""Mock TranscriptService for testing."""
service = Mock()
service.extract_transcript = AsyncMock()
service.extract_transcript.return_value = Mock(
transcript="This is a test transcript with educational content. "
"We will learn about important concepts and examples."
)
return service
@pytest.fixture
def mock_ai_service():
"""Mock AnthropicSummarizer for testing."""
service = Mock()
service.generate_summary = AsyncMock()
service.generate_summary.return_value = SummaryResult(
summary="Test summary of the video content",
key_points=["Point 1", "Point 2", "Point 3"],
main_themes=["Theme 1", "Theme 2"],
actionable_insights=["Insight 1"],
confidence_score=0.85,
processing_metadata={"tokens_used": 1000},
cost_data={"total_cost": 0.01}
)
return service
@pytest.fixture
def mock_cache_manager():
"""Mock CacheManager for testing."""
cache = Mock()
cache.get_cached_pipeline_result = AsyncMock(return_value=None)
cache.cache_pipeline_result = AsyncMock(return_value=True)
cache.get_cached_video_metadata = AsyncMock(return_value=None)
cache.cache_video_metadata = AsyncMock(return_value=True)
return cache
@pytest.fixture
def mock_notification_service():
"""Mock NotificationService for testing."""
service = Mock()
service.send_completion_notification = AsyncMock(return_value=True)
service.send_error_notification = AsyncMock(return_value=True)
service.send_progress_notification = AsyncMock(return_value=True)
return service
@pytest.fixture
def pipeline(mock_video_service, mock_transcript_service, mock_ai_service,
mock_cache_manager, mock_notification_service):
"""Create SummaryPipeline instance with mocked dependencies."""
return SummaryPipeline(
video_service=mock_video_service,
transcript_service=mock_transcript_service,
ai_service=mock_ai_service,
cache_manager=mock_cache_manager,
notification_service=mock_notification_service
)
class TestSummaryPipeline:
"""Test suite for SummaryPipeline class."""
@pytest.mark.asyncio
async def test_process_video_initialization(self, pipeline):
"""Test video processing initialization."""
video_url = "https://youtube.com/watch?v=test123"
config = PipelineConfig(summary_length="standard")
job_id = await pipeline.process_video(video_url, config)
# Verify job ID is generated
assert job_id is not None
assert len(job_id) > 0
# Verify job is tracked
assert job_id in pipeline.active_jobs
# Verify initial state
result = pipeline.active_jobs[job_id]
assert result.job_id == job_id
assert result.video_url == video_url
assert result.status == PipelineStage.INITIALIZED
assert result.started_at is not None
@pytest.mark.asyncio
async def test_process_video_with_progress_callback(self, pipeline):
"""Test video processing with progress callback."""
video_url = "https://youtube.com/watch?v=test123"
progress_updates = []
async def progress_callback(job_id, progress):
progress_updates.append((job_id, progress))
job_id = await pipeline.process_video(
video_url,
progress_callback=progress_callback
)
# Allow some processing time
await asyncio.sleep(0.1)
# Verify callback is registered
assert job_id in pipeline.progress_callbacks
assert len(pipeline.progress_callbacks[job_id]) == 1
@pytest.mark.asyncio
async def test_successful_pipeline_execution(self, pipeline):
"""Test complete successful pipeline execution."""
video_url = "https://youtube.com/watch?v=test123"
config = PipelineConfig(
summary_length="standard",
enable_notifications=True
)
# Start processing
job_id = await pipeline.process_video(video_url, config)
# Wait for completion (with timeout)
max_wait = 5.0
waited = 0.0
while waited < max_wait:
result = pipeline.active_jobs.get(job_id)
if result and result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED]:
break
await asyncio.sleep(0.1)
waited += 0.1
# Verify completion
result = pipeline.active_jobs[job_id]
assert result.status == PipelineStage.COMPLETED
assert result.video_id == "test_video_id"
assert result.summary is not None
assert result.key_points is not None
assert result.main_themes is not None
assert result.quality_score is not None
assert result.completed_at is not None
# Verify services were called
pipeline.video_service.extract_video_id.assert_called_once()
pipeline.transcript_service.extract_transcript.assert_called_once()
pipeline.ai_service.generate_summary.assert_called_once()
pipeline.cache_manager.cache_pipeline_result.assert_called_once()
pipeline.notification_service.send_completion_notification.assert_called_once()
@pytest.mark.asyncio
async def test_pipeline_error_handling(self, pipeline):
"""Test pipeline error handling and retry logic."""
# Make video service fail
pipeline.video_service.extract_video_id.side_effect = Exception("Video not found")
video_url = "https://youtube.com/watch?v=invalid"
config = PipelineConfig(max_retries=1)
job_id = await pipeline.process_video(video_url, config)
# Wait for failure
max_wait = 5.0
waited = 0.0
while waited < max_wait:
result = pipeline.active_jobs.get(job_id)
if result and result.status == PipelineStage.FAILED:
break
await asyncio.sleep(0.1)
waited += 0.1
# Verify failure handling
result = pipeline.active_jobs[job_id]
assert result.status == PipelineStage.FAILED
assert result.error is not None
assert "Video not found" in result.error["message"]
assert result.retry_count <= config.max_retries
@pytest.mark.asyncio
async def test_content_analysis(self, pipeline):
"""Test content analysis functionality."""
transcript = """
This is a technical tutorial about programming concepts.
We will learn about algorithms, functions, and databases.
The implementation details are complex but important to understand.
"""
metadata = {
"title": "Programming Tutorial",
"category": "Education",
"tags": ["programming", "tutorial", "technical"],
"language": "en"
}
analysis = await pipeline._analyze_content_characteristics(transcript, metadata)
assert isinstance(analysis, ContentAnalysis)
assert analysis.content_type == "technical"
assert analysis.word_count > 0
assert analysis.language == "en"
assert len(analysis.technical_indicators) > 0
assert "algorithm" in analysis.technical_indicators
assert "function" in analysis.technical_indicators
def test_config_optimization(self, pipeline):
"""Test configuration optimization based on content analysis."""
base_config = PipelineConfig(
summary_length="standard",
focus_areas=[],
quality_threshold=0.7
)
# Test technical content optimization
technical_analysis = ContentAnalysis(
transcript_length=5000,
word_count=1000,
estimated_reading_time=4.0,
complexity_score=0.8,
content_type="technical",
language="en",
technical_indicators=["algorithm", "function", "code"],
educational_indicators=[],
entertainment_indicators=[]
)
optimized_config = pipeline._optimize_config_for_content(
base_config, technical_analysis
)
assert optimized_config.summary_length == "standard" # Not changed for 1000 words
assert "technical concepts" in optimized_config.focus_areas
assert optimized_config.quality_threshold < base_config.quality_threshold # Lowered due to complexity
@pytest.mark.asyncio
async def test_quality_validation(self, pipeline):
"""Test summary quality validation."""
result = PipelineResult(
job_id="test",
video_url="test",
video_id="test",
status=PipelineStage.COMPLETED,
summary="This is a good summary with appropriate length and detail.",
key_points=["Point 1", "Point 2", "Point 3", "Point 4"],
main_themes=["Theme 1", "Theme 2"],
actionable_insights=["Insight 1"],
confidence_score=0.9
)
analysis = ContentAnalysis(
transcript_length=10000,
word_count=2000,
estimated_reading_time=8.0,
complexity_score=0.5,
content_type="general",
language="en",
technical_indicators=[],
educational_indicators=[],
entertainment_indicators=[]
)
quality_score = await pipeline._validate_summary_quality(result, analysis)
assert 0.0 <= quality_score <= 1.0
assert quality_score > 0.5 # Should be reasonably high for good summary
@pytest.mark.asyncio
async def test_pipeline_cancellation(self, pipeline):
"""Test pipeline cancellation functionality."""
video_url = "https://youtube.com/watch?v=test123"
# Start processing
job_id = await pipeline.process_video(video_url)
# Verify job is active
assert job_id in pipeline.active_jobs
# Cancel the job
success = await pipeline.cancel_pipeline(job_id)
assert success is True
# Verify job is cancelled
result = pipeline.active_jobs[job_id]
assert result.status == PipelineStage.CANCELLED
assert result.completed_at is not None
@pytest.mark.asyncio
async def test_get_pipeline_result(self, pipeline):
"""Test getting pipeline result."""
video_url = "https://youtube.com/watch?v=test123"
# Test non-existent job
result = await pipeline.get_pipeline_result("non_existent")
assert result is None
# Test active job
job_id = await pipeline.process_video(video_url)
result = await pipeline.get_pipeline_result(job_id)
assert result is not None
assert result.job_id == job_id
def test_iso_duration_parsing(self, pipeline):
"""Test ISO 8601 duration parsing."""
# Test various duration formats
assert pipeline._parse_iso_duration("PT10M30S") == 630 # 10:30
assert pipeline._parse_iso_duration("PT1H5M") == 3900 # 1:05:00
assert pipeline._parse_iso_duration("PT45S") == 45 # 0:45
assert pipeline._parse_iso_duration("PT2H") == 7200 # 2:00:00
assert pipeline._parse_iso_duration("invalid") == 0 # Invalid format
@pytest.mark.asyncio
async def test_cache_integration(self, pipeline):
"""Test cache integration in pipeline."""
# Setup cache to return existing result
cached_result = {
"job_id": "cached_job",
"video_url": "https://youtube.com/watch?v=cached",
"video_id": "cached_id",
"status": PipelineStage.COMPLETED.value,
"summary": "Cached summary",
"key_points": ["Cached point 1", "Cached point 2"],
"quality_score": 0.8,
"completed_at": datetime.utcnow().isoformat()
}
pipeline.cache_manager.get_cached_pipeline_result.return_value = cached_result
video_url = "https://youtube.com/watch?v=cached"
job_id = await pipeline.process_video(video_url)
# Wait for cache restoration
await asyncio.sleep(0.2)
# Verify cache was checked
pipeline.cache_manager.get_cached_pipeline_result.assert_called()
@pytest.mark.asyncio
async def test_notification_integration(self, pipeline):
"""Test notification service integration."""
video_url = "https://youtube.com/watch?v=test123"
config = PipelineConfig(enable_notifications=True)
job_id = await pipeline.process_video(video_url, config)
# Wait for completion
max_wait = 5.0
waited = 0.0
while waited < max_wait:
result = pipeline.active_jobs.get(job_id)
if result and result.status == PipelineStage.COMPLETED:
break
await asyncio.sleep(0.1)
waited += 0.1
# Verify notifications were sent
pipeline.notification_service.send_progress_notification.assert_called()
pipeline.notification_service.send_completion_notification.assert_called_once()
@pytest.mark.asyncio
async def test_cleanup_completed_jobs(self, pipeline):
"""Test cleanup of old completed jobs."""
# Create some old completed jobs
old_time = datetime.utcnow() - timedelta(hours=25)
pipeline.active_jobs["old_job"] = PipelineResult(
job_id="old_job",
video_url="test",
video_id="test",
status=PipelineStage.COMPLETED,
completed_at=old_time
)
pipeline.active_jobs["recent_job"] = PipelineResult(
job_id="recent_job",
video_url="test",
video_id="test",
status=PipelineStage.COMPLETED,
completed_at=datetime.utcnow() - timedelta(hours=1)
)
# Cleanup jobs older than 24 hours
await pipeline.cleanup_completed_jobs(max_age_hours=24)
# Verify old job was removed but recent job remains
assert "old_job" not in pipeline.active_jobs
assert "recent_job" in pipeline.active_jobs
def test_get_active_jobs(self, pipeline):
"""Test getting list of active jobs."""
# Initially no jobs
assert pipeline.get_active_jobs() == []
# Add some jobs manually for testing
pipeline.active_jobs["job1"] = Mock()
pipeline.active_jobs["job2"] = Mock()
active_jobs = pipeline.get_active_jobs()
assert len(active_jobs) == 2
assert "job1" in active_jobs
assert "job2" in active_jobs
class TestPipelineEdgeCases:
"""Test edge cases and error conditions."""
@pytest.mark.asyncio
async def test_empty_transcript_handling(self, pipeline):
"""Test handling of empty transcript."""
pipeline.transcript_service.extract_transcript.return_value = Mock(
transcript=""
)
video_url = "https://youtube.com/watch?v=empty"
job_id = await pipeline.process_video(video_url)
# Wait for processing
await asyncio.sleep(0.2)
# Should handle empty transcript gracefully
result = pipeline.active_jobs.get(job_id)
assert result is not None
@pytest.mark.asyncio
async def test_invalid_video_url_handling(self, pipeline):
"""Test handling of invalid video URLs."""
pipeline.video_service.extract_video_id.side_effect = ValueError("Invalid URL")
video_url = "https://not-youtube.com/watch?v=invalid"
config = PipelineConfig(max_retries=0) # No retries for faster test
job_id = await pipeline.process_video(video_url, config)
# Wait for failure
await asyncio.sleep(0.2)
result = pipeline.active_jobs.get(job_id)
assert result.status == PipelineStage.FAILED
assert "Invalid URL" in result.error["message"]
@pytest.mark.asyncio
async def test_ai_service_failure_with_retry(self, pipeline):
"""Test AI service failure with retry mechanism."""
# Make AI service fail initially, then succeed
call_count = 0
async def failing_generate_summary(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count == 1:
raise Exception("API rate limit exceeded")
return SummaryResult(
summary="Retry successful",
key_points=["Point 1"],
main_themes=["Theme 1"],
actionable_insights=[],
confidence_score=0.7,
processing_metadata={},
cost_data={}
)
pipeline.ai_service.generate_summary.side_effect = failing_generate_summary
video_url = "https://youtube.com/watch?v=retry_test"
config = PipelineConfig(max_retries=1)
job_id = await pipeline.process_video(video_url, config)
# Wait for retry and completion
max_wait = 10.0
waited = 0.0
while waited < max_wait:
result = pipeline.active_jobs.get(job_id)
if result and result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED]:
break
await asyncio.sleep(0.1)
waited += 0.1
result = pipeline.active_jobs[job_id]
# Should eventually succeed after retry
if result.status == PipelineStage.COMPLETED:
assert result.retry_count > 0
assert call_count > 1