youtube-summarizer/backend/tests/unit/test_summary_pipeline.py

"""Unit tests for SummaryPipeline orchestration service."""
import pytest
import asyncio
from unittest.mock import Mock, AsyncMock, patch
from datetime import datetime, timedelta

from backend.services.summary_pipeline import SummaryPipeline
from backend.models.pipeline import (
    PipelineStage, PipelineConfig, PipelineResult, ContentAnalysis
)
from backend.services.ai_service import SummaryResult


@pytest.fixture
def mock_video_service():
    """Mock VideoService for testing."""
    service = Mock()
    service.extract_video_id = AsyncMock(return_value="test_video_id")
    service.get_video_metadata = AsyncMock(return_value={
        "title": "Test Video",
        "description": "Test description",
        "duration": "PT10M30S",
        "category": "Education",
        "tags": ["test", "tutorial"],
        "language": "en"
    })
    return service


@pytest.fixture
def mock_transcript_service():
    """Mock TranscriptService for testing."""
    service = Mock()
    service.extract_transcript = AsyncMock()
    service.extract_transcript.return_value = Mock(
        transcript="This is a test transcript with educational content. "
                  "We will learn about important concepts and examples."
    )
    return service


@pytest.fixture
def mock_ai_service():
    """Mock AnthropicSummarizer for testing."""
    service = Mock()
    service.generate_summary = AsyncMock()
    service.generate_summary.return_value = SummaryResult(
        summary="Test summary of the video content",
        key_points=["Point 1", "Point 2", "Point 3"],
        main_themes=["Theme 1", "Theme 2"],
        actionable_insights=["Insight 1"],
        confidence_score=0.85,
        processing_metadata={"tokens_used": 1000},
        cost_data={"total_cost": 0.01}
    )
    return service


@pytest.fixture
def mock_cache_manager():
    """Mock CacheManager for testing."""
    cache = Mock()
    cache.get_cached_pipeline_result = AsyncMock(return_value=None)
    cache.cache_pipeline_result = AsyncMock(return_value=True)
    cache.get_cached_video_metadata = AsyncMock(return_value=None)
    cache.cache_video_metadata = AsyncMock(return_value=True)
    return cache


@pytest.fixture
def mock_notification_service():
    """Mock NotificationService for testing."""
    service = Mock()
    service.send_completion_notification = AsyncMock(return_value=True)
    service.send_error_notification = AsyncMock(return_value=True)
    service.send_progress_notification = AsyncMock(return_value=True)
    return service


@pytest.fixture
def pipeline(mock_video_service, mock_transcript_service, mock_ai_service,
             mock_cache_manager, mock_notification_service):
    """Create SummaryPipeline instance with mocked dependencies."""
    return SummaryPipeline(
        video_service=mock_video_service,
        transcript_service=mock_transcript_service,
        ai_service=mock_ai_service,
        cache_manager=mock_cache_manager,
        notification_service=mock_notification_service
    )


class TestSummaryPipeline:
    """Test suite for SummaryPipeline class."""

    @pytest.mark.asyncio
    async def test_process_video_initialization(self, pipeline):
        """Test video processing initialization."""
        video_url = "https://youtube.com/watch?v=test123"
        config = PipelineConfig(summary_length="standard")

        job_id = await pipeline.process_video(video_url, config)

        # Verify job ID is generated
        assert job_id is not None
        assert len(job_id) > 0

        # Verify job is tracked
        assert job_id in pipeline.active_jobs

        # Verify initial state
        result = pipeline.active_jobs[job_id]
        assert result.job_id == job_id
        assert result.video_url == video_url
        assert result.status == PipelineStage.INITIALIZED
        assert result.started_at is not None

    @pytest.mark.asyncio
    async def test_process_video_with_progress_callback(self, pipeline):
        """Test video processing with progress callback."""
        video_url = "https://youtube.com/watch?v=test123"
        progress_updates = []

        async def progress_callback(job_id, progress):
            progress_updates.append((job_id, progress))

        job_id = await pipeline.process_video(
            video_url,
            progress_callback=progress_callback
        )

        # Allow some processing time
        await asyncio.sleep(0.1)

        # Verify callback is registered
        assert job_id in pipeline.progress_callbacks
        assert len(pipeline.progress_callbacks[job_id]) == 1

    @pytest.mark.asyncio
    async def test_successful_pipeline_execution(self, pipeline):
        """Test complete successful pipeline execution."""
        video_url = "https://youtube.com/watch?v=test123"
        config = PipelineConfig(
            summary_length="standard",
            enable_notifications=True
        )

        # Start processing
        job_id = await pipeline.process_video(video_url, config)

        # Wait for completion (with timeout)
        max_wait = 5.0
        waited = 0.0
        while waited < max_wait:
            result = pipeline.active_jobs.get(job_id)
            if result and result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED]:
                break
            await asyncio.sleep(0.1)
            waited += 0.1

        # Verify completion
        result = pipeline.active_jobs[job_id]
        assert result.status == PipelineStage.COMPLETED
        assert result.video_id == "test_video_id"
        assert result.summary is not None
        assert result.key_points is not None
        assert result.main_themes is not None
        assert result.quality_score is not None
        assert result.completed_at is not None

        # Verify services were called
        pipeline.video_service.extract_video_id.assert_called_once()
        pipeline.transcript_service.extract_transcript.assert_called_once()
        pipeline.ai_service.generate_summary.assert_called_once()
        pipeline.cache_manager.cache_pipeline_result.assert_called_once()
        pipeline.notification_service.send_completion_notification.assert_called_once()

    @pytest.mark.asyncio
    async def test_pipeline_error_handling(self, pipeline):
        """Test pipeline error handling and retry logic."""
        # Make video service fail
        pipeline.video_service.extract_video_id.side_effect = Exception("Video not found")

        video_url = "https://youtube.com/watch?v=invalid"
        config = PipelineConfig(max_retries=1)

        job_id = await pipeline.process_video(video_url, config)

        # Wait for failure
        max_wait = 5.0
        waited = 0.0
        while waited < max_wait:
            result = pipeline.active_jobs.get(job_id)
            if result and result.status == PipelineStage.FAILED:
                break
            await asyncio.sleep(0.1)
            waited += 0.1

        # Verify failure handling
        result = pipeline.active_jobs[job_id]
        assert result.status == PipelineStage.FAILED
        assert result.error is not None
        assert "Video not found" in result.error["message"]
        assert result.retry_count <= config.max_retries

    @pytest.mark.asyncio
    async def test_content_analysis(self, pipeline):
        """Test content analysis functionality."""
        transcript = """
        This is a technical tutorial about programming concepts.
        We will learn about algorithms, functions, and databases.
        The implementation details are complex but important to understand.
        """

        metadata = {
            "title": "Programming Tutorial",
            "category": "Education",
            "tags": ["programming", "tutorial", "technical"],
            "language": "en"
        }

        analysis = await pipeline._analyze_content_characteristics(transcript, metadata)

        assert isinstance(analysis, ContentAnalysis)
        assert analysis.content_type == "technical"
        assert analysis.word_count > 0
        assert analysis.language == "en"
        assert len(analysis.technical_indicators) > 0
        assert "algorithm" in analysis.technical_indicators
        assert "function" in analysis.technical_indicators

    def test_config_optimization(self, pipeline):
        """Test configuration optimization based on content analysis."""
        base_config = PipelineConfig(
            summary_length="standard",
            focus_areas=[],
            quality_threshold=0.7
        )

        # Test technical content optimization
        technical_analysis = ContentAnalysis(
            transcript_length=5000,
            word_count=1000,
            estimated_reading_time=4.0,
            complexity_score=0.8,
            content_type="technical",
            language="en",
            technical_indicators=["algorithm", "function", "code"],
            educational_indicators=[],
            entertainment_indicators=[]
        )

        optimized_config = pipeline._optimize_config_for_content(
            base_config, technical_analysis
        )

        assert optimized_config.summary_length == "standard"  # Not changed for 1000 words
        assert "technical concepts" in optimized_config.focus_areas
        assert optimized_config.quality_threshold < base_config.quality_threshold  # Lowered due to complexity

    @pytest.mark.asyncio
    async def test_quality_validation(self, pipeline):
        """Test summary quality validation."""
        result = PipelineResult(
            job_id="test",
            video_url="test",
            video_id="test",
            status=PipelineStage.COMPLETED,
            summary="This is a good summary with appropriate length and detail.",
            key_points=["Point 1", "Point 2", "Point 3", "Point 4"],
            main_themes=["Theme 1", "Theme 2"],
            actionable_insights=["Insight 1"],
            confidence_score=0.9
        )

        analysis = ContentAnalysis(
            transcript_length=10000,
            word_count=2000,
            estimated_reading_time=8.0,
            complexity_score=0.5,
            content_type="general",
            language="en",
            technical_indicators=[],
            educational_indicators=[],
            entertainment_indicators=[]
        )

        quality_score = await pipeline._validate_summary_quality(result, analysis)

        assert 0.0 <= quality_score <= 1.0
        assert quality_score > 0.5  # Should be reasonably high for good summary

    @pytest.mark.asyncio
    async def test_pipeline_cancellation(self, pipeline):
        """Test pipeline cancellation functionality."""
        video_url = "https://youtube.com/watch?v=test123"

        # Start processing
        job_id = await pipeline.process_video(video_url)

        # Verify job is active
        assert job_id in pipeline.active_jobs

        # Cancel the job
        success = await pipeline.cancel_pipeline(job_id)

        assert success is True

        # Verify job is cancelled
        result = pipeline.active_jobs[job_id]
        assert result.status == PipelineStage.CANCELLED
        assert result.completed_at is not None

    @pytest.mark.asyncio
    async def test_get_pipeline_result(self, pipeline):
        """Test getting pipeline result."""
        video_url = "https://youtube.com/watch?v=test123"

        # Test non-existent job
        result = await pipeline.get_pipeline_result("non_existent")
        assert result is None

        # Test active job
        job_id = await pipeline.process_video(video_url)
        result = await pipeline.get_pipeline_result(job_id)

        assert result is not None
        assert result.job_id == job_id

    def test_iso_duration_parsing(self, pipeline):
        """Test ISO 8601 duration parsing."""
        # Test various duration formats
        assert pipeline._parse_iso_duration("PT10M30S") == 630  # 10:30
        assert pipeline._parse_iso_duration("PT1H5M") == 3900   # 1:05:00
        assert pipeline._parse_iso_duration("PT45S") == 45      # 0:45
        assert pipeline._parse_iso_duration("PT2H") == 7200     # 2:00:00
        assert pipeline._parse_iso_duration("invalid") == 0     # Invalid format

    @pytest.mark.asyncio
    async def test_cache_integration(self, pipeline):
        """Test cache integration in pipeline."""
        # Setup cache to return existing result
        cached_result = {
            "job_id": "cached_job",
            "video_url": "https://youtube.com/watch?v=cached",
            "video_id": "cached_id",
            "status": PipelineStage.COMPLETED.value,
            "summary": "Cached summary",
            "key_points": ["Cached point 1", "Cached point 2"],
            "quality_score": 0.8,
            "completed_at": datetime.utcnow().isoformat()
        }

        pipeline.cache_manager.get_cached_pipeline_result.return_value = cached_result

        video_url = "https://youtube.com/watch?v=cached"
        job_id = await pipeline.process_video(video_url)

        # Wait for cache restoration
        await asyncio.sleep(0.2)

        # Verify cache was checked
        pipeline.cache_manager.get_cached_pipeline_result.assert_called()

    @pytest.mark.asyncio
    async def test_notification_integration(self, pipeline):
        """Test notification service integration."""
        video_url = "https://youtube.com/watch?v=test123"
        config = PipelineConfig(enable_notifications=True)

        job_id = await pipeline.process_video(video_url, config)

        # Wait for completion
        max_wait = 5.0
        waited = 0.0
        while waited < max_wait:
            result = pipeline.active_jobs.get(job_id)
            if result and result.status == PipelineStage.COMPLETED:
                break
            await asyncio.sleep(0.1)
            waited += 0.1

        # Verify notifications were sent
        pipeline.notification_service.send_progress_notification.assert_called()
        pipeline.notification_service.send_completion_notification.assert_called_once()

    @pytest.mark.asyncio
    async def test_cleanup_completed_jobs(self, pipeline):
        """Test cleanup of old completed jobs."""
        # Create some old completed jobs
        old_time = datetime.utcnow() - timedelta(hours=25)

        pipeline.active_jobs["old_job"] = PipelineResult(
            job_id="old_job",
            video_url="test",
            video_id="test",
            status=PipelineStage.COMPLETED,
            completed_at=old_time
        )

        pipeline.active_jobs["recent_job"] = PipelineResult(
            job_id="recent_job",
            video_url="test",
            video_id="test",
            status=PipelineStage.COMPLETED,
            completed_at=datetime.utcnow() - timedelta(hours=1)
        )

        # Cleanup jobs older than 24 hours
        await pipeline.cleanup_completed_jobs(max_age_hours=24)

        # Verify old job was removed but recent job remains
        assert "old_job" not in pipeline.active_jobs
        assert "recent_job" in pipeline.active_jobs

    def test_get_active_jobs(self, pipeline):
        """Test getting list of active jobs."""
        # Initially no jobs
        assert pipeline.get_active_jobs() == []

        # Add some jobs manually for testing
        pipeline.active_jobs["job1"] = Mock()
        pipeline.active_jobs["job2"] = Mock()

        active_jobs = pipeline.get_active_jobs()
        assert len(active_jobs) == 2
        assert "job1" in active_jobs
        assert "job2" in active_jobs


class TestPipelineEdgeCases:
    """Test edge cases and error conditions."""

    @pytest.mark.asyncio
    async def test_empty_transcript_handling(self, pipeline):
        """Test handling of empty transcript."""
        pipeline.transcript_service.extract_transcript.return_value = Mock(
            transcript=""
        )

        video_url = "https://youtube.com/watch?v=empty"
        job_id = await pipeline.process_video(video_url)

        # Wait for processing
        await asyncio.sleep(0.2)

        # Should handle empty transcript gracefully
        result = pipeline.active_jobs.get(job_id)
        assert result is not None

    @pytest.mark.asyncio
    async def test_invalid_video_url_handling(self, pipeline):
        """Test handling of invalid video URLs."""
        pipeline.video_service.extract_video_id.side_effect = ValueError("Invalid URL")

        video_url = "https://not-youtube.com/watch?v=invalid"
        config = PipelineConfig(max_retries=0)  # No retries for faster test

        job_id = await pipeline.process_video(video_url, config)

        # Wait for failure
        await asyncio.sleep(0.2)

        result = pipeline.active_jobs.get(job_id)
        assert result.status == PipelineStage.FAILED
        assert "Invalid URL" in result.error["message"]

    @pytest.mark.asyncio
    async def test_ai_service_failure_with_retry(self, pipeline):
        """Test AI service failure with retry mechanism."""
        # Make AI service fail initially, then succeed
        call_count = 0

        async def failing_generate_summary(*args, **kwargs):
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                raise Exception("API rate limit exceeded")
            return SummaryResult(
                summary="Retry successful",
                key_points=["Point 1"],
                main_themes=["Theme 1"],
                actionable_insights=[],
                confidence_score=0.7,
                processing_metadata={},
                cost_data={}
            )

        pipeline.ai_service.generate_summary.side_effect = failing_generate_summary

        video_url = "https://youtube.com/watch?v=retry_test"
        config = PipelineConfig(max_retries=1)

        job_id = await pipeline.process_video(video_url, config)

        # Wait for retry and completion
        max_wait = 10.0
        waited = 0.0
        while waited < max_wait:
            result = pipeline.active_jobs.get(job_id)
            if result and result.status in [PipelineStage.COMPLETED, PipelineStage.FAILED]:
                break
            await asyncio.sleep(0.1)
            waited += 0.1

        result = pipeline.active_jobs[job_id]
        # Should eventually succeed after retry
        if result.status == PipelineStage.COMPLETED:
            assert result.retry_count > 0
            assert call_count > 1