youtube-summarizer/backend/tests/unit/test_transcript_service.py

import pytest
import asyncio
import json
from unittest.mock import Mock, patch, AsyncMock
from backend.services.transcript_service import (
    TranscriptService,
    TranscriptNotAvailableError,
    CaptionsNotAvailableError,
    AudioTranscriptionError
)
from backend.services.mock_cache import MockCacheClient
from backend.models.transcript import ExtractionMethod


class TestTranscriptService:

    @pytest.fixture
    def cache_client(self):
        return MockCacheClient()

    @pytest.fixture
    def transcript_service(self, cache_client):
        return TranscriptService(cache_client)

    @pytest.mark.asyncio
    async def test_extract_transcript_from_cache(self, transcript_service, cache_client):
        """Test transcript retrieval from cache"""
        # Set up cache with mock data
        cached_data = {
            "video_id": "test123",
            "transcript": "Cached transcript content",
            "method": "youtube_api",
            "success": True,
            "metadata": {
                "word_count": 3,
                "language": "en",
                "extraction_method": "youtube_api",
                "processing_time_seconds": 0.5,
                "estimated_reading_time": 1,
                "has_timestamps": True
            }
        }
        await cache_client.set("transcript:test123:en", cached_data)

        # Extract transcript
        result = await transcript_service.extract_transcript("test123", "en")

        assert result.success is True
        assert result.from_cache is True
        assert result.transcript == "Cached transcript content"
        assert result.method == ExtractionMethod.YOUTUBE_API

    @pytest.mark.asyncio
    async def test_extract_transcript_youtube_api_success(self, transcript_service):
        """Test successful extraction via YouTube API"""
        with patch.object(transcript_service, '_extract_youtube_transcript') as mock_extract:
            mock_extract.return_value = "YouTube API transcript"

            result = await transcript_service.extract_transcript("dQw4w9WgXcQ", "en")

            assert result.success is True
            assert result.method == ExtractionMethod.YOUTUBE_API
            assert "YouTube API transcript" in result.transcript or \
                   "comprehensive tutorial" in result.transcript  # Mock data

    @pytest.mark.asyncio
    async def test_extract_transcript_fallback_to_captions(self, transcript_service):
        """Test fallback to auto-captions when YouTube API fails"""
        with patch.object(transcript_service, '_extract_youtube_transcript') as mock_yt:
            mock_yt.side_effect = TranscriptNotAvailableError("Not available")

            with patch.object(transcript_service, '_extract_auto_captions') as mock_captions:
                mock_captions.return_value = "Auto-caption transcript"

                result = await transcript_service.extract_transcript("test123", "en")

                assert result.success is True
                assert result.method == ExtractionMethod.AUTO_CAPTIONS
                assert "Auto-caption transcript" in result.transcript

    @pytest.mark.asyncio
    async def test_extract_transcript_fallback_to_whisper(self, transcript_service):
        """Test fallback to Whisper when both YouTube API and captions fail"""
        with patch.object(transcript_service, '_extract_youtube_transcript') as mock_yt:
            mock_yt.side_effect = TranscriptNotAvailableError("Not available")

            with patch.object(transcript_service, '_extract_auto_captions') as mock_captions:
                mock_captions.side_effect = CaptionsNotAvailableError("No captions")

                with patch.object(transcript_service, '_transcribe_audio') as mock_whisper:
                    mock_whisper.return_value = "Whisper transcript"

                    result = await transcript_service.extract_transcript("test123", "en")

                    assert result.success is True
                    assert result.method == ExtractionMethod.WHISPER_AUDIO
                    assert "Whisper transcript" in result.transcript

    @pytest.mark.asyncio
    async def test_extract_transcript_all_methods_fail(self, transcript_service):
        """Test when all extraction methods fail"""
        with patch.object(transcript_service, '_extract_youtube_transcript') as mock_yt:
            mock_yt.side_effect = TranscriptNotAvailableError("Not available")

            with patch.object(transcript_service, '_extract_auto_captions') as mock_captions:
                mock_captions.side_effect = CaptionsNotAvailableError("No captions")

                with patch.object(transcript_service, '_transcribe_audio') as mock_whisper:
                    mock_whisper.side_effect = AudioTranscriptionError("Audio failed")

                    result = await transcript_service.extract_transcript("test123", "en")

                    assert result.success is False
                    assert result.method == ExtractionMethod.FAILED
                    assert result.transcript is None
                    assert result.error is not None
                    assert "attempted_methods" in result.error["details"]

    @pytest.mark.asyncio
    async def test_extract_metadata(self, transcript_service):
        """Test metadata extraction from transcript"""
        transcript = "This is a test transcript with multiple words for testing."
        metadata = transcript_service.extract_metadata(transcript)

        assert metadata["word_count"] == 10
        assert metadata["character_count"] == len(transcript)
        assert metadata["line_count"] == 1
        assert metadata["estimated_reading_time_seconds"] > 0

    @pytest.mark.asyncio
    async def test_cache_result(self, transcript_service, cache_client):
        """Test that results are properly cached"""
        # Clear cache first
        cache_client.clear_all()

        # First call should not be from cache
        result1 = await transcript_service.extract_transcript("test123", "en")
        assert result1.from_cache is False

        # Second call should be from cache
        result2 = await transcript_service.extract_transcript("test123", "en")
        assert result2.from_cache is True
        assert result2.transcript == result1.transcript