youtube-summarizer/backend/tests/integration/test_transcript_api.py

import pytest
from fastapi.testclient import TestClient
import sys
from pathlib import Path
import asyncio

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))

from backend.main import app


class TestTranscriptAPI:

    @pytest.fixture
    def client(self):
        return TestClient(app)

    def test_get_transcript_success(self, client):
        """Test successful transcript retrieval"""
        response = client.get("/api/transcripts/dQw4w9WgXcQ")

        assert response.status_code == 200
        data = response.json()

        assert data["video_id"] == "dQw4w9WgXcQ"
        assert "transcript" in data
        assert data["extraction_method"] in ["youtube_api", "auto_captions", "whisper_audio", "mock"]
        assert data["word_count"] > 0
        assert "processing_time_seconds" in data

    def test_get_transcript_with_language(self, client):
        """Test transcript retrieval with language preference"""
        response = client.get("/api/transcripts/test123?language_preference=es")

        assert response.status_code == 200
        data = response.json()

        assert data["language"] == "es"
        assert data["video_id"] == "test123"

    def test_async_transcript_extraction(self, client):
        """Test background transcript extraction job"""
        # Start async extraction
        response = client.post("/api/transcripts/extract", json={
            "video_id": "test123",
            "language_preference": "en",
            "include_metadata": True
        })

        assert response.status_code == 200
        data = response.json()

        assert "job_id" in data
        assert data["status"] == "processing"
        assert data["message"] == "Transcript extraction started"

        job_id = data["job_id"]

        # Check job status
        status_response = client.get(f"/api/transcripts/jobs/{job_id}")
        assert status_response.status_code == 200

        status_data = status_response.json()
        assert status_data["job_id"] == job_id
        assert status_data["status"] in ["pending", "processing", "completed", "failed"]
        assert "progress_percentage" in status_data

    def test_job_status_not_found(self, client):
        """Test job status for non-existent job"""
        response = client.get("/api/transcripts/jobs/non-existent-job-id")

        assert response.status_code == 404
        assert "not found" in response.json()["detail"].lower()

    def test_chunk_transcript(self, client):
        """Test transcript chunking for large content"""
        response = client.post("/api/transcripts/dQw4w9WgXcQ/chunk?max_tokens=100")

        assert response.status_code == 200
        data = response.json()

        assert data["video_id"] == "dQw4w9WgXcQ"
        assert "total_chunks" in data
        assert "chunks" in data
        assert isinstance(data["chunks"], list)

        if data["chunks"]:
            chunk = data["chunks"][0]
            assert "chunk_index" in chunk
            assert "text" in chunk
            assert "token_count" in chunk

    def test_cache_stats(self, client):
        """Test cache statistics endpoint"""
        response = client.get("/api/transcripts/cache/stats")

        assert response.status_code == 200
        data = response.json()

        assert "total_keys" in data
        assert "active_keys" in data
        assert "cache_size_bytes" in data

    def test_transcript_metadata_included(self, client):
        """Test that metadata is included when requested"""
        response = client.get("/api/transcripts/test123?include_metadata=true")

        assert response.status_code == 200
        data = response.json()

        if data["transcript"]:  # If transcript was successfully extracted
            assert "metadata" in data
            metadata = data["metadata"]
            assert "word_count" in metadata
            assert "language" in metadata
            assert "extraction_method" in metadata

    def test_transcript_metadata_excluded(self, client):
        """Test that metadata is excluded when not requested"""
        response = client.get("/api/transcripts/test123?include_metadata=false")

        assert response.status_code == 200
        data = response.json()

        # Metadata might still be None even if not requested
        # The important thing is the endpoint works
        assert "video_id" in data
        assert "extraction_method" in data