import pytest from fastapi.testclient import TestClient import sys from pathlib import Path import asyncio # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) from backend.main import app class TestTranscriptAPI: @pytest.fixture def client(self): return TestClient(app) def test_get_transcript_success(self, client): """Test successful transcript retrieval""" response = client.get("/api/transcripts/dQw4w9WgXcQ") assert response.status_code == 200 data = response.json() assert data["video_id"] == "dQw4w9WgXcQ" assert "transcript" in data assert data["extraction_method"] in ["youtube_api", "auto_captions", "whisper_audio", "mock"] assert data["word_count"] > 0 assert "processing_time_seconds" in data def test_get_transcript_with_language(self, client): """Test transcript retrieval with language preference""" response = client.get("/api/transcripts/test123?language_preference=es") assert response.status_code == 200 data = response.json() assert data["language"] == "es" assert data["video_id"] == "test123" def test_async_transcript_extraction(self, client): """Test background transcript extraction job""" # Start async extraction response = client.post("/api/transcripts/extract", json={ "video_id": "test123", "language_preference": "en", "include_metadata": True }) assert response.status_code == 200 data = response.json() assert "job_id" in data assert data["status"] == "processing" assert data["message"] == "Transcript extraction started" job_id = data["job_id"] # Check job status status_response = client.get(f"/api/transcripts/jobs/{job_id}") assert status_response.status_code == 200 status_data = status_response.json() assert status_data["job_id"] == job_id assert status_data["status"] in ["pending", "processing", "completed", "failed"] assert "progress_percentage" in status_data def test_job_status_not_found(self, client): """Test job status for non-existent job""" response = client.get("/api/transcripts/jobs/non-existent-job-id") assert response.status_code == 404 assert "not found" in response.json()["detail"].lower() def test_chunk_transcript(self, client): """Test transcript chunking for large content""" response = client.post("/api/transcripts/dQw4w9WgXcQ/chunk?max_tokens=100") assert response.status_code == 200 data = response.json() assert data["video_id"] == "dQw4w9WgXcQ" assert "total_chunks" in data assert "chunks" in data assert isinstance(data["chunks"], list) if data["chunks"]: chunk = data["chunks"][0] assert "chunk_index" in chunk assert "text" in chunk assert "token_count" in chunk def test_cache_stats(self, client): """Test cache statistics endpoint""" response = client.get("/api/transcripts/cache/stats") assert response.status_code == 200 data = response.json() assert "total_keys" in data assert "active_keys" in data assert "cache_size_bytes" in data def test_transcript_metadata_included(self, client): """Test that metadata is included when requested""" response = client.get("/api/transcripts/test123?include_metadata=true") assert response.status_code == 200 data = response.json() if data["transcript"]: # If transcript was successfully extracted assert "metadata" in data metadata = data["metadata"] assert "word_count" in metadata assert "language" in metadata assert "extraction_method" in metadata def test_transcript_metadata_excluded(self, client): """Test that metadata is excluded when not requested""" response = client.get("/api/transcripts/test123?include_metadata=false") assert response.status_code == 200 data = response.json() # Metadata might still be None even if not requested # The important thing is the endpoint works assert "video_id" in data assert "extraction_method" in data