youtube-summarizer/backend/tests/integration/test_transcript_api.py

129 lines
4.6 KiB
Python

import pytest
from fastapi.testclient import TestClient
import sys
from pathlib import Path
import asyncio
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from backend.main import app
class TestTranscriptAPI:
@pytest.fixture
def client(self):
return TestClient(app)
def test_get_transcript_success(self, client):
"""Test successful transcript retrieval"""
response = client.get("/api/transcripts/dQw4w9WgXcQ")
assert response.status_code == 200
data = response.json()
assert data["video_id"] == "dQw4w9WgXcQ"
assert "transcript" in data
assert data["extraction_method"] in ["youtube_api", "auto_captions", "whisper_audio", "mock"]
assert data["word_count"] > 0
assert "processing_time_seconds" in data
def test_get_transcript_with_language(self, client):
"""Test transcript retrieval with language preference"""
response = client.get("/api/transcripts/test123?language_preference=es")
assert response.status_code == 200
data = response.json()
assert data["language"] == "es"
assert data["video_id"] == "test123"
def test_async_transcript_extraction(self, client):
"""Test background transcript extraction job"""
# Start async extraction
response = client.post("/api/transcripts/extract", json={
"video_id": "test123",
"language_preference": "en",
"include_metadata": True
})
assert response.status_code == 200
data = response.json()
assert "job_id" in data
assert data["status"] == "processing"
assert data["message"] == "Transcript extraction started"
job_id = data["job_id"]
# Check job status
status_response = client.get(f"/api/transcripts/jobs/{job_id}")
assert status_response.status_code == 200
status_data = status_response.json()
assert status_data["job_id"] == job_id
assert status_data["status"] in ["pending", "processing", "completed", "failed"]
assert "progress_percentage" in status_data
def test_job_status_not_found(self, client):
"""Test job status for non-existent job"""
response = client.get("/api/transcripts/jobs/non-existent-job-id")
assert response.status_code == 404
assert "not found" in response.json()["detail"].lower()
def test_chunk_transcript(self, client):
"""Test transcript chunking for large content"""
response = client.post("/api/transcripts/dQw4w9WgXcQ/chunk?max_tokens=100")
assert response.status_code == 200
data = response.json()
assert data["video_id"] == "dQw4w9WgXcQ"
assert "total_chunks" in data
assert "chunks" in data
assert isinstance(data["chunks"], list)
if data["chunks"]:
chunk = data["chunks"][0]
assert "chunk_index" in chunk
assert "text" in chunk
assert "token_count" in chunk
def test_cache_stats(self, client):
"""Test cache statistics endpoint"""
response = client.get("/api/transcripts/cache/stats")
assert response.status_code == 200
data = response.json()
assert "total_keys" in data
assert "active_keys" in data
assert "cache_size_bytes" in data
def test_transcript_metadata_included(self, client):
"""Test that metadata is included when requested"""
response = client.get("/api/transcripts/test123?include_metadata=true")
assert response.status_code == 200
data = response.json()
if data["transcript"]: # If transcript was successfully extracted
assert "metadata" in data
metadata = data["metadata"]
assert "word_count" in metadata
assert "language" in metadata
assert "extraction_method" in metadata
def test_transcript_metadata_excluded(self, client):
"""Test that metadata is excluded when not requested"""
response = client.get("/api/transcripts/test123?include_metadata=false")
assert response.status_code == 200
data = response.json()
# Metadata might still be None even if not requested
# The important thing is the endpoint works
assert "video_id" in data
assert "extraction_method" in data