youtube-summarizer/backend/tests/unit/test_transcript_downloader.py

347 lines
15 KiB
Python

"""
Unit tests for transcript-only downloader
"""
import pytest
from unittest.mock import Mock, AsyncMock, patch, MagicMock
from pathlib import Path
from backend.models.video_download import (
DownloadMethod,
DownloadPreferences,
VideoDownloadResult,
DownloadStatus,
VideoMetadata,
TranscriptData,
DownloaderException
)
from backend.services.video_downloaders.transcript_downloader import TranscriptOnlyDownloader
class TestTranscriptOnlyDownloader:
"""Test transcript-only downloader functionality"""
@pytest.fixture
def mock_config(self, tmp_path):
"""Mock configuration for testing"""
return {
'youtube_api_key': 'test_api_key',
'output_dir': str(tmp_path),
'timeout': 30
}
@pytest.fixture
def downloader(self, mock_config):
"""Create downloader instance for testing"""
return TranscriptOnlyDownloader(config=mock_config)
def test_initialization(self, downloader, mock_config):
"""Test downloader initialization"""
assert downloader.method == DownloadMethod.TRANSCRIPT_ONLY
assert downloader.youtube_api_key == mock_config['youtube_api_key']
assert downloader.output_dir == Path(mock_config['output_dir'])
def test_initialization_no_api_key(self, tmp_path):
"""Test initialization without API key"""
config = {'output_dir': str(tmp_path)}
downloader = TranscriptOnlyDownloader(config=config)
assert downloader.youtube_api_key is None
def test_capabilities(self, downloader):
"""Test downloader capabilities"""
assert downloader.supports_audio_only() is False
assert downloader.supports_quality_selection() is False
assert downloader.get_supported_formats() == ["json", "txt"]
@patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi')
@pytest.mark.asyncio
async def test_successful_transcript_download(self, mock_transcript_api, downloader):
"""Test successful transcript extraction"""
# Mock transcript API response
mock_api_instance = Mock()
mock_transcript_list = [
{'text': 'Hello world', 'start': 0.0, 'duration': 2.0},
{'text': 'This is a test', 'start': 2.0, 'duration': 3.0},
{'text': 'Video transcript', 'start': 5.0, 'duration': 2.5}
]
mock_api_instance.get_transcript.return_value = mock_transcript_list
mock_transcript_api.return_value = mock_api_instance
url = "https://youtube.com/watch?v=test123"
preferences = DownloadPreferences()
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.PARTIAL
assert result.video_id == "test123"
assert result.method == DownloadMethod.TRANSCRIPT_ONLY
assert result.is_partial is True
assert result.video_path is None
assert result.audio_path is None
# Check transcript data
assert result.transcript is not None
assert result.transcript.text == "Hello world This is a test Video transcript"
assert result.transcript.language == 'en'
assert result.transcript.is_auto_generated is False
assert len(result.transcript.segments) == 3
assert result.transcript.source == "youtube-transcript-api"
@patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi')
@patch('backend.services.video_downloaders.transcript_downloader.build')
@pytest.mark.asyncio
async def test_with_metadata_extraction(self, mock_build, mock_transcript_api, downloader):
"""Test transcript download with metadata extraction"""
# Mock YouTube API
mock_service = Mock()
mock_video_response = {
'items': [{
'id': 'test123',
'snippet': {
'title': 'Test Video',
'description': 'Test description',
'publishedAt': '2024-01-01T00:00:00Z',
'channelTitle': 'Test Channel',
'tags': ['test', 'video'],
'defaultLanguage': 'en',
'thumbnails': {
'high': {'url': 'http://example.com/thumb.jpg'}
}
},
'contentDetails': {
'duration': 'PT4M30S' # 4 minutes 30 seconds
},
'statistics': {
'viewCount': '1000000'
},
'status': {
'privacyStatus': 'public'
}
}]
}
mock_service.videos.return_value.list.return_value.execute.return_value = mock_video_response
mock_build.return_value = mock_service
# Mock transcript API
mock_api_instance = Mock()
mock_transcript_list = [
{'text': 'Test transcript', 'start': 0.0, 'duration': 2.0}
]
mock_api_instance.get_transcript.return_value = mock_transcript_list
mock_transcript_api.return_value = mock_api_instance
url = "https://youtube.com/watch?v=test123"
preferences = DownloadPreferences()
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.PARTIAL
assert result.metadata is not None
assert result.metadata.title == "Test Video"
assert result.metadata.description == "Test description"
assert result.metadata.duration_seconds == 270 # 4m30s
assert result.metadata.view_count == 1000000
assert result.metadata.uploader == "Test Channel"
@patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi')
@pytest.mark.asyncio
async def test_transcript_unavailable(self, mock_transcript_api, downloader):
"""Test handling when transcript is unavailable"""
mock_transcript_api.side_effect = Exception("No transcript available")
url = "https://youtube.com/watch?v=notranscript123"
preferences = DownloadPreferences()
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.FAILED
assert "No transcript available" in result.error_message
assert result.transcript is None
@patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi')
@pytest.mark.asyncio
async def test_auto_generated_transcript(self, mock_transcript_api, downloader):
"""Test handling of auto-generated transcripts"""
# Mock transcript API to return auto-generated transcript
mock_api_instance = Mock()
mock_transcript_list = [
{'text': 'Auto generated text', 'start': 0.0, 'duration': 2.0}
]
mock_api_instance.get_transcript.return_value = mock_transcript_list
# Mock list_transcripts to show it's auto-generated
mock_transcript_entry = Mock()
mock_transcript_entry.is_generated = True
mock_transcript_entry.language_code = 'en'
mock_api_instance.list_transcripts.return_value = [mock_transcript_entry]
mock_transcript_api.return_value = mock_api_instance
url = "https://youtube.com/watch?v=auto123"
preferences = DownloadPreferences()
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.PARTIAL
assert result.transcript.is_auto_generated is True
@patch('backend.services.video_downloaders.transcript_downloader.build')
@pytest.mark.asyncio
async def test_metadata_only_extraction(self, mock_build, downloader):
"""Test metadata-only extraction without transcript"""
# Mock YouTube API for metadata
mock_service = Mock()
mock_video_response = {
'items': [{
'id': 'test123',
'snippet': {
'title': 'Metadata Only Video',
'description': 'Just metadata',
'publishedAt': '2024-01-01T00:00:00Z',
'channelTitle': 'Test Channel'
},
'contentDetails': {
'duration': 'PT2M15S'
},
'statistics': {
'viewCount': '500'
}
}]
}
mock_service.videos.return_value.list.return_value.execute.return_value = mock_video_response
mock_build.return_value = mock_service
metadata = await downloader.get_video_metadata("test123")
assert metadata is not None
assert metadata.video_id == "test123"
assert metadata.title == "Metadata Only Video"
assert metadata.duration_seconds == 135 # 2m15s
assert metadata.view_count == 500
@pytest.mark.asyncio
async def test_get_transcript_direct(self, downloader):
"""Test direct transcript extraction"""
with patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi') as mock_api:
mock_api_instance = Mock()
mock_transcript = [
{'text': 'Direct transcript', 'start': 0.0, 'duration': 2.0}
]
mock_api_instance.get_transcript.return_value = mock_transcript
mock_api.return_value = mock_api_instance
transcript = await downloader.get_transcript("test123")
assert transcript is not None
assert transcript.text == "Direct transcript"
assert len(transcript.segments) == 1
@pytest.mark.asyncio
async def test_connection_test_success(self, downloader):
"""Test successful connection test"""
with patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi'):
result = await downloader.test_connection()
assert result is True
@pytest.mark.asyncio
async def test_connection_test_failure(self, downloader):
"""Test failed connection test"""
with patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi', side_effect=Exception("Connection failed")):
result = await downloader.test_connection()
assert result is False
def test_parse_duration(self, downloader):
"""Test ISO 8601 duration parsing"""
test_cases = [
("PT1M30S", 90), # 1 minute 30 seconds
("PT2H15M", 8100), # 2 hours 15 minutes
("PT45S", 45), # 45 seconds
("PT1H", 3600), # 1 hour
("PT10M", 600), # 10 minutes
("P1DT2H3M4S", 93784), # 1 day 2 hours 3 minutes 4 seconds
("", 0), # Empty string
("invalid", 0) # Invalid format
]
for duration_str, expected_seconds in test_cases:
result = downloader._parse_duration(duration_str)
assert result == expected_seconds, f"Failed for {duration_str}: expected {expected_seconds}, got {result}"
@pytest.mark.asyncio
async def test_language_preference(self, downloader):
"""Test transcript language preference"""
with patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi') as mock_api:
mock_api_instance = Mock()
# Mock transcript list with multiple languages
mock_transcripts = Mock()
# Mock English transcript
english_transcript = Mock()
english_transcript.language_code = 'en'
english_transcript.fetch.return_value = [
{'text': 'English transcript', 'start': 0.0, 'duration': 2.0}
]
# Mock Spanish transcript
spanish_transcript = Mock()
spanish_transcript.language_code = 'es'
spanish_transcript.fetch.return_value = [
{'text': 'Spanish transcript', 'start': 0.0, 'duration': 2.0}
]
mock_transcripts.__iter__ = Mock(return_value=iter([english_transcript, spanish_transcript]))
mock_transcripts.find_transcript.return_value = spanish_transcript
mock_api_instance.list_transcripts.return_value = mock_transcripts
mock_api.return_value = mock_api_instance
# Request Spanish transcript
preferences = DownloadPreferences()
# Note: This test assumes language preference would be implemented
# Currently the downloader uses default language preference
url = "https://youtube.com/watch?v=multilang123"
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.PARTIAL
# The actual language returned depends on the implementation
assert result.transcript is not None
@patch('backend.services.video_downloaders.transcript_downloader.build')
@pytest.mark.asyncio
async def test_api_quota_exceeded(self, mock_build, downloader):
"""Test handling of YouTube API quota exceeded"""
mock_service = Mock()
mock_service.videos.return_value.list.return_value.execute.side_effect = Exception("Quota exceeded")
mock_build.return_value = mock_service
# Should still work without metadata if transcript is available
with patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi') as mock_transcript_api:
mock_api_instance = Mock()
mock_transcript = [{'text': 'Transcript without metadata', 'start': 0.0, 'duration': 2.0}]
mock_api_instance.get_transcript.return_value = mock_transcript
mock_transcript_api.return_value = mock_api_instance
url = "https://youtube.com/watch?v=quota123"
preferences = DownloadPreferences()
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.PARTIAL
assert result.transcript is not None
assert result.metadata is None # Metadata extraction failed due to quota
@pytest.mark.asyncio
async def test_invalid_video_id(self, downloader):
"""Test handling of invalid video ID"""
with patch('backend.services.video_downloaders.transcript_downloader.YouTubeTranscriptApi') as mock_api:
mock_api.side_effect = Exception("Video not found")
url = "https://youtube.com/watch?v=invalidid123"
preferences = DownloadPreferences()
result = await downloader.download_video(url, preferences)
assert result.status == DownloadStatus.FAILED
assert "Video not found" in result.error_message