youtube-summarizer/backend/services/video_service.py

156 lines
5.3 KiB
Python

import re
import logging
from typing import Optional, List, Dict, Any
from backend.core.exceptions import (
UserInputError,
ValidationError,
UnsupportedFormatError,
ErrorCode
)
logger = logging.getLogger(__name__)
class VideoService:
def __init__(self):
self.supported_formats = [
"https://youtube.com/watch?v=VIDEO_ID",
"https://youtu.be/VIDEO_ID",
"https://youtube.com/embed/VIDEO_ID",
"https://m.youtube.com/watch?v=VIDEO_ID",
"https://www.youtube.com/watch?v=VIDEO_ID"
]
# Patterns that match any video ID length (for validation later)
self.url_patterns = [
r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)',
r'(?:https?://)?(?:www\.)?youtu\.be/([a-zA-Z0-9_-]+)',
r'(?:https?://)?(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]+)',
r'(?:https?://)?(?:m\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)',
r'(?:https?://)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)'
]
self.playlist_patterns = [
r'(?:https?://)?(?:www\.)?youtube\.com/playlist\?list=',
r'(?:https?://)?(?:www\.)?youtube\.com/watch\?.*&list=',
r'(?:https?://)?(?:www\.)?youtube\.com/watch\?.*\?list='
]
def extract_video_id(self, url: str) -> str:
"""
Extract YouTube video ID from various URL formats.
Args:
url: YouTube URL to parse
Returns:
str: 11-character video ID
Raises:
ValidationError: If URL format is invalid
UnsupportedFormatError: If URL is a playlist or other unsupported format
"""
if not url:
raise ValidationError(
message="URL cannot be empty",
details={"url": url, "supported_formats": self.supported_formats}
)
url = url.strip()
# Check for playlist URLs first
if self._is_playlist_url(url):
logger.info(f"Playlist URL detected: {url}")
raise UnsupportedFormatError(
message="Playlist URLs are not yet supported. Please provide a single video URL.",
details={
"url": url,
"detected_type": "playlist",
"suggestion": "You can copy the URL of a specific video from the playlist instead",
"supported_formats": self.supported_formats
}
)
# Try to extract video ID using patterns
for pattern in self.url_patterns:
match = re.search(pattern, url)
if match:
video_id = match.group(1)
# Validate video ID format
if self._validate_video_id(video_id):
logger.info(f"Successfully extracted video ID: {video_id} from URL: {url}")
return video_id
else:
raise ValidationError(
message=f"Invalid video ID format: {video_id}",
details={
"url": url,
"video_id": video_id,
"expected_format": "11 characters containing letters, numbers, underscores, or hyphens"
}
)
# For any URL (YouTube or not) that doesn't match our patterns,
# it's a validation error
raise ValidationError(
message="Invalid YouTube URL format",
details={
"url": url,
"supported_formats": self.supported_formats,
"suggestion": "Please provide a valid YouTube video URL with a video ID"
}
)
def _validate_video_id(self, video_id: str) -> bool:
"""
Validate that video ID is exactly 11 characters with valid format.
Args:
video_id: Video ID to validate
Returns:
bool: True if valid, False otherwise
"""
if not video_id or len(video_id) != 11:
return False
# YouTube video IDs contain letters, numbers, underscores, and hyphens
valid_pattern = r'^[a-zA-Z0-9_-]{11}$'
return bool(re.match(valid_pattern, video_id))
def _is_playlist_url(self, url: str) -> bool:
"""
Check if URL is a playlist URL.
Args:
url: URL to check
Returns:
bool: True if playlist URL, False otherwise
"""
for pattern in self.playlist_patterns:
if re.search(pattern, url):
return True
return False
def normalize_url(self, video_id: str) -> str:
"""
Create a normalized YouTube URL from a video ID.
Args:
video_id: YouTube video ID
Returns:
str: Normalized YouTube URL
"""
return f"https://youtube.com/watch?v={video_id}"
def get_supported_formats(self) -> List[str]:
"""
Get list of supported URL formats.
Returns:
List[str]: List of supported format examples
"""
return self.supported_formats.copy()