import re import logging from typing import Optional, List, Dict, Any from backend.core.exceptions import ( UserInputError, ValidationError, UnsupportedFormatError, ErrorCode ) logger = logging.getLogger(__name__) class VideoService: def __init__(self): self.supported_formats = [ "https://youtube.com/watch?v=VIDEO_ID", "https://youtu.be/VIDEO_ID", "https://youtube.com/embed/VIDEO_ID", "https://m.youtube.com/watch?v=VIDEO_ID", "https://www.youtube.com/watch?v=VIDEO_ID" ] # Patterns that match any video ID length (for validation later) self.url_patterns = [ r'(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', r'(?:https?://)?(?:www\.)?youtu\.be/([a-zA-Z0-9_-]+)', r'(?:https?://)?(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]+)', r'(?:https?://)?(?:m\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)', r'(?:https?://)?youtube\.com/watch\?v=([a-zA-Z0-9_-]+)' ] self.playlist_patterns = [ r'(?:https?://)?(?:www\.)?youtube\.com/playlist\?list=', r'(?:https?://)?(?:www\.)?youtube\.com/watch\?.*&list=', r'(?:https?://)?(?:www\.)?youtube\.com/watch\?.*\?list=' ] def extract_video_id(self, url: str) -> str: """ Extract YouTube video ID from various URL formats. Args: url: YouTube URL to parse Returns: str: 11-character video ID Raises: ValidationError: If URL format is invalid UnsupportedFormatError: If URL is a playlist or other unsupported format """ if not url: raise ValidationError( message="URL cannot be empty", details={"url": url, "supported_formats": self.supported_formats} ) url = url.strip() # Check for playlist URLs first if self._is_playlist_url(url): logger.info(f"Playlist URL detected: {url}") raise UnsupportedFormatError( message="Playlist URLs are not yet supported. Please provide a single video URL.", details={ "url": url, "detected_type": "playlist", "suggestion": "You can copy the URL of a specific video from the playlist instead", "supported_formats": self.supported_formats } ) # Try to extract video ID using patterns for pattern in self.url_patterns: match = re.search(pattern, url) if match: video_id = match.group(1) # Validate video ID format if self._validate_video_id(video_id): logger.info(f"Successfully extracted video ID: {video_id} from URL: {url}") return video_id else: raise ValidationError( message=f"Invalid video ID format: {video_id}", details={ "url": url, "video_id": video_id, "expected_format": "11 characters containing letters, numbers, underscores, or hyphens" } ) # For any URL (YouTube or not) that doesn't match our patterns, # it's a validation error raise ValidationError( message="Invalid YouTube URL format", details={ "url": url, "supported_formats": self.supported_formats, "suggestion": "Please provide a valid YouTube video URL with a video ID" } ) def _validate_video_id(self, video_id: str) -> bool: """ Validate that video ID is exactly 11 characters with valid format. Args: video_id: Video ID to validate Returns: bool: True if valid, False otherwise """ if not video_id or len(video_id) != 11: return False # YouTube video IDs contain letters, numbers, underscores, and hyphens valid_pattern = r'^[a-zA-Z0-9_-]{11}$' return bool(re.match(valid_pattern, video_id)) def _is_playlist_url(self, url: str) -> bool: """ Check if URL is a playlist URL. Args: url: URL to check Returns: bool: True if playlist URL, False otherwise """ for pattern in self.playlist_patterns: if re.search(pattern, url): return True return False def normalize_url(self, video_id: str) -> str: """ Create a normalized YouTube URL from a video ID. Args: video_id: YouTube video ID Returns: str: Normalized YouTube URL """ return f"https://youtube.com/watch?v={video_id}" def get_supported_formats(self) -> List[str]: """ Get list of supported URL formats. Returns: List[str]: List of supported format examples """ return self.supported_formats.copy()