""" Storage Manager for video file organization and cleanup. Handles directory management, file operations, and storage optimization. """ import os import shutil import json from pathlib import Path from datetime import datetime, timedelta from typing import Dict, List, Optional, Tuple import logging logger = logging.getLogger(__name__) class StorageManager: """Manages storage for downloaded videos and audio files.""" def __init__(self, base_dir: str = "data/youtube-videos"): """ Initialize storage manager. Args: base_dir: Base directory for all storage operations """ self.base_dir = Path(base_dir) self.ensure_directory_structure() def ensure_directory_structure(self): """Ensure all required directories exist.""" directories = [ self.base_dir, self.base_dir / "videos", self.base_dir / "audio", self.base_dir / "metadata", self.base_dir / "thumbnails", self.base_dir / "temp" ] for directory in directories: directory.mkdir(parents=True, exist_ok=True) logger.debug(f"Ensured directory exists: {directory}") def get_video_directory(self, video_id: str) -> Path: """ Get or create directory for a specific video. Args: video_id: YouTube video ID Returns: Path to video directory """ video_dir = self.base_dir / "videos" / video_id video_dir.mkdir(parents=True, exist_ok=True) return video_dir def calculate_directory_size(self, directory: Path) -> int: """ Calculate total size of all files in a directory. Args: directory: Path to directory Returns: Total size in bytes """ total_size = 0 if not directory.exists(): return 0 for path in directory.rglob('*'): if path.is_file(): try: total_size += path.stat().st_size except (OSError, PermissionError): logger.warning(f"Could not access file: {path}") return total_size def get_storage_usage(self) -> Dict[str, int]: """ Get storage usage by category. Returns: Dictionary with storage usage by type """ return { 'videos': self.calculate_directory_size(self.base_dir / "videos"), 'audio': self.calculate_directory_size(self.base_dir / "audio"), 'metadata': self.calculate_directory_size(self.base_dir / "metadata"), 'thumbnails': self.calculate_directory_size(self.base_dir / "thumbnails"), 'temp': self.calculate_directory_size(self.base_dir / "temp"), 'total': self.calculate_directory_size(self.base_dir) } def find_old_files(self, days: int = 30) -> List[Path]: """ Find files older than specified days. Args: days: Age threshold in days Returns: List of file paths older than threshold """ old_files = [] threshold = datetime.now() - timedelta(days=days) for category in ['videos', 'audio', 'thumbnails']: directory = self.base_dir / category if directory.exists(): for path in directory.rglob('*'): if path.is_file(): try: mtime = datetime.fromtimestamp(path.stat().st_mtime) if mtime < threshold: old_files.append(path) except (OSError, PermissionError): logger.warning(f"Could not check file age: {path}") return old_files def cleanup_temp_files(self) -> int: """ Clean up temporary files. Returns: Number of bytes freed """ temp_dir = self.base_dir / "temp" bytes_freed = 0 if temp_dir.exists(): bytes_freed = self.calculate_directory_size(temp_dir) try: shutil.rmtree(temp_dir) temp_dir.mkdir(exist_ok=True) logger.info(f"Cleaned temp directory, freed {bytes_freed / (1024*1024):.2f} MB") except Exception as e: logger.error(f"Error cleaning temp directory: {e}") return 0 return bytes_freed def cleanup_orphaned_files(self, cache: Dict) -> int: """ Remove files not referenced in cache. Args: cache: Video cache dictionary Returns: Number of bytes freed """ bytes_freed = 0 # Get all referenced files from cache referenced_files = set() for video_hash, info in cache.items(): if 'video_path' in info: referenced_files.add(Path(info['video_path'])) if 'audio_path' in info: referenced_files.add(Path(info['audio_path'])) # Add metadata files video_id = info.get('video_id') if video_id: referenced_files.add(self.base_dir / "videos" / f"{video_id}.info.json") referenced_files.add(self.base_dir / "thumbnails" / f"{video_id}.jpg") # Find and remove orphaned files for category in ['videos', 'audio', 'metadata', 'thumbnails']: directory = self.base_dir / category if directory.exists(): for path in directory.glob('*'): if path.is_file() and path not in referenced_files: try: size = path.stat().st_size path.unlink() bytes_freed += size logger.info(f"Removed orphaned file: {path}") except Exception as e: logger.error(f"Error removing orphaned file {path}: {e}") return bytes_freed def move_file_safe(self, source: Path, destination: Path) -> bool: """ Safely move a file with error handling. Args: source: Source file path destination: Destination file path Returns: True if successful, False otherwise """ try: destination.parent.mkdir(parents=True, exist_ok=True) shutil.move(str(source), str(destination)) logger.debug(f"Moved {source} to {destination}") return True except Exception as e: logger.error(f"Error moving file from {source} to {destination}: {e}") return False def copy_file_safe(self, source: Path, destination: Path) -> bool: """ Safely copy a file with error handling. Args: source: Source file path destination: Destination file path Returns: True if successful, False otherwise """ try: destination.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(str(source), str(destination)) logger.debug(f"Copied {source} to {destination}") return True except Exception as e: logger.error(f"Error copying file from {source} to {destination}: {e}") return False def get_file_info(self, file_path: Path) -> Optional[Dict]: """ Get detailed information about a file. Args: file_path: Path to the file Returns: File information dictionary or None if file doesn't exist """ if not file_path.exists(): return None try: stat = file_path.stat() return { 'path': str(file_path), 'name': file_path.name, 'size_bytes': stat.st_size, 'size_mb': stat.st_size / (1024 * 1024), 'created': datetime.fromtimestamp(stat.st_ctime).isoformat(), 'modified': datetime.fromtimestamp(stat.st_mtime).isoformat(), 'accessed': datetime.fromtimestamp(stat.st_atime).isoformat(), 'extension': file_path.suffix, 'parent': str(file_path.parent) } except Exception as e: logger.error(f"Error getting file info for {file_path}: {e}") return None def archive_video(self, video_id: str, archive_dir: str = "archive") -> bool: """ Archive a video and its associated files. Args: video_id: YouTube video ID archive_dir: Archive directory name Returns: True if successful, False otherwise """ archive_path = self.base_dir / archive_dir / video_id archive_path.mkdir(parents=True, exist_ok=True) files_to_archive = [ (self.base_dir / "videos" / f"{video_id}.mp4", archive_path / f"{video_id}.mp4"), (self.base_dir / "audio" / f"{video_id}.mp3", archive_path / f"{video_id}.mp3"), (self.base_dir / "metadata" / f"{video_id}.json", archive_path / f"{video_id}.json"), (self.base_dir / "thumbnails" / f"{video_id}.jpg", archive_path / f"{video_id}.jpg"), ] success = True for source, dest in files_to_archive: if source.exists(): if not self.move_file_safe(source, dest): success = False if success: logger.info(f"Archived video {video_id} to {archive_path}") return success def restore_from_archive(self, video_id: str, archive_dir: str = "archive") -> bool: """ Restore a video from archive. Args: video_id: YouTube video ID archive_dir: Archive directory name Returns: True if successful, False otherwise """ archive_path = self.base_dir / archive_dir / video_id if not archive_path.exists(): logger.error(f"Archive not found for video {video_id}") return False files_to_restore = [ (archive_path / f"{video_id}.mp4", self.base_dir / "videos" / f"{video_id}.mp4"), (archive_path / f"{video_id}.mp3", self.base_dir / "audio" / f"{video_id}.mp3"), (archive_path / f"{video_id}.json", self.base_dir / "metadata" / f"{video_id}.json"), (archive_path / f"{video_id}.jpg", self.base_dir / "thumbnails" / f"{video_id}.jpg"), ] success = True for source, dest in files_to_restore: if source.exists(): if not self.move_file_safe(source, dest): success = False if success: # Remove empty archive directory try: archive_path.rmdir() except OSError: pass # Directory not empty, that's okay logger.info(f"Restored video {video_id} from archive") return success def get_disk_usage(self) -> Dict: """ Get disk usage statistics for the storage directory. Returns: Disk usage information """ try: stat = shutil.disk_usage(self.base_dir) return { 'total_bytes': stat.total, 'total_gb': stat.total / (1024 ** 3), 'used_bytes': stat.used, 'used_gb': stat.used / (1024 ** 3), 'free_bytes': stat.free, 'free_gb': stat.free / (1024 ** 3), 'percent_used': (stat.used / stat.total * 100) if stat.total > 0 else 0 } except Exception as e: logger.error(f"Error getting disk usage: {e}") return { 'error': str(e), 'total_bytes': 0, 'used_bytes': 0, 'free_bytes': 0 }