youtube-summarizer/backend/services/storage_manager.py

358 lines
12 KiB
Python

"""
Storage Manager for video file organization and cleanup.
Handles directory management, file operations, and storage optimization.
"""
import os
import shutil
import json
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class StorageManager:
"""Manages storage for downloaded videos and audio files."""
def __init__(self, base_dir: str = "data/youtube-videos"):
"""
Initialize storage manager.
Args:
base_dir: Base directory for all storage operations
"""
self.base_dir = Path(base_dir)
self.ensure_directory_structure()
def ensure_directory_structure(self):
"""Ensure all required directories exist."""
directories = [
self.base_dir,
self.base_dir / "videos",
self.base_dir / "audio",
self.base_dir / "metadata",
self.base_dir / "thumbnails",
self.base_dir / "temp"
]
for directory in directories:
directory.mkdir(parents=True, exist_ok=True)
logger.debug(f"Ensured directory exists: {directory}")
def get_video_directory(self, video_id: str) -> Path:
"""
Get or create directory for a specific video.
Args:
video_id: YouTube video ID
Returns:
Path to video directory
"""
video_dir = self.base_dir / "videos" / video_id
video_dir.mkdir(parents=True, exist_ok=True)
return video_dir
def calculate_directory_size(self, directory: Path) -> int:
"""
Calculate total size of all files in a directory.
Args:
directory: Path to directory
Returns:
Total size in bytes
"""
total_size = 0
if not directory.exists():
return 0
for path in directory.rglob('*'):
if path.is_file():
try:
total_size += path.stat().st_size
except (OSError, PermissionError):
logger.warning(f"Could not access file: {path}")
return total_size
def get_storage_usage(self) -> Dict[str, int]:
"""
Get storage usage by category.
Returns:
Dictionary with storage usage by type
"""
return {
'videos': self.calculate_directory_size(self.base_dir / "videos"),
'audio': self.calculate_directory_size(self.base_dir / "audio"),
'metadata': self.calculate_directory_size(self.base_dir / "metadata"),
'thumbnails': self.calculate_directory_size(self.base_dir / "thumbnails"),
'temp': self.calculate_directory_size(self.base_dir / "temp"),
'total': self.calculate_directory_size(self.base_dir)
}
def find_old_files(self, days: int = 30) -> List[Path]:
"""
Find files older than specified days.
Args:
days: Age threshold in days
Returns:
List of file paths older than threshold
"""
old_files = []
threshold = datetime.now() - timedelta(days=days)
for category in ['videos', 'audio', 'thumbnails']:
directory = self.base_dir / category
if directory.exists():
for path in directory.rglob('*'):
if path.is_file():
try:
mtime = datetime.fromtimestamp(path.stat().st_mtime)
if mtime < threshold:
old_files.append(path)
except (OSError, PermissionError):
logger.warning(f"Could not check file age: {path}")
return old_files
def cleanup_temp_files(self) -> int:
"""
Clean up temporary files.
Returns:
Number of bytes freed
"""
temp_dir = self.base_dir / "temp"
bytes_freed = 0
if temp_dir.exists():
bytes_freed = self.calculate_directory_size(temp_dir)
try:
shutil.rmtree(temp_dir)
temp_dir.mkdir(exist_ok=True)
logger.info(f"Cleaned temp directory, freed {bytes_freed / (1024*1024):.2f} MB")
except Exception as e:
logger.error(f"Error cleaning temp directory: {e}")
return 0
return bytes_freed
def cleanup_orphaned_files(self, cache: Dict) -> int:
"""
Remove files not referenced in cache.
Args:
cache: Video cache dictionary
Returns:
Number of bytes freed
"""
bytes_freed = 0
# Get all referenced files from cache
referenced_files = set()
for video_hash, info in cache.items():
if 'video_path' in info:
referenced_files.add(Path(info['video_path']))
if 'audio_path' in info:
referenced_files.add(Path(info['audio_path']))
# Add metadata files
video_id = info.get('video_id')
if video_id:
referenced_files.add(self.base_dir / "videos" / f"{video_id}.info.json")
referenced_files.add(self.base_dir / "thumbnails" / f"{video_id}.jpg")
# Find and remove orphaned files
for category in ['videos', 'audio', 'metadata', 'thumbnails']:
directory = self.base_dir / category
if directory.exists():
for path in directory.glob('*'):
if path.is_file() and path not in referenced_files:
try:
size = path.stat().st_size
path.unlink()
bytes_freed += size
logger.info(f"Removed orphaned file: {path}")
except Exception as e:
logger.error(f"Error removing orphaned file {path}: {e}")
return bytes_freed
def move_file_safe(self, source: Path, destination: Path) -> bool:
"""
Safely move a file with error handling.
Args:
source: Source file path
destination: Destination file path
Returns:
True if successful, False otherwise
"""
try:
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(source), str(destination))
logger.debug(f"Moved {source} to {destination}")
return True
except Exception as e:
logger.error(f"Error moving file from {source} to {destination}: {e}")
return False
def copy_file_safe(self, source: Path, destination: Path) -> bool:
"""
Safely copy a file with error handling.
Args:
source: Source file path
destination: Destination file path
Returns:
True if successful, False otherwise
"""
try:
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(str(source), str(destination))
logger.debug(f"Copied {source} to {destination}")
return True
except Exception as e:
logger.error(f"Error copying file from {source} to {destination}: {e}")
return False
def get_file_info(self, file_path: Path) -> Optional[Dict]:
"""
Get detailed information about a file.
Args:
file_path: Path to the file
Returns:
File information dictionary or None if file doesn't exist
"""
if not file_path.exists():
return None
try:
stat = file_path.stat()
return {
'path': str(file_path),
'name': file_path.name,
'size_bytes': stat.st_size,
'size_mb': stat.st_size / (1024 * 1024),
'created': datetime.fromtimestamp(stat.st_ctime).isoformat(),
'modified': datetime.fromtimestamp(stat.st_mtime).isoformat(),
'accessed': datetime.fromtimestamp(stat.st_atime).isoformat(),
'extension': file_path.suffix,
'parent': str(file_path.parent)
}
except Exception as e:
logger.error(f"Error getting file info for {file_path}: {e}")
return None
def archive_video(self, video_id: str, archive_dir: str = "archive") -> bool:
"""
Archive a video and its associated files.
Args:
video_id: YouTube video ID
archive_dir: Archive directory name
Returns:
True if successful, False otherwise
"""
archive_path = self.base_dir / archive_dir / video_id
archive_path.mkdir(parents=True, exist_ok=True)
files_to_archive = [
(self.base_dir / "videos" / f"{video_id}.mp4", archive_path / f"{video_id}.mp4"),
(self.base_dir / "audio" / f"{video_id}.mp3", archive_path / f"{video_id}.mp3"),
(self.base_dir / "metadata" / f"{video_id}.json", archive_path / f"{video_id}.json"),
(self.base_dir / "thumbnails" / f"{video_id}.jpg", archive_path / f"{video_id}.jpg"),
]
success = True
for source, dest in files_to_archive:
if source.exists():
if not self.move_file_safe(source, dest):
success = False
if success:
logger.info(f"Archived video {video_id} to {archive_path}")
return success
def restore_from_archive(self, video_id: str, archive_dir: str = "archive") -> bool:
"""
Restore a video from archive.
Args:
video_id: YouTube video ID
archive_dir: Archive directory name
Returns:
True if successful, False otherwise
"""
archive_path = self.base_dir / archive_dir / video_id
if not archive_path.exists():
logger.error(f"Archive not found for video {video_id}")
return False
files_to_restore = [
(archive_path / f"{video_id}.mp4", self.base_dir / "videos" / f"{video_id}.mp4"),
(archive_path / f"{video_id}.mp3", self.base_dir / "audio" / f"{video_id}.mp3"),
(archive_path / f"{video_id}.json", self.base_dir / "metadata" / f"{video_id}.json"),
(archive_path / f"{video_id}.jpg", self.base_dir / "thumbnails" / f"{video_id}.jpg"),
]
success = True
for source, dest in files_to_restore:
if source.exists():
if not self.move_file_safe(source, dest):
success = False
if success:
# Remove empty archive directory
try:
archive_path.rmdir()
except OSError:
pass # Directory not empty, that's okay
logger.info(f"Restored video {video_id} from archive")
return success
def get_disk_usage(self) -> Dict:
"""
Get disk usage statistics for the storage directory.
Returns:
Disk usage information
"""
try:
stat = shutil.disk_usage(self.base_dir)
return {
'total_bytes': stat.total,
'total_gb': stat.total / (1024 ** 3),
'used_bytes': stat.used,
'used_gb': stat.used / (1024 ** 3),
'free_bytes': stat.free,
'free_gb': stat.free / (1024 ** 3),
'percent_used': (stat.used / stat.total * 100) if stat.total > 0 else 0
}
except Exception as e:
logger.error(f"Error getting disk usage: {e}")
return {
'error': str(e),
'total_bytes': 0,
'used_bytes': 0,
'free_bytes': 0
}