youtube-summarizer/backend/services/job_history_service.py

404 lines
16 KiB
Python

"""
Job History Service for managing persistent storage-based job tracking.
Leverages existing video_storage directory structure.
"""
import os
import json
import asyncio
from pathlib import Path
from typing import List, Optional, Dict, Any
from datetime import datetime
import logging
from urllib.parse import urlparse
from backend.models.job_history import (
JobMetadata, JobHistoryIndex, JobStatus, ProcessingStatus,
VideoInfo, ProcessingDetails, JobFiles, JobMetrics,
JobHistoryQuery, JobHistoryResponse, JobDetailResponse
)
from backend.config.video_download_config import VideoDownloadConfig
logger = logging.getLogger(__name__)
class JobHistoryService:
"""Service for managing job history based on persistent storage."""
def __init__(self, config: Optional[VideoDownloadConfig] = None):
self.config = config or VideoDownloadConfig()
self.config.ensure_directories()
self.storage_dirs = self.config.get_storage_dirs()
# Jobs metadata directory
self.jobs_dir = self.storage_dirs["base"] / "jobs"
self.jobs_dir.mkdir(exist_ok=True)
# Master index file
self.index_file = self.jobs_dir / "index.json"
async def initialize_index(self) -> None:
"""Initialize or rebuild the job history index from existing files."""
logger.info("Initializing job history index from existing storage")
jobs = await self._discover_existing_jobs()
# Create master index
index = JobHistoryIndex(
total_jobs=len(jobs),
last_updated=datetime.utcnow(),
jobs=list(jobs.keys()),
oldest_job=min(job.processing.created_at for job in jobs.values()) if jobs else None,
newest_job=max(job.processing.created_at for job in jobs.values()) if jobs else None,
total_storage_mb=self._calculate_total_storage(jobs)
)
# Save index
await self._save_index(index)
# Save individual job metadata files
for video_id, job_metadata in jobs.items():
await self._save_job_metadata(video_id, job_metadata)
logger.info(f"Job history index initialized with {len(jobs)} jobs")
async def _discover_existing_jobs(self) -> Dict[str, JobMetadata]:
"""Discover existing jobs from storage directories."""
jobs: Dict[str, JobMetadata] = {}
# Scan audio directory for video IDs
audio_dir = self.storage_dirs["audio"]
if audio_dir.exists():
for audio_file in audio_dir.glob("*.mp3"):
video_id = audio_file.stem
if "_metadata" in video_id:
continue # Skip metadata files
logger.debug(f"Discovered job from audio file: {video_id}")
job_metadata = await self._create_job_metadata_from_files(video_id)
if job_metadata:
jobs[video_id] = job_metadata
return jobs
async def _create_job_metadata_from_files(self, video_id: str) -> Optional[JobMetadata]:
"""Create job metadata from existing files for a video ID."""
try:
files = JobFiles()
metadata = JobMetrics()
processing = ProcessingDetails(
created_at=datetime.utcnow(),
last_processed_at=datetime.utcnow()
)
# Check for audio file and metadata
audio_file = self.storage_dirs["audio"] / f"{video_id}.mp3"
audio_metadata_file = self.storage_dirs["audio"] / f"{video_id}_metadata.json"
if audio_file.exists():
files.audio = str(audio_file.relative_to(self.storage_dirs["base"]))
metadata.file_size_mb = audio_file.stat().st_size / (1024 * 1024)
# Load audio metadata if available
if audio_metadata_file.exists():
files.audio_metadata = str(audio_metadata_file.relative_to(self.storage_dirs["base"]))
audio_meta = json.loads(audio_metadata_file.read_text())
metadata.audio_duration_seconds = audio_meta.get("duration_seconds")
processing.created_at = datetime.fromisoformat(audio_meta.get("download_date", datetime.utcnow().isoformat()))
# Check for transcript files
transcript_file = self.storage_dirs["transcripts"] / f"{video_id}.txt"
transcript_json_file = self.storage_dirs["transcripts"] / f"{video_id}.json"
if transcript_file.exists():
files.transcript = str(transcript_file.relative_to(self.storage_dirs["base"]))
# Count words in transcript
transcript_content = transcript_file.read_text(encoding='utf-8')
metadata.word_count = len(transcript_content.split())
processing.transcript["status"] = ProcessingStatus.COMPLETED
processing.transcript["method"] = "whisper"
if transcript_json_file.exists():
files.transcript_json = str(transcript_json_file.relative_to(self.storage_dirs["base"]))
# Count segments
transcript_data = json.loads(transcript_json_file.read_text())
metadata.segment_count = len(transcript_data) if isinstance(transcript_data, list) else 0
# Create video info (extract from available metadata or use defaults)
video_info = VideoInfo(
title=self._extract_title_from_metadata(video_id, audio_metadata_file),
url=f"https://www.youtube.com/watch?v={video_id}",
video_id=video_id,
duration=int(metadata.audio_duration_seconds) if metadata.audio_duration_seconds else None
)
# Determine overall job status
status = JobStatus.COMPLETED if files.transcript or files.audio else JobStatus.FAILED
return JobMetadata(
id=video_id,
status=status,
video_info=video_info,
processing=processing,
files=files,
metadata=metadata
)
except Exception as e:
logger.error(f"Error creating job metadata for {video_id}: {e}")
return None
def _extract_title_from_metadata(self, video_id: str, metadata_file: Path) -> str:
"""Extract video title from metadata or generate a default."""
try:
if metadata_file.exists():
metadata = json.loads(metadata_file.read_text())
# Try to extract title from metadata (if available in future)
return f"Video {video_id}" # Fallback for now
return f"Video {video_id}"
except:
return f"Video {video_id}"
def _calculate_total_storage(self, jobs: Dict[str, JobMetadata]) -> float:
"""Calculate total storage used by all jobs in MB."""
total_mb = 0.0
for job in jobs.values():
if job.metadata.file_size_mb:
total_mb += job.metadata.file_size_mb
return total_mb
async def _save_index(self, index: JobHistoryIndex) -> None:
"""Save the master index to disk."""
index_data = index.dict()
with open(self.index_file, 'w') as f:
json.dump(index_data, f, indent=2, default=str)
async def _load_index(self) -> Optional[JobHistoryIndex]:
"""Load the master index from disk."""
try:
if self.index_file.exists():
with open(self.index_file, 'r') as f:
data = json.load(f)
return JobHistoryIndex(**data)
except Exception as e:
logger.error(f"Error loading index: {e}")
return None
async def _save_job_metadata(self, video_id: str, job_metadata: JobMetadata) -> None:
"""Save individual job metadata to disk."""
job_file = self.jobs_dir / f"{video_id}.json"
job_data = job_metadata.dict()
with open(job_file, 'w') as f:
json.dump(job_data, f, indent=2, default=str)
async def _load_job_metadata(self, video_id: str) -> Optional[JobMetadata]:
"""Load individual job metadata from disk."""
try:
job_file = self.jobs_dir / f"{video_id}.json"
if job_file.exists():
with open(job_file, 'r') as f:
data = json.load(f)
return JobMetadata(**data)
except Exception as e:
logger.error(f"Error loading job metadata for {video_id}: {e}")
return None
async def get_job_history(self, query: JobHistoryQuery) -> JobHistoryResponse:
"""Get paginated job history with filtering and sorting."""
# Load index
index = await self._load_index()
if not index:
return JobHistoryResponse(
jobs=[], total=0, page=query.page, page_size=query.page_size,
total_pages=0, has_next=False, has_previous=False
)
# Load all job metadata
jobs = []
for video_id in index.jobs:
job_metadata = await self._load_job_metadata(video_id)
if job_metadata:
jobs.append(job_metadata)
# Apply filters
filtered_jobs = self._apply_filters(jobs, query)
# Apply sorting
sorted_jobs = self._apply_sorting(filtered_jobs, query)
# Apply pagination
total = len(sorted_jobs)
start_idx = (query.page - 1) * query.page_size
end_idx = start_idx + query.page_size
paginated_jobs = sorted_jobs[start_idx:end_idx]
total_pages = (total + query.page_size - 1) // query.page_size
return JobHistoryResponse(
jobs=paginated_jobs,
total=total,
page=query.page,
page_size=query.page_size,
total_pages=total_pages,
has_next=query.page < total_pages,
has_previous=query.page > 1
)
def _apply_filters(self, jobs: List[JobMetadata], query: JobHistoryQuery) -> List[JobMetadata]:
"""Apply filters to job list."""
filtered = jobs
# Search filter
if query.search:
search_lower = query.search.lower()
filtered = [
job for job in filtered
if search_lower in job.video_info.title.lower()
or search_lower in job.video_info.video_id.lower()
or (job.video_info.channel and search_lower in job.video_info.channel.lower())
]
# Status filter
if query.status_filter:
filtered = [job for job in filtered if job.status in query.status_filter]
# Date filters
if query.date_from:
filtered = [job for job in filtered if job.processing.created_at >= query.date_from]
if query.date_to:
filtered = [job for job in filtered if job.processing.created_at <= query.date_to]
# Starred filter
if query.starred_only:
filtered = [job for job in filtered if job.is_starred]
# Tags filter
if query.tags:
filtered = [job for job in filtered if any(tag in job.tags for tag in query.tags)]
return filtered
def _apply_sorting(self, jobs: List[JobMetadata], query: JobHistoryQuery) -> List[JobMetadata]:
"""Apply sorting to job list."""
reverse = query.sort_order == "desc"
if query.sort_by == "created_at":
return sorted(jobs, key=lambda x: x.processing.created_at, reverse=reverse)
elif query.sort_by == "title":
return sorted(jobs, key=lambda x: x.video_info.title, reverse=reverse)
elif query.sort_by == "duration":
return sorted(jobs, key=lambda x: x.video_info.duration or 0, reverse=reverse)
elif query.sort_by == "word_count":
return sorted(jobs, key=lambda x: x.metadata.word_count or 0, reverse=reverse)
elif query.sort_by == "processing_time":
return sorted(jobs, key=lambda x: x.metadata.processing_time_seconds or 0, reverse=reverse)
return jobs
async def get_job_detail(self, video_id: str) -> Optional[JobDetailResponse]:
"""Get detailed information for a specific job."""
job_metadata = await self._load_job_metadata(video_id)
if not job_metadata:
return None
# Load file contents
transcript_content = None
transcript_segments = None
summary_content = None
file_exists = {}
# Load transcript content
if job_metadata.files.transcript:
transcript_path = self.storage_dirs["base"] / job_metadata.files.transcript
if transcript_path.exists():
transcript_content = transcript_path.read_text(encoding='utf-8')
file_exists["transcript"] = True
else:
file_exists["transcript"] = False
# Load transcript segments
if job_metadata.files.transcript_json:
json_path = self.storage_dirs["base"] / job_metadata.files.transcript_json
if json_path.exists():
transcript_segments = json.loads(json_path.read_text())
file_exists["transcript_json"] = True
else:
file_exists["transcript_json"] = False
# Check other files
for file_key, file_path in {
"audio": job_metadata.files.audio,
"audio_metadata": job_metadata.files.audio_metadata,
"summary": job_metadata.files.summary
}.items():
if file_path:
full_path = self.storage_dirs["base"] / file_path
file_exists[file_key] = full_path.exists()
# Update access tracking
job_metadata.last_accessed = datetime.utcnow()
job_metadata.access_count += 1
await self._save_job_metadata(video_id, job_metadata)
return JobDetailResponse(
job=job_metadata,
transcript_content=transcript_content,
transcript_segments=transcript_segments,
summary_content=summary_content,
file_exists=file_exists
)
async def update_job(self, video_id: str, **updates) -> Optional[JobMetadata]:
"""Update job metadata."""
job_metadata = await self._load_job_metadata(video_id)
if not job_metadata:
return None
# Apply updates
for key, value in updates.items():
if hasattr(job_metadata, key):
setattr(job_metadata, key, value)
job_metadata.processing.last_processed_at = datetime.utcnow()
await self._save_job_metadata(video_id, job_metadata)
return job_metadata
async def delete_job(self, video_id: str, delete_files: bool = False) -> bool:
"""Delete a job and optionally its associated files."""
job_metadata = await self._load_job_metadata(video_id)
if not job_metadata:
return False
# Delete files if requested
if delete_files:
for file_path in [
job_metadata.files.audio,
job_metadata.files.audio_metadata,
job_metadata.files.transcript,
job_metadata.files.transcript_json,
job_metadata.files.summary
]:
if file_path:
full_path = self.storage_dirs["base"] / file_path
if full_path.exists():
full_path.unlink()
logger.info(f"Deleted file: {full_path}")
# Delete job metadata file
job_file = self.jobs_dir / f"{video_id}.json"
if job_file.exists():
job_file.unlink()
# Update index
index = await self._load_index()
if index and video_id in index.jobs:
index.jobs.remove(video_id)
index.total_jobs -= 1
index.last_updated = datetime.utcnow()
await self._save_index(index)
logger.info(f"Deleted job: {video_id}")
return True