404 lines
16 KiB
Python
404 lines
16 KiB
Python
"""
|
|
Job History Service for managing persistent storage-based job tracking.
|
|
Leverages existing video_storage directory structure.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any
|
|
from datetime import datetime
|
|
import logging
|
|
from urllib.parse import urlparse
|
|
|
|
from backend.models.job_history import (
|
|
JobMetadata, JobHistoryIndex, JobStatus, ProcessingStatus,
|
|
VideoInfo, ProcessingDetails, JobFiles, JobMetrics,
|
|
JobHistoryQuery, JobHistoryResponse, JobDetailResponse
|
|
)
|
|
from backend.config.video_download_config import VideoDownloadConfig
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class JobHistoryService:
|
|
"""Service for managing job history based on persistent storage."""
|
|
|
|
def __init__(self, config: Optional[VideoDownloadConfig] = None):
|
|
self.config = config or VideoDownloadConfig()
|
|
self.config.ensure_directories()
|
|
self.storage_dirs = self.config.get_storage_dirs()
|
|
|
|
# Jobs metadata directory
|
|
self.jobs_dir = self.storage_dirs["base"] / "jobs"
|
|
self.jobs_dir.mkdir(exist_ok=True)
|
|
|
|
# Master index file
|
|
self.index_file = self.jobs_dir / "index.json"
|
|
|
|
async def initialize_index(self) -> None:
|
|
"""Initialize or rebuild the job history index from existing files."""
|
|
logger.info("Initializing job history index from existing storage")
|
|
|
|
jobs = await self._discover_existing_jobs()
|
|
|
|
# Create master index
|
|
index = JobHistoryIndex(
|
|
total_jobs=len(jobs),
|
|
last_updated=datetime.utcnow(),
|
|
jobs=list(jobs.keys()),
|
|
oldest_job=min(job.processing.created_at for job in jobs.values()) if jobs else None,
|
|
newest_job=max(job.processing.created_at for job in jobs.values()) if jobs else None,
|
|
total_storage_mb=self._calculate_total_storage(jobs)
|
|
)
|
|
|
|
# Save index
|
|
await self._save_index(index)
|
|
|
|
# Save individual job metadata files
|
|
for video_id, job_metadata in jobs.items():
|
|
await self._save_job_metadata(video_id, job_metadata)
|
|
|
|
logger.info(f"Job history index initialized with {len(jobs)} jobs")
|
|
|
|
async def _discover_existing_jobs(self) -> Dict[str, JobMetadata]:
|
|
"""Discover existing jobs from storage directories."""
|
|
jobs: Dict[str, JobMetadata] = {}
|
|
|
|
# Scan audio directory for video IDs
|
|
audio_dir = self.storage_dirs["audio"]
|
|
if audio_dir.exists():
|
|
for audio_file in audio_dir.glob("*.mp3"):
|
|
video_id = audio_file.stem
|
|
if "_metadata" in video_id:
|
|
continue # Skip metadata files
|
|
|
|
logger.debug(f"Discovered job from audio file: {video_id}")
|
|
job_metadata = await self._create_job_metadata_from_files(video_id)
|
|
if job_metadata:
|
|
jobs[video_id] = job_metadata
|
|
|
|
return jobs
|
|
|
|
async def _create_job_metadata_from_files(self, video_id: str) -> Optional[JobMetadata]:
|
|
"""Create job metadata from existing files for a video ID."""
|
|
try:
|
|
files = JobFiles()
|
|
metadata = JobMetrics()
|
|
processing = ProcessingDetails(
|
|
created_at=datetime.utcnow(),
|
|
last_processed_at=datetime.utcnow()
|
|
)
|
|
|
|
# Check for audio file and metadata
|
|
audio_file = self.storage_dirs["audio"] / f"{video_id}.mp3"
|
|
audio_metadata_file = self.storage_dirs["audio"] / f"{video_id}_metadata.json"
|
|
|
|
if audio_file.exists():
|
|
files.audio = str(audio_file.relative_to(self.storage_dirs["base"]))
|
|
metadata.file_size_mb = audio_file.stat().st_size / (1024 * 1024)
|
|
|
|
# Load audio metadata if available
|
|
if audio_metadata_file.exists():
|
|
files.audio_metadata = str(audio_metadata_file.relative_to(self.storage_dirs["base"]))
|
|
audio_meta = json.loads(audio_metadata_file.read_text())
|
|
metadata.audio_duration_seconds = audio_meta.get("duration_seconds")
|
|
processing.created_at = datetime.fromisoformat(audio_meta.get("download_date", datetime.utcnow().isoformat()))
|
|
|
|
# Check for transcript files
|
|
transcript_file = self.storage_dirs["transcripts"] / f"{video_id}.txt"
|
|
transcript_json_file = self.storage_dirs["transcripts"] / f"{video_id}.json"
|
|
|
|
if transcript_file.exists():
|
|
files.transcript = str(transcript_file.relative_to(self.storage_dirs["base"]))
|
|
# Count words in transcript
|
|
transcript_content = transcript_file.read_text(encoding='utf-8')
|
|
metadata.word_count = len(transcript_content.split())
|
|
|
|
processing.transcript["status"] = ProcessingStatus.COMPLETED
|
|
processing.transcript["method"] = "whisper"
|
|
|
|
if transcript_json_file.exists():
|
|
files.transcript_json = str(transcript_json_file.relative_to(self.storage_dirs["base"]))
|
|
# Count segments
|
|
transcript_data = json.loads(transcript_json_file.read_text())
|
|
metadata.segment_count = len(transcript_data) if isinstance(transcript_data, list) else 0
|
|
|
|
# Create video info (extract from available metadata or use defaults)
|
|
video_info = VideoInfo(
|
|
title=self._extract_title_from_metadata(video_id, audio_metadata_file),
|
|
url=f"https://www.youtube.com/watch?v={video_id}",
|
|
video_id=video_id,
|
|
duration=int(metadata.audio_duration_seconds) if metadata.audio_duration_seconds else None
|
|
)
|
|
|
|
# Determine overall job status
|
|
status = JobStatus.COMPLETED if files.transcript or files.audio else JobStatus.FAILED
|
|
|
|
return JobMetadata(
|
|
id=video_id,
|
|
status=status,
|
|
video_info=video_info,
|
|
processing=processing,
|
|
files=files,
|
|
metadata=metadata
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating job metadata for {video_id}: {e}")
|
|
return None
|
|
|
|
def _extract_title_from_metadata(self, video_id: str, metadata_file: Path) -> str:
|
|
"""Extract video title from metadata or generate a default."""
|
|
try:
|
|
if metadata_file.exists():
|
|
metadata = json.loads(metadata_file.read_text())
|
|
# Try to extract title from metadata (if available in future)
|
|
return f"Video {video_id}" # Fallback for now
|
|
return f"Video {video_id}"
|
|
except:
|
|
return f"Video {video_id}"
|
|
|
|
def _calculate_total_storage(self, jobs: Dict[str, JobMetadata]) -> float:
|
|
"""Calculate total storage used by all jobs in MB."""
|
|
total_mb = 0.0
|
|
for job in jobs.values():
|
|
if job.metadata.file_size_mb:
|
|
total_mb += job.metadata.file_size_mb
|
|
return total_mb
|
|
|
|
async def _save_index(self, index: JobHistoryIndex) -> None:
|
|
"""Save the master index to disk."""
|
|
index_data = index.dict()
|
|
with open(self.index_file, 'w') as f:
|
|
json.dump(index_data, f, indent=2, default=str)
|
|
|
|
async def _load_index(self) -> Optional[JobHistoryIndex]:
|
|
"""Load the master index from disk."""
|
|
try:
|
|
if self.index_file.exists():
|
|
with open(self.index_file, 'r') as f:
|
|
data = json.load(f)
|
|
return JobHistoryIndex(**data)
|
|
except Exception as e:
|
|
logger.error(f"Error loading index: {e}")
|
|
return None
|
|
|
|
async def _save_job_metadata(self, video_id: str, job_metadata: JobMetadata) -> None:
|
|
"""Save individual job metadata to disk."""
|
|
job_file = self.jobs_dir / f"{video_id}.json"
|
|
job_data = job_metadata.dict()
|
|
with open(job_file, 'w') as f:
|
|
json.dump(job_data, f, indent=2, default=str)
|
|
|
|
async def _load_job_metadata(self, video_id: str) -> Optional[JobMetadata]:
|
|
"""Load individual job metadata from disk."""
|
|
try:
|
|
job_file = self.jobs_dir / f"{video_id}.json"
|
|
if job_file.exists():
|
|
with open(job_file, 'r') as f:
|
|
data = json.load(f)
|
|
return JobMetadata(**data)
|
|
except Exception as e:
|
|
logger.error(f"Error loading job metadata for {video_id}: {e}")
|
|
return None
|
|
|
|
async def get_job_history(self, query: JobHistoryQuery) -> JobHistoryResponse:
|
|
"""Get paginated job history with filtering and sorting."""
|
|
# Load index
|
|
index = await self._load_index()
|
|
if not index:
|
|
return JobHistoryResponse(
|
|
jobs=[], total=0, page=query.page, page_size=query.page_size,
|
|
total_pages=0, has_next=False, has_previous=False
|
|
)
|
|
|
|
# Load all job metadata
|
|
jobs = []
|
|
for video_id in index.jobs:
|
|
job_metadata = await self._load_job_metadata(video_id)
|
|
if job_metadata:
|
|
jobs.append(job_metadata)
|
|
|
|
# Apply filters
|
|
filtered_jobs = self._apply_filters(jobs, query)
|
|
|
|
# Apply sorting
|
|
sorted_jobs = self._apply_sorting(filtered_jobs, query)
|
|
|
|
# Apply pagination
|
|
total = len(sorted_jobs)
|
|
start_idx = (query.page - 1) * query.page_size
|
|
end_idx = start_idx + query.page_size
|
|
paginated_jobs = sorted_jobs[start_idx:end_idx]
|
|
|
|
total_pages = (total + query.page_size - 1) // query.page_size
|
|
|
|
return JobHistoryResponse(
|
|
jobs=paginated_jobs,
|
|
total=total,
|
|
page=query.page,
|
|
page_size=query.page_size,
|
|
total_pages=total_pages,
|
|
has_next=query.page < total_pages,
|
|
has_previous=query.page > 1
|
|
)
|
|
|
|
def _apply_filters(self, jobs: List[JobMetadata], query: JobHistoryQuery) -> List[JobMetadata]:
|
|
"""Apply filters to job list."""
|
|
filtered = jobs
|
|
|
|
# Search filter
|
|
if query.search:
|
|
search_lower = query.search.lower()
|
|
filtered = [
|
|
job for job in filtered
|
|
if search_lower in job.video_info.title.lower()
|
|
or search_lower in job.video_info.video_id.lower()
|
|
or (job.video_info.channel and search_lower in job.video_info.channel.lower())
|
|
]
|
|
|
|
# Status filter
|
|
if query.status_filter:
|
|
filtered = [job for job in filtered if job.status in query.status_filter]
|
|
|
|
# Date filters
|
|
if query.date_from:
|
|
filtered = [job for job in filtered if job.processing.created_at >= query.date_from]
|
|
|
|
if query.date_to:
|
|
filtered = [job for job in filtered if job.processing.created_at <= query.date_to]
|
|
|
|
# Starred filter
|
|
if query.starred_only:
|
|
filtered = [job for job in filtered if job.is_starred]
|
|
|
|
# Tags filter
|
|
if query.tags:
|
|
filtered = [job for job in filtered if any(tag in job.tags for tag in query.tags)]
|
|
|
|
return filtered
|
|
|
|
def _apply_sorting(self, jobs: List[JobMetadata], query: JobHistoryQuery) -> List[JobMetadata]:
|
|
"""Apply sorting to job list."""
|
|
reverse = query.sort_order == "desc"
|
|
|
|
if query.sort_by == "created_at":
|
|
return sorted(jobs, key=lambda x: x.processing.created_at, reverse=reverse)
|
|
elif query.sort_by == "title":
|
|
return sorted(jobs, key=lambda x: x.video_info.title, reverse=reverse)
|
|
elif query.sort_by == "duration":
|
|
return sorted(jobs, key=lambda x: x.video_info.duration or 0, reverse=reverse)
|
|
elif query.sort_by == "word_count":
|
|
return sorted(jobs, key=lambda x: x.metadata.word_count or 0, reverse=reverse)
|
|
elif query.sort_by == "processing_time":
|
|
return sorted(jobs, key=lambda x: x.metadata.processing_time_seconds or 0, reverse=reverse)
|
|
|
|
return jobs
|
|
|
|
async def get_job_detail(self, video_id: str) -> Optional[JobDetailResponse]:
|
|
"""Get detailed information for a specific job."""
|
|
job_metadata = await self._load_job_metadata(video_id)
|
|
if not job_metadata:
|
|
return None
|
|
|
|
# Load file contents
|
|
transcript_content = None
|
|
transcript_segments = None
|
|
summary_content = None
|
|
file_exists = {}
|
|
|
|
# Load transcript content
|
|
if job_metadata.files.transcript:
|
|
transcript_path = self.storage_dirs["base"] / job_metadata.files.transcript
|
|
if transcript_path.exists():
|
|
transcript_content = transcript_path.read_text(encoding='utf-8')
|
|
file_exists["transcript"] = True
|
|
else:
|
|
file_exists["transcript"] = False
|
|
|
|
# Load transcript segments
|
|
if job_metadata.files.transcript_json:
|
|
json_path = self.storage_dirs["base"] / job_metadata.files.transcript_json
|
|
if json_path.exists():
|
|
transcript_segments = json.loads(json_path.read_text())
|
|
file_exists["transcript_json"] = True
|
|
else:
|
|
file_exists["transcript_json"] = False
|
|
|
|
# Check other files
|
|
for file_key, file_path in {
|
|
"audio": job_metadata.files.audio,
|
|
"audio_metadata": job_metadata.files.audio_metadata,
|
|
"summary": job_metadata.files.summary
|
|
}.items():
|
|
if file_path:
|
|
full_path = self.storage_dirs["base"] / file_path
|
|
file_exists[file_key] = full_path.exists()
|
|
|
|
# Update access tracking
|
|
job_metadata.last_accessed = datetime.utcnow()
|
|
job_metadata.access_count += 1
|
|
await self._save_job_metadata(video_id, job_metadata)
|
|
|
|
return JobDetailResponse(
|
|
job=job_metadata,
|
|
transcript_content=transcript_content,
|
|
transcript_segments=transcript_segments,
|
|
summary_content=summary_content,
|
|
file_exists=file_exists
|
|
)
|
|
|
|
async def update_job(self, video_id: str, **updates) -> Optional[JobMetadata]:
|
|
"""Update job metadata."""
|
|
job_metadata = await self._load_job_metadata(video_id)
|
|
if not job_metadata:
|
|
return None
|
|
|
|
# Apply updates
|
|
for key, value in updates.items():
|
|
if hasattr(job_metadata, key):
|
|
setattr(job_metadata, key, value)
|
|
|
|
job_metadata.processing.last_processed_at = datetime.utcnow()
|
|
await self._save_job_metadata(video_id, job_metadata)
|
|
|
|
return job_metadata
|
|
|
|
async def delete_job(self, video_id: str, delete_files: bool = False) -> bool:
|
|
"""Delete a job and optionally its associated files."""
|
|
job_metadata = await self._load_job_metadata(video_id)
|
|
if not job_metadata:
|
|
return False
|
|
|
|
# Delete files if requested
|
|
if delete_files:
|
|
for file_path in [
|
|
job_metadata.files.audio,
|
|
job_metadata.files.audio_metadata,
|
|
job_metadata.files.transcript,
|
|
job_metadata.files.transcript_json,
|
|
job_metadata.files.summary
|
|
]:
|
|
if file_path:
|
|
full_path = self.storage_dirs["base"] / file_path
|
|
if full_path.exists():
|
|
full_path.unlink()
|
|
logger.info(f"Deleted file: {full_path}")
|
|
|
|
# Delete job metadata file
|
|
job_file = self.jobs_dir / f"{video_id}.json"
|
|
if job_file.exists():
|
|
job_file.unlink()
|
|
|
|
# Update index
|
|
index = await self._load_index()
|
|
if index and video_id in index.jobs:
|
|
index.jobs.remove(video_id)
|
|
index.total_jobs -= 1
|
|
index.last_updated = datetime.utcnow()
|
|
await self._save_index(index)
|
|
|
|
logger.info(f"Deleted job: {video_id}")
|
|
return True |