""" Job History Service for managing persistent storage-based job tracking. Leverages existing video_storage directory structure. """ import os import json import asyncio from pathlib import Path from typing import List, Optional, Dict, Any from datetime import datetime import logging from urllib.parse import urlparse from backend.models.job_history import ( JobMetadata, JobHistoryIndex, JobStatus, ProcessingStatus, VideoInfo, ProcessingDetails, JobFiles, JobMetrics, JobHistoryQuery, JobHistoryResponse, JobDetailResponse ) from backend.config.video_download_config import VideoDownloadConfig logger = logging.getLogger(__name__) class JobHistoryService: """Service for managing job history based on persistent storage.""" def __init__(self, config: Optional[VideoDownloadConfig] = None): self.config = config or VideoDownloadConfig() self.config.ensure_directories() self.storage_dirs = self.config.get_storage_dirs() # Jobs metadata directory self.jobs_dir = self.storage_dirs["base"] / "jobs" self.jobs_dir.mkdir(exist_ok=True) # Master index file self.index_file = self.jobs_dir / "index.json" async def initialize_index(self) -> None: """Initialize or rebuild the job history index from existing files.""" logger.info("Initializing job history index from existing storage") jobs = await self._discover_existing_jobs() # Create master index index = JobHistoryIndex( total_jobs=len(jobs), last_updated=datetime.utcnow(), jobs=list(jobs.keys()), oldest_job=min(job.processing.created_at for job in jobs.values()) if jobs else None, newest_job=max(job.processing.created_at for job in jobs.values()) if jobs else None, total_storage_mb=self._calculate_total_storage(jobs) ) # Save index await self._save_index(index) # Save individual job metadata files for video_id, job_metadata in jobs.items(): await self._save_job_metadata(video_id, job_metadata) logger.info(f"Job history index initialized with {len(jobs)} jobs") async def _discover_existing_jobs(self) -> Dict[str, JobMetadata]: """Discover existing jobs from storage directories.""" jobs: Dict[str, JobMetadata] = {} # Scan audio directory for video IDs audio_dir = self.storage_dirs["audio"] if audio_dir.exists(): for audio_file in audio_dir.glob("*.mp3"): video_id = audio_file.stem if "_metadata" in video_id: continue # Skip metadata files logger.debug(f"Discovered job from audio file: {video_id}") job_metadata = await self._create_job_metadata_from_files(video_id) if job_metadata: jobs[video_id] = job_metadata return jobs async def _create_job_metadata_from_files(self, video_id: str) -> Optional[JobMetadata]: """Create job metadata from existing files for a video ID.""" try: files = JobFiles() metadata = JobMetrics() processing = ProcessingDetails( created_at=datetime.utcnow(), last_processed_at=datetime.utcnow() ) # Check for audio file and metadata audio_file = self.storage_dirs["audio"] / f"{video_id}.mp3" audio_metadata_file = self.storage_dirs["audio"] / f"{video_id}_metadata.json" if audio_file.exists(): files.audio = str(audio_file.relative_to(self.storage_dirs["base"])) metadata.file_size_mb = audio_file.stat().st_size / (1024 * 1024) # Load audio metadata if available if audio_metadata_file.exists(): files.audio_metadata = str(audio_metadata_file.relative_to(self.storage_dirs["base"])) audio_meta = json.loads(audio_metadata_file.read_text()) metadata.audio_duration_seconds = audio_meta.get("duration_seconds") processing.created_at = datetime.fromisoformat(audio_meta.get("download_date", datetime.utcnow().isoformat())) # Check for transcript files transcript_file = self.storage_dirs["transcripts"] / f"{video_id}.txt" transcript_json_file = self.storage_dirs["transcripts"] / f"{video_id}.json" if transcript_file.exists(): files.transcript = str(transcript_file.relative_to(self.storage_dirs["base"])) # Count words in transcript transcript_content = transcript_file.read_text(encoding='utf-8') metadata.word_count = len(transcript_content.split()) processing.transcript["status"] = ProcessingStatus.COMPLETED processing.transcript["method"] = "whisper" if transcript_json_file.exists(): files.transcript_json = str(transcript_json_file.relative_to(self.storage_dirs["base"])) # Count segments transcript_data = json.loads(transcript_json_file.read_text()) metadata.segment_count = len(transcript_data) if isinstance(transcript_data, list) else 0 # Create video info (extract from available metadata or use defaults) video_info = VideoInfo( title=self._extract_title_from_metadata(video_id, audio_metadata_file), url=f"https://www.youtube.com/watch?v={video_id}", video_id=video_id, duration=int(metadata.audio_duration_seconds) if metadata.audio_duration_seconds else None ) # Determine overall job status status = JobStatus.COMPLETED if files.transcript or files.audio else JobStatus.FAILED return JobMetadata( id=video_id, status=status, video_info=video_info, processing=processing, files=files, metadata=metadata ) except Exception as e: logger.error(f"Error creating job metadata for {video_id}: {e}") return None def _extract_title_from_metadata(self, video_id: str, metadata_file: Path) -> str: """Extract video title from metadata or generate a default.""" try: if metadata_file.exists(): metadata = json.loads(metadata_file.read_text()) # Try to extract title from metadata (if available in future) return f"Video {video_id}" # Fallback for now return f"Video {video_id}" except: return f"Video {video_id}" def _calculate_total_storage(self, jobs: Dict[str, JobMetadata]) -> float: """Calculate total storage used by all jobs in MB.""" total_mb = 0.0 for job in jobs.values(): if job.metadata.file_size_mb: total_mb += job.metadata.file_size_mb return total_mb async def _save_index(self, index: JobHistoryIndex) -> None: """Save the master index to disk.""" index_data = index.dict() with open(self.index_file, 'w') as f: json.dump(index_data, f, indent=2, default=str) async def _load_index(self) -> Optional[JobHistoryIndex]: """Load the master index from disk.""" try: if self.index_file.exists(): with open(self.index_file, 'r') as f: data = json.load(f) return JobHistoryIndex(**data) except Exception as e: logger.error(f"Error loading index: {e}") return None async def _save_job_metadata(self, video_id: str, job_metadata: JobMetadata) -> None: """Save individual job metadata to disk.""" job_file = self.jobs_dir / f"{video_id}.json" job_data = job_metadata.dict() with open(job_file, 'w') as f: json.dump(job_data, f, indent=2, default=str) async def _load_job_metadata(self, video_id: str) -> Optional[JobMetadata]: """Load individual job metadata from disk.""" try: job_file = self.jobs_dir / f"{video_id}.json" if job_file.exists(): with open(job_file, 'r') as f: data = json.load(f) return JobMetadata(**data) except Exception as e: logger.error(f"Error loading job metadata for {video_id}: {e}") return None async def get_job_history(self, query: JobHistoryQuery) -> JobHistoryResponse: """Get paginated job history with filtering and sorting.""" # Load index index = await self._load_index() if not index: return JobHistoryResponse( jobs=[], total=0, page=query.page, page_size=query.page_size, total_pages=0, has_next=False, has_previous=False ) # Load all job metadata jobs = [] for video_id in index.jobs: job_metadata = await self._load_job_metadata(video_id) if job_metadata: jobs.append(job_metadata) # Apply filters filtered_jobs = self._apply_filters(jobs, query) # Apply sorting sorted_jobs = self._apply_sorting(filtered_jobs, query) # Apply pagination total = len(sorted_jobs) start_idx = (query.page - 1) * query.page_size end_idx = start_idx + query.page_size paginated_jobs = sorted_jobs[start_idx:end_idx] total_pages = (total + query.page_size - 1) // query.page_size return JobHistoryResponse( jobs=paginated_jobs, total=total, page=query.page, page_size=query.page_size, total_pages=total_pages, has_next=query.page < total_pages, has_previous=query.page > 1 ) def _apply_filters(self, jobs: List[JobMetadata], query: JobHistoryQuery) -> List[JobMetadata]: """Apply filters to job list.""" filtered = jobs # Search filter if query.search: search_lower = query.search.lower() filtered = [ job for job in filtered if search_lower in job.video_info.title.lower() or search_lower in job.video_info.video_id.lower() or (job.video_info.channel and search_lower in job.video_info.channel.lower()) ] # Status filter if query.status_filter: filtered = [job for job in filtered if job.status in query.status_filter] # Date filters if query.date_from: filtered = [job for job in filtered if job.processing.created_at >= query.date_from] if query.date_to: filtered = [job for job in filtered if job.processing.created_at <= query.date_to] # Starred filter if query.starred_only: filtered = [job for job in filtered if job.is_starred] # Tags filter if query.tags: filtered = [job for job in filtered if any(tag in job.tags for tag in query.tags)] return filtered def _apply_sorting(self, jobs: List[JobMetadata], query: JobHistoryQuery) -> List[JobMetadata]: """Apply sorting to job list.""" reverse = query.sort_order == "desc" if query.sort_by == "created_at": return sorted(jobs, key=lambda x: x.processing.created_at, reverse=reverse) elif query.sort_by == "title": return sorted(jobs, key=lambda x: x.video_info.title, reverse=reverse) elif query.sort_by == "duration": return sorted(jobs, key=lambda x: x.video_info.duration or 0, reverse=reverse) elif query.sort_by == "word_count": return sorted(jobs, key=lambda x: x.metadata.word_count or 0, reverse=reverse) elif query.sort_by == "processing_time": return sorted(jobs, key=lambda x: x.metadata.processing_time_seconds or 0, reverse=reverse) return jobs async def get_job_detail(self, video_id: str) -> Optional[JobDetailResponse]: """Get detailed information for a specific job.""" job_metadata = await self._load_job_metadata(video_id) if not job_metadata: return None # Load file contents transcript_content = None transcript_segments = None summary_content = None file_exists = {} # Load transcript content if job_metadata.files.transcript: transcript_path = self.storage_dirs["base"] / job_metadata.files.transcript if transcript_path.exists(): transcript_content = transcript_path.read_text(encoding='utf-8') file_exists["transcript"] = True else: file_exists["transcript"] = False # Load transcript segments if job_metadata.files.transcript_json: json_path = self.storage_dirs["base"] / job_metadata.files.transcript_json if json_path.exists(): transcript_segments = json.loads(json_path.read_text()) file_exists["transcript_json"] = True else: file_exists["transcript_json"] = False # Check other files for file_key, file_path in { "audio": job_metadata.files.audio, "audio_metadata": job_metadata.files.audio_metadata, "summary": job_metadata.files.summary }.items(): if file_path: full_path = self.storage_dirs["base"] / file_path file_exists[file_key] = full_path.exists() # Update access tracking job_metadata.last_accessed = datetime.utcnow() job_metadata.access_count += 1 await self._save_job_metadata(video_id, job_metadata) return JobDetailResponse( job=job_metadata, transcript_content=transcript_content, transcript_segments=transcript_segments, summary_content=summary_content, file_exists=file_exists ) async def update_job(self, video_id: str, **updates) -> Optional[JobMetadata]: """Update job metadata.""" job_metadata = await self._load_job_metadata(video_id) if not job_metadata: return None # Apply updates for key, value in updates.items(): if hasattr(job_metadata, key): setattr(job_metadata, key, value) job_metadata.processing.last_processed_at = datetime.utcnow() await self._save_job_metadata(video_id, job_metadata) return job_metadata async def delete_job(self, video_id: str, delete_files: bool = False) -> bool: """Delete a job and optionally its associated files.""" job_metadata = await self._load_job_metadata(video_id) if not job_metadata: return False # Delete files if requested if delete_files: for file_path in [ job_metadata.files.audio, job_metadata.files.audio_metadata, job_metadata.files.transcript, job_metadata.files.transcript_json, job_metadata.files.summary ]: if file_path: full_path = self.storage_dirs["base"] / file_path if full_path.exists(): full_path.unlink() logger.info(f"Deleted file: {full_path}") # Delete job metadata file job_file = self.jobs_dir / f"{video_id}.json" if job_file.exists(): job_file.unlink() # Update index index = await self._load_index() if index and video_id in index.jobs: index.jobs.remove(video_id) index.total_jobs -= 1 index.last_updated = datetime.utcnow() await self._save_index(index) logger.info(f"Deleted job: {video_id}") return True