youtube-summarizer/backend/services/batch_processing_service.py

527 lines
20 KiB
Python

"""
Batch processing service for handling multiple video summarizations
"""
import asyncio
import re
import json
import zipfile
import tempfile
import os
from typing import List, Dict, Optional, Any
from datetime import datetime, timedelta
import uuid
from sqlalchemy.orm import Session
import logging
from backend.models.batch_job import BatchJob, BatchJobItem
from backend.models.summary import Summary
from backend.services.summary_pipeline import SummaryPipeline
from backend.services.notification_service import NotificationService
from backend.core.websocket_manager import websocket_manager
from backend.models.pipeline import PipelineConfig
logger = logging.getLogger(__name__)
class BatchProcessingService:
"""Service for processing multiple YouTube videos in batch"""
def __init__(
self,
db_session: Session,
summary_pipeline: Optional[SummaryPipeline] = None,
notification_service: Optional[NotificationService] = None
):
self.db = db_session
self.pipeline = summary_pipeline
self.notifications = notification_service
self.active_jobs: Dict[str, asyncio.Task] = {}
def _validate_youtube_url(self, url: str) -> bool:
"""Validate if URL is a valid YouTube URL"""
youtube_regex = r'(https?://)?(www\.)?(youtube\.com/(watch\?v=|embed/|v/)|youtu\.be/|m\.youtube\.com/watch\?v=)[\w\-]+'
return bool(re.match(youtube_regex, url))
def _extract_video_id(self, url: str) -> Optional[str]:
"""Extract video ID from YouTube URL"""
patterns = [
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
r'(?:embed\/)([0-9A-Za-z_-]{11})',
r'(?:watch\?v=)([0-9A-Za-z_-]{11})'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
async def create_batch_job(
self,
user_id: str,
urls: List[str],
name: Optional[str] = None,
model: str = "deepseek",
summary_length: str = "standard",
options: Optional[Dict] = None
) -> BatchJob:
"""Create a new batch processing job"""
# Validate and deduplicate URLs
validated_urls = []
seen_ids = set()
for url in urls:
if self._validate_youtube_url(url):
video_id = self._extract_video_id(url)
if video_id and video_id not in seen_ids:
validated_urls.append(url)
seen_ids.add(video_id)
if not validated_urls:
raise ValueError("No valid YouTube URLs provided")
# Create batch job
batch_job = BatchJob(
user_id=user_id,
name=name or f"Batch {datetime.now().strftime('%Y-%m-%d %H:%M')}",
urls=validated_urls,
total_videos=len(validated_urls),
model=model,
summary_length=summary_length,
options=options or {},
status="pending"
)
self.db.add(batch_job)
self.db.flush() # Get the ID
# Create job items
for idx, url in enumerate(validated_urls):
item = BatchJobItem(
batch_job_id=batch_job.id,
url=url,
position=idx,
video_id=self._extract_video_id(url)
)
self.db.add(item)
self.db.commit()
# Start processing in background
task = asyncio.create_task(self._process_batch(batch_job.id))
self.active_jobs[batch_job.id] = task
logger.info(f"Created batch job {batch_job.id} with {len(validated_urls)} videos")
return batch_job
async def _process_batch(self, batch_job_id: str):
"""Process all videos in a batch sequentially"""
try:
# Get batch job
batch_job = self.db.query(BatchJob).filter_by(id=batch_job_id).first()
if not batch_job:
logger.error(f"Batch job {batch_job_id} not found")
return
# Update status to processing
batch_job.status = "processing"
batch_job.started_at = datetime.utcnow()
self.db.commit()
# Send initial progress update
await self._send_progress_update(batch_job)
# Get all items to process
items = self.db.query(BatchJobItem).filter_by(
batch_job_id=batch_job_id
).order_by(BatchJobItem.position).all()
# Process each item
for item in items:
if batch_job.status == "cancelled":
logger.info(f"Batch job {batch_job_id} cancelled")
break
await self._process_single_item(item, batch_job)
# Update progress
await self._send_progress_update(batch_job)
# Small delay between videos to avoid rate limiting
await asyncio.sleep(2)
# Finalize batch
if batch_job.status != "cancelled":
batch_job.status = "completed"
batch_job.completed_at = datetime.utcnow()
# Calculate total processing time
if batch_job.started_at:
batch_job.total_processing_time = (
batch_job.completed_at - batch_job.started_at
).total_seconds()
# Generate export file
try:
export_url = await self._generate_export(batch_job_id)
batch_job.export_url = export_url
except Exception as e:
logger.error(f"Failed to generate export for batch {batch_job_id}: {e}")
self.db.commit()
# Send completion notification
await self._send_completion_notification(batch_job)
# Final progress update
await self._send_progress_update(batch_job)
except Exception as e:
logger.error(f"Error processing batch {batch_job_id}: {e}")
batch_job.status = "failed"
self.db.commit()
finally:
# Clean up active job
if batch_job_id in self.active_jobs:
del self.active_jobs[batch_job_id]
async def _process_single_item(self, item: BatchJobItem, batch_job: BatchJob):
"""Process a single video item in the batch"""
try:
# Update item status
item.status = "processing"
item.started_at = datetime.utcnow()
self.db.commit()
# Create pipeline config
config = PipelineConfig(
model=batch_job.model,
summary_length=batch_job.summary_length,
**batch_job.options
)
# Process video using the pipeline
if self.pipeline:
# Start pipeline processing
pipeline_job_id = await self.pipeline.process_video(
video_url=item.url,
config=config
)
# Wait for completion (with timeout)
result = await self._wait_for_pipeline_completion(
pipeline_job_id,
timeout=600 # 10 minutes max per video
)
if result and result.status == "completed":
# Create summary record
summary = Summary(
user_id=batch_job.user_id,
video_url=item.url,
video_id=item.video_id,
video_title=result.video_metadata.get("title") if result.video_metadata else None,
channel_name=result.video_metadata.get("channel") if result.video_metadata else None,
duration_seconds=result.video_metadata.get("duration") if result.video_metadata else None,
summary_text=result.summary,
key_points=result.key_points,
model_used=batch_job.model,
confidence_score=result.confidence_score,
quality_score=result.quality_score,
processing_time=result.processing_time,
cost_data=result.cost_data
)
self.db.add(summary)
self.db.flush()
# Update item with success
item.status = "completed"
item.summary_id = summary.id
item.video_title = summary.video_title
item.channel_name = summary.channel_name
item.duration_seconds = summary.duration_seconds
item.cost_usd = result.cost_data.get("total_cost_usd", 0) if result.cost_data else 0
# Update batch counters
batch_job.completed_videos += 1
batch_job.total_cost_usd += item.cost_usd
else:
# Processing failed
error_msg = result.error if result else "Pipeline timeout"
await self._handle_item_failure(item, batch_job, error_msg, "processing_error")
else:
# No pipeline available (shouldn't happen in production)
await self._handle_item_failure(item, batch_job, "Pipeline not available", "system_error")
except Exception as e:
logger.error(f"Error processing item {item.id}: {e}")
await self._handle_item_failure(item, batch_job, str(e), "exception")
finally:
# Update item completion time
item.completed_at = datetime.utcnow()
if item.started_at:
item.processing_time_seconds = (
item.completed_at - item.started_at
).total_seconds()
self.db.commit()
async def _handle_item_failure(
self,
item: BatchJobItem,
batch_job: BatchJob,
error_message: str,
error_type: str
):
"""Handle a failed item with retry logic"""
item.retry_count += 1
if item.retry_count < item.max_retries:
# Will retry later
item.status = "pending"
logger.info(f"Item {item.id} failed, will retry ({item.retry_count}/{item.max_retries})")
else:
# Max retries reached
item.status = "failed"
item.error_message = error_message
item.error_type = error_type
batch_job.failed_videos += 1
logger.error(f"Item {item.id} failed after {item.retry_count} retries: {error_message}")
async def _wait_for_pipeline_completion(
self,
pipeline_job_id: str,
timeout: int = 600
) -> Optional[Any]:
"""Wait for pipeline job to complete with timeout"""
start_time = datetime.utcnow()
while (datetime.utcnow() - start_time).total_seconds() < timeout:
if self.pipeline:
result = await self.pipeline.get_pipeline_result(pipeline_job_id)
if result and result.status in ["completed", "failed"]:
return result
await asyncio.sleep(2)
logger.warning(f"Pipeline job {pipeline_job_id} timed out after {timeout} seconds")
return None
async def _send_progress_update(self, batch_job: BatchJob):
"""Send progress update via WebSocket"""
# Get current processing item
current_item = self.db.query(BatchJobItem).filter_by(
batch_job_id=batch_job.id,
status="processing"
).first()
progress_data = {
"batch_job_id": batch_job.id,
"status": batch_job.status,
"name": batch_job.name,
"progress": {
"total": batch_job.total_videos,
"completed": batch_job.completed_videos,
"failed": batch_job.failed_videos,
"percentage": batch_job.get_progress_percentage()
},
"current_item": {
"url": current_item.url,
"position": current_item.position + 1,
"video_title": current_item.video_title
} if current_item else None,
"estimated_completion": self._estimate_completion_time(batch_job),
"export_url": batch_job.export_url
}
# Send via WebSocket to subscribers
await websocket_manager.broadcast_to_job(
f"batch_{batch_job.id}",
{
"type": "batch_progress",
"data": progress_data
}
)
def _estimate_completion_time(self, batch_job: BatchJob) -> Optional[str]:
"""Estimate completion time based on average processing time"""
if batch_job.completed_videos == 0:
return None
# Calculate average time per video
elapsed = (datetime.utcnow() - batch_job.started_at).total_seconds()
avg_time_per_video = elapsed / batch_job.completed_videos
# Estimate remaining time
remaining_videos = batch_job.total_videos - batch_job.completed_videos - batch_job.failed_videos
estimated_seconds = remaining_videos * avg_time_per_video
estimated_completion = datetime.utcnow() + timedelta(seconds=estimated_seconds)
return estimated_completion.isoformat()
async def _send_completion_notification(self, batch_job: BatchJob):
"""Send completion notification"""
if self.notifications:
await self.notifications.send_notification(
user_id=batch_job.user_id,
type="batch_complete",
title=f"Batch Processing Complete: {batch_job.name}",
message=f"Processed {batch_job.completed_videos} videos successfully, {batch_job.failed_videos} failed.",
data={
"batch_job_id": batch_job.id,
"export_url": batch_job.export_url
}
)
async def _generate_export(self, batch_job_id: str) -> str:
"""Generate ZIP export of all summaries in the batch"""
batch_job = self.db.query(BatchJob).filter_by(id=batch_job_id).first()
if not batch_job:
return ""
# Get all completed items with summaries
items = self.db.query(BatchJobItem).filter_by(
batch_job_id=batch_job_id,
status="completed"
).all()
if not items:
return ""
# Create temporary ZIP file
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
with zipfile.ZipFile(tmp_file.name, 'w') as zip_file:
# Add metadata file
metadata = {
"batch_name": batch_job.name,
"total_videos": batch_job.total_videos,
"completed": batch_job.completed_videos,
"failed": batch_job.failed_videos,
"created_at": batch_job.created_at.isoformat() if batch_job.created_at else None,
"completed_at": batch_job.completed_at.isoformat() if batch_job.completed_at else None,
"total_cost_usd": batch_job.total_cost_usd
}
zip_file.writestr("batch_metadata.json", json.dumps(metadata, indent=2))
# Add each summary
for item in items:
if item.summary_id:
summary = self.db.query(Summary).filter_by(id=item.summary_id).first()
if summary:
# Create filename from video title or ID
safe_title = re.sub(r'[^\w\s-]', '', summary.video_title or f"video_{item.position}")
safe_title = re.sub(r'[-\s]+', '-', safe_title)
# Export as JSON
summary_data = {
"video_url": summary.video_url,
"video_title": summary.video_title,
"channel_name": summary.channel_name,
"summary": summary.summary_text,
"key_points": summary.key_points,
"created_at": summary.created_at.isoformat() if summary.created_at else None
}
zip_file.writestr(
f"summaries/{safe_title}.json",
json.dumps(summary_data, indent=2)
)
# Also export as markdown
markdown_content = f"""# {summary.video_title}
**URL**: {summary.video_url}
**Channel**: {summary.channel_name}
**Date**: {summary.created_at.strftime('%Y-%m-%d') if summary.created_at else 'N/A'}
## Summary
{summary.summary_text}
## Key Points
{chr(10).join([f"- {point}" for point in (summary.key_points or [])])}
"""
zip_file.writestr(
f"summaries/{safe_title}.md",
markdown_content
)
# Move to permanent location (in real app, upload to S3 or similar)
export_path = f"/tmp/batch_exports/{batch_job_id}.zip"
os.makedirs(os.path.dirname(export_path), exist_ok=True)
os.rename(tmp_file.name, export_path)
# Return URL (in real app, return S3 URL)
return f"/api/batch/{batch_job_id}/download"
async def cancel_batch_job(self, batch_job_id: str, user_id: str) -> bool:
"""Cancel a running batch job"""
batch_job = self.db.query(BatchJob).filter_by(
id=batch_job_id,
user_id=user_id,
status="processing"
).first()
if not batch_job:
return False
batch_job.status = "cancelled"
self.db.commit()
# Cancel the async task if it exists
if batch_job_id in self.active_jobs:
self.active_jobs[batch_job_id].cancel()
logger.info(f"Cancelled batch job {batch_job_id}")
return True
async def get_batch_status(self, batch_job_id: str, user_id: str) -> Optional[Dict]:
"""Get detailed status of a batch job"""
batch_job = self.db.query(BatchJob).filter_by(
id=batch_job_id,
user_id=user_id
).first()
if not batch_job:
return None
items = self.db.query(BatchJobItem).filter_by(
batch_job_id=batch_job_id
).order_by(BatchJobItem.position).all()
return {
"id": batch_job.id,
"name": batch_job.name,
"status": batch_job.status,
"progress": {
"total": batch_job.total_videos,
"completed": batch_job.completed_videos,
"failed": batch_job.failed_videos,
"percentage": batch_job.get_progress_percentage()
},
"items": [item.to_dict() for item in items],
"created_at": batch_job.created_at.isoformat() if batch_job.created_at else None,
"started_at": batch_job.started_at.isoformat() if batch_job.started_at else None,
"completed_at": batch_job.completed_at.isoformat() if batch_job.completed_at else None,
"export_url": batch_job.export_url,
"total_cost_usd": batch_job.total_cost_usd,
"estimated_completion": self._estimate_completion_time(batch_job)
}