15 KiB
15 KiB
Story 3.4: Batch Processing - Implementation Plan
🎯 Objective
Implement batch processing capability to allow users to summarize multiple YouTube videos at once, with progress tracking, error handling, and bulk export functionality.
📋 Pre-Implementation Checklist
Prerequisites ✅
- Story 3.3 (Summary History Management) complete
- Authentication system working
- Summary pipeline operational
- Database migrations working
Environment Setup
# Backend
cd apps/youtube-summarizer/backend
source ../../../venv/bin/activate # Or your venv path
pip install aiofiles # For async file operations
pip install python-multipart # For file uploads
# Frontend
cd apps/youtube-summarizer/frontend
npm install react-dropzone # For file upload UI
🏗️ Implementation Plan
Phase 1: Database Foundation (Day 1 Morning)
1.1 Create Database Models
# backend/models/batch_job.py
from sqlalchemy import Column, String, Integer, JSON, DateTime, ForeignKey
from sqlalchemy.dialects.postgresql import UUID
from backend.models.base import Model
import uuid
class BatchJob(Model):
__tablename__ = "batch_jobs"
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
user_id = Column(String, ForeignKey("users.id"), nullable=False)
name = Column(String(255))
status = Column(String(50), default="pending") # pending, processing, completed, cancelled
# Configuration
urls = Column(JSON, nullable=False)
model = Column(String(50))
summary_length = Column(String(20))
options = Column(JSON)
# Progress
total_videos = Column(Integer, nullable=False)
completed_videos = Column(Integer, default=0)
failed_videos = Column(Integer, default=0)
# Results
results = Column(JSON) # Array of results
export_url = Column(String(500))
# Relationships
user = relationship("User", back_populates="batch_jobs")
items = relationship("BatchJobItem", back_populates="batch_job", cascade="all, delete-orphan")
class BatchJobItem(Model):
__tablename__ = "batch_job_items"
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
batch_job_id = Column(String, ForeignKey("batch_jobs.id"), nullable=False)
url = Column(String(500), nullable=False)
position = Column(Integer, nullable=False)
status = Column(String(50), default="pending")
# Results
video_id = Column(String(20))
video_title = Column(String(500))
summary_id = Column(String, ForeignKey("summaries.id"))
error_message = Column(Text)
retry_count = Column(Integer, default=0)
# Relationships
batch_job = relationship("BatchJob", back_populates="items")
summary = relationship("Summary")
1.2 Create Migration
cd backend
PYTHONPATH=/path/to/youtube-summarizer python3 -m alembic revision -m "Add batch processing tables"
1.3 Update User Model
# In backend/models/user.py, add:
batch_jobs = relationship("BatchJob", back_populates="user", cascade="all, delete-orphan")
Phase 2: Batch Processing Service (Day 1 Afternoon - Day 2 Morning)
2.1 Create Batch Service
# backend/services/batch_processing_service.py
import asyncio
from typing import List, Dict, Optional
from datetime import datetime
import uuid
from sqlalchemy.orm import Session
from backend.services.summary_pipeline import SummaryPipeline
from backend.models.batch_job import BatchJob, BatchJobItem
from backend.core.websocket_manager import websocket_manager
class BatchProcessingService:
def __init__(self, db_session: Session):
self.db = db_session
self.active_jobs: Dict[str, asyncio.Task] = {}
async def create_batch_job(
self,
user_id: str,
urls: List[str],
name: Optional[str] = None,
model: str = "anthropic",
summary_length: str = "standard"
) -> BatchJob:
"""Create a new batch processing job"""
# Validate and deduplicate URLs
valid_urls = list(set(filter(self._validate_youtube_url, urls)))
# Create batch job
batch_job = BatchJob(
user_id=user_id,
name=name or f"Batch {datetime.now().strftime('%Y-%m-%d %H:%M')}",
urls=valid_urls,
total_videos=len(valid_urls),
model=model,
summary_length=summary_length,
status="pending"
)
# Create job items
for idx, url in enumerate(valid_urls):
item = BatchJobItem(
batch_job_id=batch_job.id,
url=url,
position=idx
)
self.db.add(item)
self.db.add(batch_job)
self.db.commit()
# Start processing in background
task = asyncio.create_task(self._process_batch(batch_job.id))
self.active_jobs[batch_job.id] = task
return batch_job
async def _process_batch(self, batch_job_id: str):
"""Process all videos in a batch sequentially"""
batch_job = self.db.query(BatchJob).filter_by(id=batch_job_id).first()
if not batch_job:
return
batch_job.status = "processing"
batch_job.started_at = datetime.utcnow()
self.db.commit()
# Get pipeline service
from backend.services.summary_pipeline import SummaryPipeline
pipeline = SummaryPipeline(...) # Initialize with dependencies
items = self.db.query(BatchJobItem).filter_by(
batch_job_id=batch_job_id
).order_by(BatchJobItem.position).all()
for item in items:
if batch_job.status == "cancelled":
break
await self._process_single_item(item, batch_job, pipeline)
# Send progress update
await self._send_progress_update(batch_job)
# Finalize batch
if batch_job.status != "cancelled":
batch_job.status = "completed"
batch_job.completed_at = datetime.utcnow()
# Generate export
export_url = await self._generate_export(batch_job_id)
batch_job.export_url = export_url
self.db.commit()
# Clean up active job
del self.active_jobs[batch_job_id]
2.2 Add Progress Broadcasting
async def _send_progress_update(self, batch_job: BatchJob):
"""Send progress update via WebSocket"""
progress_data = {
"batch_job_id": batch_job.id,
"status": batch_job.status,
"progress": {
"total": batch_job.total_videos,
"completed": batch_job.completed_videos,
"failed": batch_job.failed_videos,
"percentage": (batch_job.completed_videos / batch_job.total_videos * 100)
},
"current_item": self._get_current_item(batch_job)
}
await websocket_manager.broadcast_to_job(
f"batch_{batch_job.id}",
{
"type": "batch_progress",
"data": progress_data
}
)
Phase 3: API Endpoints (Day 2 Afternoon)
3.1 Create Batch Router
# backend/api/batch.py
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from typing import List
from pydantic import BaseModel
router = APIRouter(prefix="/api/batch", tags=["batch"])
class BatchJobRequest(BaseModel):
name: Optional[str] = None
urls: List[str]
model: str = "anthropic"
summary_length: str = "standard"
class BatchJobResponse(BaseModel):
id: str
name: str
status: str
total_videos: int
created_at: datetime
@router.post("/create", response_model=BatchJobResponse)
async def create_batch_job(
request: BatchJobRequest,
background_tasks: BackgroundTasks,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Create a new batch processing job"""
service = BatchProcessingService(db)
batch_job = await service.create_batch_job(
user_id=current_user.id,
urls=request.urls,
name=request.name,
model=request.model,
summary_length=request.summary_length
)
return BatchJobResponse.from_orm(batch_job)
@router.get("/{job_id}")
async def get_batch_status(
job_id: str,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get batch job status and progress"""
batch_job = db.query(BatchJob).filter_by(
id=job_id,
user_id=current_user.id
).first()
if not batch_job:
raise HTTPException(status_code=404, detail="Batch job not found")
return {
"id": batch_job.id,
"status": batch_job.status,
"progress": {
"total": batch_job.total_videos,
"completed": batch_job.completed_videos,
"failed": batch_job.failed_videos
},
"items": batch_job.items,
"export_url": batch_job.export_url
}
@router.post("/{job_id}/cancel")
async def cancel_batch_job(
job_id: str,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Cancel a running batch job"""
batch_job = db.query(BatchJob).filter_by(
id=job_id,
user_id=current_user.id,
status="processing"
).first()
if not batch_job:
raise HTTPException(status_code=404, detail="Active batch job not found")
batch_job.status = "cancelled"
db.commit()
return {"message": "Batch job cancelled"}
3.2 Add to Main App
# In backend/main.py
from backend.api.batch import router as batch_router
app.include_router(batch_router)
Phase 4: Frontend Implementation (Day 3)
4.1 Create Batch API Service
// frontend/src/services/batchAPI.ts
export interface BatchJobRequest {
name?: string;
urls: string[];
model?: string;
summary_length?: string;
}
export interface BatchJob {
id: string;
name: string;
status: 'pending' | 'processing' | 'completed' | 'cancelled';
total_videos: number;
completed_videos: number;
failed_videos: number;
items: BatchJobItem[];
export_url?: string;
}
class BatchAPI {
async createBatchJob(request: BatchJobRequest): Promise<BatchJob> {
const response = await fetch('/api/batch/create', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${localStorage.getItem('access_token')}`
},
body: JSON.stringify(request)
});
return response.json();
}
async getBatchStatus(jobId: string): Promise<BatchJob> {
const response = await fetch(`/api/batch/${jobId}`, {
headers: {
'Authorization': `Bearer ${localStorage.getItem('access_token')}`
}
});
return response.json();
}
async cancelBatchJob(jobId: string): Promise<void> {
await fetch(`/api/batch/${jobId}/cancel`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${localStorage.getItem('access_token')}`
}
});
}
}
export const batchAPI = new BatchAPI();
4.2 Create Batch Processing Page
// frontend/src/pages/batch/BatchProcessingPage.tsx
import React, { useState, useEffect } from 'react';
import { BatchInputForm } from '@/components/batch/BatchInputForm';
import { BatchProgress } from '@/components/batch/BatchProgress';
import { useBatchProcessing } from '@/hooks/useBatchProcessing';
export function BatchProcessingPage() {
const {
createBatch,
currentBatch,
isProcessing,
progress,
cancelBatch
} = useBatchProcessing();
return (
<div className="container mx-auto py-8">
<h1 className="text-3xl font-bold mb-8">Batch Video Processing</h1>
{!isProcessing ? (
<BatchInputForm onSubmit={createBatch} />
) : (
<BatchProgress
batch={currentBatch}
progress={progress}
onCancel={cancelBatch}
/>
)}
</div>
);
}
Phase 5: Testing & Polish (Day 4)
5.1 Test Script
# test_batch_processing.py
import asyncio
import httpx
async def test_batch_processing():
# Login
login_response = await client.post("/api/auth/login", json={
"email": "test@example.com",
"password": "TestPass123!"
})
token = login_response.json()["access_token"]
# Create batch job
batch_response = await client.post(
"/api/batch/create",
headers={"Authorization": f"Bearer {token}"},
json={
"urls": [
"https://youtube.com/watch?v=dQw4w9WgXcQ",
"https://youtube.com/watch?v=invalid",
"https://youtube.com/watch?v=9bZkp7q19f0"
],
"name": "Test Batch"
}
)
job_id = batch_response.json()["id"]
# Poll for status
while True:
status_response = await client.get(
f"/api/batch/{job_id}",
headers={"Authorization": f"Bearer {token}"}
)
status = status_response.json()
print(f"Status: {status['status']}, Progress: {status['progress']}")
if status['status'] in ['completed', 'cancelled']:
break
await asyncio.sleep(2)
🔥 Common Pitfalls & Solutions
Pitfall 1: Memory Issues with Large Batches
Solution: Process videos sequentially, not in parallel
Pitfall 2: Long Processing Times
Solution: Add WebSocket updates and clear progress indicators
Pitfall 3: Failed Videos Blocking Queue
Solution: Try-catch each video, continue on failure
Pitfall 4: Database Connection Exhaustion
Solution: Use single session per batch, not per video
Pitfall 5: WebSocket Connection Loss
Solution: Implement reconnection logic in frontend
📊 Success Metrics
- Can process 10+ videos in a batch
- Progress updates every 2-3 seconds
- Failed videos don't stop processing
- Export ZIP contains all summaries
- UI clearly shows current status
- Can cancel batch mid-processing
- Handles duplicate URLs gracefully
🚀 Quick Start Commands
# Start backend with batch support
cd backend
PYTHONPATH=/path/to/youtube-summarizer python3 main.py
# Start frontend
cd frontend
npm run dev
# Run batch test
python3 test_batch_processing.py
📝 Testing Checklist
Manual Testing
- Upload 5 valid YouTube URLs
- Include 2 invalid URLs in batch
- Cancel batch after 2 videos
- Export completed batch as ZIP
- Process batch with 10+ videos
- Test with different models
- Verify progress percentage accuracy
Automated Testing
- Unit test URL validation
- Unit test batch creation
- Integration test full batch flow
- Test export generation
- Test cancellation handling
🎯 Definition of Done
- Database models created and migrated
- Batch processing service working
- All API endpoints functional
- Frontend UI complete
- Progress updates via WebSocket
- Export functionality working
- Error handling robust
- Tests passing
- Documentation updated
Ready to implement Story 3.4! This will add powerful batch processing capabilities to the YouTube Summarizer.