diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..d2a31bc --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,849 @@ +# AGENTS.md - YouTube Summarizer Development Standards + +This document defines development workflows, standards, and best practices for the YouTube Summarizer project. It serves as a guide for both human developers and AI agents working on this codebase. + +## Table of Contents +1. [Development Workflow](#1-development-workflow) +2. [Code Standards](#2-code-standards) +3. [Testing Requirements](#3-testing-requirements) +4. [Documentation Standards](#4-documentation-standards) +5. [Git Workflow](#5-git-workflow) +6. [API Design Standards](#6-api-design-standards) +7. [Database Operations](#7-database-operations) +8. [Performance Guidelines](#8-performance-guidelines) +9. [Security Protocols](#9-security-protocols) +10. [Deployment Process](#10-deployment-process) + +## 1. Development Workflow + +### Task-Driven Development + +All development follows the Task Master workflow: + +```bash +# 1. Get next task +task-master next + +# 2. Review task details +task-master show + +# 3. Expand if needed +task-master expand --id= --research + +# 4. Set to in-progress +task-master set-status --id= --status=in-progress + +# 5. Implement feature +# ... code implementation ... + +# 6. Test implementation +pytest tests/ + +# 7. Update task with notes +task-master update-subtask --id= --prompt="Implementation details..." + +# 8. Mark complete +task-master set-status --id= --status=done +``` + +### Feature Implementation Checklist + +- [ ] Review task requirements +- [ ] Write unit tests first (TDD) +- [ ] Implement feature +- [ ] Add integration tests +- [ ] Update documentation +- [ ] Run full test suite +- [ ] Update task status +- [ ] Commit with descriptive message + +## 2. Code Standards + +### Python Style Guide + +```python +""" +Module docstring describing purpose and usage +""" + +from typing import List, Optional, Dict, Any +import asyncio +from datetime import datetime + +# Constants in UPPER_CASE +DEFAULT_TIMEOUT = 30 +MAX_RETRIES = 3 + +class YouTubeSummarizer: + """ + Class for summarizing YouTube videos. + + Attributes: + model: AI model to use for summarization + cache: Cache service instance + """ + + def __init__(self, model: str = "openai"): + """Initialize summarizer with specified model.""" + self.model = model + self.cache = CacheService() + + async def summarize( + self, + video_url: str, + options: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Summarize a YouTube video. + + Args: + video_url: YouTube video URL + options: Optional summarization parameters + + Returns: + Dictionary containing summary and metadata + + Raises: + YouTubeError: If video cannot be accessed + AIServiceError: If summarization fails + """ + # Implementation here + pass +``` + +### Type Hints + +Always use type hints for better code quality: + +```python +from typing import Union, List, Optional, Dict, Any, Tuple +from pydantic import BaseModel, HttpUrl + +async def process_video( + url: HttpUrl, + models: List[str], + max_length: Optional[int] = None +) -> Tuple[str, Dict[str, Any]]: + """Process video with type safety.""" + pass +``` + +### Async/Await Pattern + +Use async for all I/O operations: + +```python +async def fetch_transcript(video_id: str) -> str: + """Fetch transcript asynchronously.""" + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + return await response.text() + +# Use asyncio.gather for parallel operations +results = await asyncio.gather( + fetch_transcript(id1), + fetch_transcript(id2), + fetch_transcript(id3) +) +``` + +## 3. Testing Requirements + +### Test Structure + +``` +tests/ +├── unit/ +│ ├── test_youtube_service.py +│ ├── test_summarizer_service.py +│ └── test_cache_service.py +├── integration/ +│ ├── test_api_endpoints.py +│ └── test_database.py +├── fixtures/ +│ ├── sample_transcripts.json +│ └── mock_responses.py +└── conftest.py +``` + +### Unit Test Example + +```python +# tests/unit/test_youtube_service.py +import pytest +from unittest.mock import Mock, patch, AsyncMock +from src.services.youtube import YouTubeService + +class TestYouTubeService: + @pytest.fixture + def youtube_service(self): + return YouTubeService() + + @pytest.fixture + def mock_transcript(self): + return [ + {"text": "Hello world", "start": 0.0, "duration": 2.0}, + {"text": "This is a test", "start": 2.0, "duration": 3.0} + ] + + @pytest.mark.asyncio + async def test_extract_transcript_success( + self, + youtube_service, + mock_transcript + ): + with patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') as mock_get: + mock_get.return_value = mock_transcript + + result = await youtube_service.extract_transcript("test_id") + + assert result == mock_transcript + mock_get.assert_called_once_with("test_id") + + def test_extract_video_id_various_formats(self, youtube_service): + test_cases = [ + ("https://www.youtube.com/watch?v=abc123", "abc123"), + ("https://youtu.be/xyz789", "xyz789"), + ("https://youtube.com/embed/qwe456", "qwe456"), + ("https://www.youtube.com/watch?v=test&t=123", "test") + ] + + for url, expected_id in test_cases: + assert youtube_service.extract_video_id(url) == expected_id +``` + +### Integration Test Example + +```python +# tests/integration/test_api_endpoints.py +import pytest +from fastapi.testclient import TestClient +from src.main import app + +@pytest.fixture +def client(): + return TestClient(app) + +class TestSummarizationAPI: + @pytest.mark.asyncio + async def test_summarize_endpoint(self, client): + response = client.post("/api/summarize", json={ + "url": "https://youtube.com/watch?v=test123", + "model": "openai", + "options": {"max_length": 500} + }) + + assert response.status_code == 200 + data = response.json() + assert "job_id" in data + assert data["status"] == "processing" + + @pytest.mark.asyncio + async def test_get_summary(self, client): + # First create a summary + create_response = client.post("/api/summarize", json={ + "url": "https://youtube.com/watch?v=test123" + }) + job_id = create_response.json()["job_id"] + + # Then retrieve it + get_response = client.get(f"/api/summary/{job_id}") + assert get_response.status_code in [200, 202] # 202 if still processing +``` + +### Test Coverage Requirements + +- Minimum 80% code coverage +- 100% coverage for critical paths +- All edge cases tested +- Error conditions covered + +```bash +# Run tests with coverage +pytest tests/ --cov=src --cov-report=html --cov-report=term + +# Coverage report should show: +# src/services/youtube.py 95% +# src/services/summarizer.py 88% +# src/api/routes.py 92% +``` + +## 4. Documentation Standards + +### Code Documentation + +Every module, class, and function must have docstrings: + +```python +""" +Module: YouTube Transcript Extractor + +This module provides functionality to extract transcripts from YouTube videos +using multiple fallback methods. + +Example: + >>> extractor = TranscriptExtractor() + >>> transcript = await extractor.extract("video_id") +""" + +def extract_transcript( + video_id: str, + language: str = "en", + include_auto_generated: bool = True +) -> List[Dict[str, Any]]: + """ + Extract transcript from YouTube video. + + This function attempts to extract transcripts using the following priority: + 1. Manual captions in specified language + 2. Auto-generated captions if allowed + 3. Translated captions as fallback + + Args: + video_id: YouTube video identifier + language: ISO 639-1 language code (default: "en") + include_auto_generated: Whether to use auto-generated captions + + Returns: + List of transcript segments with text, start time, and duration + + Raises: + TranscriptNotAvailable: If no transcript can be extracted + + Example: + >>> transcript = extract_transcript("dQw4w9WgXcQ", "en") + >>> print(transcript[0]) + {"text": "Never gonna give you up", "start": 0.0, "duration": 3.5} + """ + pass +``` + +### API Documentation + +Use FastAPI's automatic documentation features: + +```python +from fastapi import APIRouter, HTTPException, status +from pydantic import BaseModel, Field + +router = APIRouter() + +class SummarizeRequest(BaseModel): + """Request model for video summarization.""" + + url: str = Field( + ..., + description="YouTube video URL", + example="https://youtube.com/watch?v=dQw4w9WgXcQ" + ) + model: str = Field( + "auto", + description="AI model to use (openai, anthropic, deepseek, auto)", + example="openai" + ) + max_length: Optional[int] = Field( + None, + description="Maximum summary length in words", + ge=50, + le=5000 + ) + +@router.post( + "/summarize", + response_model=SummarizeResponse, + status_code=status.HTTP_200_OK, + summary="Summarize YouTube Video", + description="Submit a YouTube video URL for AI-powered summarization" +) +async def summarize_video(request: SummarizeRequest): + """ + Summarize a YouTube video using AI. + + This endpoint accepts a YouTube URL and returns a job ID for tracking + the summarization progress. Use the /summary/{job_id} endpoint to + retrieve the completed summary. + """ + pass +``` + +## 5. Git Workflow + +### Branch Naming + +```bash +# Feature branches +feature/task-2-youtube-extraction +feature/task-3-ai-summarization + +# Bugfix branches +bugfix/transcript-encoding-error +bugfix/rate-limit-handling + +# Hotfix branches +hotfix/critical-api-error +``` + +### Commit Messages + +Follow conventional commits: + +```bash +# Format: (): + +# Examples: +feat(youtube): add transcript extraction service +fix(api): handle rate limiting correctly +docs(readme): update installation instructions +test(youtube): add edge case tests +refactor(cache): optimize cache key generation +perf(summarizer): implement parallel processing +chore(deps): update requirements.txt +``` + +### Pull Request Template + +```markdown +## Task Reference +- Task ID: #3 +- Task Title: Develop AI Summary Generation Service + +## Description +Brief description of changes made + +## Changes Made +- [ ] Implemented YouTube transcript extraction +- [ ] Added multi-model AI support +- [ ] Created caching layer +- [ ] Added comprehensive tests + +## Testing +- [ ] Unit tests pass +- [ ] Integration tests pass +- [ ] Manual testing completed +- [ ] Coverage > 80% + +## Documentation +- [ ] Code documented +- [ ] API docs updated +- [ ] README updated if needed + +## Screenshots (if applicable) +[Add screenshots here] +``` + +## 6. API Design Standards + +### RESTful Principles + +```python +# Good API design +GET /api/summaries # List all summaries +GET /api/summaries/{id} # Get specific summary +POST /api/summaries # Create new summary +PUT /api/summaries/{id} # Update summary +DELETE /api/summaries/{id} # Delete summary + +# Status codes +200 OK # Successful GET/PUT +201 Created # Successful POST +202 Accepted # Processing async request +204 No Content # Successful DELETE +400 Bad Request # Invalid input +401 Unauthorized # Missing/invalid auth +403 Forbidden # No permission +404 Not Found # Resource doesn't exist +429 Too Many Requests # Rate limited +500 Internal Error # Server error +``` + +### Response Format + +```python +# Success response +{ + "success": true, + "data": { + "id": "uuid", + "video_id": "abc123", + "summary": "...", + "metadata": {} + }, + "timestamp": "2025-01-25T10:00:00Z" +} + +# Error response +{ + "success": false, + "error": { + "code": "TRANSCRIPT_NOT_AVAILABLE", + "message": "Could not extract transcript from video", + "details": "No captions available in requested language" + }, + "timestamp": "2025-01-25T10:00:00Z" +} +``` + +### Pagination + +```python +@router.get("/summaries") +async def list_summaries( + page: int = Query(1, ge=1), + limit: int = Query(20, ge=1, le=100), + sort: str = Query("created_at", regex="^(created_at|updated_at|title)$"), + order: str = Query("desc", regex="^(asc|desc)$") +): + """List summaries with pagination.""" + return { + "data": summaries, + "pagination": { + "page": page, + "limit": limit, + "total": total_count, + "pages": math.ceil(total_count / limit) + } + } +``` + +## 7. Database Operations + +### SQLAlchemy Models + +```python +from sqlalchemy import Column, String, Text, DateTime, Float, JSON +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.dialects.postgresql import UUID +import uuid + +Base = declarative_base() + +class Summary(Base): + __tablename__ = "summaries" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + video_id = Column(String(20), nullable=False, index=True) + video_url = Column(Text, nullable=False) + video_title = Column(Text) + transcript = Column(Text) + summary = Column(Text) + key_points = Column(JSON) + chapters = Column(JSON) + model_used = Column(String(50)) + processing_time = Column(Float) + created_at = Column(DateTime, default=datetime.utcnow) + updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) + + def to_dict(self): + """Convert to dictionary for API responses.""" + return { + "id": str(self.id), + "video_id": self.video_id, + "video_title": self.video_title, + "summary": self.summary, + "key_points": self.key_points, + "chapters": self.chapters, + "model_used": self.model_used, + "created_at": self.created_at.isoformat() + } +``` + +### Database Migrations + +Use Alembic for migrations: + +```bash +# Create new migration +alembic revision --autogenerate -m "Add chapters column" + +# Apply migrations +alembic upgrade head + +# Rollback +alembic downgrade -1 +``` + +### Query Optimization + +```python +from sqlalchemy import select, and_ +from sqlalchemy.orm import selectinload + +# Efficient querying with joins +async def get_summaries_with_metadata(session, user_id: str): + stmt = ( + select(Summary) + .options(selectinload(Summary.metadata)) + .where(Summary.user_id == user_id) + .order_by(Summary.created_at.desc()) + .limit(10) + ) + + result = await session.execute(stmt) + return result.scalars().all() +``` + +## 8. Performance Guidelines + +### Caching Strategy + +```python +from functools import lru_cache +import redis +import hashlib +import json + +class CacheService: + def __init__(self): + self.redis = redis.Redis(decode_responses=True) + self.ttl = 3600 # 1 hour default + + def get_key(self, prefix: str, **kwargs) -> str: + """Generate cache key from parameters.""" + data = json.dumps(kwargs, sort_keys=True) + hash_digest = hashlib.md5(data.encode()).hexdigest() + return f"{prefix}:{hash_digest}" + + async def get_or_set(self, key: str, func, ttl: int = None): + """Get from cache or compute and set.""" + # Try cache first + cached = self.redis.get(key) + if cached: + return json.loads(cached) + + # Compute result + result = await func() + + # Cache result + self.redis.setex( + key, + ttl or self.ttl, + json.dumps(result) + ) + + return result +``` + +### Async Processing + +```python +from celery import Celery +from typing import Dict, Any + +celery_app = Celery('youtube_summarizer') + +@celery_app.task +async def process_video_task(video_url: str, options: Dict[str, Any]): + """Background task for video processing.""" + try: + # Extract transcript + transcript = await extract_transcript(video_url) + + # Generate summary + summary = await generate_summary(transcript, options) + + # Save to database + await save_summary(video_url, summary) + + return {"status": "completed", "summary_id": summary.id} + except Exception as e: + return {"status": "failed", "error": str(e)} +``` + +### Performance Monitoring + +```python +import time +from functools import wraps +import logging + +logger = logging.getLogger(__name__) + +def measure_performance(func): + """Decorator to measure function performance.""" + @wraps(func) + async def wrapper(*args, **kwargs): + start = time.perf_counter() + try: + result = await func(*args, **kwargs) + elapsed = time.perf_counter() - start + logger.info(f"{func.__name__} took {elapsed:.3f}s") + return result + except Exception as e: + elapsed = time.perf_counter() - start + logger.error(f"{func.__name__} failed after {elapsed:.3f}s: {e}") + raise + return wrapper +``` + +## 9. Security Protocols + +### Input Validation + +```python +from pydantic import BaseModel, validator, HttpUrl +import re + +class VideoURLValidator(BaseModel): + url: HttpUrl + + @validator('url') + def validate_youtube_url(cls, v): + youtube_regex = re.compile( + r'(https?://)?(www\.)?(youtube\.com|youtu\.be)/.+' + ) + if not youtube_regex.match(str(v)): + raise ValueError('Invalid YouTube URL') + return v +``` + +### API Key Management + +```python +from pydantic import BaseSettings + +class Settings(BaseSettings): + """Application settings with validation.""" + + # API Keys (never hardcode!) + openai_api_key: str + anthropic_api_key: str + youtube_api_key: Optional[str] = None + + # Security + secret_key: str + allowed_origins: List[str] = ["http://localhost:3000"] + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + case_sensitive = False + +settings = Settings() +``` + +### Rate Limiting + +```python +from fastapi import Request, HTTPException +from fastapi_limiter import FastAPILimiter +from fastapi_limiter.depends import RateLimiter +import redis.asyncio as redis + +# Initialize rate limiter +async def init_rate_limiter(): + redis_client = redis.from_url("redis://localhost:6379", encoding="utf-8", decode_responses=True) + await FastAPILimiter.init(redis_client) + +# Apply rate limiting +@router.post("/summarize", dependencies=[Depends(RateLimiter(times=10, seconds=60))]) +async def summarize_video(request: SummarizeRequest): + """Rate limited to 10 requests per minute.""" + pass +``` + +## 10. Deployment Process + +### Docker Configuration + +```dockerfile +# Dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Run application +CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8082"] +``` + +### Environment Management + +```bash +# .env.development +DEBUG=true +DATABASE_URL=sqlite:///./dev.db +LOG_LEVEL=DEBUG + +# .env.production +DEBUG=false +DATABASE_URL=postgresql://user:pass@db:5432/youtube_summarizer +LOG_LEVEL=INFO +``` + +### Health Checks + +```python +@router.get("/health") +async def health_check(): + """Health check endpoint for monitoring.""" + checks = { + "api": "healthy", + "database": await check_database(), + "cache": await check_cache(), + "ai_service": await check_ai_service() + } + + all_healthy = all(v == "healthy" for v in checks.values()) + + return { + "status": "healthy" if all_healthy else "degraded", + "checks": checks, + "timestamp": datetime.utcnow().isoformat() + } +``` + +### Monitoring + +```python +from prometheus_client import Counter, Histogram, generate_latest + +# Metrics +request_count = Counter('youtube_requests_total', 'Total requests') +request_duration = Histogram('youtube_request_duration_seconds', 'Request duration') +summary_generation_time = Histogram('summary_generation_seconds', 'Summary generation time') + +@router.get("/metrics") +async def metrics(): + """Prometheus metrics endpoint.""" + return Response(generate_latest(), media_type="text/plain") +``` + +## Agent-Specific Instructions + +### For AI Agents + +When working on this codebase: + +1. **Always check Task Master first**: `task-master next` +2. **Follow TDD**: Write tests before implementation +3. **Use type hints**: All functions must have type annotations +4. **Document changes**: Update docstrings and comments +5. **Test thoroughly**: Run full test suite before marking complete +6. **Update task status**: Keep Task Master updated with progress + +### Quality Checklist + +Before marking any task as complete: + +- [ ] All tests pass (`pytest tests/`) +- [ ] Code coverage > 80% (`pytest --cov=src`) +- [ ] No linting errors (`ruff check src/`) +- [ ] Type checking passes (`mypy src/`) +- [ ] Documentation updated +- [ ] Task Master updated +- [ ] Changes committed with proper message + +## Conclusion + +This guide ensures consistent, high-quality development across all contributors to the YouTube Summarizer project. Follow these standards to maintain code quality, performance, and security. + +--- + +*Last Updated: 2025-01-25* +*Version: 1.0.0* \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index ec26a18..1808fce 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,5 +1,405 @@ -# Claude Code Instructions +# CLAUDE.md - YouTube Summarizer -## Task Master AI Instructions -**Import Task Master's development workflow commands and guidelines, treat as if import is in the main CLAUDE.md file.** -@./.taskmaster/CLAUDE.md +This file provides guidance to Claude Code (claude.ai/code) when working with the YouTube Summarizer project. + +## Project Overview + +An AI-powered web application that automatically extracts, transcribes, and summarizes YouTube videos. The application supports multiple AI models (OpenAI, Anthropic, DeepSeek), provides various export formats, and includes intelligent caching for efficiency. + +**Status**: Development Phase - 12 tasks (0% complete) managed via Task Master + +## Quick Start Commands + +```bash +# Development +cd apps/youtube-summarizer +source venv/bin/activate # Activate virtual environment +python src/main.py # Run the application (port 8082) + +# Task Management +task-master list # View all tasks +task-master next # Get next task to work on +task-master show # View task details +task-master set-status --id= --status=done # Mark task complete + +# Testing +pytest tests/ -v # Run tests +pytest tests/ --cov=src # Run with coverage + +# Git Operations +git add . +git commit -m "feat: implement task X.Y" +git push origin main +``` + +## Architecture + +``` +YouTube Summarizer +├── API Layer (FastAPI) +│ ├── /api/summarize - Submit URL for summarization +│ ├── /api/summary/{id} - Retrieve summary +│ └── /api/export/{id} - Export in various formats +├── Service Layer +│ ├── YouTube Service - Transcript extraction +│ ├── AI Service - Summary generation +│ └── Cache Service - Performance optimization +└── Data Layer + ├── SQLite/PostgreSQL - Summary storage + └── Redis (optional) - Caching layer +``` + +## Development Workflow + +### 1. Check Current Task +```bash +task-master next +task-master show +``` + +### 2. Implement Feature +Follow the task details and implement in appropriate modules: +- API endpoints → `src/api/` +- Business logic → `src/services/` +- Utilities → `src/utils/` + +### 3. Test Implementation +```bash +# Unit tests +pytest tests/unit/test_.py -v + +# Integration tests +pytest tests/integration/ -v + +# Manual testing +python src/main.py +# Visit http://localhost:8082/docs for API testing +``` + +### 4. Update Task Status +```bash +# Log progress +task-master update-subtask --id= --prompt="Implemented X, tested Y" + +# Mark complete +task-master set-status --id= --status=done +``` + +## Key Implementation Areas + +### YouTube Integration (`src/services/youtube.py`) +```python +# Primary: youtube-transcript-api +from youtube_transcript_api import YouTubeTranscriptApi + +# Fallback: yt-dlp for metadata +import yt_dlp + +# Extract video ID from various URL formats +# Handle multiple subtitle languages +# Implement retry logic for failures +``` + +### AI Summarization (`src/services/summarizer.py`) +```python +# Multi-model support +class SummarizerService: + def __init__(self): + self.models = { + 'openai': OpenAISummarizer(), + 'anthropic': AnthropicSummarizer(), + 'deepseek': DeepSeekSummarizer() + } + + async def summarize(self, transcript, model='auto'): + # Implement model selection logic + # Handle token limits + # Generate structured summaries +``` + +### Caching Strategy (`src/services/cache.py`) +```python +# Cache at multiple levels: +# 1. Transcript cache (by video_id) +# 2. Summary cache (by video_id + model + params) +# 3. Export cache (by summary_id + format) + +# Use hash for cache keys +import hashlib + +def get_cache_key(video_id: str, model: str, params: dict) -> str: + key_data = f"{video_id}:{model}:{json.dumps(params, sort_keys=True)}" + return hashlib.sha256(key_data.encode()).hexdigest() +``` + +## API Endpoint Patterns + +### FastAPI Best Practices +```python +from fastapi import APIRouter, HTTPException, BackgroundTasks +from pydantic import BaseModel, HttpUrl + +router = APIRouter(prefix="/api", tags=["summarization"]) + +class SummarizeRequest(BaseModel): + url: HttpUrl + model: str = "auto" + options: dict = {} + +@router.post("/summarize") +async def summarize_video( + request: SummarizeRequest, + background_tasks: BackgroundTasks +): + # Validate URL + # Extract video ID + # Check cache + # Queue for processing if needed + # Return job ID for status checking +``` + +## Database Schema + +```sql +-- Main summaries table +CREATE TABLE summaries ( + id UUID PRIMARY KEY, + video_id VARCHAR(20) NOT NULL, + video_title TEXT, + video_url TEXT NOT NULL, + transcript TEXT, + summary TEXT, + key_points JSONB, + chapters JSONB, + model_used VARCHAR(50), + processing_time FLOAT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Cache for performance +CREATE INDEX idx_video_id ON summaries(video_id); +CREATE INDEX idx_created_at ON summaries(created_at); +``` + +## Error Handling + +```python +class YouTubeError(Exception): + """Base exception for YouTube-related errors""" + pass + +class TranscriptNotAvailable(YouTubeError): + """Raised when transcript cannot be extracted""" + pass + +class AIServiceError(Exception): + """Base exception for AI service errors""" + pass + +class TokenLimitExceeded(AIServiceError): + """Raised when content exceeds model token limit""" + pass + +# Global error handler +@app.exception_handler(YouTubeError) +async def youtube_error_handler(request, exc): + return JSONResponse( + status_code=400, + content={"error": str(exc), "type": "youtube_error"} + ) +``` + +## Environment Variables + +```bash +# Required +OPENAI_API_KEY=sk-... # At least one AI key required +ANTHROPIC_API_KEY=sk-ant-... +DEEPSEEK_API_KEY=sk-... +DATABASE_URL=sqlite:///./data/youtube_summarizer.db +SECRET_KEY=your-secret-key + +# Optional but recommended +YOUTUBE_API_KEY=AIza... # For metadata and quota +REDIS_URL=redis://localhost:6379/0 +RATE_LIMIT_PER_MINUTE=30 +MAX_VIDEO_LENGTH_MINUTES=180 +``` + +## Testing Guidelines + +### Unit Test Structure +```python +# tests/unit/test_youtube_service.py +import pytest +from unittest.mock import Mock, patch +from src.services.youtube import YouTubeService + +@pytest.fixture +def youtube_service(): + return YouTubeService() + +def test_extract_video_id(youtube_service): + urls = [ + ("https://youtube.com/watch?v=abc123", "abc123"), + ("https://youtu.be/xyz789", "xyz789"), + ("https://www.youtube.com/embed/qwe456", "qwe456") + ] + for url, expected_id in urls: + assert youtube_service.extract_video_id(url) == expected_id +``` + +### Integration Test Pattern +```python +# tests/integration/test_api.py +from fastapi.testclient import TestClient +from src.main import app + +client = TestClient(app) + +def test_summarize_endpoint(): + response = client.post("/api/summarize", json={ + "url": "https://youtube.com/watch?v=test123", + "model": "openai" + }) + assert response.status_code == 200 + assert "job_id" in response.json() +``` + +## Performance Optimization + +1. **Async Everything**: Use async/await for all I/O operations +2. **Background Tasks**: Process summaries in background +3. **Caching Layers**: + - Memory cache for hot data + - Database cache for persistence + - CDN for static exports +4. **Rate Limiting**: Implement per-IP and per-user limits +5. **Token Optimization**: + - Chunk long transcripts + - Use map-reduce for summaries + - Implement progressive summarization + +## Security Considerations + +1. **Input Validation**: Validate all YouTube URLs +2. **API Key Management**: Use environment variables, never commit keys +3. **Rate Limiting**: Prevent abuse and API exhaustion +4. **CORS Configuration**: Restrict to known domains in production +5. **SQL Injection Prevention**: Use parameterized queries +6. **XSS Protection**: Sanitize all user inputs +7. **Authentication**: Implement JWT for user sessions (Phase 3) + +## Common Issues and Solutions + +### Issue: Transcript Not Available +```python +# Solution: Implement fallback chain +try: + transcript = await get_youtube_transcript(video_id) +except TranscriptNotAvailable: + # Try auto-generated captions + transcript = await get_auto_captions(video_id) + if not transcript: + # Use audio transcription as last resort + transcript = await transcribe_audio(video_id) +``` + +### Issue: Token Limit Exceeded +```python +# Solution: Implement chunking +def chunk_transcript(transcript, max_tokens=3000): + chunks = [] + current_chunk = [] + current_tokens = 0 + + for segment in transcript: + segment_tokens = count_tokens(segment) + if current_tokens + segment_tokens > max_tokens: + chunks.append(current_chunk) + current_chunk = [segment] + current_tokens = segment_tokens + else: + current_chunk.append(segment) + current_tokens += segment_tokens + + if current_chunk: + chunks.append(current_chunk) + + return chunks +``` + +### Issue: Rate Limiting +```python +# Solution: Implement exponential backoff +import asyncio +from typing import Optional + +async def retry_with_backoff( + func, + max_retries: int = 3, + initial_delay: float = 1.0 +) -> Optional[Any]: + delay = initial_delay + for attempt in range(max_retries): + try: + return await func() + except RateLimitError: + if attempt == max_retries - 1: + raise + await asyncio.sleep(delay) + delay *= 2 # Exponential backoff +``` + +## Development Tips + +1. **Start with Task 1**: Setup and environment configuration +2. **Test Early**: Write tests as you implement features +3. **Use Type Hints**: Improve code quality and IDE support +4. **Document APIs**: Use FastAPI's automatic documentation +5. **Log Everything**: Implement comprehensive logging for debugging +6. **Cache Aggressively**: Reduce API calls and improve response times +7. **Handle Errors Gracefully**: Provide helpful error messages to users + +## Task Master Integration + +This project uses Task Master for task management. Key commands: + +```bash +# View current progress +task-master list + +# Get detailed task info +task-master show 1 + +# Expand task into subtasks +task-master expand --id=1 --research + +# Update task with progress +task-master update-task --id=1 --prompt="Completed API structure" + +# Complete task +task-master set-status --id=1 --status=done +``` + +## Related Documentation + +- [Project README](README.md) - General project information +- [AGENTS.md](AGENTS.md) - Development workflow and standards +- [Task Master Guide](.taskmaster/CLAUDE.md) - Task management details +- [API Documentation](http://localhost:8082/docs) - Interactive API docs (when running) + +## Current Focus Areas (Based on Task Master) + +1. **Task 1**: Setup Project Structure and Environment ⬅️ Start here +2. **Task 2**: Implement YouTube Transcript Extraction +3. **Task 3**: Develop AI Summary Generation Service +4. **Task 4**: Create Basic Frontend Interface +5. **Task 5**: Implement FastAPI Backend Endpoints + +Remember to check task dependencies and complete prerequisites before moving to dependent tasks. + +--- + +*This guide is specifically tailored for Claude Code development on the YouTube Summarizer project.* \ No newline at end of file