youtube-summarizer/backend/api/transcripts.py

571 lines
19 KiB
Python

from fastapi import APIRouter, Depends, BackgroundTasks, HTTPException, status
from typing import Dict, Any, Optional
import time
import uuid
import logging
from backend.models.transcript import (
TranscriptRequest,
TranscriptResponse,
JobResponse,
JobStatusResponse,
# Dual transcript models
DualTranscriptRequest,
DualTranscriptResponse,
TranscriptSource,
ProcessingTimeEstimate
)
from backend.services.transcript_service import TranscriptService
from backend.services.transcript_processor import TranscriptProcessor
from backend.services.dual_transcript_service import DualTranscriptService
from backend.services.mock_cache import MockCacheClient
from backend.services.service_factory import ServiceFactory
from backend.core.exceptions import TranscriptExtractionError
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/transcripts", tags=["transcripts"])
# Shared service instances using factory
cache_client = ServiceFactory.create_cache_client()
transcript_service = ServiceFactory.create_transcript_service()
transcript_processor = TranscriptProcessor()
dual_transcript_service = DualTranscriptService()
# In-memory job storage (mock implementation)
job_storage: Dict[str, Dict[str, Any]] = {}
async def extract_transcript_job(job_id: str, video_id: str,
language_preference: str,
transcript_service: TranscriptService):
"""Background job for transcript extraction"""
try:
# Update job status
job_storage[job_id] = {
"status": "processing",
"progress_percentage": 10,
"current_step": "Validating video ID..."
}
# Simulate progress updates
await cache_client.set(f"job:{job_id}", job_storage[job_id], ttl=3600)
# Extract transcript
job_storage[job_id]["progress_percentage"] = 30
job_storage[job_id]["current_step"] = "Extracting transcript..."
result = await transcript_service.extract_transcript(video_id, language_preference)
# Process transcript
job_storage[job_id]["progress_percentage"] = 70
job_storage[job_id]["current_step"] = "Processing content..."
if result.success and result.transcript:
cleaned_transcript = transcript_processor.clean_transcript(result.transcript)
metadata = transcript_service.extract_metadata(cleaned_transcript)
# Create response
response = TranscriptResponse(
video_id=video_id,
transcript=cleaned_transcript,
segments=result.segments, # Include segments from transcript result
metadata=result.metadata,
extraction_method=result.method.value,
language=language_preference,
word_count=metadata["word_count"],
cached=result.from_cache,
processing_time_seconds=result.metadata.processing_time_seconds if result.metadata else 0
)
job_storage[job_id] = {
"status": "completed",
"progress_percentage": 100,
"current_step": "Complete",
"result": response.model_dump()
}
else:
job_storage[job_id] = {
"status": "failed",
"progress_percentage": 0,
"current_step": "Failed",
"error": result.error
}
except Exception as e:
logger.error(f"Job {job_id} failed: {str(e)}")
job_storage[job_id] = {
"status": "failed",
"progress_percentage": 0,
"current_step": "Failed",
"error": {
"code": "JOB_FAILED",
"message": str(e)
}
}
@router.get("/{video_id}", response_model=TranscriptResponse)
async def get_transcript(
video_id: str,
language_preference: str = "en",
include_metadata: bool = True
):
"""
Get transcript for a YouTube video.
Args:
video_id: YouTube video ID
language_preference: Preferred language code
include_metadata: Whether to include metadata
Returns:
TranscriptResponse with transcript and metadata
"""
start_time = time.time()
try:
result = await transcript_service.extract_transcript(video_id, language_preference)
if result.success and result.transcript:
# Clean and process transcript
cleaned_transcript = transcript_processor.clean_transcript(result.transcript)
response_data = {
"video_id": video_id,
"transcript": cleaned_transcript,
"segments": result.segments, # Include segments from transcript result
"extraction_method": result.method.value,
"language": language_preference,
"word_count": len(cleaned_transcript.split()),
"cached": result.from_cache,
"processing_time_seconds": time.time() - start_time
}
if include_metadata and result.metadata:
response_data["metadata"] = result.metadata
return TranscriptResponse(**response_data)
else:
# Return error response
return TranscriptResponse(
video_id=video_id,
transcript=None,
extraction_method="failed",
language=language_preference,
word_count=0,
cached=False,
processing_time_seconds=time.time() - start_time,
error=result.error
)
except Exception as e:
logger.error(f"Failed to get transcript for {video_id}: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to extract transcript: {str(e)}"
)
@router.post("/extract", response_model=JobResponse)
async def extract_transcript_async(
request: TranscriptRequest,
background_tasks: BackgroundTasks
):
"""
Start async transcript extraction job.
Args:
request: Transcript extraction request
background_tasks: FastAPI background tasks
Returns:
JobResponse with job ID for status tracking
"""
job_id = str(uuid.uuid4())
# Initialize job status
job_storage[job_id] = {
"status": "pending",
"progress_percentage": 0,
"current_step": "Initializing..."
}
# Start background extraction
background_tasks.add_task(
extract_transcript_job,
job_id=job_id,
video_id=request.video_id,
language_preference=request.language_preference,
transcript_service=transcript_service
)
return JobResponse(
job_id=job_id,
status="processing",
message="Transcript extraction started"
)
@router.get("/jobs/{job_id}", response_model=JobStatusResponse)
async def get_extraction_status(job_id: str):
"""
Get status of transcript extraction job.
Args:
job_id: Job ID from extract endpoint
Returns:
JobStatusResponse with current job status
"""
if job_id not in job_storage:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Job {job_id} not found"
)
job_data = job_storage[job_id]
response = JobStatusResponse(
job_id=job_id,
status=job_data["status"],
progress_percentage=job_data.get("progress_percentage", 0),
current_step=job_data.get("current_step")
)
if job_data["status"] == "completed" and "result" in job_data:
response.result = TranscriptResponse(**job_data["result"])
elif job_data["status"] == "failed" and "error" in job_data:
response.error = job_data["error"]
return response
@router.post("/{video_id}/chunk", response_model=Dict[str, Any])
async def chunk_transcript(
video_id: str,
max_tokens: int = 3000
):
"""
Get transcript in chunks for large content.
Args:
video_id: YouTube video ID
max_tokens: Maximum tokens per chunk
Returns:
Chunked transcript data
"""
# Get transcript first
result = await transcript_service.extract_transcript(video_id)
if not result.success or not result.transcript:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Transcript not available for this video"
)
# Clean and chunk transcript
cleaned = transcript_processor.clean_transcript(result.transcript)
chunks = transcript_processor.chunk_transcript(cleaned, max_tokens)
return {
"video_id": video_id,
"total_chunks": len(chunks),
"chunks": [chunk.model_dump() for chunk in chunks],
"metadata": {
"total_words": len(cleaned.split()),
"extraction_method": result.method.value
}
}
@router.get("/cache/stats", response_model=Dict[str, Any])
async def get_cache_stats():
"""Get cache statistics for monitoring"""
return cache_client.get_stats()
# ====== DUAL TRANSCRIPT ENDPOINTS ======
@router.post("/dual/extract", response_model=JobResponse)
async def extract_dual_transcript(
request: DualTranscriptRequest,
background_tasks: BackgroundTasks
):
"""
Start dual transcript extraction job.
Supports YouTube captions, Whisper AI transcription, or both for comparison.
Args:
request: Dual transcript extraction request
background_tasks: FastAPI background tasks
Returns:
JobResponse with job ID for status tracking
"""
job_id = str(uuid.uuid4())
# Initialize job status
job_storage[job_id] = {
"status": "pending",
"progress_percentage": 0,
"current_step": "Initializing dual transcript extraction...",
"source": request.transcript_source.value
}
# Start background extraction
background_tasks.add_task(
extract_dual_transcript_job,
job_id=job_id,
request=request
)
return JobResponse(
job_id=job_id,
status="processing",
message=f"Dual transcript extraction started ({request.transcript_source.value})"
)
async def extract_dual_transcript_job(job_id: str, request: DualTranscriptRequest):
"""Background job for dual transcript extraction"""
try:
# Extract video ID from URL (assuming URL format like the frontend)
video_id = extract_video_id_from_url(request.video_url)
# Update job status
job_storage[job_id].update({
"status": "processing",
"progress_percentage": 10,
"current_step": "Validating video URL..."
})
# Progress callback function
async def progress_callback(message: str):
current_progress = job_storage[job_id]["progress_percentage"]
new_progress = min(90, current_progress + 10)
job_storage[job_id].update({
"progress_percentage": new_progress,
"current_step": message
})
# Extract transcript using dual service
result = await dual_transcript_service.get_transcript(
video_id=video_id,
video_url=request.video_url,
source=request.transcript_source,
progress_callback=progress_callback
)
if result.success:
# Create API response from service result
response = DualTranscriptResponse(
video_id=result.video_id,
source=result.source,
youtube_transcript=result.youtube_transcript,
youtube_metadata=result.youtube_metadata,
whisper_transcript=result.whisper_transcript,
whisper_metadata=result.whisper_metadata,
comparison=result.comparison,
processing_time_seconds=result.processing_time_seconds,
success=result.success,
error=result.error
)
job_storage[job_id].update({
"status": "completed",
"progress_percentage": 100,
"current_step": "Complete",
"result": response.model_dump()
})
else:
job_storage[job_id].update({
"status": "failed",
"progress_percentage": 0,
"current_step": "Failed",
"error": {"message": result.error or "Unknown error"}
})
except Exception as e:
logger.error(f"Dual transcript job {job_id} failed: {str(e)}")
job_storage[job_id].update({
"status": "failed",
"progress_percentage": 0,
"current_step": "Failed",
"error": {
"code": "DUAL_TRANSCRIPT_FAILED",
"message": str(e)
}
})
@router.get("/dual/jobs/{job_id}", response_model=JobStatusResponse)
async def get_dual_transcript_status(job_id: str):
"""
Get status of dual transcript extraction job.
Args:
job_id: Job ID from dual extract endpoint
Returns:
JobStatusResponse with current job status and results
"""
if job_id not in job_storage:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Job {job_id} not found"
)
job_data = job_storage[job_id]
response = JobStatusResponse(
job_id=job_id,
status=job_data["status"],
progress_percentage=job_data.get("progress_percentage", 0),
current_step=job_data.get("current_step")
)
# Note: For dual transcripts, we'll return the result in a custom format
# since JobStatusResponse expects TranscriptResponse, but we have DualTranscriptResponse
if job_data["status"] == "completed" and "result" in job_data:
# For now, we'll put the dual result in the error field as a workaround
# In a real implementation, we'd create a new response model
response.error = {"dual_result": job_data["result"]}
elif job_data["status"] == "failed" and "error" in job_data:
response.error = job_data["error"]
return response
@router.post("/dual/estimate", response_model=ProcessingTimeEstimate)
async def estimate_dual_transcript_time(
video_url: str,
transcript_source: TranscriptSource,
video_duration_seconds: Optional[float] = None
):
"""
Estimate processing time for dual transcript extraction.
Args:
video_url: YouTube video URL
transcript_source: Which transcript source(s) to estimate
video_duration_seconds: Video duration if known (saves a metadata call)
Returns:
ProcessingTimeEstimate with time estimates
"""
try:
# If duration not provided, we'd need to get it from video metadata
# For now, assume a default duration of 10 minutes for estimation
if video_duration_seconds is None:
video_duration_seconds = 600 # 10 minutes default
estimates = dual_transcript_service.estimate_processing_time(
video_duration_seconds, transcript_source
)
# Convert to ISO timestamp for estimated completion
import datetime
estimated_completion = None
if estimates.get("total"):
completion_time = datetime.datetime.now() + datetime.timedelta(
seconds=estimates["total"]
)
estimated_completion = completion_time.isoformat()
return ProcessingTimeEstimate(
youtube_seconds=estimates.get("youtube"),
whisper_seconds=estimates.get("whisper"),
total_seconds=estimates.get("total"),
estimated_completion=estimated_completion
)
except Exception as e:
logger.error(f"Failed to estimate processing time: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to estimate processing time: {str(e)}"
)
@router.get("/dual/compare/{video_id}")
async def compare_transcript_sources(
video_id: str,
video_url: str
):
"""
Compare YouTube captions vs Whisper transcription for a video.
This is a convenience endpoint that forces both transcripts
and returns detailed comparison metrics.
Args:
video_id: YouTube video ID
video_url: Full YouTube video URL
Returns:
Detailed comparison between transcript sources
"""
try:
# Force both transcripts for comparison
result = await dual_transcript_service.get_transcript(
video_id=video_id,
video_url=video_url,
source=TranscriptSource.BOTH
)
if not result.success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to extract transcripts: {result.error}"
)
if not result.has_comparison:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Unable to generate comparison - both transcripts are required"
)
return {
"video_id": video_id,
"comparison": result.comparison.model_dump() if result.comparison else None,
"youtube_available": result.has_youtube,
"whisper_available": result.has_whisper,
"processing_time_seconds": result.processing_time_seconds,
"recommendation": result.comparison.recommendation if result.comparison else None
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to compare transcripts for {video_id}: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to compare transcripts: {str(e)}"
)
def extract_video_id_from_url(url: str) -> str:
"""
Extract YouTube video ID from various URL formats.
Supports:
- https://www.youtube.com/watch?v=VIDEO_ID
- https://youtu.be/VIDEO_ID
- https://www.youtube.com/embed/VIDEO_ID
"""
import re
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([^&\n?#]+)',
r'youtube\.com.*[?&]v=([^&\n?#]+)'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
raise ValueError(f"Could not extract video ID from URL: {url}")