571 lines
19 KiB
Python
571 lines
19 KiB
Python
from fastapi import APIRouter, Depends, BackgroundTasks, HTTPException, status
|
|
from typing import Dict, Any, Optional
|
|
import time
|
|
import uuid
|
|
import logging
|
|
|
|
from backend.models.transcript import (
|
|
TranscriptRequest,
|
|
TranscriptResponse,
|
|
JobResponse,
|
|
JobStatusResponse,
|
|
# Dual transcript models
|
|
DualTranscriptRequest,
|
|
DualTranscriptResponse,
|
|
TranscriptSource,
|
|
ProcessingTimeEstimate
|
|
)
|
|
from backend.services.transcript_service import TranscriptService
|
|
from backend.services.transcript_processor import TranscriptProcessor
|
|
from backend.services.dual_transcript_service import DualTranscriptService
|
|
from backend.services.mock_cache import MockCacheClient
|
|
from backend.services.service_factory import ServiceFactory
|
|
from backend.core.exceptions import TranscriptExtractionError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/transcripts", tags=["transcripts"])
|
|
|
|
# Shared service instances using factory
|
|
cache_client = ServiceFactory.create_cache_client()
|
|
transcript_service = ServiceFactory.create_transcript_service()
|
|
transcript_processor = TranscriptProcessor()
|
|
dual_transcript_service = DualTranscriptService()
|
|
|
|
# In-memory job storage (mock implementation)
|
|
job_storage: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
async def extract_transcript_job(job_id: str, video_id: str,
|
|
language_preference: str,
|
|
transcript_service: TranscriptService):
|
|
"""Background job for transcript extraction"""
|
|
try:
|
|
# Update job status
|
|
job_storage[job_id] = {
|
|
"status": "processing",
|
|
"progress_percentage": 10,
|
|
"current_step": "Validating video ID..."
|
|
}
|
|
|
|
# Simulate progress updates
|
|
await cache_client.set(f"job:{job_id}", job_storage[job_id], ttl=3600)
|
|
|
|
# Extract transcript
|
|
job_storage[job_id]["progress_percentage"] = 30
|
|
job_storage[job_id]["current_step"] = "Extracting transcript..."
|
|
|
|
result = await transcript_service.extract_transcript(video_id, language_preference)
|
|
|
|
# Process transcript
|
|
job_storage[job_id]["progress_percentage"] = 70
|
|
job_storage[job_id]["current_step"] = "Processing content..."
|
|
|
|
if result.success and result.transcript:
|
|
cleaned_transcript = transcript_processor.clean_transcript(result.transcript)
|
|
metadata = transcript_service.extract_metadata(cleaned_transcript)
|
|
|
|
# Create response
|
|
response = TranscriptResponse(
|
|
video_id=video_id,
|
|
transcript=cleaned_transcript,
|
|
segments=result.segments, # Include segments from transcript result
|
|
metadata=result.metadata,
|
|
extraction_method=result.method.value,
|
|
language=language_preference,
|
|
word_count=metadata["word_count"],
|
|
cached=result.from_cache,
|
|
processing_time_seconds=result.metadata.processing_time_seconds if result.metadata else 0
|
|
)
|
|
|
|
job_storage[job_id] = {
|
|
"status": "completed",
|
|
"progress_percentage": 100,
|
|
"current_step": "Complete",
|
|
"result": response.model_dump()
|
|
}
|
|
else:
|
|
job_storage[job_id] = {
|
|
"status": "failed",
|
|
"progress_percentage": 0,
|
|
"current_step": "Failed",
|
|
"error": result.error
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Job {job_id} failed: {str(e)}")
|
|
job_storage[job_id] = {
|
|
"status": "failed",
|
|
"progress_percentage": 0,
|
|
"current_step": "Failed",
|
|
"error": {
|
|
"code": "JOB_FAILED",
|
|
"message": str(e)
|
|
}
|
|
}
|
|
|
|
|
|
@router.get("/{video_id}", response_model=TranscriptResponse)
|
|
async def get_transcript(
|
|
video_id: str,
|
|
language_preference: str = "en",
|
|
include_metadata: bool = True
|
|
):
|
|
"""
|
|
Get transcript for a YouTube video.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
language_preference: Preferred language code
|
|
include_metadata: Whether to include metadata
|
|
|
|
Returns:
|
|
TranscriptResponse with transcript and metadata
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
result = await transcript_service.extract_transcript(video_id, language_preference)
|
|
|
|
if result.success and result.transcript:
|
|
# Clean and process transcript
|
|
cleaned_transcript = transcript_processor.clean_transcript(result.transcript)
|
|
|
|
response_data = {
|
|
"video_id": video_id,
|
|
"transcript": cleaned_transcript,
|
|
"segments": result.segments, # Include segments from transcript result
|
|
"extraction_method": result.method.value,
|
|
"language": language_preference,
|
|
"word_count": len(cleaned_transcript.split()),
|
|
"cached": result.from_cache,
|
|
"processing_time_seconds": time.time() - start_time
|
|
}
|
|
|
|
if include_metadata and result.metadata:
|
|
response_data["metadata"] = result.metadata
|
|
|
|
return TranscriptResponse(**response_data)
|
|
else:
|
|
# Return error response
|
|
return TranscriptResponse(
|
|
video_id=video_id,
|
|
transcript=None,
|
|
extraction_method="failed",
|
|
language=language_preference,
|
|
word_count=0,
|
|
cached=False,
|
|
processing_time_seconds=time.time() - start_time,
|
|
error=result.error
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get transcript for {video_id}: {str(e)}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to extract transcript: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.post("/extract", response_model=JobResponse)
|
|
async def extract_transcript_async(
|
|
request: TranscriptRequest,
|
|
background_tasks: BackgroundTasks
|
|
):
|
|
"""
|
|
Start async transcript extraction job.
|
|
|
|
Args:
|
|
request: Transcript extraction request
|
|
background_tasks: FastAPI background tasks
|
|
|
|
Returns:
|
|
JobResponse with job ID for status tracking
|
|
"""
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Initialize job status
|
|
job_storage[job_id] = {
|
|
"status": "pending",
|
|
"progress_percentage": 0,
|
|
"current_step": "Initializing..."
|
|
}
|
|
|
|
# Start background extraction
|
|
background_tasks.add_task(
|
|
extract_transcript_job,
|
|
job_id=job_id,
|
|
video_id=request.video_id,
|
|
language_preference=request.language_preference,
|
|
transcript_service=transcript_service
|
|
)
|
|
|
|
return JobResponse(
|
|
job_id=job_id,
|
|
status="processing",
|
|
message="Transcript extraction started"
|
|
)
|
|
|
|
|
|
@router.get("/jobs/{job_id}", response_model=JobStatusResponse)
|
|
async def get_extraction_status(job_id: str):
|
|
"""
|
|
Get status of transcript extraction job.
|
|
|
|
Args:
|
|
job_id: Job ID from extract endpoint
|
|
|
|
Returns:
|
|
JobStatusResponse with current job status
|
|
"""
|
|
if job_id not in job_storage:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail=f"Job {job_id} not found"
|
|
)
|
|
|
|
job_data = job_storage[job_id]
|
|
|
|
response = JobStatusResponse(
|
|
job_id=job_id,
|
|
status=job_data["status"],
|
|
progress_percentage=job_data.get("progress_percentage", 0),
|
|
current_step=job_data.get("current_step")
|
|
)
|
|
|
|
if job_data["status"] == "completed" and "result" in job_data:
|
|
response.result = TranscriptResponse(**job_data["result"])
|
|
elif job_data["status"] == "failed" and "error" in job_data:
|
|
response.error = job_data["error"]
|
|
|
|
return response
|
|
|
|
|
|
@router.post("/{video_id}/chunk", response_model=Dict[str, Any])
|
|
async def chunk_transcript(
|
|
video_id: str,
|
|
max_tokens: int = 3000
|
|
):
|
|
"""
|
|
Get transcript in chunks for large content.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
max_tokens: Maximum tokens per chunk
|
|
|
|
Returns:
|
|
Chunked transcript data
|
|
"""
|
|
# Get transcript first
|
|
result = await transcript_service.extract_transcript(video_id)
|
|
|
|
if not result.success or not result.transcript:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail="Transcript not available for this video"
|
|
)
|
|
|
|
# Clean and chunk transcript
|
|
cleaned = transcript_processor.clean_transcript(result.transcript)
|
|
chunks = transcript_processor.chunk_transcript(cleaned, max_tokens)
|
|
|
|
return {
|
|
"video_id": video_id,
|
|
"total_chunks": len(chunks),
|
|
"chunks": [chunk.model_dump() for chunk in chunks],
|
|
"metadata": {
|
|
"total_words": len(cleaned.split()),
|
|
"extraction_method": result.method.value
|
|
}
|
|
}
|
|
|
|
|
|
@router.get("/cache/stats", response_model=Dict[str, Any])
|
|
async def get_cache_stats():
|
|
"""Get cache statistics for monitoring"""
|
|
return cache_client.get_stats()
|
|
|
|
|
|
# ====== DUAL TRANSCRIPT ENDPOINTS ======
|
|
|
|
@router.post("/dual/extract", response_model=JobResponse)
|
|
async def extract_dual_transcript(
|
|
request: DualTranscriptRequest,
|
|
background_tasks: BackgroundTasks
|
|
):
|
|
"""
|
|
Start dual transcript extraction job.
|
|
|
|
Supports YouTube captions, Whisper AI transcription, or both for comparison.
|
|
|
|
Args:
|
|
request: Dual transcript extraction request
|
|
background_tasks: FastAPI background tasks
|
|
|
|
Returns:
|
|
JobResponse with job ID for status tracking
|
|
"""
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Initialize job status
|
|
job_storage[job_id] = {
|
|
"status": "pending",
|
|
"progress_percentage": 0,
|
|
"current_step": "Initializing dual transcript extraction...",
|
|
"source": request.transcript_source.value
|
|
}
|
|
|
|
# Start background extraction
|
|
background_tasks.add_task(
|
|
extract_dual_transcript_job,
|
|
job_id=job_id,
|
|
request=request
|
|
)
|
|
|
|
return JobResponse(
|
|
job_id=job_id,
|
|
status="processing",
|
|
message=f"Dual transcript extraction started ({request.transcript_source.value})"
|
|
)
|
|
|
|
|
|
async def extract_dual_transcript_job(job_id: str, request: DualTranscriptRequest):
|
|
"""Background job for dual transcript extraction"""
|
|
try:
|
|
# Extract video ID from URL (assuming URL format like the frontend)
|
|
video_id = extract_video_id_from_url(request.video_url)
|
|
|
|
# Update job status
|
|
job_storage[job_id].update({
|
|
"status": "processing",
|
|
"progress_percentage": 10,
|
|
"current_step": "Validating video URL..."
|
|
})
|
|
|
|
# Progress callback function
|
|
async def progress_callback(message: str):
|
|
current_progress = job_storage[job_id]["progress_percentage"]
|
|
new_progress = min(90, current_progress + 10)
|
|
job_storage[job_id].update({
|
|
"progress_percentage": new_progress,
|
|
"current_step": message
|
|
})
|
|
|
|
# Extract transcript using dual service
|
|
result = await dual_transcript_service.get_transcript(
|
|
video_id=video_id,
|
|
video_url=request.video_url,
|
|
source=request.transcript_source,
|
|
progress_callback=progress_callback
|
|
)
|
|
|
|
if result.success:
|
|
# Create API response from service result
|
|
response = DualTranscriptResponse(
|
|
video_id=result.video_id,
|
|
source=result.source,
|
|
youtube_transcript=result.youtube_transcript,
|
|
youtube_metadata=result.youtube_metadata,
|
|
whisper_transcript=result.whisper_transcript,
|
|
whisper_metadata=result.whisper_metadata,
|
|
comparison=result.comparison,
|
|
processing_time_seconds=result.processing_time_seconds,
|
|
success=result.success,
|
|
error=result.error
|
|
)
|
|
|
|
job_storage[job_id].update({
|
|
"status": "completed",
|
|
"progress_percentage": 100,
|
|
"current_step": "Complete",
|
|
"result": response.model_dump()
|
|
})
|
|
else:
|
|
job_storage[job_id].update({
|
|
"status": "failed",
|
|
"progress_percentage": 0,
|
|
"current_step": "Failed",
|
|
"error": {"message": result.error or "Unknown error"}
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Dual transcript job {job_id} failed: {str(e)}")
|
|
job_storage[job_id].update({
|
|
"status": "failed",
|
|
"progress_percentage": 0,
|
|
"current_step": "Failed",
|
|
"error": {
|
|
"code": "DUAL_TRANSCRIPT_FAILED",
|
|
"message": str(e)
|
|
}
|
|
})
|
|
|
|
|
|
@router.get("/dual/jobs/{job_id}", response_model=JobStatusResponse)
|
|
async def get_dual_transcript_status(job_id: str):
|
|
"""
|
|
Get status of dual transcript extraction job.
|
|
|
|
Args:
|
|
job_id: Job ID from dual extract endpoint
|
|
|
|
Returns:
|
|
JobStatusResponse with current job status and results
|
|
"""
|
|
if job_id not in job_storage:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail=f"Job {job_id} not found"
|
|
)
|
|
|
|
job_data = job_storage[job_id]
|
|
|
|
response = JobStatusResponse(
|
|
job_id=job_id,
|
|
status=job_data["status"],
|
|
progress_percentage=job_data.get("progress_percentage", 0),
|
|
current_step=job_data.get("current_step")
|
|
)
|
|
|
|
# Note: For dual transcripts, we'll return the result in a custom format
|
|
# since JobStatusResponse expects TranscriptResponse, but we have DualTranscriptResponse
|
|
if job_data["status"] == "completed" and "result" in job_data:
|
|
# For now, we'll put the dual result in the error field as a workaround
|
|
# In a real implementation, we'd create a new response model
|
|
response.error = {"dual_result": job_data["result"]}
|
|
elif job_data["status"] == "failed" and "error" in job_data:
|
|
response.error = job_data["error"]
|
|
|
|
return response
|
|
|
|
|
|
@router.post("/dual/estimate", response_model=ProcessingTimeEstimate)
|
|
async def estimate_dual_transcript_time(
|
|
video_url: str,
|
|
transcript_source: TranscriptSource,
|
|
video_duration_seconds: Optional[float] = None
|
|
):
|
|
"""
|
|
Estimate processing time for dual transcript extraction.
|
|
|
|
Args:
|
|
video_url: YouTube video URL
|
|
transcript_source: Which transcript source(s) to estimate
|
|
video_duration_seconds: Video duration if known (saves a metadata call)
|
|
|
|
Returns:
|
|
ProcessingTimeEstimate with time estimates
|
|
"""
|
|
try:
|
|
# If duration not provided, we'd need to get it from video metadata
|
|
# For now, assume a default duration of 10 minutes for estimation
|
|
if video_duration_seconds is None:
|
|
video_duration_seconds = 600 # 10 minutes default
|
|
|
|
estimates = dual_transcript_service.estimate_processing_time(
|
|
video_duration_seconds, transcript_source
|
|
)
|
|
|
|
# Convert to ISO timestamp for estimated completion
|
|
import datetime
|
|
estimated_completion = None
|
|
if estimates.get("total"):
|
|
completion_time = datetime.datetime.now() + datetime.timedelta(
|
|
seconds=estimates["total"]
|
|
)
|
|
estimated_completion = completion_time.isoformat()
|
|
|
|
return ProcessingTimeEstimate(
|
|
youtube_seconds=estimates.get("youtube"),
|
|
whisper_seconds=estimates.get("whisper"),
|
|
total_seconds=estimates.get("total"),
|
|
estimated_completion=estimated_completion
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to estimate processing time: {str(e)}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to estimate processing time: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.get("/dual/compare/{video_id}")
|
|
async def compare_transcript_sources(
|
|
video_id: str,
|
|
video_url: str
|
|
):
|
|
"""
|
|
Compare YouTube captions vs Whisper transcription for a video.
|
|
|
|
This is a convenience endpoint that forces both transcripts
|
|
and returns detailed comparison metrics.
|
|
|
|
Args:
|
|
video_id: YouTube video ID
|
|
video_url: Full YouTube video URL
|
|
|
|
Returns:
|
|
Detailed comparison between transcript sources
|
|
"""
|
|
try:
|
|
# Force both transcripts for comparison
|
|
result = await dual_transcript_service.get_transcript(
|
|
video_id=video_id,
|
|
video_url=video_url,
|
|
source=TranscriptSource.BOTH
|
|
)
|
|
|
|
if not result.success:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to extract transcripts: {result.error}"
|
|
)
|
|
|
|
if not result.has_comparison:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail="Unable to generate comparison - both transcripts are required"
|
|
)
|
|
|
|
return {
|
|
"video_id": video_id,
|
|
"comparison": result.comparison.model_dump() if result.comparison else None,
|
|
"youtube_available": result.has_youtube,
|
|
"whisper_available": result.has_whisper,
|
|
"processing_time_seconds": result.processing_time_seconds,
|
|
"recommendation": result.comparison.recommendation if result.comparison else None
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to compare transcripts for {video_id}: {str(e)}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to compare transcripts: {str(e)}"
|
|
)
|
|
|
|
|
|
def extract_video_id_from_url(url: str) -> str:
|
|
"""
|
|
Extract YouTube video ID from various URL formats.
|
|
|
|
Supports:
|
|
- https://www.youtube.com/watch?v=VIDEO_ID
|
|
- https://youtu.be/VIDEO_ID
|
|
- https://www.youtube.com/embed/VIDEO_ID
|
|
"""
|
|
import re
|
|
|
|
patterns = [
|
|
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([^&\n?#]+)',
|
|
r'youtube\.com.*[?&]v=([^&\n?#]+)'
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
raise ValueError(f"Could not extract video ID from URL: {url}") |