from fastapi import APIRouter, Depends, BackgroundTasks, HTTPException, status from typing import Dict, Any, Optional import time import uuid import logging from backend.models.transcript import ( TranscriptRequest, TranscriptResponse, JobResponse, JobStatusResponse, # Dual transcript models DualTranscriptRequest, DualTranscriptResponse, TranscriptSource, ProcessingTimeEstimate ) from backend.services.transcript_service import TranscriptService from backend.services.transcript_processor import TranscriptProcessor from backend.services.dual_transcript_service import DualTranscriptService from backend.services.mock_cache import MockCacheClient from backend.services.service_factory import ServiceFactory from backend.core.exceptions import TranscriptExtractionError logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/transcripts", tags=["transcripts"]) # Shared service instances using factory cache_client = ServiceFactory.create_cache_client() transcript_service = ServiceFactory.create_transcript_service() transcript_processor = TranscriptProcessor() dual_transcript_service = DualTranscriptService() # In-memory job storage (mock implementation) job_storage: Dict[str, Dict[str, Any]] = {} async def extract_transcript_job(job_id: str, video_id: str, language_preference: str, transcript_service: TranscriptService): """Background job for transcript extraction""" try: # Update job status job_storage[job_id] = { "status": "processing", "progress_percentage": 10, "current_step": "Validating video ID..." } # Simulate progress updates await cache_client.set(f"job:{job_id}", job_storage[job_id], ttl=3600) # Extract transcript job_storage[job_id]["progress_percentage"] = 30 job_storage[job_id]["current_step"] = "Extracting transcript..." result = await transcript_service.extract_transcript(video_id, language_preference) # Process transcript job_storage[job_id]["progress_percentage"] = 70 job_storage[job_id]["current_step"] = "Processing content..." if result.success and result.transcript: cleaned_transcript = transcript_processor.clean_transcript(result.transcript) metadata = transcript_service.extract_metadata(cleaned_transcript) # Create response response = TranscriptResponse( video_id=video_id, transcript=cleaned_transcript, segments=result.segments, # Include segments from transcript result metadata=result.metadata, extraction_method=result.method.value, language=language_preference, word_count=metadata["word_count"], cached=result.from_cache, processing_time_seconds=result.metadata.processing_time_seconds if result.metadata else 0 ) job_storage[job_id] = { "status": "completed", "progress_percentage": 100, "current_step": "Complete", "result": response.model_dump() } else: job_storage[job_id] = { "status": "failed", "progress_percentage": 0, "current_step": "Failed", "error": result.error } except Exception as e: logger.error(f"Job {job_id} failed: {str(e)}") job_storage[job_id] = { "status": "failed", "progress_percentage": 0, "current_step": "Failed", "error": { "code": "JOB_FAILED", "message": str(e) } } @router.get("/{video_id}", response_model=TranscriptResponse) async def get_transcript( video_id: str, language_preference: str = "en", include_metadata: bool = True ): """ Get transcript for a YouTube video. Args: video_id: YouTube video ID language_preference: Preferred language code include_metadata: Whether to include metadata Returns: TranscriptResponse with transcript and metadata """ start_time = time.time() try: result = await transcript_service.extract_transcript(video_id, language_preference) if result.success and result.transcript: # Clean and process transcript cleaned_transcript = transcript_processor.clean_transcript(result.transcript) response_data = { "video_id": video_id, "transcript": cleaned_transcript, "segments": result.segments, # Include segments from transcript result "extraction_method": result.method.value, "language": language_preference, "word_count": len(cleaned_transcript.split()), "cached": result.from_cache, "processing_time_seconds": time.time() - start_time } if include_metadata and result.metadata: response_data["metadata"] = result.metadata return TranscriptResponse(**response_data) else: # Return error response return TranscriptResponse( video_id=video_id, transcript=None, extraction_method="failed", language=language_preference, word_count=0, cached=False, processing_time_seconds=time.time() - start_time, error=result.error ) except Exception as e: logger.error(f"Failed to get transcript for {video_id}: {str(e)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to extract transcript: {str(e)}" ) @router.post("/extract", response_model=JobResponse) async def extract_transcript_async( request: TranscriptRequest, background_tasks: BackgroundTasks ): """ Start async transcript extraction job. Args: request: Transcript extraction request background_tasks: FastAPI background tasks Returns: JobResponse with job ID for status tracking """ job_id = str(uuid.uuid4()) # Initialize job status job_storage[job_id] = { "status": "pending", "progress_percentage": 0, "current_step": "Initializing..." } # Start background extraction background_tasks.add_task( extract_transcript_job, job_id=job_id, video_id=request.video_id, language_preference=request.language_preference, transcript_service=transcript_service ) return JobResponse( job_id=job_id, status="processing", message="Transcript extraction started" ) @router.get("/jobs/{job_id}", response_model=JobStatusResponse) async def get_extraction_status(job_id: str): """ Get status of transcript extraction job. Args: job_id: Job ID from extract endpoint Returns: JobStatusResponse with current job status """ if job_id not in job_storage: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Job {job_id} not found" ) job_data = job_storage[job_id] response = JobStatusResponse( job_id=job_id, status=job_data["status"], progress_percentage=job_data.get("progress_percentage", 0), current_step=job_data.get("current_step") ) if job_data["status"] == "completed" and "result" in job_data: response.result = TranscriptResponse(**job_data["result"]) elif job_data["status"] == "failed" and "error" in job_data: response.error = job_data["error"] return response @router.post("/{video_id}/chunk", response_model=Dict[str, Any]) async def chunk_transcript( video_id: str, max_tokens: int = 3000 ): """ Get transcript in chunks for large content. Args: video_id: YouTube video ID max_tokens: Maximum tokens per chunk Returns: Chunked transcript data """ # Get transcript first result = await transcript_service.extract_transcript(video_id) if not result.success or not result.transcript: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail="Transcript not available for this video" ) # Clean and chunk transcript cleaned = transcript_processor.clean_transcript(result.transcript) chunks = transcript_processor.chunk_transcript(cleaned, max_tokens) return { "video_id": video_id, "total_chunks": len(chunks), "chunks": [chunk.model_dump() for chunk in chunks], "metadata": { "total_words": len(cleaned.split()), "extraction_method": result.method.value } } @router.get("/cache/stats", response_model=Dict[str, Any]) async def get_cache_stats(): """Get cache statistics for monitoring""" return cache_client.get_stats() # ====== DUAL TRANSCRIPT ENDPOINTS ====== @router.post("/dual/extract", response_model=JobResponse) async def extract_dual_transcript( request: DualTranscriptRequest, background_tasks: BackgroundTasks ): """ Start dual transcript extraction job. Supports YouTube captions, Whisper AI transcription, or both for comparison. Args: request: Dual transcript extraction request background_tasks: FastAPI background tasks Returns: JobResponse with job ID for status tracking """ job_id = str(uuid.uuid4()) # Initialize job status job_storage[job_id] = { "status": "pending", "progress_percentage": 0, "current_step": "Initializing dual transcript extraction...", "source": request.transcript_source.value } # Start background extraction background_tasks.add_task( extract_dual_transcript_job, job_id=job_id, request=request ) return JobResponse( job_id=job_id, status="processing", message=f"Dual transcript extraction started ({request.transcript_source.value})" ) async def extract_dual_transcript_job(job_id: str, request: DualTranscriptRequest): """Background job for dual transcript extraction""" try: # Extract video ID from URL (assuming URL format like the frontend) video_id = extract_video_id_from_url(request.video_url) # Update job status job_storage[job_id].update({ "status": "processing", "progress_percentage": 10, "current_step": "Validating video URL..." }) # Progress callback function async def progress_callback(message: str): current_progress = job_storage[job_id]["progress_percentage"] new_progress = min(90, current_progress + 10) job_storage[job_id].update({ "progress_percentage": new_progress, "current_step": message }) # Extract transcript using dual service result = await dual_transcript_service.get_transcript( video_id=video_id, video_url=request.video_url, source=request.transcript_source, progress_callback=progress_callback ) if result.success: # Create API response from service result response = DualTranscriptResponse( video_id=result.video_id, source=result.source, youtube_transcript=result.youtube_transcript, youtube_metadata=result.youtube_metadata, whisper_transcript=result.whisper_transcript, whisper_metadata=result.whisper_metadata, comparison=result.comparison, processing_time_seconds=result.processing_time_seconds, success=result.success, error=result.error ) job_storage[job_id].update({ "status": "completed", "progress_percentage": 100, "current_step": "Complete", "result": response.model_dump() }) else: job_storage[job_id].update({ "status": "failed", "progress_percentage": 0, "current_step": "Failed", "error": {"message": result.error or "Unknown error"} }) except Exception as e: logger.error(f"Dual transcript job {job_id} failed: {str(e)}") job_storage[job_id].update({ "status": "failed", "progress_percentage": 0, "current_step": "Failed", "error": { "code": "DUAL_TRANSCRIPT_FAILED", "message": str(e) } }) @router.get("/dual/jobs/{job_id}", response_model=JobStatusResponse) async def get_dual_transcript_status(job_id: str): """ Get status of dual transcript extraction job. Args: job_id: Job ID from dual extract endpoint Returns: JobStatusResponse with current job status and results """ if job_id not in job_storage: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Job {job_id} not found" ) job_data = job_storage[job_id] response = JobStatusResponse( job_id=job_id, status=job_data["status"], progress_percentage=job_data.get("progress_percentage", 0), current_step=job_data.get("current_step") ) # Note: For dual transcripts, we'll return the result in a custom format # since JobStatusResponse expects TranscriptResponse, but we have DualTranscriptResponse if job_data["status"] == "completed" and "result" in job_data: # For now, we'll put the dual result in the error field as a workaround # In a real implementation, we'd create a new response model response.error = {"dual_result": job_data["result"]} elif job_data["status"] == "failed" and "error" in job_data: response.error = job_data["error"] return response @router.post("/dual/estimate", response_model=ProcessingTimeEstimate) async def estimate_dual_transcript_time( video_url: str, transcript_source: TranscriptSource, video_duration_seconds: Optional[float] = None ): """ Estimate processing time for dual transcript extraction. Args: video_url: YouTube video URL transcript_source: Which transcript source(s) to estimate video_duration_seconds: Video duration if known (saves a metadata call) Returns: ProcessingTimeEstimate with time estimates """ try: # If duration not provided, we'd need to get it from video metadata # For now, assume a default duration of 10 minutes for estimation if video_duration_seconds is None: video_duration_seconds = 600 # 10 minutes default estimates = dual_transcript_service.estimate_processing_time( video_duration_seconds, transcript_source ) # Convert to ISO timestamp for estimated completion import datetime estimated_completion = None if estimates.get("total"): completion_time = datetime.datetime.now() + datetime.timedelta( seconds=estimates["total"] ) estimated_completion = completion_time.isoformat() return ProcessingTimeEstimate( youtube_seconds=estimates.get("youtube"), whisper_seconds=estimates.get("whisper"), total_seconds=estimates.get("total"), estimated_completion=estimated_completion ) except Exception as e: logger.error(f"Failed to estimate processing time: {str(e)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to estimate processing time: {str(e)}" ) @router.get("/dual/compare/{video_id}") async def compare_transcript_sources( video_id: str, video_url: str ): """ Compare YouTube captions vs Whisper transcription for a video. This is a convenience endpoint that forces both transcripts and returns detailed comparison metrics. Args: video_id: YouTube video ID video_url: Full YouTube video URL Returns: Detailed comparison between transcript sources """ try: # Force both transcripts for comparison result = await dual_transcript_service.get_transcript( video_id=video_id, video_url=video_url, source=TranscriptSource.BOTH ) if not result.success: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to extract transcripts: {result.error}" ) if not result.has_comparison: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Unable to generate comparison - both transcripts are required" ) return { "video_id": video_id, "comparison": result.comparison.model_dump() if result.comparison else None, "youtube_available": result.has_youtube, "whisper_available": result.has_whisper, "processing_time_seconds": result.processing_time_seconds, "recommendation": result.comparison.recommendation if result.comparison else None } except HTTPException: raise except Exception as e: logger.error(f"Failed to compare transcripts for {video_id}: {str(e)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to compare transcripts: {str(e)}" ) def extract_video_id_from_url(url: str) -> str: """ Extract YouTube video ID from various URL formats. Supports: - https://www.youtube.com/watch?v=VIDEO_ID - https://youtu.be/VIDEO_ID - https://www.youtube.com/embed/VIDEO_ID """ import re patterns = [ r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([^&\n?#]+)', r'youtube\.com.*[?&]v=([^&\n?#]+)' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) raise ValueError(f"Could not extract video ID from URL: {url}")