youtube-summarizer/backend/api/transcripts.py

from fastapi import APIRouter, Depends, BackgroundTasks, HTTPException, status
from typing import Dict, Any, Optional
import time
import uuid
import logging

from backend.models.transcript import (
    TranscriptRequest,
    TranscriptResponse,
    JobResponse,
    JobStatusResponse,
    # Dual transcript models
    DualTranscriptRequest,
    DualTranscriptResponse,
    TranscriptSource,
    ProcessingTimeEstimate
)
from backend.services.transcript_service import TranscriptService
from backend.services.transcript_processor import TranscriptProcessor
from backend.services.dual_transcript_service import DualTranscriptService
from backend.services.mock_cache import MockCacheClient
from backend.services.service_factory import ServiceFactory
from backend.core.exceptions import TranscriptExtractionError

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/transcripts", tags=["transcripts"])

# Shared service instances using factory
cache_client = ServiceFactory.create_cache_client()
transcript_service = ServiceFactory.create_transcript_service()
transcript_processor = TranscriptProcessor()
dual_transcript_service = DualTranscriptService()

# In-memory job storage (mock implementation)
job_storage: Dict[str, Dict[str, Any]] = {}


async def extract_transcript_job(job_id: str, video_id: str,
                                language_preference: str,
                                transcript_service: TranscriptService):
    """Background job for transcript extraction"""
    try:
        # Update job status
        job_storage[job_id] = {
            "status": "processing",
            "progress_percentage": 10,
            "current_step": "Validating video ID..."
        }

        # Simulate progress updates
        await cache_client.set(f"job:{job_id}", job_storage[job_id], ttl=3600)

        # Extract transcript
        job_storage[job_id]["progress_percentage"] = 30
        job_storage[job_id]["current_step"] = "Extracting transcript..."

        result = await transcript_service.extract_transcript(video_id, language_preference)

        # Process transcript
        job_storage[job_id]["progress_percentage"] = 70
        job_storage[job_id]["current_step"] = "Processing content..."

        if result.success and result.transcript:
            cleaned_transcript = transcript_processor.clean_transcript(result.transcript)
            metadata = transcript_service.extract_metadata(cleaned_transcript)

            # Create response
            response = TranscriptResponse(
                video_id=video_id,
                transcript=cleaned_transcript,
                segments=result.segments,  # Include segments from transcript result
                metadata=result.metadata,
                extraction_method=result.method.value,
                language=language_preference,
                word_count=metadata["word_count"],
                cached=result.from_cache,
                processing_time_seconds=result.metadata.processing_time_seconds if result.metadata else 0
            )

            job_storage[job_id] = {
                "status": "completed",
                "progress_percentage": 100,
                "current_step": "Complete",
                "result": response.model_dump()
            }
        else:
            job_storage[job_id] = {
                "status": "failed",
                "progress_percentage": 0,
                "current_step": "Failed",
                "error": result.error
            }

    except Exception as e:
        logger.error(f"Job {job_id} failed: {str(e)}")
        job_storage[job_id] = {
            "status": "failed",
            "progress_percentage": 0,
            "current_step": "Failed",
            "error": {
                "code": "JOB_FAILED",
                "message": str(e)
            }
        }


@router.get("/{video_id}", response_model=TranscriptResponse)
async def get_transcript(
    video_id: str,
    language_preference: str = "en",
    include_metadata: bool = True
):
    """
    Get transcript for a YouTube video.

    Args:
        video_id: YouTube video ID
        language_preference: Preferred language code
        include_metadata: Whether to include metadata

    Returns:
        TranscriptResponse with transcript and metadata
    """
    start_time = time.time()

    try:
        result = await transcript_service.extract_transcript(video_id, language_preference)

        if result.success and result.transcript:
            # Clean and process transcript
            cleaned_transcript = transcript_processor.clean_transcript(result.transcript)

            response_data = {
                "video_id": video_id,
                "transcript": cleaned_transcript,
                "segments": result.segments,  # Include segments from transcript result
                "extraction_method": result.method.value,
                "language": language_preference,
                "word_count": len(cleaned_transcript.split()),
                "cached": result.from_cache,
                "processing_time_seconds": time.time() - start_time
            }

            if include_metadata and result.metadata:
                response_data["metadata"] = result.metadata

            return TranscriptResponse(**response_data)
        else:
            # Return error response
            return TranscriptResponse(
                video_id=video_id,
                transcript=None,
                extraction_method="failed",
                language=language_preference,
                word_count=0,
                cached=False,
                processing_time_seconds=time.time() - start_time,
                error=result.error
            )

    except Exception as e:
        logger.error(f"Failed to get transcript for {video_id}: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to extract transcript: {str(e)}"
        )


@router.post("/extract", response_model=JobResponse)
async def extract_transcript_async(
    request: TranscriptRequest,
    background_tasks: BackgroundTasks
):
    """
    Start async transcript extraction job.

    Args:
        request: Transcript extraction request
        background_tasks: FastAPI background tasks

    Returns:
        JobResponse with job ID for status tracking
    """
    job_id = str(uuid.uuid4())

    # Initialize job status
    job_storage[job_id] = {
        "status": "pending",
        "progress_percentage": 0,
        "current_step": "Initializing..."
    }

    # Start background extraction
    background_tasks.add_task(
        extract_transcript_job,
        job_id=job_id,
        video_id=request.video_id,
        language_preference=request.language_preference,
        transcript_service=transcript_service
    )

    return JobResponse(
        job_id=job_id,
        status="processing",
        message="Transcript extraction started"
    )


@router.get("/jobs/{job_id}", response_model=JobStatusResponse)
async def get_extraction_status(job_id: str):
    """
    Get status of transcript extraction job.

    Args:
        job_id: Job ID from extract endpoint

    Returns:
        JobStatusResponse with current job status
    """
    if job_id not in job_storage:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Job {job_id} not found"
        )

    job_data = job_storage[job_id]

    response = JobStatusResponse(
        job_id=job_id,
        status=job_data["status"],
        progress_percentage=job_data.get("progress_percentage", 0),
        current_step=job_data.get("current_step")
    )

    if job_data["status"] == "completed" and "result" in job_data:
        response.result = TranscriptResponse(**job_data["result"])
    elif job_data["status"] == "failed" and "error" in job_data:
        response.error = job_data["error"]

    return response


@router.post("/{video_id}/chunk", response_model=Dict[str, Any])
async def chunk_transcript(
    video_id: str,
    max_tokens: int = 3000
):
    """
    Get transcript in chunks for large content.

    Args:
        video_id: YouTube video ID
        max_tokens: Maximum tokens per chunk

    Returns:
        Chunked transcript data
    """
    # Get transcript first
    result = await transcript_service.extract_transcript(video_id)

    if not result.success or not result.transcript:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Transcript not available for this video"
        )

    # Clean and chunk transcript
    cleaned = transcript_processor.clean_transcript(result.transcript)
    chunks = transcript_processor.chunk_transcript(cleaned, max_tokens)

    return {
        "video_id": video_id,
        "total_chunks": len(chunks),
        "chunks": [chunk.model_dump() for chunk in chunks],
        "metadata": {
            "total_words": len(cleaned.split()),
            "extraction_method": result.method.value
        }
    }


@router.get("/cache/stats", response_model=Dict[str, Any])
async def get_cache_stats():
    """Get cache statistics for monitoring"""
    return cache_client.get_stats()


# ====== DUAL TRANSCRIPT ENDPOINTS ======

@router.post("/dual/extract", response_model=JobResponse)
async def extract_dual_transcript(
    request: DualTranscriptRequest,
    background_tasks: BackgroundTasks
):
    """
    Start dual transcript extraction job.

    Supports YouTube captions, Whisper AI transcription, or both for comparison.

    Args:
        request: Dual transcript extraction request
        background_tasks: FastAPI background tasks

    Returns:
        JobResponse with job ID for status tracking
    """
    job_id = str(uuid.uuid4())

    # Initialize job status
    job_storage[job_id] = {
        "status": "pending",
        "progress_percentage": 0,
        "current_step": "Initializing dual transcript extraction...",
        "source": request.transcript_source.value
    }

    # Start background extraction
    background_tasks.add_task(
        extract_dual_transcript_job,
        job_id=job_id,
        request=request
    )

    return JobResponse(
        job_id=job_id,
        status="processing",
        message=f"Dual transcript extraction started ({request.transcript_source.value})"
    )


async def extract_dual_transcript_job(job_id: str, request: DualTranscriptRequest):
    """Background job for dual transcript extraction"""
    try:
        # Extract video ID from URL (assuming URL format like the frontend)
        video_id = extract_video_id_from_url(request.video_url)

        # Update job status
        job_storage[job_id].update({
            "status": "processing",
            "progress_percentage": 10,
            "current_step": "Validating video URL..."
        })

        # Progress callback function
        async def progress_callback(message: str):
            current_progress = job_storage[job_id]["progress_percentage"]
            new_progress = min(90, current_progress + 10)
            job_storage[job_id].update({
                "progress_percentage": new_progress,
                "current_step": message
            })

        # Extract transcript using dual service
        result = await dual_transcript_service.get_transcript(
            video_id=video_id,
            video_url=request.video_url,
            source=request.transcript_source,
            progress_callback=progress_callback
        )

        if result.success:
            # Create API response from service result
            response = DualTranscriptResponse(
                video_id=result.video_id,
                source=result.source,
                youtube_transcript=result.youtube_transcript,
                youtube_metadata=result.youtube_metadata,
                whisper_transcript=result.whisper_transcript,
                whisper_metadata=result.whisper_metadata,
                comparison=result.comparison,
                processing_time_seconds=result.processing_time_seconds,
                success=result.success,
                error=result.error
            )

            job_storage[job_id].update({
                "status": "completed",
                "progress_percentage": 100,
                "current_step": "Complete",
                "result": response.model_dump()
            })
        else:
            job_storage[job_id].update({
                "status": "failed",
                "progress_percentage": 0,
                "current_step": "Failed",
                "error": {"message": result.error or "Unknown error"}
            })

    except Exception as e:
        logger.error(f"Dual transcript job {job_id} failed: {str(e)}")
        job_storage[job_id].update({
            "status": "failed",
            "progress_percentage": 0,
            "current_step": "Failed",
            "error": {
                "code": "DUAL_TRANSCRIPT_FAILED",
                "message": str(e)
            }
        })


@router.get("/dual/jobs/{job_id}", response_model=JobStatusResponse)
async def get_dual_transcript_status(job_id: str):
    """
    Get status of dual transcript extraction job.

    Args:
        job_id: Job ID from dual extract endpoint

    Returns:
        JobStatusResponse with current job status and results
    """
    if job_id not in job_storage:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Job {job_id} not found"
        )

    job_data = job_storage[job_id]

    response = JobStatusResponse(
        job_id=job_id,
        status=job_data["status"],
        progress_percentage=job_data.get("progress_percentage", 0),
        current_step=job_data.get("current_step")
    )

    # Note: For dual transcripts, we'll return the result in a custom format
    # since JobStatusResponse expects TranscriptResponse, but we have DualTranscriptResponse
    if job_data["status"] == "completed" and "result" in job_data:
        # For now, we'll put the dual result in the error field as a workaround
        # In a real implementation, we'd create a new response model
        response.error = {"dual_result": job_data["result"]}
    elif job_data["status"] == "failed" and "error" in job_data:
        response.error = job_data["error"]

    return response


@router.post("/dual/estimate", response_model=ProcessingTimeEstimate)
async def estimate_dual_transcript_time(
    video_url: str,
    transcript_source: TranscriptSource,
    video_duration_seconds: Optional[float] = None
):
    """
    Estimate processing time for dual transcript extraction.

    Args:
        video_url: YouTube video URL
        transcript_source: Which transcript source(s) to estimate
        video_duration_seconds: Video duration if known (saves a metadata call)

    Returns:
        ProcessingTimeEstimate with time estimates
    """
    try:
        # If duration not provided, we'd need to get it from video metadata
        # For now, assume a default duration of 10 minutes for estimation
        if video_duration_seconds is None:
            video_duration_seconds = 600  # 10 minutes default

        estimates = dual_transcript_service.estimate_processing_time(
            video_duration_seconds, transcript_source
        )

        # Convert to ISO timestamp for estimated completion
        import datetime
        estimated_completion = None
        if estimates.get("total"):
            completion_time = datetime.datetime.now() + datetime.timedelta(
                seconds=estimates["total"]
            )
            estimated_completion = completion_time.isoformat()

        return ProcessingTimeEstimate(
            youtube_seconds=estimates.get("youtube"),
            whisper_seconds=estimates.get("whisper"),
            total_seconds=estimates.get("total"),
            estimated_completion=estimated_completion
        )

    except Exception as e:
        logger.error(f"Failed to estimate processing time: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to estimate processing time: {str(e)}"
        )


@router.get("/dual/compare/{video_id}")
async def compare_transcript_sources(
    video_id: str,
    video_url: str
):
    """
    Compare YouTube captions vs Whisper transcription for a video.

    This is a convenience endpoint that forces both transcripts
    and returns detailed comparison metrics.

    Args:
        video_id: YouTube video ID
        video_url: Full YouTube video URL

    Returns:
        Detailed comparison between transcript sources
    """
    try:
        # Force both transcripts for comparison
        result = await dual_transcript_service.get_transcript(
            video_id=video_id,
            video_url=video_url,
            source=TranscriptSource.BOTH
        )

        if not result.success:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Failed to extract transcripts: {result.error}"
            )

        if not result.has_comparison:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Unable to generate comparison - both transcripts are required"
            )

        return {
            "video_id": video_id,
            "comparison": result.comparison.model_dump() if result.comparison else None,
            "youtube_available": result.has_youtube,
            "whisper_available": result.has_whisper,
            "processing_time_seconds": result.processing_time_seconds,
            "recommendation": result.comparison.recommendation if result.comparison else None
        }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Failed to compare transcripts for {video_id}: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to compare transcripts: {str(e)}"
        )


def extract_video_id_from_url(url: str) -> str:
    """
    Extract YouTube video ID from various URL formats.

    Supports:
    - https://www.youtube.com/watch?v=VIDEO_ID
    - https://youtu.be/VIDEO_ID
    - https://www.youtube.com/embed/VIDEO_ID
    """
    import re

    patterns = [
        r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([^&\n?#]+)',
        r'youtube\.com.*[?&]v=([^&\n?#]+)'
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)

    raise ValueError(f"Could not extract video ID from URL: {url}")