youtube-summarizer/backend/integrations/langchain_tools.py

"""
LangChain integration for YouTube Summarizer API
Provides LangChain-compatible tools and wrappers for agent frameworks
"""

import json
import logging
from typing import Any, Dict, List, Optional, Type
from datetime import datetime

try:
    from langchain.tools import BaseTool
    from langchain.callbacks.manager import AsyncCallbackManagerForToolRun, CallbackManagerForToolRun
    from pydantic import BaseModel, Field
    LANGCHAIN_AVAILABLE = True
except ImportError:
    # Graceful fallback when LangChain is not installed
    class BaseTool:
        """Mock BaseTool for when LangChain is not available"""
        name: str = ""
        description: str = ""

    class BaseModel:
        """Mock BaseModel for when Pydantic from LangChain is not available"""
        pass

    def Field(**kwargs):
        return None

    CallbackManagerForToolRun = None
    AsyncCallbackManagerForToolRun = None
    LANGCHAIN_AVAILABLE = False

# Import backend services
try:
    from ..services.dual_transcript_service import DualTranscriptService
    from ..services.summary_pipeline import SummaryPipeline
    from ..services.batch_processing_service import BatchProcessingService
    from ..models.transcript import TranscriptSource, WhisperModelSize
    from ..models.batch import BatchJobStatus
    BACKEND_SERVICES_AVAILABLE = True
except ImportError:
    BACKEND_SERVICES_AVAILABLE = False

logger = logging.getLogger(__name__)

# Input schemas for LangChain tools
class TranscriptExtractionInput(BaseModel):
    """Input schema for transcript extraction"""
    video_url: str = Field(..., description="YouTube video URL to extract transcript from")
    source: str = Field(
        default="youtube",
        description="Transcript source: 'youtube' (captions), 'whisper' (AI), or 'both' (comparison)"
    )
    whisper_model: str = Field(
        default="base",
        description="Whisper model size: tiny, base, small, medium, large"
    )

class SummarizationInput(BaseModel):
    """Input schema for video summarization"""
    video_url: str = Field(..., description="YouTube video URL to summarize")
    summary_type: str = Field(
        default="comprehensive",
        description="Summary type: brief, standard, comprehensive, or detailed"
    )
    format: str = Field(
        default="structured",
        description="Output format: structured, bullet_points, paragraph, or narrative"
    )
    extract_key_points: bool = Field(default=True, description="Whether to extract key points")

class BatchProcessingInput(BaseModel):
    """Input schema for batch processing"""
    video_urls: List[str] = Field(..., description="List of YouTube video URLs to process")
    batch_name: Optional[str] = Field(None, description="Optional name for the batch")
    processing_type: str = Field(default="summarize", description="Type of processing: transcribe or summarize")

class VideoSearchInput(BaseModel):
    """Input schema for video search"""
    query: str = Field(..., description="Search query for processed videos")
    limit: int = Field(default=10, description="Maximum number of results to return")

# LangChain Tools

class YouTubeTranscriptTool(BaseTool):
    """LangChain tool for extracting YouTube video transcripts"""

    name: str = "youtube_transcript"
    description: str = """Extract transcript from YouTube videos using captions or AI.

    Supports three modes:
    - 'youtube': Fast extraction using YouTube's captions
    - 'whisper': High-quality AI transcription using OpenAI Whisper
    - 'both': Comparison mode that provides both methods with quality analysis

    Input: video_url (required), source (optional), whisper_model (optional)
    Returns: Transcript text with metadata and quality metrics"""

    args_schema: Type[BaseModel] = TranscriptExtractionInput

    def __init__(self):
        super().__init__()
        self.dual_transcript_service = None
        if BACKEND_SERVICES_AVAILABLE:
            try:
                self.dual_transcript_service = DualTranscriptService()
            except Exception as e:
                logger.warning(f"Could not initialize DualTranscriptService: {e}")

    def _run(
        self,
        video_url: str,
        source: str = "youtube",
        whisper_model: str = "base",
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Synchronous execution"""
        # For sync execution, we'll return a structured response
        return self._execute_extraction(video_url, source, whisper_model)

    async def _arun(
        self,
        video_url: str,
        source: str = "youtube",
        whisper_model: str = "base",
        run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Asynchronous execution"""
        return await self._execute_extraction_async(video_url, source, whisper_model)

    def _execute_extraction(self, video_url: str, source: str, whisper_model: str) -> str:
        """Execute transcript extraction (sync fallback)"""
        try:
            if self.dual_transcript_service and BACKEND_SERVICES_AVAILABLE:
                # This is a simplified sync wrapper - in production you'd want proper async handling
                result = {
                    "success": True,
                    "video_url": video_url,
                    "source": source,
                    "whisper_model": whisper_model,
                    "message": "Transcript extraction initiated. Use async method for real processing.",
                    "note": "Sync execution provides limited functionality. Use arun() for full features."
                }
                return json.dumps(result, indent=2)
            else:
                # Mock response
                return json.dumps({
                    "success": True,
                    "video_url": video_url,
                    "source": source,
                    "transcript": f"[Mock transcript for {video_url}] This is a sample transcript extracted using {source} method.",
                    "metadata": {
                        "duration": 300,
                        "word_count": 45,
                        "quality_score": 0.85,
                        "processing_time": 2.1
                    },
                    "mock": True
                }, indent=2)

        except Exception as e:
            return json.dumps({"success": False, "error": str(e)}, indent=2)

    async def _execute_extraction_async(self, video_url: str, source: str, whisper_model: str) -> str:
        """Execute transcript extraction (async)"""
        try:
            if self.dual_transcript_service and BACKEND_SERVICES_AVAILABLE:
                # Real async execution
                from ..models.transcript import TranscriptRequest
                from ..models.transcript import WhisperModelSize

                # Convert string to enum
                try:
                    transcript_source = getattr(TranscriptSource, source.upper())
                    whisper_size = getattr(WhisperModelSize, whisper_model.upper())
                except AttributeError:
                    transcript_source = TranscriptSource.YOUTUBE
                    whisper_size = WhisperModelSize.BASE

                request = TranscriptRequest(
                    video_url=video_url,
                    source=transcript_source,
                    whisper_model=whisper_size
                )

                result = await self.dual_transcript_service.extract_transcript(request)

                return json.dumps({
                    "success": True,
                    "video_url": video_url,
                    "source": source,
                    "result": result,
                    "langchain_tool": "youtube_transcript"
                }, indent=2)
            else:
                # Enhanced mock response for async
                return json.dumps({
                    "success": True,
                    "video_url": video_url,
                    "source": source,
                    "transcript": f"[Async Mock] Comprehensive transcript extracted from {video_url} using {source}. This simulates real async processing with {whisper_model} model quality.",
                    "metadata": {
                        "duration": 847,
                        "word_count": 6420,
                        "quality_score": 0.92,
                        "processing_time": 45.2,
                        "confidence_score": 0.96
                    },
                    "mock": True,
                    "async_processed": True
                }, indent=2)

        except Exception as e:
            logger.error(f"Error in async transcript extraction: {e}")
            return json.dumps({"success": False, "error": str(e)}, indent=2)


class YouTubeSummarizationTool(BaseTool):
    """LangChain tool for summarizing YouTube videos"""

    name: str = "youtube_summarize"
    description: str = """Generate AI-powered summaries of YouTube videos with customizable options.

    Provides comprehensive summarization with multiple output formats:
    - Brief: Quick overview (2-3 sentences)
    - Standard: Balanced summary with key points
    - Comprehensive: Detailed analysis with insights
    - Detailed: Complete breakdown with timestamps

    Input: video_url (required), summary_type (optional), format (optional)
    Returns: Structured summary with key points, insights, and metadata"""

    args_schema: Type[BaseModel] = SummarizationInput

    def __init__(self):
        super().__init__()
        self.summary_pipeline = None
        if BACKEND_SERVICES_AVAILABLE:
            try:
                # Note: SummaryPipeline requires proper dependency injection in real implementation
                pass
            except Exception as e:
                logger.warning(f"Could not initialize SummaryPipeline: {e}")

    def _run(
        self,
        video_url: str,
        summary_type: str = "comprehensive",
        format: str = "structured",
        extract_key_points: bool = True,
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Synchronous execution"""
        return self._execute_summarization(video_url, summary_type, format, extract_key_points)

    async def _arun(
        self,
        video_url: str,
        summary_type: str = "comprehensive",
        format: str = "structured",
        extract_key_points: bool = True,
        run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Asynchronous execution"""
        return await self._execute_summarization_async(video_url, summary_type, format, extract_key_points)

    def _execute_summarization(self, video_url: str, summary_type: str, format: str, extract_key_points: bool) -> str:
        """Execute summarization (sync)"""
        try:
            # Mock comprehensive response
            mock_summary = self._generate_mock_summary(video_url, summary_type, format, extract_key_points)
            return json.dumps(mock_summary, indent=2)

        except Exception as e:
            return json.dumps({"success": False, "error": str(e)}, indent=2)

    async def _execute_summarization_async(self, video_url: str, summary_type: str, format: str, extract_key_points: bool) -> str:
        """Execute summarization (async)"""
        try:
            if self.summary_pipeline and BACKEND_SERVICES_AVAILABLE:
                # Real async execution would go here
                pass

            # Enhanced mock for async
            mock_summary = self._generate_mock_summary(video_url, summary_type, format, extract_key_points, async_mode=True)
            return json.dumps(mock_summary, indent=2)

        except Exception as e:
            logger.error(f"Error in async summarization: {e}")
            return json.dumps({"success": False, "error": str(e)}, indent=2)

    def _generate_mock_summary(self, video_url: str, summary_type: str, format: str, extract_key_points: bool, async_mode: bool = False) -> Dict[str, Any]:
        """Generate mock summary response"""
        summaries = {
            "brief": "This video provides a concise overview of advanced techniques and practical applications.",
            "standard": "The video explores key concepts and methodologies, providing practical examples and real-world applications. The presenter demonstrates step-by-step approaches and discusses common challenges and solutions.",
            "comprehensive": "This comprehensive video tutorial delves deep into advanced concepts, providing detailed explanations, practical demonstrations, and real-world case studies. The content covers theoretical foundations, implementation strategies, best practices, and troubleshooting techniques. Key insights include performance optimization, scalability considerations, and industry standards.",
            "detailed": "An extensive exploration of the subject matter, beginning with foundational concepts and progressing through advanced topics. The video includes detailed technical explanations, comprehensive examples, practical implementations, and thorough analysis of various approaches. Multiple perspectives are presented, along with pros and cons of different methodologies, performance benchmarks, and detailed troubleshooting guides."
        }

        key_points = [
            "Introduction to core concepts and terminology",
            "Practical implementation strategies and best practices",
            "Common challenges and proven solution approaches",
            "Performance optimization techniques and benchmarks",
            "Real-world case studies and industry applications",
            "Troubleshooting guide and error resolution methods"
        ] if extract_key_points else []

        return {
            "success": True,
            "video_url": video_url,
            "summary_type": summary_type,
            "format": format,
            "summary": summaries.get(summary_type, summaries["standard"]),
            "key_points": key_points,
            "insights": [
                "Strong educational value with practical applications",
                "Well-structured content with logical progression",
                "Comprehensive coverage of advanced topics"
            ],
            "metadata": {
                "video_title": f"Tutorial Video - {video_url[-8:]}",
                "duration": 847,
                "processing_time": 23.4 if async_mode else 5.2,
                "quality_score": 0.94,
                "confidence_score": 0.91,
                "word_count": len(summaries.get(summary_type, summaries["standard"]).split()),
                "generated_at": datetime.now().isoformat()
            },
            "langchain_tool": "youtube_summarize",
            "mock": True,
            "async_processed": async_mode
        }


class YouTubeBatchTool(BaseTool):
    """LangChain tool for batch processing multiple YouTube videos"""

    name: str = "youtube_batch"
    description: str = """Process multiple YouTube videos in batch mode for efficient bulk operations.

    Supports batch transcription and summarization of video lists:
    - Parallel processing for faster completion
    - Progress tracking for all videos in batch
    - Consolidated results with individual video status
    - Cost optimization through batch processing

    Input: video_urls (list, required), batch_name (optional), processing_type (optional)
    Returns: Batch job details with processing status and results"""

    args_schema: Type[BaseModel] = BatchProcessingInput

    def __init__(self):
        super().__init__()
        self.batch_service = None
        if BACKEND_SERVICES_AVAILABLE:
            try:
                self.batch_service = BatchProcessingService()
            except Exception as e:
                logger.warning(f"Could not initialize BatchProcessingService: {e}")

    def _run(
        self,
        video_urls: List[str],
        batch_name: Optional[str] = None,
        processing_type: str = "summarize",
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Synchronous execution"""
        return self._execute_batch_processing(video_urls, batch_name, processing_type)

    async def _arun(
        self,
        video_urls: List[str],
        batch_name: Optional[str] = None,
        processing_type: str = "summarize",
        run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Asynchronous execution"""
        return await self._execute_batch_processing_async(video_urls, batch_name, processing_type)

    def _execute_batch_processing(self, video_urls: List[str], batch_name: Optional[str], processing_type: str) -> str:
        """Execute batch processing (sync)"""
        try:
            batch_id = f"langchain_batch_{int(datetime.now().timestamp())}"
            batch_name = batch_name or f"LangChain Batch {datetime.now().strftime('%Y-%m-%d %H:%M')}"

            return json.dumps({
                "success": True,
                "batch_id": batch_id,
                "batch_name": batch_name,
                "processing_type": processing_type,
                "video_count": len(video_urls),
                "status": "queued",
                "estimated_completion": f"{len(video_urls) * 2} minutes",
                "videos": video_urls,
                "message": f"Batch job created with {len(video_urls)} videos",
                "langchain_tool": "youtube_batch",
                "mock": True
            }, indent=2)

        except Exception as e:
            return json.dumps({"success": False, "error": str(e)}, indent=2)

    async def _execute_batch_processing_async(self, video_urls: List[str], batch_name: Optional[str], processing_type: str) -> str:
        """Execute batch processing (async)"""
        try:
            if self.batch_service and BACKEND_SERVICES_AVAILABLE:
                # Real async batch processing would go here
                pass

            batch_id = f"langchain_batch_async_{int(datetime.now().timestamp())}"
            batch_name = batch_name or f"LangChain Async Batch {datetime.now().strftime('%Y-%m-%d %H:%M')}"

            return json.dumps({
                "success": True,
                "batch_id": batch_id,
                "batch_name": batch_name,
                "processing_type": processing_type,
                "video_count": len(video_urls),
                "status": "processing",
                "progress": 0.15,
                "completed_videos": 0,
                "failed_videos": 0,
                "estimated_completion": f"{len(video_urls) * 1.8} minutes",
                "videos": video_urls,
                "message": f"Async batch processing started for {len(video_urls)} videos",
                "langchain_tool": "youtube_batch",
                "mock": True,
                "async_processed": True
            }, indent=2)

        except Exception as e:
            logger.error(f"Error in async batch processing: {e}")
            return json.dumps({"success": False, "error": str(e)}, indent=2)


class YouTubeSearchTool(BaseTool):
    """LangChain tool for searching processed YouTube videos"""

    name: str = "youtube_search"
    description: str = """Search through previously processed YouTube videos and summaries.

    Provides intelligent search across:
    - Video titles and descriptions
    - Generated summaries and transcripts
    - Key points and insights
    - Metadata and tags

    Input: query (required), limit (optional)
    Returns: Ranked search results with relevance scores and metadata"""

    args_schema: Type[BaseModel] = VideoSearchInput

    def _run(
        self,
        query: str,
        limit: int = 10,
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Synchronous execution"""
        return self._execute_search(query, limit)

    async def _arun(
        self,
        query: str,
        limit: int = 10,
        run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Asynchronous execution"""
        return await self._execute_search_async(query, limit)

    def _execute_search(self, query: str, limit: int) -> str:
        """Execute search (sync)"""
        try:
            mock_results = self._generate_mock_search_results(query, limit)
            return json.dumps({
                "success": True,
                "query": query,
                "limit": limit,
                "total_results": len(mock_results),
                "results": mock_results,
                "search_time": 0.08,
                "langchain_tool": "youtube_search",
                "mock": True
            }, indent=2)

        except Exception as e:
            return json.dumps({"success": False, "error": str(e)}, indent=2)

    async def _execute_search_async(self, query: str, limit: int) -> str:
        """Execute search (async)"""
        try:
            # Enhanced mock for async with more sophisticated results
            mock_results = self._generate_mock_search_results(query, limit, enhanced=True)
            return json.dumps({
                "success": True,
                "query": query,
                "limit": limit,
                "total_results": len(mock_results),
                "results": mock_results,
                "search_time": 0.05,  # Faster async search
                "relevance_algorithm": "semantic_similarity_v2",
                "langchain_tool": "youtube_search",
                "mock": True,
                "async_processed": True
            }, indent=2)

        except Exception as e:
            logger.error(f"Error in async search: {e}")
            return json.dumps({"success": False, "error": str(e)}, indent=2)

    def _generate_mock_search_results(self, query: str, limit: int, enhanced: bool = False) -> List[Dict[str, Any]]:
        """Generate mock search results"""
        base_results = [
            {
                "video_id": "dQw4w9WgXcQ",
                "title": f"Advanced Tutorial: {query.title()} Fundamentals",
                "channel": "TechEducation Pro",
                "duration": 847,
                "relevance_score": 0.95,
                "summary": f"Comprehensive guide covering {query} concepts with practical examples and real-world applications.",
                "url": "https://youtube.com/watch?v=dQw4w9WgXcQ",
                "key_points": [
                    f"Introduction to {query}",
                    "Implementation strategies",
                    "Best practices and optimization"
                ],
                "processed_at": "2024-01-20T10:30:00Z"
            },
            {
                "video_id": "abc123xyz789",
                "title": f"Mastering {query.title()}: Expert Techniques",
                "channel": "DevSkills Academy",
                "duration": 1200,
                "relevance_score": 0.87,
                "summary": f"Deep dive into advanced {query} techniques with expert insights and industry case studies.",
                "url": "https://youtube.com/watch?v=abc123xyz789",
                "key_points": [
                    f"Advanced {query} patterns",
                    "Performance optimization",
                    "Industry best practices"
                ],
                "processed_at": "2024-01-19T15:45:00Z"
            }
        ]

        if enhanced:
            # Add more sophisticated mock data for async results
            for result in base_results:
                result.update({
                    "semantic_score": result["relevance_score"] * 0.98,
                    "content_quality": 0.92,
                    "engagement_metrics": {
                        "views": 125680,
                        "likes": 4521,
                        "comments": 387
                    },
                    "tags": [query.lower(), "tutorial", "advanced", "education"],
                    "transcript_matches": 15,
                    "summary_matches": 8
                })

        return base_results[:limit]


# Tool collection for easy registration

def get_youtube_langchain_tools() -> List[BaseTool]:
    """Get all YouTube Summarizer LangChain tools"""
    if not LANGCHAIN_AVAILABLE:
        logger.warning("LangChain not available. Tools will have limited functionality.")

    return [
        YouTubeTranscriptTool(),
        YouTubeSummarizationTool(),
        YouTubeBatchTool(),
        YouTubeSearchTool()
    ]

# Utility functions for LangChain integration

def create_youtube_toolkit():
    """Create a complete toolkit for LangChain agents"""
    if not LANGCHAIN_AVAILABLE:
        logger.error("LangChain not available. Cannot create toolkit.")
        return None

    return get_youtube_langchain_tools()

def register_youtube_tools_with_agent(agent):
    """Register YouTube tools with a LangChain agent"""
    if not LANGCHAIN_AVAILABLE:
        logger.error("LangChain not available. Cannot register tools.")
        return False

    try:
        tools = get_youtube_langchain_tools()
        # Implementation depends on the specific agent type
        # This is a generic interface
        if hasattr(agent, 'tools'):
            agent.tools.extend(tools)
        elif hasattr(agent, 'add_tools'):
            agent.add_tools(tools)
        return True
    except Exception as e:
        logger.error(f"Error registering tools: {e}")
        return False

# Example usage and documentation

if __name__ == "__main__":
    # Example usage
    tools = get_youtube_langchain_tools()
    print(f"Created {len(tools)} LangChain tools:")
    for tool in tools:
        print(f"- {tool.name}: {tool.description[:50]}...")