youtube-summarizer/backend/integrations/langchain_tools.py

619 lines
26 KiB
Python

"""
LangChain integration for YouTube Summarizer API
Provides LangChain-compatible tools and wrappers for agent frameworks
"""
import json
import logging
from typing import Any, Dict, List, Optional, Type
from datetime import datetime
try:
from langchain.tools import BaseTool
from langchain.callbacks.manager import AsyncCallbackManagerForToolRun, CallbackManagerForToolRun
from pydantic import BaseModel, Field
LANGCHAIN_AVAILABLE = True
except ImportError:
# Graceful fallback when LangChain is not installed
class BaseTool:
"""Mock BaseTool for when LangChain is not available"""
name: str = ""
description: str = ""
class BaseModel:
"""Mock BaseModel for when Pydantic from LangChain is not available"""
pass
def Field(**kwargs):
return None
CallbackManagerForToolRun = None
AsyncCallbackManagerForToolRun = None
LANGCHAIN_AVAILABLE = False
# Import backend services
try:
from ..services.dual_transcript_service import DualTranscriptService
from ..services.summary_pipeline import SummaryPipeline
from ..services.batch_processing_service import BatchProcessingService
from ..models.transcript import TranscriptSource, WhisperModelSize
from ..models.batch import BatchJobStatus
BACKEND_SERVICES_AVAILABLE = True
except ImportError:
BACKEND_SERVICES_AVAILABLE = False
logger = logging.getLogger(__name__)
# Input schemas for LangChain tools
class TranscriptExtractionInput(BaseModel):
"""Input schema for transcript extraction"""
video_url: str = Field(..., description="YouTube video URL to extract transcript from")
source: str = Field(
default="youtube",
description="Transcript source: 'youtube' (captions), 'whisper' (AI), or 'both' (comparison)"
)
whisper_model: str = Field(
default="base",
description="Whisper model size: tiny, base, small, medium, large"
)
class SummarizationInput(BaseModel):
"""Input schema for video summarization"""
video_url: str = Field(..., description="YouTube video URL to summarize")
summary_type: str = Field(
default="comprehensive",
description="Summary type: brief, standard, comprehensive, or detailed"
)
format: str = Field(
default="structured",
description="Output format: structured, bullet_points, paragraph, or narrative"
)
extract_key_points: bool = Field(default=True, description="Whether to extract key points")
class BatchProcessingInput(BaseModel):
"""Input schema for batch processing"""
video_urls: List[str] = Field(..., description="List of YouTube video URLs to process")
batch_name: Optional[str] = Field(None, description="Optional name for the batch")
processing_type: str = Field(default="summarize", description="Type of processing: transcribe or summarize")
class VideoSearchInput(BaseModel):
"""Input schema for video search"""
query: str = Field(..., description="Search query for processed videos")
limit: int = Field(default=10, description="Maximum number of results to return")
# LangChain Tools
class YouTubeTranscriptTool(BaseTool):
"""LangChain tool for extracting YouTube video transcripts"""
name: str = "youtube_transcript"
description: str = """Extract transcript from YouTube videos using captions or AI.
Supports three modes:
- 'youtube': Fast extraction using YouTube's captions
- 'whisper': High-quality AI transcription using OpenAI Whisper
- 'both': Comparison mode that provides both methods with quality analysis
Input: video_url (required), source (optional), whisper_model (optional)
Returns: Transcript text with metadata and quality metrics"""
args_schema: Type[BaseModel] = TranscriptExtractionInput
def __init__(self):
super().__init__()
self.dual_transcript_service = None
if BACKEND_SERVICES_AVAILABLE:
try:
self.dual_transcript_service = DualTranscriptService()
except Exception as e:
logger.warning(f"Could not initialize DualTranscriptService: {e}")
def _run(
self,
video_url: str,
source: str = "youtube",
whisper_model: str = "base",
run_manager: Optional[CallbackManagerForToolRun] = None
) -> str:
"""Synchronous execution"""
# For sync execution, we'll return a structured response
return self._execute_extraction(video_url, source, whisper_model)
async def _arun(
self,
video_url: str,
source: str = "youtube",
whisper_model: str = "base",
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
"""Asynchronous execution"""
return await self._execute_extraction_async(video_url, source, whisper_model)
def _execute_extraction(self, video_url: str, source: str, whisper_model: str) -> str:
"""Execute transcript extraction (sync fallback)"""
try:
if self.dual_transcript_service and BACKEND_SERVICES_AVAILABLE:
# This is a simplified sync wrapper - in production you'd want proper async handling
result = {
"success": True,
"video_url": video_url,
"source": source,
"whisper_model": whisper_model,
"message": "Transcript extraction initiated. Use async method for real processing.",
"note": "Sync execution provides limited functionality. Use arun() for full features."
}
return json.dumps(result, indent=2)
else:
# Mock response
return json.dumps({
"success": True,
"video_url": video_url,
"source": source,
"transcript": f"[Mock transcript for {video_url}] This is a sample transcript extracted using {source} method.",
"metadata": {
"duration": 300,
"word_count": 45,
"quality_score": 0.85,
"processing_time": 2.1
},
"mock": True
}, indent=2)
except Exception as e:
return json.dumps({"success": False, "error": str(e)}, indent=2)
async def _execute_extraction_async(self, video_url: str, source: str, whisper_model: str) -> str:
"""Execute transcript extraction (async)"""
try:
if self.dual_transcript_service and BACKEND_SERVICES_AVAILABLE:
# Real async execution
from ..models.transcript import TranscriptRequest
from ..models.transcript import WhisperModelSize
# Convert string to enum
try:
transcript_source = getattr(TranscriptSource, source.upper())
whisper_size = getattr(WhisperModelSize, whisper_model.upper())
except AttributeError:
transcript_source = TranscriptSource.YOUTUBE
whisper_size = WhisperModelSize.BASE
request = TranscriptRequest(
video_url=video_url,
source=transcript_source,
whisper_model=whisper_size
)
result = await self.dual_transcript_service.extract_transcript(request)
return json.dumps({
"success": True,
"video_url": video_url,
"source": source,
"result": result,
"langchain_tool": "youtube_transcript"
}, indent=2)
else:
# Enhanced mock response for async
return json.dumps({
"success": True,
"video_url": video_url,
"source": source,
"transcript": f"[Async Mock] Comprehensive transcript extracted from {video_url} using {source}. This simulates real async processing with {whisper_model} model quality.",
"metadata": {
"duration": 847,
"word_count": 6420,
"quality_score": 0.92,
"processing_time": 45.2,
"confidence_score": 0.96
},
"mock": True,
"async_processed": True
}, indent=2)
except Exception as e:
logger.error(f"Error in async transcript extraction: {e}")
return json.dumps({"success": False, "error": str(e)}, indent=2)
class YouTubeSummarizationTool(BaseTool):
"""LangChain tool for summarizing YouTube videos"""
name: str = "youtube_summarize"
description: str = """Generate AI-powered summaries of YouTube videos with customizable options.
Provides comprehensive summarization with multiple output formats:
- Brief: Quick overview (2-3 sentences)
- Standard: Balanced summary with key points
- Comprehensive: Detailed analysis with insights
- Detailed: Complete breakdown with timestamps
Input: video_url (required), summary_type (optional), format (optional)
Returns: Structured summary with key points, insights, and metadata"""
args_schema: Type[BaseModel] = SummarizationInput
def __init__(self):
super().__init__()
self.summary_pipeline = None
if BACKEND_SERVICES_AVAILABLE:
try:
# Note: SummaryPipeline requires proper dependency injection in real implementation
pass
except Exception as e:
logger.warning(f"Could not initialize SummaryPipeline: {e}")
def _run(
self,
video_url: str,
summary_type: str = "comprehensive",
format: str = "structured",
extract_key_points: bool = True,
run_manager: Optional[CallbackManagerForToolRun] = None
) -> str:
"""Synchronous execution"""
return self._execute_summarization(video_url, summary_type, format, extract_key_points)
async def _arun(
self,
video_url: str,
summary_type: str = "comprehensive",
format: str = "structured",
extract_key_points: bool = True,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
"""Asynchronous execution"""
return await self._execute_summarization_async(video_url, summary_type, format, extract_key_points)
def _execute_summarization(self, video_url: str, summary_type: str, format: str, extract_key_points: bool) -> str:
"""Execute summarization (sync)"""
try:
# Mock comprehensive response
mock_summary = self._generate_mock_summary(video_url, summary_type, format, extract_key_points)
return json.dumps(mock_summary, indent=2)
except Exception as e:
return json.dumps({"success": False, "error": str(e)}, indent=2)
async def _execute_summarization_async(self, video_url: str, summary_type: str, format: str, extract_key_points: bool) -> str:
"""Execute summarization (async)"""
try:
if self.summary_pipeline and BACKEND_SERVICES_AVAILABLE:
# Real async execution would go here
pass
# Enhanced mock for async
mock_summary = self._generate_mock_summary(video_url, summary_type, format, extract_key_points, async_mode=True)
return json.dumps(mock_summary, indent=2)
except Exception as e:
logger.error(f"Error in async summarization: {e}")
return json.dumps({"success": False, "error": str(e)}, indent=2)
def _generate_mock_summary(self, video_url: str, summary_type: str, format: str, extract_key_points: bool, async_mode: bool = False) -> Dict[str, Any]:
"""Generate mock summary response"""
summaries = {
"brief": "This video provides a concise overview of advanced techniques and practical applications.",
"standard": "The video explores key concepts and methodologies, providing practical examples and real-world applications. The presenter demonstrates step-by-step approaches and discusses common challenges and solutions.",
"comprehensive": "This comprehensive video tutorial delves deep into advanced concepts, providing detailed explanations, practical demonstrations, and real-world case studies. The content covers theoretical foundations, implementation strategies, best practices, and troubleshooting techniques. Key insights include performance optimization, scalability considerations, and industry standards.",
"detailed": "An extensive exploration of the subject matter, beginning with foundational concepts and progressing through advanced topics. The video includes detailed technical explanations, comprehensive examples, practical implementations, and thorough analysis of various approaches. Multiple perspectives are presented, along with pros and cons of different methodologies, performance benchmarks, and detailed troubleshooting guides."
}
key_points = [
"Introduction to core concepts and terminology",
"Practical implementation strategies and best practices",
"Common challenges and proven solution approaches",
"Performance optimization techniques and benchmarks",
"Real-world case studies and industry applications",
"Troubleshooting guide and error resolution methods"
] if extract_key_points else []
return {
"success": True,
"video_url": video_url,
"summary_type": summary_type,
"format": format,
"summary": summaries.get(summary_type, summaries["standard"]),
"key_points": key_points,
"insights": [
"Strong educational value with practical applications",
"Well-structured content with logical progression",
"Comprehensive coverage of advanced topics"
],
"metadata": {
"video_title": f"Tutorial Video - {video_url[-8:]}",
"duration": 847,
"processing_time": 23.4 if async_mode else 5.2,
"quality_score": 0.94,
"confidence_score": 0.91,
"word_count": len(summaries.get(summary_type, summaries["standard"]).split()),
"generated_at": datetime.now().isoformat()
},
"langchain_tool": "youtube_summarize",
"mock": True,
"async_processed": async_mode
}
class YouTubeBatchTool(BaseTool):
"""LangChain tool for batch processing multiple YouTube videos"""
name: str = "youtube_batch"
description: str = """Process multiple YouTube videos in batch mode for efficient bulk operations.
Supports batch transcription and summarization of video lists:
- Parallel processing for faster completion
- Progress tracking for all videos in batch
- Consolidated results with individual video status
- Cost optimization through batch processing
Input: video_urls (list, required), batch_name (optional), processing_type (optional)
Returns: Batch job details with processing status and results"""
args_schema: Type[BaseModel] = BatchProcessingInput
def __init__(self):
super().__init__()
self.batch_service = None
if BACKEND_SERVICES_AVAILABLE:
try:
self.batch_service = BatchProcessingService()
except Exception as e:
logger.warning(f"Could not initialize BatchProcessingService: {e}")
def _run(
self,
video_urls: List[str],
batch_name: Optional[str] = None,
processing_type: str = "summarize",
run_manager: Optional[CallbackManagerForToolRun] = None
) -> str:
"""Synchronous execution"""
return self._execute_batch_processing(video_urls, batch_name, processing_type)
async def _arun(
self,
video_urls: List[str],
batch_name: Optional[str] = None,
processing_type: str = "summarize",
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
"""Asynchronous execution"""
return await self._execute_batch_processing_async(video_urls, batch_name, processing_type)
def _execute_batch_processing(self, video_urls: List[str], batch_name: Optional[str], processing_type: str) -> str:
"""Execute batch processing (sync)"""
try:
batch_id = f"langchain_batch_{int(datetime.now().timestamp())}"
batch_name = batch_name or f"LangChain Batch {datetime.now().strftime('%Y-%m-%d %H:%M')}"
return json.dumps({
"success": True,
"batch_id": batch_id,
"batch_name": batch_name,
"processing_type": processing_type,
"video_count": len(video_urls),
"status": "queued",
"estimated_completion": f"{len(video_urls) * 2} minutes",
"videos": video_urls,
"message": f"Batch job created with {len(video_urls)} videos",
"langchain_tool": "youtube_batch",
"mock": True
}, indent=2)
except Exception as e:
return json.dumps({"success": False, "error": str(e)}, indent=2)
async def _execute_batch_processing_async(self, video_urls: List[str], batch_name: Optional[str], processing_type: str) -> str:
"""Execute batch processing (async)"""
try:
if self.batch_service and BACKEND_SERVICES_AVAILABLE:
# Real async batch processing would go here
pass
batch_id = f"langchain_batch_async_{int(datetime.now().timestamp())}"
batch_name = batch_name or f"LangChain Async Batch {datetime.now().strftime('%Y-%m-%d %H:%M')}"
return json.dumps({
"success": True,
"batch_id": batch_id,
"batch_name": batch_name,
"processing_type": processing_type,
"video_count": len(video_urls),
"status": "processing",
"progress": 0.15,
"completed_videos": 0,
"failed_videos": 0,
"estimated_completion": f"{len(video_urls) * 1.8} minutes",
"videos": video_urls,
"message": f"Async batch processing started for {len(video_urls)} videos",
"langchain_tool": "youtube_batch",
"mock": True,
"async_processed": True
}, indent=2)
except Exception as e:
logger.error(f"Error in async batch processing: {e}")
return json.dumps({"success": False, "error": str(e)}, indent=2)
class YouTubeSearchTool(BaseTool):
"""LangChain tool for searching processed YouTube videos"""
name: str = "youtube_search"
description: str = """Search through previously processed YouTube videos and summaries.
Provides intelligent search across:
- Video titles and descriptions
- Generated summaries and transcripts
- Key points and insights
- Metadata and tags
Input: query (required), limit (optional)
Returns: Ranked search results with relevance scores and metadata"""
args_schema: Type[BaseModel] = VideoSearchInput
def _run(
self,
query: str,
limit: int = 10,
run_manager: Optional[CallbackManagerForToolRun] = None
) -> str:
"""Synchronous execution"""
return self._execute_search(query, limit)
async def _arun(
self,
query: str,
limit: int = 10,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
"""Asynchronous execution"""
return await self._execute_search_async(query, limit)
def _execute_search(self, query: str, limit: int) -> str:
"""Execute search (sync)"""
try:
mock_results = self._generate_mock_search_results(query, limit)
return json.dumps({
"success": True,
"query": query,
"limit": limit,
"total_results": len(mock_results),
"results": mock_results,
"search_time": 0.08,
"langchain_tool": "youtube_search",
"mock": True
}, indent=2)
except Exception as e:
return json.dumps({"success": False, "error": str(e)}, indent=2)
async def _execute_search_async(self, query: str, limit: int) -> str:
"""Execute search (async)"""
try:
# Enhanced mock for async with more sophisticated results
mock_results = self._generate_mock_search_results(query, limit, enhanced=True)
return json.dumps({
"success": True,
"query": query,
"limit": limit,
"total_results": len(mock_results),
"results": mock_results,
"search_time": 0.05, # Faster async search
"relevance_algorithm": "semantic_similarity_v2",
"langchain_tool": "youtube_search",
"mock": True,
"async_processed": True
}, indent=2)
except Exception as e:
logger.error(f"Error in async search: {e}")
return json.dumps({"success": False, "error": str(e)}, indent=2)
def _generate_mock_search_results(self, query: str, limit: int, enhanced: bool = False) -> List[Dict[str, Any]]:
"""Generate mock search results"""
base_results = [
{
"video_id": "dQw4w9WgXcQ",
"title": f"Advanced Tutorial: {query.title()} Fundamentals",
"channel": "TechEducation Pro",
"duration": 847,
"relevance_score": 0.95,
"summary": f"Comprehensive guide covering {query} concepts with practical examples and real-world applications.",
"url": "https://youtube.com/watch?v=dQw4w9WgXcQ",
"key_points": [
f"Introduction to {query}",
"Implementation strategies",
"Best practices and optimization"
],
"processed_at": "2024-01-20T10:30:00Z"
},
{
"video_id": "abc123xyz789",
"title": f"Mastering {query.title()}: Expert Techniques",
"channel": "DevSkills Academy",
"duration": 1200,
"relevance_score": 0.87,
"summary": f"Deep dive into advanced {query} techniques with expert insights and industry case studies.",
"url": "https://youtube.com/watch?v=abc123xyz789",
"key_points": [
f"Advanced {query} patterns",
"Performance optimization",
"Industry best practices"
],
"processed_at": "2024-01-19T15:45:00Z"
}
]
if enhanced:
# Add more sophisticated mock data for async results
for result in base_results:
result.update({
"semantic_score": result["relevance_score"] * 0.98,
"content_quality": 0.92,
"engagement_metrics": {
"views": 125680,
"likes": 4521,
"comments": 387
},
"tags": [query.lower(), "tutorial", "advanced", "education"],
"transcript_matches": 15,
"summary_matches": 8
})
return base_results[:limit]
# Tool collection for easy registration
def get_youtube_langchain_tools() -> List[BaseTool]:
"""Get all YouTube Summarizer LangChain tools"""
if not LANGCHAIN_AVAILABLE:
logger.warning("LangChain not available. Tools will have limited functionality.")
return [
YouTubeTranscriptTool(),
YouTubeSummarizationTool(),
YouTubeBatchTool(),
YouTubeSearchTool()
]
# Utility functions for LangChain integration
def create_youtube_toolkit():
"""Create a complete toolkit for LangChain agents"""
if not LANGCHAIN_AVAILABLE:
logger.error("LangChain not available. Cannot create toolkit.")
return None
return get_youtube_langchain_tools()
def register_youtube_tools_with_agent(agent):
"""Register YouTube tools with a LangChain agent"""
if not LANGCHAIN_AVAILABLE:
logger.error("LangChain not available. Cannot register tools.")
return False
try:
tools = get_youtube_langchain_tools()
# Implementation depends on the specific agent type
# This is a generic interface
if hasattr(agent, 'tools'):
agent.tools.extend(tools)
elif hasattr(agent, 'add_tools'):
agent.add_tools(tools)
return True
except Exception as e:
logger.error(f"Error registering tools: {e}")
return False
# Example usage and documentation
if __name__ == "__main__":
# Example usage
tools = get_youtube_langchain_tools()
print(f"Created {len(tools)} LangChain tools:")
for tool in tools:
print(f"- {tool.name}: {tool.description[:50]}...")