#!/usr/bin/env python3 """Direct summary generation using DeepSeek without the API.""" import asyncio import json import os import sys from pathlib import Path from datetime import datetime # Add backend to path sys.path.append(str(Path(__file__).parent.parent)) # Load environment variables from dotenv import load_dotenv load_dotenv() from backend.services.deepseek_summarizer import DeepSeekSummarizer from backend.services.ai_service import SummaryRequest, SummaryLength async def generate_summary_direct(video_id: str, transcript_path: str): """Generate summary directly using DeepSeek service.""" print(f"\nšŸ“¹ Processing video: {video_id}") # Load transcript with open(transcript_path, 'r') as f: transcript_data = json.load(f) # Extract text from segments if isinstance(transcript_data, list): transcript_text = ' '.join([segment.get('text', '') for segment in transcript_data]) else: transcript_text = transcript_data.get('text', '') print(f"šŸ“ Transcript length: {len(transcript_text)} characters") print(f"šŸ“ Preview: {transcript_text[:200]}...") # Get API key api_key = os.getenv('DEEPSEEK_API_KEY') if not api_key: print("āŒ DEEPSEEK_API_KEY not found in environment") return None # Initialize DeepSeek service summarizer = DeepSeekSummarizer(api_key=api_key) await summarizer.initialize() print(f"šŸ¤– Using DeepSeek model: {summarizer.default_config.model_name}") # Create summary request request = SummaryRequest( transcript=transcript_text, length=SummaryLength.STANDARD, focus_areas=["key points", "main topics", "conclusions"], language="en" ) # Estimate cost cost = summarizer.estimate_cost(transcript_text, SummaryLength.STANDARD) print(f"šŸ’° Estimated cost: ${cost:.6f}") try: print("šŸš€ Generating summary with DeepSeek...") result = await summarizer.generate_summary(request) # Save summary to file system summary_dir = Path(f"video_storage/summaries/{video_id}") summary_dir.mkdir(parents=True, exist_ok=True) # Create timestamp for unique file name timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Save full result summary_file = summary_dir / f"summary_{timestamp}.json" summary_data = { "video_id": video_id, "generated_at": datetime.now().isoformat(), "model": "deepseek-chat", "summary": result.summary, "key_points": result.key_points, "main_themes": result.main_themes, "actionable_insights": result.actionable_insights, "confidence_score": result.confidence_score, "processing_metadata": result.processing_metadata, "cost_data": result.cost_data, "transcript_length": len(transcript_text) } with open(summary_file, 'w') as f: json.dump(summary_data, f, indent=2) print(f"āœ… Summary saved to: {summary_file}") # Also save a markdown version for easy reading md_file = summary_dir / f"summary_{timestamp}.md" with open(md_file, 'w') as f: f.write(f"# Video Summary\n") f.write(f"**Video ID:** {video_id}\n") f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"**Model:** DeepSeek Chat\n\n") f.write(f"## Summary\n{result.summary}\n\n") f.write(f"## Key Points\n") for point in result.key_points: f.write(f"- {point}\n") f.write("\n") f.write(f"## Main Themes\n") for theme in result.main_themes: f.write(f"- {theme}\n") f.write("\n") f.write(f"## Actionable Insights\n") for insight in result.actionable_insights: f.write(f"- {insight}\n") f.write("\n") f.write(f"---\n") f.write(f"*Confidence Score: {result.confidence_score:.2%}*\n") f.write(f"*Processing Cost: ${result.cost_data['total_cost_usd']:.6f}*\n") print(f"šŸ“„ Markdown saved to: {md_file}") return result except Exception as e: print(f"āŒ Error generating summary: {e}") import traceback traceback.print_exc() return None def format_summary(result): """Format and display the summary.""" if not result: return print("\n" + "="*60) print("šŸ“Š GENERATED SUMMARY") print("="*60) print("\nšŸ“ Summary:") print("-" * 40) print(result.summary) print("\nšŸŽÆ Key Points:") print("-" * 40) for i, point in enumerate(result.key_points, 1): print(f"{i}. {point}") print("\nšŸ“š Main Themes:") print("-" * 40) for theme in result.main_themes: print(f"• {theme}") print("\nšŸ’” Actionable Insights:") print("-" * 40) for insight in result.actionable_insights: print(f"→ {insight}") print(f"\nšŸ“Š Confidence Score: {result.confidence_score:.2%}") print(f"šŸ’° Total Cost: ${result.cost_data['total_cost_usd']:.6f}") async def check_existing_summaries(video_id: str): """Check for existing summaries for a video.""" summary_dir = Path(f"video_storage/summaries/{video_id}") if summary_dir.exists(): summaries = list(summary_dir.glob("summary_*.json")) if summaries: print(f"\nšŸ“š Found {len(summaries)} existing summary(ies) for video {video_id}:") for summary_file in sorted(summaries): print(f" - {summary_file.name}") return summaries return [] async def main(): """Process videos and generate summaries.""" videos = [ ("jNQXAC9IVRw", "First YouTube video - Me at the zoo"), ("DCquejfz04A", "China decoupling video"), ] print("šŸš€ YouTube Video Summary Generator (Direct)") print("=" * 60) for video_id, description in videos: transcript_path = Path(f"video_storage/transcripts/{video_id}.json") print(f"\nšŸ“¹ Video: {description}") print(f" ID: {video_id}") if not transcript_path.exists(): print(f" āŒ No transcript found") continue # Check for existing summaries existing = await check_existing_summaries(video_id) if existing: print(f" āš ļø Summary already exists. Generating new version...") # Generate summary result = await generate_summary_direct(video_id, str(transcript_path)) if result: format_summary(result) print("\nāœ… Summary generated successfully!") else: print("\nāŒ Failed to generate summary") print("\n" + "="*60) print("✨ All processing complete!") if __name__ == "__main__": asyncio.run(main())