youtube-summarizer/scripts/generate_summary.py

133 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""Generate summary for a video using the DeepSeek API."""
import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime
import httpx
# Add backend to path
sys.path.append(str(Path(__file__).parent.parent))
async def generate_summary_via_api(video_id: str, transcript_path: str):
"""Generate summary using the API endpoint."""
print(f"\n📹 Processing video: {video_id}")
# Load transcript
with open(transcript_path, 'r') as f:
transcript_data = json.load(f)
# Extract text from segments
if isinstance(transcript_data, list):
transcript_text = ' '.join([segment.get('text', '') for segment in transcript_data])
else:
transcript_text = transcript_data.get('text', '')
print(f"📝 Transcript length: {len(transcript_text)} characters")
print(f"📝 First 200 chars: {transcript_text[:200]}...")
# Call API to generate summary
async with httpx.AsyncClient(timeout=60.0) as client:
try:
print("🤖 Calling DeepSeek API to generate summary...")
response = await client.post(
"http://localhost:8000/api/summarize",
json={
"transcript": transcript_text,
"length": "standard",
"focus_areas": ["key points", "main topics", "conclusions"],
"include_timestamps": False
}
)
if response.status_code == 200:
result = response.json()
# Save summary to file
summary_dir = Path(f"video_storage/summaries/{video_id}")
summary_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_file = summary_dir / f"summary_{timestamp}.json"
with open(summary_file, 'w') as f:
json.dump(result, f, indent=2)
print(f"✅ Summary saved to: {summary_file}")
return result
else:
print(f"❌ API Error: {response.status_code}")
print(f"Response: {response.text}")
return None
except Exception as e:
print(f"❌ Error calling API: {e}")
return None
def format_summary(result: dict):
"""Format summary for display."""
if not result:
return
print("\n" + "="*60)
print("📊 SUMMARY RESULTS")
print("="*60)
print("\n📝 Main Summary:")
print("-" * 40)
print(result.get('summary', 'No summary available'))
print("\n🎯 Key Points:")
print("-" * 40)
for i, point in enumerate(result.get('key_points', []), 1):
print(f"{i}. {point}")
print("\n📚 Main Themes:")
print("-" * 40)
for theme in result.get('main_themes', []):
print(f"{theme}")
print("\n💡 Actionable Insights:")
print("-" * 40)
for insight in result.get('actionable_insights', []):
print(f"{insight}")
if 'confidence_score' in result:
print(f"\n📊 Confidence Score: {result['confidence_score']:.2%}")
if 'cost_data' in result:
cost = result['cost_data'].get('total_cost_usd', 0)
print(f"💰 Processing Cost: ${cost:.6f}")
async def main():
"""Process the most recent video."""
# Video: "Me at the zoo" - First YouTube video
video_id = "jNQXAC9IVRw"
transcript_path = Path("/Users/enias/projects/my-ai-projects/apps/youtube-summarizer/video_storage/transcripts/jNQXAC9IVRw.json")
print("🚀 YouTube Video Summary Generator (via API)")
print("=" * 60)
print(f"Video ID: {video_id}")
print(f"Transcript: {transcript_path}")
if not transcript_path.exists():
print(f"❌ Transcript file not found: {transcript_path}")
return
result = await generate_summary_via_api(video_id, str(transcript_path))
if result:
format_summary(result)
print("\n✅ Summary generation complete!")
else:
print("\n❌ Failed to generate summary")
if __name__ == "__main__":
asyncio.run(main())