youtube-summarizer/scripts/generate_summary_direct.py

218 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""Direct summary generation using DeepSeek without the API."""
import asyncio
import json
import os
import sys
from pathlib import Path
from datetime import datetime
# Add backend to path
sys.path.append(str(Path(__file__).parent.parent))
# Load environment variables
from dotenv import load_dotenv
load_dotenv()
from backend.services.deepseek_summarizer import DeepSeekSummarizer
from backend.services.ai_service import SummaryRequest, SummaryLength
async def generate_summary_direct(video_id: str, transcript_path: str):
"""Generate summary directly using DeepSeek service."""
print(f"\n📹 Processing video: {video_id}")
# Load transcript
with open(transcript_path, 'r') as f:
transcript_data = json.load(f)
# Extract text from segments
if isinstance(transcript_data, list):
transcript_text = ' '.join([segment.get('text', '') for segment in transcript_data])
else:
transcript_text = transcript_data.get('text', '')
print(f"📝 Transcript length: {len(transcript_text)} characters")
print(f"📝 Preview: {transcript_text[:200]}...")
# Get API key
api_key = os.getenv('DEEPSEEK_API_KEY')
if not api_key:
print("❌ DEEPSEEK_API_KEY not found in environment")
return None
# Initialize DeepSeek service
summarizer = DeepSeekSummarizer(api_key=api_key)
await summarizer.initialize()
print(f"🤖 Using DeepSeek model: {summarizer.default_config.model_name}")
# Create summary request
request = SummaryRequest(
transcript=transcript_text,
length=SummaryLength.STANDARD,
focus_areas=["key points", "main topics", "conclusions"],
language="en"
)
# Estimate cost
cost = summarizer.estimate_cost(transcript_text, SummaryLength.STANDARD)
print(f"💰 Estimated cost: ${cost:.6f}")
try:
print("🚀 Generating summary with DeepSeek...")
result = await summarizer.generate_summary(request)
# Save summary to file system
summary_dir = Path(f"video_storage/summaries/{video_id}")
summary_dir.mkdir(parents=True, exist_ok=True)
# Create timestamp for unique file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save full result
summary_file = summary_dir / f"summary_{timestamp}.json"
summary_data = {
"video_id": video_id,
"generated_at": datetime.now().isoformat(),
"model": "deepseek-chat",
"summary": result.summary,
"key_points": result.key_points,
"main_themes": result.main_themes,
"actionable_insights": result.actionable_insights,
"confidence_score": result.confidence_score,
"processing_metadata": result.processing_metadata,
"cost_data": result.cost_data,
"transcript_length": len(transcript_text)
}
with open(summary_file, 'w') as f:
json.dump(summary_data, f, indent=2)
print(f"✅ Summary saved to: {summary_file}")
# Also save a markdown version for easy reading
md_file = summary_dir / f"summary_{timestamp}.md"
with open(md_file, 'w') as f:
f.write(f"# Video Summary\n")
f.write(f"**Video ID:** {video_id}\n")
f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"**Model:** DeepSeek Chat\n\n")
f.write(f"## Summary\n{result.summary}\n\n")
f.write(f"## Key Points\n")
for point in result.key_points:
f.write(f"- {point}\n")
f.write("\n")
f.write(f"## Main Themes\n")
for theme in result.main_themes:
f.write(f"- {theme}\n")
f.write("\n")
f.write(f"## Actionable Insights\n")
for insight in result.actionable_insights:
f.write(f"- {insight}\n")
f.write("\n")
f.write(f"---\n")
f.write(f"*Confidence Score: {result.confidence_score:.2%}*\n")
f.write(f"*Processing Cost: ${result.cost_data['total_cost_usd']:.6f}*\n")
print(f"📄 Markdown saved to: {md_file}")
return result
except Exception as e:
print(f"❌ Error generating summary: {e}")
import traceback
traceback.print_exc()
return None
def format_summary(result):
"""Format and display the summary."""
if not result:
return
print("\n" + "="*60)
print("📊 GENERATED SUMMARY")
print("="*60)
print("\n📝 Summary:")
print("-" * 40)
print(result.summary)
print("\n🎯 Key Points:")
print("-" * 40)
for i, point in enumerate(result.key_points, 1):
print(f"{i}. {point}")
print("\n📚 Main Themes:")
print("-" * 40)
for theme in result.main_themes:
print(f"{theme}")
print("\n💡 Actionable Insights:")
print("-" * 40)
for insight in result.actionable_insights:
print(f"{insight}")
print(f"\n📊 Confidence Score: {result.confidence_score:.2%}")
print(f"💰 Total Cost: ${result.cost_data['total_cost_usd']:.6f}")
async def check_existing_summaries(video_id: str):
"""Check for existing summaries for a video."""
summary_dir = Path(f"video_storage/summaries/{video_id}")
if summary_dir.exists():
summaries = list(summary_dir.glob("summary_*.json"))
if summaries:
print(f"\n📚 Found {len(summaries)} existing summary(ies) for video {video_id}:")
for summary_file in sorted(summaries):
print(f" - {summary_file.name}")
return summaries
return []
async def main():
"""Process videos and generate summaries."""
videos = [
("jNQXAC9IVRw", "First YouTube video - Me at the zoo"),
("DCquejfz04A", "China decoupling video"),
]
print("🚀 YouTube Video Summary Generator (Direct)")
print("=" * 60)
for video_id, description in videos:
transcript_path = Path(f"video_storage/transcripts/{video_id}.json")
print(f"\n📹 Video: {description}")
print(f" ID: {video_id}")
if not transcript_path.exists():
print(f" ❌ No transcript found")
continue
# Check for existing summaries
existing = await check_existing_summaries(video_id)
if existing:
print(f" ⚠️ Summary already exists. Generating new version...")
# Generate summary
result = await generate_summary_direct(video_id, str(transcript_path))
if result:
format_summary(result)
print("\n✅ Summary generated successfully!")
else:
print("\n❌ Failed to generate summary")
print("\n" + "="*60)
print("✨ All processing complete!")
if __name__ == "__main__":
asyncio.run(main())