218 lines
7.1 KiB
Python
218 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Direct summary generation using DeepSeek without the API."""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Add backend to path
|
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
|
|
# Load environment variables
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
from backend.services.deepseek_summarizer import DeepSeekSummarizer
|
|
from backend.services.ai_service import SummaryRequest, SummaryLength
|
|
|
|
async def generate_summary_direct(video_id: str, transcript_path: str):
|
|
"""Generate summary directly using DeepSeek service."""
|
|
|
|
print(f"\n📹 Processing video: {video_id}")
|
|
|
|
# Load transcript
|
|
with open(transcript_path, 'r') as f:
|
|
transcript_data = json.load(f)
|
|
|
|
# Extract text from segments
|
|
if isinstance(transcript_data, list):
|
|
transcript_text = ' '.join([segment.get('text', '') for segment in transcript_data])
|
|
else:
|
|
transcript_text = transcript_data.get('text', '')
|
|
|
|
print(f"📝 Transcript length: {len(transcript_text)} characters")
|
|
print(f"📝 Preview: {transcript_text[:200]}...")
|
|
|
|
# Get API key
|
|
api_key = os.getenv('DEEPSEEK_API_KEY')
|
|
if not api_key:
|
|
print("❌ DEEPSEEK_API_KEY not found in environment")
|
|
return None
|
|
|
|
# Initialize DeepSeek service
|
|
summarizer = DeepSeekSummarizer(api_key=api_key)
|
|
await summarizer.initialize()
|
|
|
|
print(f"🤖 Using DeepSeek model: {summarizer.default_config.model_name}")
|
|
|
|
# Create summary request
|
|
request = SummaryRequest(
|
|
transcript=transcript_text,
|
|
length=SummaryLength.STANDARD,
|
|
focus_areas=["key points", "main topics", "conclusions"],
|
|
language="en"
|
|
)
|
|
|
|
# Estimate cost
|
|
cost = summarizer.estimate_cost(transcript_text, SummaryLength.STANDARD)
|
|
print(f"💰 Estimated cost: ${cost:.6f}")
|
|
|
|
try:
|
|
print("🚀 Generating summary with DeepSeek...")
|
|
result = await summarizer.generate_summary(request)
|
|
|
|
# Save summary to file system
|
|
summary_dir = Path(f"video_storage/summaries/{video_id}")
|
|
summary_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create timestamp for unique file name
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
|
# Save full result
|
|
summary_file = summary_dir / f"summary_{timestamp}.json"
|
|
summary_data = {
|
|
"video_id": video_id,
|
|
"generated_at": datetime.now().isoformat(),
|
|
"model": "deepseek-chat",
|
|
"summary": result.summary,
|
|
"key_points": result.key_points,
|
|
"main_themes": result.main_themes,
|
|
"actionable_insights": result.actionable_insights,
|
|
"confidence_score": result.confidence_score,
|
|
"processing_metadata": result.processing_metadata,
|
|
"cost_data": result.cost_data,
|
|
"transcript_length": len(transcript_text)
|
|
}
|
|
|
|
with open(summary_file, 'w') as f:
|
|
json.dump(summary_data, f, indent=2)
|
|
|
|
print(f"✅ Summary saved to: {summary_file}")
|
|
|
|
# Also save a markdown version for easy reading
|
|
md_file = summary_dir / f"summary_{timestamp}.md"
|
|
with open(md_file, 'w') as f:
|
|
f.write(f"# Video Summary\n")
|
|
f.write(f"**Video ID:** {video_id}\n")
|
|
f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
f.write(f"**Model:** DeepSeek Chat\n\n")
|
|
|
|
f.write(f"## Summary\n{result.summary}\n\n")
|
|
|
|
f.write(f"## Key Points\n")
|
|
for point in result.key_points:
|
|
f.write(f"- {point}\n")
|
|
f.write("\n")
|
|
|
|
f.write(f"## Main Themes\n")
|
|
for theme in result.main_themes:
|
|
f.write(f"- {theme}\n")
|
|
f.write("\n")
|
|
|
|
f.write(f"## Actionable Insights\n")
|
|
for insight in result.actionable_insights:
|
|
f.write(f"- {insight}\n")
|
|
f.write("\n")
|
|
|
|
f.write(f"---\n")
|
|
f.write(f"*Confidence Score: {result.confidence_score:.2%}*\n")
|
|
f.write(f"*Processing Cost: ${result.cost_data['total_cost_usd']:.6f}*\n")
|
|
|
|
print(f"📄 Markdown saved to: {md_file}")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error generating summary: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return None
|
|
|
|
def format_summary(result):
|
|
"""Format and display the summary."""
|
|
if not result:
|
|
return
|
|
|
|
print("\n" + "="*60)
|
|
print("📊 GENERATED SUMMARY")
|
|
print("="*60)
|
|
|
|
print("\n📝 Summary:")
|
|
print("-" * 40)
|
|
print(result.summary)
|
|
|
|
print("\n🎯 Key Points:")
|
|
print("-" * 40)
|
|
for i, point in enumerate(result.key_points, 1):
|
|
print(f"{i}. {point}")
|
|
|
|
print("\n📚 Main Themes:")
|
|
print("-" * 40)
|
|
for theme in result.main_themes:
|
|
print(f"• {theme}")
|
|
|
|
print("\n💡 Actionable Insights:")
|
|
print("-" * 40)
|
|
for insight in result.actionable_insights:
|
|
print(f"→ {insight}")
|
|
|
|
print(f"\n📊 Confidence Score: {result.confidence_score:.2%}")
|
|
print(f"💰 Total Cost: ${result.cost_data['total_cost_usd']:.6f}")
|
|
|
|
async def check_existing_summaries(video_id: str):
|
|
"""Check for existing summaries for a video."""
|
|
summary_dir = Path(f"video_storage/summaries/{video_id}")
|
|
|
|
if summary_dir.exists():
|
|
summaries = list(summary_dir.glob("summary_*.json"))
|
|
if summaries:
|
|
print(f"\n📚 Found {len(summaries)} existing summary(ies) for video {video_id}:")
|
|
for summary_file in sorted(summaries):
|
|
print(f" - {summary_file.name}")
|
|
return summaries
|
|
return []
|
|
|
|
async def main():
|
|
"""Process videos and generate summaries."""
|
|
|
|
videos = [
|
|
("jNQXAC9IVRw", "First YouTube video - Me at the zoo"),
|
|
("DCquejfz04A", "China decoupling video"),
|
|
]
|
|
|
|
print("🚀 YouTube Video Summary Generator (Direct)")
|
|
print("=" * 60)
|
|
|
|
for video_id, description in videos:
|
|
transcript_path = Path(f"video_storage/transcripts/{video_id}.json")
|
|
|
|
print(f"\n📹 Video: {description}")
|
|
print(f" ID: {video_id}")
|
|
|
|
if not transcript_path.exists():
|
|
print(f" ❌ No transcript found")
|
|
continue
|
|
|
|
# Check for existing summaries
|
|
existing = await check_existing_summaries(video_id)
|
|
|
|
if existing:
|
|
print(f" ⚠️ Summary already exists. Generating new version...")
|
|
|
|
# Generate summary
|
|
result = await generate_summary_direct(video_id, str(transcript_path))
|
|
|
|
if result:
|
|
format_summary(result)
|
|
print("\n✅ Summary generated successfully!")
|
|
else:
|
|
print("\n❌ Failed to generate summary")
|
|
|
|
print("\n" + "="*60)
|
|
print("✨ All processing complete!")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |