youtube-summarizer/backend/services/exporters/json_exporter.py

141 lines
6.6 KiB
Python

"""
JSON Exporter for YouTube Summaries
Exports summaries to structured JSON format with full metadata
"""
import json
import tempfile
from typing import Dict, Any, Optional
from ..export_service import BaseExporter
class JSONExporter(BaseExporter):
"""Export summaries to structured JSON format"""
async def export(
self,
summary_data: Dict[str, Any],
template: Optional[str] = None,
branding: Optional[Dict[str, Any]] = None
) -> str:
"""Export to JSON"""
data = self._prepare_summary_data(summary_data)
# Structure data for JSON export
json_data = {
"youtube_summarizer_export": {
"version": "1.0",
"exported_at": data["export_metadata"]["exported_at"],
"exporter_version": data["export_metadata"]["exporter_version"]
},
"video": {
"id": data.get("video_id"),
"url": data.get("video_url"),
"metadata": {
"title": data.get("video_metadata", {}).get("title"),
"channel": data.get("video_metadata", {}).get("channel_name"),
"channel_id": data.get("video_metadata", {}).get("channel_id"),
"duration_seconds": data.get("video_metadata", {}).get("duration"),
"published_at": data.get("video_metadata", {}).get("published_at"),
"view_count": data.get("video_metadata", {}).get("view_count"),
"like_count": data.get("video_metadata", {}).get("like_count"),
"comment_count": data.get("video_metadata", {}).get("comment_count"),
"description": data.get("video_metadata", {}).get("description"),
"tags": data.get("video_metadata", {}).get("tags", []),
"thumbnail_url": data.get("video_metadata", {}).get("thumbnail_url"),
"categories": data.get("video_metadata", {}).get("categories", [])
}
},
"transcript": {
"language": data.get("transcript_language", "en"),
"segments": data.get("transcript_segments", []),
"full_text": data.get("transcript_text"),
"word_count": data.get("word_count"),
"duration_seconds": data.get("transcript_duration")
},
"summary": {
"text": data.get("summary"),
"key_points": data.get("key_points", []),
"main_themes": data.get("main_themes", []),
"actionable_insights": data.get("actionable_insights", []),
"confidence_score": data.get("confidence_score"),
"quality_metrics": {
"completeness": data.get("quality_metrics", {}).get("completeness"),
"coherence": data.get("quality_metrics", {}).get("coherence"),
"relevance": data.get("quality_metrics", {}).get("relevance"),
"accuracy": data.get("quality_metrics", {}).get("accuracy")
},
"sentiment_analysis": {
"overall_sentiment": data.get("sentiment", {}).get("overall"),
"positive_score": data.get("sentiment", {}).get("positive"),
"negative_score": data.get("sentiment", {}).get("negative"),
"neutral_score": data.get("sentiment", {}).get("neutral")
},
"topics": data.get("topics", []),
"entities": data.get("entities", []),
"keywords": data.get("keywords", [])
},
"chapters": data.get("chapters", []),
"related_content": {
"recommended_videos": data.get("recommended_videos", []),
"related_topics": data.get("related_topics", []),
"external_links": data.get("external_links", [])
},
"processing": {
"metadata": {
"model": data.get("processing_metadata", {}).get("model"),
"model_version": data.get("processing_metadata", {}).get("model_version"),
"processing_time_seconds": data.get("processing_metadata", {}).get("processing_time_seconds"),
"timestamp": data.get("processing_metadata", {}).get("timestamp"),
"cache_hit": data.get("processing_metadata", {}).get("cache_hit", False),
"pipeline_version": data.get("processing_metadata", {}).get("pipeline_version")
},
"cost_data": {
"input_tokens": data.get("cost_data", {}).get("input_tokens"),
"output_tokens": data.get("cost_data", {}).get("output_tokens"),
"total_tokens": data.get("cost_data", {}).get("total_tokens"),
"estimated_cost_usd": data.get("cost_data", {}).get("estimated_cost_usd"),
"model_pricing": data.get("cost_data", {}).get("model_pricing")
},
"quality_score": data.get("quality_score"),
"errors": data.get("processing_errors", []),
"warnings": data.get("processing_warnings", [])
},
"user_data": {
"user_id": data.get("user_id"),
"session_id": data.get("session_id"),
"preferences": data.get("user_preferences", {}),
"customization": data.get("customization", {})
},
"branding": branding,
"export_options": {
"template": template,
"include_metadata": True,
"format_version": "1.0"
}
}
# Clean up None values for cleaner JSON
json_data = self._clean_none_values(json_data)
# Write to temporary file
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
json.dump(json_data, f, indent=2, default=str, ensure_ascii=False)
return f.name
def _clean_none_values(self, data: Any) -> Any:
"""Recursively remove None values from dictionaries"""
if isinstance(data, dict):
return {
key: self._clean_none_values(value)
for key, value in data.items()
if value is not None
}
elif isinstance(data, list):
return [self._clean_none_values(item) for item in data]
else:
return data
def get_file_extension(self) -> str:
return "json"