youtube-summarizer/backend/services/enhanced_markdown_formatter.py

526 lines
19 KiB
Python

"""Enhanced Markdown Formatter for professional export documents.
This service creates professional markdown documents with executive summaries,
timestamped sections, table of contents, and consistent formatting.
"""
import asyncio
import logging
from datetime import datetime
from typing import Dict, Any, List, Optional
from dataclasses import dataclass
from ..services.executive_summary_generator import ExecutiveSummary, ExecutiveSummaryGenerator
from ..services.timestamp_processor import TimestampedSection, TimestampProcessor
from ..core.exceptions import ServiceError
logger = logging.getLogger(__name__)
@dataclass
class MarkdownExportConfig:
"""Configuration for markdown export."""
include_executive_summary: bool = True
include_timestamps: bool = True
include_toc: bool = True
section_detail_level: str = "standard" # brief, standard, detailed
include_metadata_header: bool = True
include_footer: bool = True
custom_template_id: Optional[str] = None
@dataclass
class EnhancedMarkdownExport:
"""Result of enhanced markdown export."""
markdown_content: str
executive_summary: Optional[ExecutiveSummary]
sections: List[TimestampedSection]
table_of_contents: str
metadata: Dict[str, Any]
quality_score: float
processing_time_seconds: float
export_config: MarkdownExportConfig
created_at: datetime
class EnhancedMarkdownFormatter:
"""Service for creating professional markdown documents."""
def __init__(
self,
executive_generator: Optional[ExecutiveSummaryGenerator] = None,
timestamp_processor: Optional[TimestampProcessor] = None
):
"""Initialize enhanced markdown formatter.
Args:
executive_generator: Service for executive summaries
timestamp_processor: Service for timestamp processing
"""
self.executive_generator = executive_generator or ExecutiveSummaryGenerator()
self.timestamp_processor = timestamp_processor or TimestampProcessor()
# Formatting configuration
self.max_line_length = 80
self.heading_levels = {
"title": "#",
"section": "##",
"subsection": "###",
"detail": "####"
}
logger.info("EnhancedMarkdownFormatter initialized")
async def create_enhanced_export(
self,
video_title: str,
video_url: str,
content: str,
transcript_data: List[Dict[str, Any]] = None,
export_config: Optional[MarkdownExportConfig] = None
) -> EnhancedMarkdownExport:
"""Create comprehensive enhanced markdown export.
Args:
video_title: Title of the video
video_url: YouTube video URL
content: Main content/summary text
transcript_data: Raw transcript data with timestamps
export_config: Export configuration options
Returns:
Enhanced markdown export result
"""
start_time = datetime.now()
config = export_config or MarkdownExportConfig()
try:
# Generate components in parallel where possible
tasks = []
# Executive summary (if enabled)
executive_summary = None
if config.include_executive_summary:
tasks.append(self._generate_executive_summary(content, video_title))
# Timestamp sections (if enabled and data available)
sections = []
if config.include_timestamps and transcript_data:
tasks.append(self._generate_timestamp_sections(
transcript_data, video_url, video_title
))
# Execute parallel tasks
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results
result_idx = 0
if config.include_executive_summary:
executive_summary = results[result_idx] if not isinstance(results[result_idx], Exception) else None
result_idx += 1
if config.include_timestamps and transcript_data:
section_result = results[result_idx] if not isinstance(results[result_idx], Exception) else None
if section_result:
sections = section_result.sections
result_idx += 1
# Generate table of contents
toc = ""
if config.include_toc and sections:
toc = await self.timestamp_processor.generate_table_of_contents(sections)
# Assemble final markdown document
markdown_content = await self._assemble_markdown_document(
video_title=video_title,
video_url=video_url,
content=content,
executive_summary=executive_summary,
sections=sections,
table_of_contents=toc,
config=config
)
# Calculate quality score
quality_score = self._calculate_export_quality(
executive_summary, sections, markdown_content
)
# Generate metadata
metadata = self._generate_export_metadata(
video_title, video_url, executive_summary, sections, config
)
processing_time = (datetime.now() - start_time).total_seconds()
return EnhancedMarkdownExport(
markdown_content=markdown_content,
executive_summary=executive_summary,
sections=sections,
table_of_contents=toc,
metadata=metadata,
quality_score=quality_score,
processing_time_seconds=processing_time,
export_config=config,
created_at=datetime.now()
)
except Exception as e:
logger.error(f"Error creating enhanced export: {e}")
raise ServiceError(f"Enhanced export creation failed: {str(e)}")
async def _generate_executive_summary(
self,
content: str,
video_title: str
) -> Optional[ExecutiveSummary]:
"""Generate executive summary component."""
try:
return await self.executive_generator.generate_executive_summary(
content=content,
video_title=video_title,
summary_type="business"
)
except Exception as e:
logger.warning(f"Executive summary generation failed: {e}")
return None
async def _generate_timestamp_sections(
self,
transcript_data: List[Dict[str, Any]],
video_url: str,
video_title: str
):
"""Generate timestamp sections component."""
try:
return await self.timestamp_processor.detect_semantic_sections(
transcript_data=transcript_data,
video_url=video_url,
video_title=video_title
)
except Exception as e:
logger.warning(f"Timestamp section generation failed: {e}")
return None
async def _assemble_markdown_document(
self,
video_title: str,
video_url: str,
content: str,
executive_summary: Optional[ExecutiveSummary],
sections: List[TimestampedSection],
table_of_contents: str,
config: MarkdownExportConfig
) -> str:
"""Assemble final markdown document."""
document_parts = []
# 1. Metadata Header
if config.include_metadata_header:
if executive_summary:
header = await self.executive_generator.generate_metadata_header(
executive_summary, video_title, video_url
)
else:
header = self._generate_basic_header(video_title, video_url)
document_parts.append(header)
# 2. Executive Summary Section
if config.include_executive_summary and executive_summary:
exec_section = self._format_executive_summary_section(executive_summary)
document_parts.append(exec_section)
# 3. Table of Contents
if config.include_toc and table_of_contents:
document_parts.append(table_of_contents)
# 4. Main Content Section
main_content = self._format_main_content_section(content, config)
document_parts.append(main_content)
# 5. Timestamped Sections
if config.include_timestamps and sections:
sections_content = self._format_timestamped_sections(sections, config)
document_parts.append(sections_content)
# 6. Footer
if config.include_footer:
if executive_summary:
footer = await self.executive_generator.generate_executive_footer(executive_summary)
else:
footer = self._generate_basic_footer()
document_parts.append(footer)
# Join all parts with proper spacing
return '\n\n'.join(filter(None, document_parts))
def _generate_basic_header(self, video_title: str, video_url: str) -> str:
"""Generate basic header when executive summary not available."""
return f"""# {video_title}
**Analysis Date**: {datetime.now().strftime("%B %d, %Y")}
**Source**: {video_url}
"""
def _format_executive_summary_section(self, executive_summary: ExecutiveSummary) -> str:
"""Format executive summary as markdown section."""
section_parts = [
"## Executive Summary",
"",
executive_summary.overview
]
# Add key metrics if available
if executive_summary.key_metrics:
metrics = executive_summary.key_metrics
section_parts.extend([
"",
"### Key Metrics",
f"- **Duration**: {metrics.duration_minutes} minutes",
f"- **Complexity**: {metrics.complexity_level.title()}",
f"- **Main Topics**: {', '.join(metrics.main_topics[:3])}"
])
# Add business value if available
if executive_summary.business_value:
section_parts.extend([
"",
"### Business Value",
executive_summary.business_value
])
# Add action items
if executive_summary.action_items:
section_parts.extend([
"",
"### Action Items"
])
for item in executive_summary.action_items:
section_parts.append(f"- {item}")
# Add strategic implications
if executive_summary.strategic_implications:
section_parts.extend([
"",
"### Strategic Implications"
])
for implication in executive_summary.strategic_implications:
section_parts.append(f"- {implication}")
return '\n'.join(section_parts)
def _format_main_content_section(
self,
content: str,
config: MarkdownExportConfig
) -> str:
"""Format main content section."""
if config.section_detail_level == "brief":
# Truncate content for brief format
content_lines = content.split('\n')
if len(content_lines) > 10:
content = '\n'.join(content_lines[:10]) + "\n\n*[Content truncated for brief format]*"
return f"""## Content Analysis
{content}"""
def _format_timestamped_sections(
self,
sections: List[TimestampedSection],
config: MarkdownExportConfig
) -> str:
"""Format timestamped sections."""
if not sections:
return ""
section_parts = [
"## Detailed Sections",
""
]
for section in sections:
timestamp_display = self.timestamp_processor.seconds_to_timestamp(section.start_timestamp)
# Section header with clickable timestamp
section_header = f"### [{timestamp_display}] {section.title}"
section_parts.append(section_header)
section_parts.append("")
# YouTube link
section_parts.append(f"**[🎬 Jump to this section]({section.youtube_link})**")
section_parts.append("")
# Section summary
if section.summary and config.section_detail_level != "brief":
section_parts.append(f"*{section.summary}*")
section_parts.append("")
# Key points
if section.key_points and config.section_detail_level == "detailed":
section_parts.append("**Key Points:**")
for point in section.key_points:
section_parts.append(f"- {point}")
section_parts.append("")
# Section content (for detailed format)
if config.section_detail_level == "detailed" and section.content:
# Limit content length for readability
content_preview = section.content[:500]
if len(section.content) > 500:
content_preview += "..."
section_parts.append("**Content:**")
section_parts.append(content_preview)
section_parts.append("")
return '\n'.join(section_parts)
def _generate_basic_footer(self) -> str:
"""Generate basic footer when executive summary not available."""
return f"""
---
**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
*This analysis was generated using AI and is intended for informational purposes.*
"""
def _calculate_export_quality(
self,
executive_summary: Optional[ExecutiveSummary],
sections: List[TimestampedSection],
markdown_content: str
) -> float:
"""Calculate overall quality score for export."""
quality_factors = []
# Executive summary quality
if executive_summary:
quality_factors.append(executive_summary.key_metrics.confidence_score)
# Sections quality
if sections:
avg_section_quality = sum(s.confidence_score for s in sections) / len(sections)
quality_factors.append(avg_section_quality)
# Content length and structure
content_length = len(markdown_content)
if 1000 <= content_length <= 50000: # Good length range
quality_factors.append(0.9)
elif content_length < 1000:
quality_factors.append(0.6)
else:
quality_factors.append(0.7)
# Structure completeness
structure_score = 0.0
if "# " in markdown_content: # Has title
structure_score += 0.2
if "## " in markdown_content: # Has sections
structure_score += 0.3
if "[" in markdown_content and "](" in markdown_content: # Has links
structure_score += 0.3
if "**" in markdown_content: # Has bold formatting
structure_score += 0.2
quality_factors.append(structure_score)
# Return average quality score
return sum(quality_factors) / len(quality_factors) if quality_factors else 0.5
def _generate_export_metadata(
self,
video_title: str,
video_url: str,
executive_summary: Optional[ExecutiveSummary],
sections: List[TimestampedSection],
config: MarkdownExportConfig
) -> Dict[str, Any]:
"""Generate metadata for export."""
metadata = {
"video_title": video_title,
"video_url": video_url,
"export_format": "enhanced_markdown",
"created_at": datetime.now().isoformat(),
"config": {
"include_executive_summary": config.include_executive_summary,
"include_timestamps": config.include_timestamps,
"include_toc": config.include_toc,
"section_detail_level": config.section_detail_level
}
}
if executive_summary:
metadata["executive_summary"] = {
"generated": True,
"confidence_score": executive_summary.key_metrics.confidence_score,
"processing_time": executive_summary.processing_time_seconds,
"word_count": executive_summary.key_metrics.word_count
}
if sections:
metadata["sections"] = {
"total_sections": len(sections),
"avg_confidence": sum(s.confidence_score for s in sections) / len(sections),
"total_duration": max(s.end_timestamp for s in sections) if sections else 0
}
return metadata
async def create_table_of_contents_only(
self,
sections: List[TimestampedSection]
) -> str:
"""Create standalone table of contents."""
return await self.timestamp_processor.generate_table_of_contents(sections)
def format_for_platform(self, markdown_content: str, platform: str) -> str:
"""Format markdown for specific platforms (GitHub, Notion, etc.)."""
if platform.lower() == "github":
# GitHub-specific formatting
return self._format_for_github(markdown_content)
elif platform.lower() == "notion":
# Notion-specific formatting
return self._format_for_notion(markdown_content)
elif platform.lower() == "obsidian":
# Obsidian-specific formatting
return self._format_for_obsidian(markdown_content)
else:
return markdown_content
def _format_for_github(self, content: str) -> str:
"""Optimize for GitHub markdown rendering."""
# GitHub supports most standard markdown features
return content
def _format_for_notion(self, content: str) -> str:
"""Optimize for Notion markdown import."""
# Notion has some limitations with complex markdown
# Simplify some formatting for better compatibility
content = content.replace("**[🎬", "[🎬")
content = content.replace("]**", "]")
return content
def _format_for_obsidian(self, content: str) -> str:
"""Optimize for Obsidian markdown."""
# Obsidian supports wiki-style links and other features
# Add backlink support if needed
return content
def get_formatter_stats(self) -> Dict[str, Any]:
"""Get formatter configuration and statistics."""
return {
"service_name": "EnhancedMarkdownFormatter",
"max_line_length": self.max_line_length,
"heading_levels": self.heading_levels,
"supported_platforms": ["github", "notion", "obsidian", "standard"]
}