526 lines
19 KiB
Python
526 lines
19 KiB
Python
"""Enhanced Markdown Formatter for professional export documents.
|
|
|
|
This service creates professional markdown documents with executive summaries,
|
|
timestamped sections, table of contents, and consistent formatting.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Dict, Any, List, Optional
|
|
from dataclasses import dataclass
|
|
|
|
from ..services.executive_summary_generator import ExecutiveSummary, ExecutiveSummaryGenerator
|
|
from ..services.timestamp_processor import TimestampedSection, TimestampProcessor
|
|
from ..core.exceptions import ServiceError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class MarkdownExportConfig:
|
|
"""Configuration for markdown export."""
|
|
include_executive_summary: bool = True
|
|
include_timestamps: bool = True
|
|
include_toc: bool = True
|
|
section_detail_level: str = "standard" # brief, standard, detailed
|
|
include_metadata_header: bool = True
|
|
include_footer: bool = True
|
|
custom_template_id: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class EnhancedMarkdownExport:
|
|
"""Result of enhanced markdown export."""
|
|
markdown_content: str
|
|
executive_summary: Optional[ExecutiveSummary]
|
|
sections: List[TimestampedSection]
|
|
table_of_contents: str
|
|
metadata: Dict[str, Any]
|
|
quality_score: float
|
|
processing_time_seconds: float
|
|
export_config: MarkdownExportConfig
|
|
created_at: datetime
|
|
|
|
|
|
class EnhancedMarkdownFormatter:
|
|
"""Service for creating professional markdown documents."""
|
|
|
|
def __init__(
|
|
self,
|
|
executive_generator: Optional[ExecutiveSummaryGenerator] = None,
|
|
timestamp_processor: Optional[TimestampProcessor] = None
|
|
):
|
|
"""Initialize enhanced markdown formatter.
|
|
|
|
Args:
|
|
executive_generator: Service for executive summaries
|
|
timestamp_processor: Service for timestamp processing
|
|
"""
|
|
self.executive_generator = executive_generator or ExecutiveSummaryGenerator()
|
|
self.timestamp_processor = timestamp_processor or TimestampProcessor()
|
|
|
|
# Formatting configuration
|
|
self.max_line_length = 80
|
|
self.heading_levels = {
|
|
"title": "#",
|
|
"section": "##",
|
|
"subsection": "###",
|
|
"detail": "####"
|
|
}
|
|
|
|
logger.info("EnhancedMarkdownFormatter initialized")
|
|
|
|
async def create_enhanced_export(
|
|
self,
|
|
video_title: str,
|
|
video_url: str,
|
|
content: str,
|
|
transcript_data: List[Dict[str, Any]] = None,
|
|
export_config: Optional[MarkdownExportConfig] = None
|
|
) -> EnhancedMarkdownExport:
|
|
"""Create comprehensive enhanced markdown export.
|
|
|
|
Args:
|
|
video_title: Title of the video
|
|
video_url: YouTube video URL
|
|
content: Main content/summary text
|
|
transcript_data: Raw transcript data with timestamps
|
|
export_config: Export configuration options
|
|
|
|
Returns:
|
|
Enhanced markdown export result
|
|
"""
|
|
start_time = datetime.now()
|
|
config = export_config or MarkdownExportConfig()
|
|
|
|
try:
|
|
# Generate components in parallel where possible
|
|
tasks = []
|
|
|
|
# Executive summary (if enabled)
|
|
executive_summary = None
|
|
if config.include_executive_summary:
|
|
tasks.append(self._generate_executive_summary(content, video_title))
|
|
|
|
# Timestamp sections (if enabled and data available)
|
|
sections = []
|
|
if config.include_timestamps and transcript_data:
|
|
tasks.append(self._generate_timestamp_sections(
|
|
transcript_data, video_url, video_title
|
|
))
|
|
|
|
# Execute parallel tasks
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# Process results
|
|
result_idx = 0
|
|
if config.include_executive_summary:
|
|
executive_summary = results[result_idx] if not isinstance(results[result_idx], Exception) else None
|
|
result_idx += 1
|
|
|
|
if config.include_timestamps and transcript_data:
|
|
section_result = results[result_idx] if not isinstance(results[result_idx], Exception) else None
|
|
if section_result:
|
|
sections = section_result.sections
|
|
result_idx += 1
|
|
|
|
# Generate table of contents
|
|
toc = ""
|
|
if config.include_toc and sections:
|
|
toc = await self.timestamp_processor.generate_table_of_contents(sections)
|
|
|
|
# Assemble final markdown document
|
|
markdown_content = await self._assemble_markdown_document(
|
|
video_title=video_title,
|
|
video_url=video_url,
|
|
content=content,
|
|
executive_summary=executive_summary,
|
|
sections=sections,
|
|
table_of_contents=toc,
|
|
config=config
|
|
)
|
|
|
|
# Calculate quality score
|
|
quality_score = self._calculate_export_quality(
|
|
executive_summary, sections, markdown_content
|
|
)
|
|
|
|
# Generate metadata
|
|
metadata = self._generate_export_metadata(
|
|
video_title, video_url, executive_summary, sections, config
|
|
)
|
|
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
return EnhancedMarkdownExport(
|
|
markdown_content=markdown_content,
|
|
executive_summary=executive_summary,
|
|
sections=sections,
|
|
table_of_contents=toc,
|
|
metadata=metadata,
|
|
quality_score=quality_score,
|
|
processing_time_seconds=processing_time,
|
|
export_config=config,
|
|
created_at=datetime.now()
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating enhanced export: {e}")
|
|
raise ServiceError(f"Enhanced export creation failed: {str(e)}")
|
|
|
|
async def _generate_executive_summary(
|
|
self,
|
|
content: str,
|
|
video_title: str
|
|
) -> Optional[ExecutiveSummary]:
|
|
"""Generate executive summary component."""
|
|
try:
|
|
return await self.executive_generator.generate_executive_summary(
|
|
content=content,
|
|
video_title=video_title,
|
|
summary_type="business"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Executive summary generation failed: {e}")
|
|
return None
|
|
|
|
async def _generate_timestamp_sections(
|
|
self,
|
|
transcript_data: List[Dict[str, Any]],
|
|
video_url: str,
|
|
video_title: str
|
|
):
|
|
"""Generate timestamp sections component."""
|
|
try:
|
|
return await self.timestamp_processor.detect_semantic_sections(
|
|
transcript_data=transcript_data,
|
|
video_url=video_url,
|
|
video_title=video_title
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Timestamp section generation failed: {e}")
|
|
return None
|
|
|
|
async def _assemble_markdown_document(
|
|
self,
|
|
video_title: str,
|
|
video_url: str,
|
|
content: str,
|
|
executive_summary: Optional[ExecutiveSummary],
|
|
sections: List[TimestampedSection],
|
|
table_of_contents: str,
|
|
config: MarkdownExportConfig
|
|
) -> str:
|
|
"""Assemble final markdown document."""
|
|
|
|
document_parts = []
|
|
|
|
# 1. Metadata Header
|
|
if config.include_metadata_header:
|
|
if executive_summary:
|
|
header = await self.executive_generator.generate_metadata_header(
|
|
executive_summary, video_title, video_url
|
|
)
|
|
else:
|
|
header = self._generate_basic_header(video_title, video_url)
|
|
document_parts.append(header)
|
|
|
|
# 2. Executive Summary Section
|
|
if config.include_executive_summary and executive_summary:
|
|
exec_section = self._format_executive_summary_section(executive_summary)
|
|
document_parts.append(exec_section)
|
|
|
|
# 3. Table of Contents
|
|
if config.include_toc and table_of_contents:
|
|
document_parts.append(table_of_contents)
|
|
|
|
# 4. Main Content Section
|
|
main_content = self._format_main_content_section(content, config)
|
|
document_parts.append(main_content)
|
|
|
|
# 5. Timestamped Sections
|
|
if config.include_timestamps and sections:
|
|
sections_content = self._format_timestamped_sections(sections, config)
|
|
document_parts.append(sections_content)
|
|
|
|
# 6. Footer
|
|
if config.include_footer:
|
|
if executive_summary:
|
|
footer = await self.executive_generator.generate_executive_footer(executive_summary)
|
|
else:
|
|
footer = self._generate_basic_footer()
|
|
document_parts.append(footer)
|
|
|
|
# Join all parts with proper spacing
|
|
return '\n\n'.join(filter(None, document_parts))
|
|
|
|
def _generate_basic_header(self, video_title: str, video_url: str) -> str:
|
|
"""Generate basic header when executive summary not available."""
|
|
return f"""# {video_title}
|
|
|
|
**Analysis Date**: {datetime.now().strftime("%B %d, %Y")}
|
|
**Source**: {video_url}
|
|
|
|
"""
|
|
|
|
def _format_executive_summary_section(self, executive_summary: ExecutiveSummary) -> str:
|
|
"""Format executive summary as markdown section."""
|
|
|
|
section_parts = [
|
|
"## Executive Summary",
|
|
"",
|
|
executive_summary.overview
|
|
]
|
|
|
|
# Add key metrics if available
|
|
if executive_summary.key_metrics:
|
|
metrics = executive_summary.key_metrics
|
|
section_parts.extend([
|
|
"",
|
|
"### Key Metrics",
|
|
f"- **Duration**: {metrics.duration_minutes} minutes",
|
|
f"- **Complexity**: {metrics.complexity_level.title()}",
|
|
f"- **Main Topics**: {', '.join(metrics.main_topics[:3])}"
|
|
])
|
|
|
|
# Add business value if available
|
|
if executive_summary.business_value:
|
|
section_parts.extend([
|
|
"",
|
|
"### Business Value",
|
|
executive_summary.business_value
|
|
])
|
|
|
|
# Add action items
|
|
if executive_summary.action_items:
|
|
section_parts.extend([
|
|
"",
|
|
"### Action Items"
|
|
])
|
|
for item in executive_summary.action_items:
|
|
section_parts.append(f"- {item}")
|
|
|
|
# Add strategic implications
|
|
if executive_summary.strategic_implications:
|
|
section_parts.extend([
|
|
"",
|
|
"### Strategic Implications"
|
|
])
|
|
for implication in executive_summary.strategic_implications:
|
|
section_parts.append(f"- {implication}")
|
|
|
|
return '\n'.join(section_parts)
|
|
|
|
def _format_main_content_section(
|
|
self,
|
|
content: str,
|
|
config: MarkdownExportConfig
|
|
) -> str:
|
|
"""Format main content section."""
|
|
|
|
if config.section_detail_level == "brief":
|
|
# Truncate content for brief format
|
|
content_lines = content.split('\n')
|
|
if len(content_lines) > 10:
|
|
content = '\n'.join(content_lines[:10]) + "\n\n*[Content truncated for brief format]*"
|
|
|
|
return f"""## Content Analysis
|
|
|
|
{content}"""
|
|
|
|
def _format_timestamped_sections(
|
|
self,
|
|
sections: List[TimestampedSection],
|
|
config: MarkdownExportConfig
|
|
) -> str:
|
|
"""Format timestamped sections."""
|
|
|
|
if not sections:
|
|
return ""
|
|
|
|
section_parts = [
|
|
"## Detailed Sections",
|
|
""
|
|
]
|
|
|
|
for section in sections:
|
|
timestamp_display = self.timestamp_processor.seconds_to_timestamp(section.start_timestamp)
|
|
|
|
# Section header with clickable timestamp
|
|
section_header = f"### [{timestamp_display}] {section.title}"
|
|
section_parts.append(section_header)
|
|
section_parts.append("")
|
|
|
|
# YouTube link
|
|
section_parts.append(f"**[🎬 Jump to this section]({section.youtube_link})**")
|
|
section_parts.append("")
|
|
|
|
# Section summary
|
|
if section.summary and config.section_detail_level != "brief":
|
|
section_parts.append(f"*{section.summary}*")
|
|
section_parts.append("")
|
|
|
|
# Key points
|
|
if section.key_points and config.section_detail_level == "detailed":
|
|
section_parts.append("**Key Points:**")
|
|
for point in section.key_points:
|
|
section_parts.append(f"- {point}")
|
|
section_parts.append("")
|
|
|
|
# Section content (for detailed format)
|
|
if config.section_detail_level == "detailed" and section.content:
|
|
# Limit content length for readability
|
|
content_preview = section.content[:500]
|
|
if len(section.content) > 500:
|
|
content_preview += "..."
|
|
|
|
section_parts.append("**Content:**")
|
|
section_parts.append(content_preview)
|
|
section_parts.append("")
|
|
|
|
return '\n'.join(section_parts)
|
|
|
|
def _generate_basic_footer(self) -> str:
|
|
"""Generate basic footer when executive summary not available."""
|
|
return f"""
|
|
|
|
---
|
|
|
|
**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
|
*This analysis was generated using AI and is intended for informational purposes.*
|
|
"""
|
|
|
|
def _calculate_export_quality(
|
|
self,
|
|
executive_summary: Optional[ExecutiveSummary],
|
|
sections: List[TimestampedSection],
|
|
markdown_content: str
|
|
) -> float:
|
|
"""Calculate overall quality score for export."""
|
|
|
|
quality_factors = []
|
|
|
|
# Executive summary quality
|
|
if executive_summary:
|
|
quality_factors.append(executive_summary.key_metrics.confidence_score)
|
|
|
|
# Sections quality
|
|
if sections:
|
|
avg_section_quality = sum(s.confidence_score for s in sections) / len(sections)
|
|
quality_factors.append(avg_section_quality)
|
|
|
|
# Content length and structure
|
|
content_length = len(markdown_content)
|
|
if 1000 <= content_length <= 50000: # Good length range
|
|
quality_factors.append(0.9)
|
|
elif content_length < 1000:
|
|
quality_factors.append(0.6)
|
|
else:
|
|
quality_factors.append(0.7)
|
|
|
|
# Structure completeness
|
|
structure_score = 0.0
|
|
if "# " in markdown_content: # Has title
|
|
structure_score += 0.2
|
|
if "## " in markdown_content: # Has sections
|
|
structure_score += 0.3
|
|
if "[" in markdown_content and "](" in markdown_content: # Has links
|
|
structure_score += 0.3
|
|
if "**" in markdown_content: # Has bold formatting
|
|
structure_score += 0.2
|
|
|
|
quality_factors.append(structure_score)
|
|
|
|
# Return average quality score
|
|
return sum(quality_factors) / len(quality_factors) if quality_factors else 0.5
|
|
|
|
def _generate_export_metadata(
|
|
self,
|
|
video_title: str,
|
|
video_url: str,
|
|
executive_summary: Optional[ExecutiveSummary],
|
|
sections: List[TimestampedSection],
|
|
config: MarkdownExportConfig
|
|
) -> Dict[str, Any]:
|
|
"""Generate metadata for export."""
|
|
|
|
metadata = {
|
|
"video_title": video_title,
|
|
"video_url": video_url,
|
|
"export_format": "enhanced_markdown",
|
|
"created_at": datetime.now().isoformat(),
|
|
"config": {
|
|
"include_executive_summary": config.include_executive_summary,
|
|
"include_timestamps": config.include_timestamps,
|
|
"include_toc": config.include_toc,
|
|
"section_detail_level": config.section_detail_level
|
|
}
|
|
}
|
|
|
|
if executive_summary:
|
|
metadata["executive_summary"] = {
|
|
"generated": True,
|
|
"confidence_score": executive_summary.key_metrics.confidence_score,
|
|
"processing_time": executive_summary.processing_time_seconds,
|
|
"word_count": executive_summary.key_metrics.word_count
|
|
}
|
|
|
|
if sections:
|
|
metadata["sections"] = {
|
|
"total_sections": len(sections),
|
|
"avg_confidence": sum(s.confidence_score for s in sections) / len(sections),
|
|
"total_duration": max(s.end_timestamp for s in sections) if sections else 0
|
|
}
|
|
|
|
return metadata
|
|
|
|
async def create_table_of_contents_only(
|
|
self,
|
|
sections: List[TimestampedSection]
|
|
) -> str:
|
|
"""Create standalone table of contents."""
|
|
return await self.timestamp_processor.generate_table_of_contents(sections)
|
|
|
|
def format_for_platform(self, markdown_content: str, platform: str) -> str:
|
|
"""Format markdown for specific platforms (GitHub, Notion, etc.)."""
|
|
|
|
if platform.lower() == "github":
|
|
# GitHub-specific formatting
|
|
return self._format_for_github(markdown_content)
|
|
elif platform.lower() == "notion":
|
|
# Notion-specific formatting
|
|
return self._format_for_notion(markdown_content)
|
|
elif platform.lower() == "obsidian":
|
|
# Obsidian-specific formatting
|
|
return self._format_for_obsidian(markdown_content)
|
|
else:
|
|
return markdown_content
|
|
|
|
def _format_for_github(self, content: str) -> str:
|
|
"""Optimize for GitHub markdown rendering."""
|
|
# GitHub supports most standard markdown features
|
|
return content
|
|
|
|
def _format_for_notion(self, content: str) -> str:
|
|
"""Optimize for Notion markdown import."""
|
|
# Notion has some limitations with complex markdown
|
|
# Simplify some formatting for better compatibility
|
|
content = content.replace("**[🎬", "[🎬")
|
|
content = content.replace("]**", "]")
|
|
return content
|
|
|
|
def _format_for_obsidian(self, content: str) -> str:
|
|
"""Optimize for Obsidian markdown."""
|
|
# Obsidian supports wiki-style links and other features
|
|
# Add backlink support if needed
|
|
return content
|
|
|
|
def get_formatter_stats(self) -> Dict[str, Any]:
|
|
"""Get formatter configuration and statistics."""
|
|
return {
|
|
"service_name": "EnhancedMarkdownFormatter",
|
|
"max_line_length": self.max_line_length,
|
|
"heading_levels": self.heading_levels,
|
|
"supported_platforms": ["github", "notion", "obsidian", "standard"]
|
|
} |