youtube-summarizer/backend/services/enhanced_export_service.py

714 lines
26 KiB
Python

"""Enhanced export service with executive summaries and timestamped sections."""
import re
import logging
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime
from dataclasses import dataclass
from enum import Enum
from pydantic import BaseModel
from ..models.transcript import TranscriptSegment
from ..core.exceptions import ServiceError
from .deepseek_service import DeepSeekService
logger = logging.getLogger(__name__)
class ExportFormat(str, Enum):
"""Supported export formats."""
MARKDOWN = "markdown"
HTML = "html"
PDF = "pdf"
JSON = "json"
@dataclass
class VideoMetadata:
"""Video metadata for export context."""
video_id: str
title: str
channel: str
duration: int # seconds
view_count: Optional[int] = None
upload_date: Optional[str] = None
description: Optional[str] = None
class ExecutiveSummary(BaseModel):
"""Executive summary with key business insights."""
overview: str # 2-3 paragraph executive overview
key_metrics: Dict[str, Any] # Duration, word count, topics, etc.
main_topics: List[str] # Primary topics covered
business_value: Optional[str] = None # Business value proposition
action_items: List[str] # Actionable items for executives
sentiment_analysis: Dict[str, float] # Sentiment scores
class TimestampedSection(BaseModel):
"""Section with timestamp navigation."""
index: int
title: str
start_timestamp: int # seconds
end_timestamp: int
youtube_link: str
content: str
summary: str # Brief section summary
key_points: List[str]
class ExportConfig(BaseModel):
"""Configuration for enhanced export."""
format: ExportFormat = ExportFormat.MARKDOWN
include_executive_summary: bool = True
include_timestamps: bool = True
include_toc: bool = True
section_detail_level: str = "standard" # brief, standard, detailed
custom_template_id: Optional[str] = None
class EnhancedMarkdownExport(BaseModel):
"""Complete enhanced export result."""
summary_id: str
video_metadata: VideoMetadata
executive_summary: ExecutiveSummary
table_of_contents: List[str]
sections: List[TimestampedSection]
markdown_content: str
metadata: Dict[str, Any]
quality_score: float
processing_time_seconds: float
created_at: datetime
class EnhancedExportService:
"""Service for generating enhanced exports with executive summaries and timestamps."""
def __init__(self, ai_service: Optional[DeepSeekService] = None):
"""Initialize the enhanced export service.
Args:
ai_service: DeepSeek AI service for content generation
"""
self.ai_service = ai_service or DeepSeekService()
async def generate_enhanced_export(
self,
summary_id: str,
transcript: str,
video_metadata: VideoMetadata,
config: Optional[ExportConfig] = None
) -> EnhancedMarkdownExport:
"""Generate enhanced export with all features.
Args:
summary_id: Summary ID for tracking
transcript: Video transcript text
video_metadata: Video information
config: Export configuration
Returns:
Complete enhanced export
"""
if not transcript or len(transcript.strip()) < 50:
raise ServiceError("Transcript too short for enhanced export")
config = config or ExportConfig()
start_time = datetime.now()
logger.info(f"Starting enhanced export for summary {summary_id}")
try:
# 1. Generate executive summary
executive_summary = await self._generate_executive_summary(
transcript, video_metadata
) if config.include_executive_summary else None
# 2. Detect and create timestamped sections
sections = await self._create_timestamped_sections(
transcript, video_metadata, config.section_detail_level
) if config.include_timestamps else []
# 3. Generate table of contents
table_of_contents = self._generate_table_of_contents(
sections
) if config.include_toc else []
# 4. Generate markdown content
markdown_content = await self._generate_markdown_content(
video_metadata, executive_summary, sections, table_of_contents, config
)
# 5. Calculate quality score
quality_score = self._calculate_export_quality(
executive_summary, sections, markdown_content
)
processing_time = (datetime.now() - start_time).total_seconds()
result = EnhancedMarkdownExport(
summary_id=summary_id,
video_metadata=video_metadata,
executive_summary=executive_summary,
table_of_contents=table_of_contents,
sections=sections,
markdown_content=markdown_content,
metadata={
"export_config": config.dict(),
"processing_time": processing_time,
"sections_count": len(sections),
"word_count": len(markdown_content.split())
},
quality_score=quality_score,
processing_time_seconds=processing_time,
created_at=start_time
)
logger.info(f"Enhanced export completed for summary {summary_id} in {processing_time:.2f}s")
return result
except Exception as e:
logger.error(f"Error generating enhanced export for {summary_id}: {e}")
raise ServiceError(f"Enhanced export failed: {str(e)}")
async def _generate_executive_summary(
self,
transcript: str,
video_metadata: VideoMetadata
) -> ExecutiveSummary:
"""Generate executive summary with business focus.
Args:
transcript: Video transcript
video_metadata: Video information
Returns:
Executive summary
"""
duration_minutes = video_metadata.duration // 60
word_count = len(transcript.split())
system_prompt = """You are an executive assistant creating a high-level summary for business leaders.
Focus on business value, strategic insights, ROI implications, and actionable intelligence.
Your audience consists of executives who need to quickly understand the key value and decisions points.
Provide your response as valid JSON with this exact structure:
{
"overview": "2-3 paragraph executive overview emphasizing business value and strategic insights",
"key_metrics": {
"duration_minutes": 45,
"word_count": 1200,
"complexity_level": "intermediate",
"primary_audience": "technical professionals"
},
"main_topics": ["List of 4-6 primary topics covered"],
"business_value": "Clear statement of business value and ROI implications",
"action_items": ["List of 3-5 specific actionable items for executives"],
"sentiment_analysis": {
"overall_sentiment": 0.7,
"confidence_level": 0.8,
"business_optimism": 0.6
}
}"""
prompt = f"""Video Title: {video_metadata.title}
Channel: {video_metadata.channel}
Duration: {duration_minutes} minutes
Word Count: {word_count} words
Please create an executive summary of the following video transcript. Focus on business implications, strategic value, and actionable insights that would be relevant to decision-makers and executives.
Transcript:
{transcript[:6000]} # Limit for token constraints
Create a comprehensive executive summary that captures the strategic value and business implications."""
try:
response = await self.ai_service.generate_response(
prompt=prompt,
system_prompt=system_prompt,
temperature=0.3,
max_tokens=1200
)
# Parse AI response
import json
summary_data = json.loads(response)
# Ensure required fields and add calculated metrics
summary_data["key_metrics"].update({
"duration_minutes": duration_minutes,
"word_count": word_count,
"video_id": video_metadata.video_id
})
return ExecutiveSummary(**summary_data)
except json.JSONDecodeError:
# Fallback if JSON parsing fails
logger.warning("Executive summary response was not valid JSON, creating fallback")
return ExecutiveSummary(
overview=response[:500] if response else "Executive summary generation failed.",
key_metrics={
"duration_minutes": duration_minutes,
"word_count": word_count,
"video_id": video_metadata.video_id,
"complexity_level": "unknown"
},
main_topics=["Content analysis", "Business insights"],
business_value="Detailed analysis available in full summary.",
action_items=["Review full content", "Assess implementation options"],
sentiment_analysis={"overall_sentiment": 0.5, "confidence_level": 0.3}
)
except Exception as e:
logger.error(f"Error generating executive summary: {e}")
raise ServiceError(f"Executive summary generation failed: {str(e)}")
async def _create_timestamped_sections(
self,
transcript: str,
video_metadata: VideoMetadata,
detail_level: str = "standard"
) -> List[TimestampedSection]:
"""Create timestamped sections from transcript.
Args:
transcript: Video transcript
video_metadata: Video metadata
detail_level: Level of detail for sections
Returns:
List of timestamped sections
"""
# First, detect natural section breaks in the transcript
raw_sections = self._detect_section_breaks(transcript)
if not raw_sections:
logger.warning("No sections detected, creating single section")
raw_sections = [(transcript, 0, video_metadata.duration)]
sections = []
for i, (section_content, start_time, end_time) in enumerate(raw_sections):
if not section_content.strip():
continue
# Generate section title using AI
section_title = await self._generate_section_title(section_content, i + 1)
# Generate section summary
section_summary = await self._generate_section_summary(
section_content, detail_level
)
# Extract key points
key_points = self._extract_key_points(section_content)
# Create YouTube timestamp link
youtube_link = f"https://youtube.com/watch?v={video_metadata.video_id}&t={start_time}s"
section = TimestampedSection(
index=i + 1,
title=section_title,
start_timestamp=start_time,
end_timestamp=end_time,
youtube_link=youtube_link,
content=section_content,
summary=section_summary,
key_points=key_points
)
sections.append(section)
logger.info(f"Created {len(sections)} timestamped sections")
return sections
def _detect_section_breaks(self, transcript: str) -> List[Tuple[str, int, int]]:
"""Detect natural section breaks in transcript.
Args:
transcript: Full transcript text
Returns:
List of (content, start_time, end_time) tuples
"""
# Simple heuristic-based section detection
# In a real implementation, this could use more sophisticated NLP
paragraphs = transcript.split('\n\n')
if not paragraphs:
return [(transcript, 0, 300)] # Default 5 minute section
sections = []
current_content = ""
section_start = 0
words_per_minute = 150 # Average speaking rate
for i, paragraph in enumerate(paragraphs):
current_content += paragraph + "\n\n"
# Create section break every ~500 words or at natural breaks
word_count = len(current_content.split())
if word_count > 500 or i == len(paragraphs) - 1:
# Estimate timestamps based on word count
section_duration = (word_count / words_per_minute) * 60
section_end = section_start + int(section_duration)
sections.append((current_content.strip(), section_start, section_end))
# Reset for next section
section_start = section_end
current_content = ""
return sections
async def _generate_section_title(self, content: str, section_index: int) -> str:
"""Generate descriptive title for a section.
Args:
content: Section content
section_index: Section number
Returns:
Section title
"""
# Extract first meaningful sentence or topic
sentences = content.split('.')[:3] # First 3 sentences
preview = '. '.join(sentences)[:200]
try:
prompt = f"Create a concise, descriptive title (4-8 words) for this video section:\n\n{preview}"
title = await self.ai_service.generate_response(
prompt=prompt,
system_prompt="Generate clear, descriptive section titles for video content. Keep titles under 8 words.",
temperature=0.4,
max_tokens=50
)
# Clean up title
title = title.strip().strip('"\'')
if len(title) > 60: # Too long, truncate
title = title[:57] + "..."
return title or f"Section {section_index}"
except Exception:
# Fallback to simple title
logger.debug(f"Could not generate title for section {section_index}, using fallback")
return f"Section {section_index}"
async def _generate_section_summary(self, content: str, detail_level: str) -> str:
"""Generate summary for a section.
Args:
content: Section content
detail_level: Level of detail
Returns:
Section summary
"""
if detail_level == "brief":
max_tokens = 100
target = "1-2 sentences"
elif detail_level == "detailed":
max_tokens = 300
target = "2-3 paragraphs"
else: # standard
max_tokens = 150
target = "2-3 sentences"
try:
prompt = f"Summarize this video section in {target}:\n\n{content[:1000]}"
summary = await self.ai_service.generate_response(
prompt=prompt,
system_prompt=f"Create concise {target} summaries of video sections that capture the main points.",
temperature=0.3,
max_tokens=max_tokens
)
return summary.strip()
except Exception:
# Fallback to first sentence(s)
sentences = content.split('.')[:2]
return '. '.join(sentences) + "."
def _extract_key_points(self, content: str) -> List[str]:
"""Extract key points from section content.
Args:
content: Section content
Returns:
List of key points
"""
# Simple extraction based on sentence importance
sentences = [s.strip() for s in content.split('.') if s.strip()]
# Score sentences by length and keyword presence
important_keywords = [
'important', 'key', 'main', 'crucial', 'essential', 'critical',
'should', 'must', 'need', 'remember', 'note that', 'takeaway'
]
scored_sentences = []
for sentence in sentences[:10]: # Limit to first 10 sentences
if len(sentence.split()) < 5: # Skip very short sentences
continue
score = 0
sentence_lower = sentence.lower()
# Score based on keywords
for keyword in important_keywords:
if keyword in sentence_lower:
score += 2
# Score based on sentence length (moderate length preferred)
word_count = len(sentence.split())
if 8 <= word_count <= 20:
score += 1
scored_sentences.append((score, sentence))
# Sort by score and take top points
scored_sentences.sort(key=lambda x: x[0], reverse=True)
key_points = [sentence for score, sentence in scored_sentences[:5] if score > 0]
return key_points
def _generate_table_of_contents(self, sections: List[TimestampedSection]) -> List[str]:
"""Generate table of contents from sections.
Args:
sections: List of timestamped sections
Returns:
Table of contents entries
"""
toc_entries = []
for section in sections:
# Format timestamp as HH:MM:SS
timestamp_formatted = self._format_timestamp(section.start_timestamp)
# Create TOC entry with timestamp and title
entry = f"[{timestamp_formatted}] {section.title}"
toc_entries.append(entry)
return toc_entries
def _format_timestamp(self, seconds: int) -> str:
"""Format seconds as HH:MM:SS or MM:SS.
Args:
seconds: Time in seconds
Returns:
Formatted timestamp string
"""
hours = seconds // 3600
minutes = (seconds % 3600) // 60
secs = seconds % 60
if hours > 0:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
else:
return f"{minutes:02d}:{secs:02d}"
async def _generate_markdown_content(
self,
video_metadata: VideoMetadata,
executive_summary: Optional[ExecutiveSummary],
sections: List[TimestampedSection],
table_of_contents: List[str],
config: ExportConfig
) -> str:
"""Generate complete markdown content.
Args:
video_metadata: Video metadata
executive_summary: Executive summary
sections: Timestamped sections
table_of_contents: TOC entries
config: Export configuration
Returns:
Complete markdown document
"""
markdown_lines = []
# 1. Document header
markdown_lines.extend([
f"# {video_metadata.title}",
"",
f"**Channel:** {video_metadata.channel} ",
f"**Duration:** {self._format_timestamp(video_metadata.duration)} ",
f"**Video ID:** {video_metadata.video_id} ",
f"**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ",
"",
"---",
""
])
# 2. Executive Summary
if executive_summary and config.include_executive_summary:
markdown_lines.extend([
"## 📊 Executive Summary",
"",
executive_summary.overview,
"",
"### Key Metrics",
""
])
for key, value in executive_summary.key_metrics.items():
markdown_lines.append(f"- **{key.replace('_', ' ').title()}:** {value}")
markdown_lines.extend(["", "### Action Items", ""])
for item in executive_summary.action_items:
markdown_lines.append(f"- {item}")
if executive_summary.business_value:
markdown_lines.extend([
"",
"### Business Value",
"",
executive_summary.business_value
])
markdown_lines.extend(["", "---", ""])
# 3. Table of Contents
if table_of_contents and config.include_toc:
markdown_lines.extend([
"## 📋 Table of Contents",
""
])
for i, entry in enumerate(table_of_contents, 1):
markdown_lines.append(f"{i}. {entry}")
markdown_lines.extend(["", "---", ""])
# 4. Detailed Sections
if sections and config.include_timestamps:
markdown_lines.extend([
"## 📝 Detailed Analysis",
""
])
for section in sections:
timestamp_formatted = self._format_timestamp(section.start_timestamp)
# Section header with timestamp
markdown_lines.extend([
f"### [{timestamp_formatted}] {section.title}",
"",
f"**🔗 [Jump to video]({section.youtube_link})**",
""
])
# Section summary
if section.summary:
markdown_lines.extend([
"#### Summary",
"",
section.summary,
""
])
# Key points
if section.key_points:
markdown_lines.extend([
"#### Key Points",
""
])
for point in section.key_points:
markdown_lines.append(f"- {point}")
markdown_lines.append("")
markdown_lines.extend(["---", ""])
# 5. Footer
markdown_lines.extend([
"## 📄 Document Information",
"",
f"- **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
f"- **Source:** [YouTube Video](https://youtube.com/watch?v={video_metadata.video_id})",
f"- **Analysis Type:** Enhanced Export with Timestamps",
f"- **Sections:** {len(sections)}",
"",
"*This analysis was generated using AI-powered video summarization technology.*"
])
return "\n".join(markdown_lines)
def _calculate_export_quality(
self,
executive_summary: Optional[ExecutiveSummary],
sections: List[TimestampedSection],
markdown_content: str
) -> float:
"""Calculate quality score for the export.
Args:
executive_summary: Executive summary
sections: Timestamped sections
markdown_content: Generated markdown
Returns:
Quality score between 0.0 and 1.0
"""
quality_factors = []
# Executive summary quality
if executive_summary:
exec_score = 0.0
if len(executive_summary.overview) > 200: # Substantial overview
exec_score += 0.3
if len(executive_summary.action_items) >= 3: # Good actionable items
exec_score += 0.2
if executive_summary.business_value: # Business value present
exec_score += 0.3
quality_factors.append(exec_score)
# Sections quality
if sections:
section_score = 0.0
avg_section_length = sum(len(s.content) for s in sections) / len(sections)
if avg_section_length > 200: # Substantial sections
section_score += 0.3
sections_with_summaries = sum(1 for s in sections if s.summary)
if sections_with_summaries / len(sections) > 0.8: # Most have summaries
section_score += 0.4
sections_with_points = sum(1 for s in sections if s.key_points)
if sections_with_points / len(sections) > 0.5: # Half have key points
section_score += 0.3
quality_factors.append(section_score)
# Markdown quality
markdown_score = 0.0
if len(markdown_content) > 2000: # Substantial content
markdown_score += 0.4
if "## " in markdown_content: # Proper structure
markdown_score += 0.3
if "[" in markdown_content and "](" in markdown_content: # Has links
markdown_score += 0.3
quality_factors.append(markdown_score)
# Overall quality is average of factors
if quality_factors:
return round(sum(quality_factors) / len(quality_factors), 2)
else:
return 0.5 # Default middle score