"""Enhanced export service with executive summaries and timestamped sections.""" import re import logging from typing import Dict, List, Optional, Any, Tuple from datetime import datetime from dataclasses import dataclass from enum import Enum from pydantic import BaseModel from ..models.transcript import TranscriptSegment from ..core.exceptions import ServiceError from .deepseek_service import DeepSeekService logger = logging.getLogger(__name__) class ExportFormat(str, Enum): """Supported export formats.""" MARKDOWN = "markdown" HTML = "html" PDF = "pdf" JSON = "json" @dataclass class VideoMetadata: """Video metadata for export context.""" video_id: str title: str channel: str duration: int # seconds view_count: Optional[int] = None upload_date: Optional[str] = None description: Optional[str] = None class ExecutiveSummary(BaseModel): """Executive summary with key business insights.""" overview: str # 2-3 paragraph executive overview key_metrics: Dict[str, Any] # Duration, word count, topics, etc. main_topics: List[str] # Primary topics covered business_value: Optional[str] = None # Business value proposition action_items: List[str] # Actionable items for executives sentiment_analysis: Dict[str, float] # Sentiment scores class TimestampedSection(BaseModel): """Section with timestamp navigation.""" index: int title: str start_timestamp: int # seconds end_timestamp: int youtube_link: str content: str summary: str # Brief section summary key_points: List[str] class ExportConfig(BaseModel): """Configuration for enhanced export.""" format: ExportFormat = ExportFormat.MARKDOWN include_executive_summary: bool = True include_timestamps: bool = True include_toc: bool = True section_detail_level: str = "standard" # brief, standard, detailed custom_template_id: Optional[str] = None class EnhancedMarkdownExport(BaseModel): """Complete enhanced export result.""" summary_id: str video_metadata: VideoMetadata executive_summary: ExecutiveSummary table_of_contents: List[str] sections: List[TimestampedSection] markdown_content: str metadata: Dict[str, Any] quality_score: float processing_time_seconds: float created_at: datetime class EnhancedExportService: """Service for generating enhanced exports with executive summaries and timestamps.""" def __init__(self, ai_service: Optional[DeepSeekService] = None): """Initialize the enhanced export service. Args: ai_service: DeepSeek AI service for content generation """ self.ai_service = ai_service or DeepSeekService() async def generate_enhanced_export( self, summary_id: str, transcript: str, video_metadata: VideoMetadata, config: Optional[ExportConfig] = None ) -> EnhancedMarkdownExport: """Generate enhanced export with all features. Args: summary_id: Summary ID for tracking transcript: Video transcript text video_metadata: Video information config: Export configuration Returns: Complete enhanced export """ if not transcript or len(transcript.strip()) < 50: raise ServiceError("Transcript too short for enhanced export") config = config or ExportConfig() start_time = datetime.now() logger.info(f"Starting enhanced export for summary {summary_id}") try: # 1. Generate executive summary executive_summary = await self._generate_executive_summary( transcript, video_metadata ) if config.include_executive_summary else None # 2. Detect and create timestamped sections sections = await self._create_timestamped_sections( transcript, video_metadata, config.section_detail_level ) if config.include_timestamps else [] # 3. Generate table of contents table_of_contents = self._generate_table_of_contents( sections ) if config.include_toc else [] # 4. Generate markdown content markdown_content = await self._generate_markdown_content( video_metadata, executive_summary, sections, table_of_contents, config ) # 5. Calculate quality score quality_score = self._calculate_export_quality( executive_summary, sections, markdown_content ) processing_time = (datetime.now() - start_time).total_seconds() result = EnhancedMarkdownExport( summary_id=summary_id, video_metadata=video_metadata, executive_summary=executive_summary, table_of_contents=table_of_contents, sections=sections, markdown_content=markdown_content, metadata={ "export_config": config.dict(), "processing_time": processing_time, "sections_count": len(sections), "word_count": len(markdown_content.split()) }, quality_score=quality_score, processing_time_seconds=processing_time, created_at=start_time ) logger.info(f"Enhanced export completed for summary {summary_id} in {processing_time:.2f}s") return result except Exception as e: logger.error(f"Error generating enhanced export for {summary_id}: {e}") raise ServiceError(f"Enhanced export failed: {str(e)}") async def _generate_executive_summary( self, transcript: str, video_metadata: VideoMetadata ) -> ExecutiveSummary: """Generate executive summary with business focus. Args: transcript: Video transcript video_metadata: Video information Returns: Executive summary """ duration_minutes = video_metadata.duration // 60 word_count = len(transcript.split()) system_prompt = """You are an executive assistant creating a high-level summary for business leaders. Focus on business value, strategic insights, ROI implications, and actionable intelligence. Your audience consists of executives who need to quickly understand the key value and decisions points. Provide your response as valid JSON with this exact structure: { "overview": "2-3 paragraph executive overview emphasizing business value and strategic insights", "key_metrics": { "duration_minutes": 45, "word_count": 1200, "complexity_level": "intermediate", "primary_audience": "technical professionals" }, "main_topics": ["List of 4-6 primary topics covered"], "business_value": "Clear statement of business value and ROI implications", "action_items": ["List of 3-5 specific actionable items for executives"], "sentiment_analysis": { "overall_sentiment": 0.7, "confidence_level": 0.8, "business_optimism": 0.6 } }""" prompt = f"""Video Title: {video_metadata.title} Channel: {video_metadata.channel} Duration: {duration_minutes} minutes Word Count: {word_count} words Please create an executive summary of the following video transcript. Focus on business implications, strategic value, and actionable insights that would be relevant to decision-makers and executives. Transcript: {transcript[:6000]} # Limit for token constraints Create a comprehensive executive summary that captures the strategic value and business implications.""" try: response = await self.ai_service.generate_response( prompt=prompt, system_prompt=system_prompt, temperature=0.3, max_tokens=1200 ) # Parse AI response import json summary_data = json.loads(response) # Ensure required fields and add calculated metrics summary_data["key_metrics"].update({ "duration_minutes": duration_minutes, "word_count": word_count, "video_id": video_metadata.video_id }) return ExecutiveSummary(**summary_data) except json.JSONDecodeError: # Fallback if JSON parsing fails logger.warning("Executive summary response was not valid JSON, creating fallback") return ExecutiveSummary( overview=response[:500] if response else "Executive summary generation failed.", key_metrics={ "duration_minutes": duration_minutes, "word_count": word_count, "video_id": video_metadata.video_id, "complexity_level": "unknown" }, main_topics=["Content analysis", "Business insights"], business_value="Detailed analysis available in full summary.", action_items=["Review full content", "Assess implementation options"], sentiment_analysis={"overall_sentiment": 0.5, "confidence_level": 0.3} ) except Exception as e: logger.error(f"Error generating executive summary: {e}") raise ServiceError(f"Executive summary generation failed: {str(e)}") async def _create_timestamped_sections( self, transcript: str, video_metadata: VideoMetadata, detail_level: str = "standard" ) -> List[TimestampedSection]: """Create timestamped sections from transcript. Args: transcript: Video transcript video_metadata: Video metadata detail_level: Level of detail for sections Returns: List of timestamped sections """ # First, detect natural section breaks in the transcript raw_sections = self._detect_section_breaks(transcript) if not raw_sections: logger.warning("No sections detected, creating single section") raw_sections = [(transcript, 0, video_metadata.duration)] sections = [] for i, (section_content, start_time, end_time) in enumerate(raw_sections): if not section_content.strip(): continue # Generate section title using AI section_title = await self._generate_section_title(section_content, i + 1) # Generate section summary section_summary = await self._generate_section_summary( section_content, detail_level ) # Extract key points key_points = self._extract_key_points(section_content) # Create YouTube timestamp link youtube_link = f"https://youtube.com/watch?v={video_metadata.video_id}&t={start_time}s" section = TimestampedSection( index=i + 1, title=section_title, start_timestamp=start_time, end_timestamp=end_time, youtube_link=youtube_link, content=section_content, summary=section_summary, key_points=key_points ) sections.append(section) logger.info(f"Created {len(sections)} timestamped sections") return sections def _detect_section_breaks(self, transcript: str) -> List[Tuple[str, int, int]]: """Detect natural section breaks in transcript. Args: transcript: Full transcript text Returns: List of (content, start_time, end_time) tuples """ # Simple heuristic-based section detection # In a real implementation, this could use more sophisticated NLP paragraphs = transcript.split('\n\n') if not paragraphs: return [(transcript, 0, 300)] # Default 5 minute section sections = [] current_content = "" section_start = 0 words_per_minute = 150 # Average speaking rate for i, paragraph in enumerate(paragraphs): current_content += paragraph + "\n\n" # Create section break every ~500 words or at natural breaks word_count = len(current_content.split()) if word_count > 500 or i == len(paragraphs) - 1: # Estimate timestamps based on word count section_duration = (word_count / words_per_minute) * 60 section_end = section_start + int(section_duration) sections.append((current_content.strip(), section_start, section_end)) # Reset for next section section_start = section_end current_content = "" return sections async def _generate_section_title(self, content: str, section_index: int) -> str: """Generate descriptive title for a section. Args: content: Section content section_index: Section number Returns: Section title """ # Extract first meaningful sentence or topic sentences = content.split('.')[:3] # First 3 sentences preview = '. '.join(sentences)[:200] try: prompt = f"Create a concise, descriptive title (4-8 words) for this video section:\n\n{preview}" title = await self.ai_service.generate_response( prompt=prompt, system_prompt="Generate clear, descriptive section titles for video content. Keep titles under 8 words.", temperature=0.4, max_tokens=50 ) # Clean up title title = title.strip().strip('"\'') if len(title) > 60: # Too long, truncate title = title[:57] + "..." return title or f"Section {section_index}" except Exception: # Fallback to simple title logger.debug(f"Could not generate title for section {section_index}, using fallback") return f"Section {section_index}" async def _generate_section_summary(self, content: str, detail_level: str) -> str: """Generate summary for a section. Args: content: Section content detail_level: Level of detail Returns: Section summary """ if detail_level == "brief": max_tokens = 100 target = "1-2 sentences" elif detail_level == "detailed": max_tokens = 300 target = "2-3 paragraphs" else: # standard max_tokens = 150 target = "2-3 sentences" try: prompt = f"Summarize this video section in {target}:\n\n{content[:1000]}" summary = await self.ai_service.generate_response( prompt=prompt, system_prompt=f"Create concise {target} summaries of video sections that capture the main points.", temperature=0.3, max_tokens=max_tokens ) return summary.strip() except Exception: # Fallback to first sentence(s) sentences = content.split('.')[:2] return '. '.join(sentences) + "." def _extract_key_points(self, content: str) -> List[str]: """Extract key points from section content. Args: content: Section content Returns: List of key points """ # Simple extraction based on sentence importance sentences = [s.strip() for s in content.split('.') if s.strip()] # Score sentences by length and keyword presence important_keywords = [ 'important', 'key', 'main', 'crucial', 'essential', 'critical', 'should', 'must', 'need', 'remember', 'note that', 'takeaway' ] scored_sentences = [] for sentence in sentences[:10]: # Limit to first 10 sentences if len(sentence.split()) < 5: # Skip very short sentences continue score = 0 sentence_lower = sentence.lower() # Score based on keywords for keyword in important_keywords: if keyword in sentence_lower: score += 2 # Score based on sentence length (moderate length preferred) word_count = len(sentence.split()) if 8 <= word_count <= 20: score += 1 scored_sentences.append((score, sentence)) # Sort by score and take top points scored_sentences.sort(key=lambda x: x[0], reverse=True) key_points = [sentence for score, sentence in scored_sentences[:5] if score > 0] return key_points def _generate_table_of_contents(self, sections: List[TimestampedSection]) -> List[str]: """Generate table of contents from sections. Args: sections: List of timestamped sections Returns: Table of contents entries """ toc_entries = [] for section in sections: # Format timestamp as HH:MM:SS timestamp_formatted = self._format_timestamp(section.start_timestamp) # Create TOC entry with timestamp and title entry = f"[{timestamp_formatted}] {section.title}" toc_entries.append(entry) return toc_entries def _format_timestamp(self, seconds: int) -> str: """Format seconds as HH:MM:SS or MM:SS. Args: seconds: Time in seconds Returns: Formatted timestamp string """ hours = seconds // 3600 minutes = (seconds % 3600) // 60 secs = seconds % 60 if hours > 0: return f"{hours:02d}:{minutes:02d}:{secs:02d}" else: return f"{minutes:02d}:{secs:02d}" async def _generate_markdown_content( self, video_metadata: VideoMetadata, executive_summary: Optional[ExecutiveSummary], sections: List[TimestampedSection], table_of_contents: List[str], config: ExportConfig ) -> str: """Generate complete markdown content. Args: video_metadata: Video metadata executive_summary: Executive summary sections: Timestamped sections table_of_contents: TOC entries config: Export configuration Returns: Complete markdown document """ markdown_lines = [] # 1. Document header markdown_lines.extend([ f"# {video_metadata.title}", "", f"**Channel:** {video_metadata.channel} ", f"**Duration:** {self._format_timestamp(video_metadata.duration)} ", f"**Video ID:** {video_metadata.video_id} ", f"**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ", "", "---", "" ]) # 2. Executive Summary if executive_summary and config.include_executive_summary: markdown_lines.extend([ "## 📊 Executive Summary", "", executive_summary.overview, "", "### Key Metrics", "" ]) for key, value in executive_summary.key_metrics.items(): markdown_lines.append(f"- **{key.replace('_', ' ').title()}:** {value}") markdown_lines.extend(["", "### Action Items", ""]) for item in executive_summary.action_items: markdown_lines.append(f"- {item}") if executive_summary.business_value: markdown_lines.extend([ "", "### Business Value", "", executive_summary.business_value ]) markdown_lines.extend(["", "---", ""]) # 3. Table of Contents if table_of_contents and config.include_toc: markdown_lines.extend([ "## 📋 Table of Contents", "" ]) for i, entry in enumerate(table_of_contents, 1): markdown_lines.append(f"{i}. {entry}") markdown_lines.extend(["", "---", ""]) # 4. Detailed Sections if sections and config.include_timestamps: markdown_lines.extend([ "## 📝 Detailed Analysis", "" ]) for section in sections: timestamp_formatted = self._format_timestamp(section.start_timestamp) # Section header with timestamp markdown_lines.extend([ f"### [{timestamp_formatted}] {section.title}", "", f"**🔗 [Jump to video]({section.youtube_link})**", "" ]) # Section summary if section.summary: markdown_lines.extend([ "#### Summary", "", section.summary, "" ]) # Key points if section.key_points: markdown_lines.extend([ "#### Key Points", "" ]) for point in section.key_points: markdown_lines.append(f"- {point}") markdown_lines.append("") markdown_lines.extend(["---", ""]) # 5. Footer markdown_lines.extend([ "## 📄 Document Information", "", f"- **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", f"- **Source:** [YouTube Video](https://youtube.com/watch?v={video_metadata.video_id})", f"- **Analysis Type:** Enhanced Export with Timestamps", f"- **Sections:** {len(sections)}", "", "*This analysis was generated using AI-powered video summarization technology.*" ]) return "\n".join(markdown_lines) def _calculate_export_quality( self, executive_summary: Optional[ExecutiveSummary], sections: List[TimestampedSection], markdown_content: str ) -> float: """Calculate quality score for the export. Args: executive_summary: Executive summary sections: Timestamped sections markdown_content: Generated markdown Returns: Quality score between 0.0 and 1.0 """ quality_factors = [] # Executive summary quality if executive_summary: exec_score = 0.0 if len(executive_summary.overview) > 200: # Substantial overview exec_score += 0.3 if len(executive_summary.action_items) >= 3: # Good actionable items exec_score += 0.2 if executive_summary.business_value: # Business value present exec_score += 0.3 quality_factors.append(exec_score) # Sections quality if sections: section_score = 0.0 avg_section_length = sum(len(s.content) for s in sections) / len(sections) if avg_section_length > 200: # Substantial sections section_score += 0.3 sections_with_summaries = sum(1 for s in sections if s.summary) if sections_with_summaries / len(sections) > 0.8: # Most have summaries section_score += 0.4 sections_with_points = sum(1 for s in sections if s.key_points) if sections_with_points / len(sections) > 0.5: # Half have key points section_score += 0.3 quality_factors.append(section_score) # Markdown quality markdown_score = 0.0 if len(markdown_content) > 2000: # Substantial content markdown_score += 0.4 if "## " in markdown_content: # Proper structure markdown_score += 0.3 if "[" in markdown_content and "](" in markdown_content: # Has links markdown_score += 0.3 quality_factors.append(markdown_score) # Overall quality is average of factors if quality_factors: return round(sum(quality_factors) / len(quality_factors), 2) else: return 0.5 # Default middle score