youtube-summarizer/backend/services/enhanced_export_service.py

"""Enhanced export service with executive summaries and timestamped sections."""

import re
import logging
from typing import Dict, List, Optional, Any, Tuple
from datetime import datetime
from dataclasses import dataclass
from enum import Enum

from pydantic import BaseModel

from ..models.transcript import TranscriptSegment
from ..core.exceptions import ServiceError
from .deepseek_service import DeepSeekService

logger = logging.getLogger(__name__)


class ExportFormat(str, Enum):
    """Supported export formats."""
    MARKDOWN = "markdown"
    HTML = "html"
    PDF = "pdf"
    JSON = "json"


@dataclass
class VideoMetadata:
    """Video metadata for export context."""
    video_id: str
    title: str
    channel: str
    duration: int  # seconds
    view_count: Optional[int] = None
    upload_date: Optional[str] = None
    description: Optional[str] = None


class ExecutiveSummary(BaseModel):
    """Executive summary with key business insights."""
    overview: str  # 2-3 paragraph executive overview
    key_metrics: Dict[str, Any]  # Duration, word count, topics, etc.
    main_topics: List[str]  # Primary topics covered
    business_value: Optional[str] = None  # Business value proposition
    action_items: List[str]  # Actionable items for executives
    sentiment_analysis: Dict[str, float]  # Sentiment scores


class TimestampedSection(BaseModel):
    """Section with timestamp navigation."""
    index: int
    title: str
    start_timestamp: int  # seconds
    end_timestamp: int
    youtube_link: str
    content: str
    summary: str  # Brief section summary
    key_points: List[str]


class ExportConfig(BaseModel):
    """Configuration for enhanced export."""
    format: ExportFormat = ExportFormat.MARKDOWN
    include_executive_summary: bool = True
    include_timestamps: bool = True
    include_toc: bool = True
    section_detail_level: str = "standard"  # brief, standard, detailed
    custom_template_id: Optional[str] = None


class EnhancedMarkdownExport(BaseModel):
    """Complete enhanced export result."""
    summary_id: str
    video_metadata: VideoMetadata
    executive_summary: ExecutiveSummary
    table_of_contents: List[str]
    sections: List[TimestampedSection]
    markdown_content: str
    metadata: Dict[str, Any]
    quality_score: float
    processing_time_seconds: float
    created_at: datetime


class EnhancedExportService:
    """Service for generating enhanced exports with executive summaries and timestamps."""

    def __init__(self, ai_service: Optional[DeepSeekService] = None):
        """Initialize the enhanced export service.

        Args:
            ai_service: DeepSeek AI service for content generation
        """
        self.ai_service = ai_service or DeepSeekService()

    async def generate_enhanced_export(
        self,
        summary_id: str,
        transcript: str,
        video_metadata: VideoMetadata,
        config: Optional[ExportConfig] = None
    ) -> EnhancedMarkdownExport:
        """Generate enhanced export with all features.

        Args:
            summary_id: Summary ID for tracking
            transcript: Video transcript text
            video_metadata: Video information
            config: Export configuration

        Returns:
            Complete enhanced export
        """
        if not transcript or len(transcript.strip()) < 50:
            raise ServiceError("Transcript too short for enhanced export")

        config = config or ExportConfig()
        start_time = datetime.now()

        logger.info(f"Starting enhanced export for summary {summary_id}")

        try:
            # 1. Generate executive summary
            executive_summary = await self._generate_executive_summary(
                transcript, video_metadata
            ) if config.include_executive_summary else None

            # 2. Detect and create timestamped sections
            sections = await self._create_timestamped_sections(
                transcript, video_metadata, config.section_detail_level
            ) if config.include_timestamps else []

            # 3. Generate table of contents
            table_of_contents = self._generate_table_of_contents(
                sections
            ) if config.include_toc else []

            # 4. Generate markdown content
            markdown_content = await self._generate_markdown_content(
                video_metadata, executive_summary, sections, table_of_contents, config
            )

            # 5. Calculate quality score
            quality_score = self._calculate_export_quality(
                executive_summary, sections, markdown_content
            )

            processing_time = (datetime.now() - start_time).total_seconds()

            result = EnhancedMarkdownExport(
                summary_id=summary_id,
                video_metadata=video_metadata,
                executive_summary=executive_summary,
                table_of_contents=table_of_contents,
                sections=sections,
                markdown_content=markdown_content,
                metadata={
                    "export_config": config.dict(),
                    "processing_time": processing_time,
                    "sections_count": len(sections),
                    "word_count": len(markdown_content.split())
                },
                quality_score=quality_score,
                processing_time_seconds=processing_time,
                created_at=start_time
            )

            logger.info(f"Enhanced export completed for summary {summary_id} in {processing_time:.2f}s")
            return result

        except Exception as e:
            logger.error(f"Error generating enhanced export for {summary_id}: {e}")
            raise ServiceError(f"Enhanced export failed: {str(e)}")

    async def _generate_executive_summary(
        self,
        transcript: str,
        video_metadata: VideoMetadata
    ) -> ExecutiveSummary:
        """Generate executive summary with business focus.

        Args:
            transcript: Video transcript
            video_metadata: Video information

        Returns:
            Executive summary
        """
        duration_minutes = video_metadata.duration // 60
        word_count = len(transcript.split())

        system_prompt = """You are an executive assistant creating a high-level summary for business leaders.
        Focus on business value, strategic insights, ROI implications, and actionable intelligence.
        Your audience consists of executives who need to quickly understand the key value and decisions points.

        Provide your response as valid JSON with this exact structure:
        {
          "overview": "2-3 paragraph executive overview emphasizing business value and strategic insights",
          "key_metrics": {
            "duration_minutes": 45,
            "word_count": 1200,
            "complexity_level": "intermediate",
            "primary_audience": "technical professionals"
          },
          "main_topics": ["List of 4-6 primary topics covered"],
          "business_value": "Clear statement of business value and ROI implications",
          "action_items": ["List of 3-5 specific actionable items for executives"],
          "sentiment_analysis": {
            "overall_sentiment": 0.7,
            "confidence_level": 0.8,
            "business_optimism": 0.6
          }
        }"""

        prompt = f"""Video Title: {video_metadata.title}
Channel: {video_metadata.channel}
Duration: {duration_minutes} minutes
Word Count: {word_count} words

Please create an executive summary of the following video transcript. Focus on business implications, strategic value, and actionable insights that would be relevant to decision-makers and executives.

Transcript:
{transcript[:6000]}  # Limit for token constraints

Create a comprehensive executive summary that captures the strategic value and business implications."""

        try:
            response = await self.ai_service.generate_response(
                prompt=prompt,
                system_prompt=system_prompt,
                temperature=0.3,
                max_tokens=1200
            )

            # Parse AI response
            import json
            summary_data = json.loads(response)

            # Ensure required fields and add calculated metrics
            summary_data["key_metrics"].update({
                "duration_minutes": duration_minutes,
                "word_count": word_count,
                "video_id": video_metadata.video_id
            })

            return ExecutiveSummary(**summary_data)

        except json.JSONDecodeError:
            # Fallback if JSON parsing fails
            logger.warning("Executive summary response was not valid JSON, creating fallback")
            return ExecutiveSummary(
                overview=response[:500] if response else "Executive summary generation failed.",
                key_metrics={
                    "duration_minutes": duration_minutes,
                    "word_count": word_count,
                    "video_id": video_metadata.video_id,
                    "complexity_level": "unknown"
                },
                main_topics=["Content analysis", "Business insights"],
                business_value="Detailed analysis available in full summary.",
                action_items=["Review full content", "Assess implementation options"],
                sentiment_analysis={"overall_sentiment": 0.5, "confidence_level": 0.3}
            )
        except Exception as e:
            logger.error(f"Error generating executive summary: {e}")
            raise ServiceError(f"Executive summary generation failed: {str(e)}")

    async def _create_timestamped_sections(
        self,
        transcript: str,
        video_metadata: VideoMetadata,
        detail_level: str = "standard"
    ) -> List[TimestampedSection]:
        """Create timestamped sections from transcript.

        Args:
            transcript: Video transcript
            video_metadata: Video metadata
            detail_level: Level of detail for sections

        Returns:
            List of timestamped sections
        """
        # First, detect natural section breaks in the transcript
        raw_sections = self._detect_section_breaks(transcript)

        if not raw_sections:
            logger.warning("No sections detected, creating single section")
            raw_sections = [(transcript, 0, video_metadata.duration)]

        sections = []

        for i, (section_content, start_time, end_time) in enumerate(raw_sections):
            if not section_content.strip():
                continue

            # Generate section title using AI
            section_title = await self._generate_section_title(section_content, i + 1)

            # Generate section summary
            section_summary = await self._generate_section_summary(
                section_content, detail_level
            )

            # Extract key points
            key_points = self._extract_key_points(section_content)

            # Create YouTube timestamp link
            youtube_link = f"https://youtube.com/watch?v={video_metadata.video_id}&t={start_time}s"

            section = TimestampedSection(
                index=i + 1,
                title=section_title,
                start_timestamp=start_time,
                end_timestamp=end_time,
                youtube_link=youtube_link,
                content=section_content,
                summary=section_summary,
                key_points=key_points
            )

            sections.append(section)

        logger.info(f"Created {len(sections)} timestamped sections")
        return sections

    def _detect_section_breaks(self, transcript: str) -> List[Tuple[str, int, int]]:
        """Detect natural section breaks in transcript.

        Args:
            transcript: Full transcript text

        Returns:
            List of (content, start_time, end_time) tuples
        """
        # Simple heuristic-based section detection
        # In a real implementation, this could use more sophisticated NLP

        paragraphs = transcript.split('\n\n')
        if not paragraphs:
            return [(transcript, 0, 300)]  # Default 5 minute section

        sections = []
        current_content = ""
        section_start = 0
        words_per_minute = 150  # Average speaking rate

        for i, paragraph in enumerate(paragraphs):
            current_content += paragraph + "\n\n"

            # Create section break every ~500 words or at natural breaks
            word_count = len(current_content.split())

            if word_count > 500 or i == len(paragraphs) - 1:
                # Estimate timestamps based on word count
                section_duration = (word_count / words_per_minute) * 60
                section_end = section_start + int(section_duration)

                sections.append((current_content.strip(), section_start, section_end))

                # Reset for next section
                section_start = section_end
                current_content = ""

        return sections

    async def _generate_section_title(self, content: str, section_index: int) -> str:
        """Generate descriptive title for a section.

        Args:
            content: Section content
            section_index: Section number

        Returns:
            Section title
        """
        # Extract first meaningful sentence or topic
        sentences = content.split('.')[:3]  # First 3 sentences
        preview = '. '.join(sentences)[:200]

        try:
            prompt = f"Create a concise, descriptive title (4-8 words) for this video section:\n\n{preview}"

            title = await self.ai_service.generate_response(
                prompt=prompt,
                system_prompt="Generate clear, descriptive section titles for video content. Keep titles under 8 words.",
                temperature=0.4,
                max_tokens=50
            )

            # Clean up title
            title = title.strip().strip('"\'')
            if len(title) > 60:  # Too long, truncate
                title = title[:57] + "..."

            return title or f"Section {section_index}"

        except Exception:
            # Fallback to simple title
            logger.debug(f"Could not generate title for section {section_index}, using fallback")
            return f"Section {section_index}"

    async def _generate_section_summary(self, content: str, detail_level: str) -> str:
        """Generate summary for a section.

        Args:
            content: Section content
            detail_level: Level of detail

        Returns:
            Section summary
        """
        if detail_level == "brief":
            max_tokens = 100
            target = "1-2 sentences"
        elif detail_level == "detailed":
            max_tokens = 300
            target = "2-3 paragraphs"
        else:  # standard
            max_tokens = 150
            target = "2-3 sentences"

        try:
            prompt = f"Summarize this video section in {target}:\n\n{content[:1000]}"

            summary = await self.ai_service.generate_response(
                prompt=prompt,
                system_prompt=f"Create concise {target} summaries of video sections that capture the main points.",
                temperature=0.3,
                max_tokens=max_tokens
            )

            return summary.strip()

        except Exception:
            # Fallback to first sentence(s)
            sentences = content.split('.')[:2]
            return '. '.join(sentences) + "."

    def _extract_key_points(self, content: str) -> List[str]:
        """Extract key points from section content.

        Args:
            content: Section content

        Returns:
            List of key points
        """
        # Simple extraction based on sentence importance
        sentences = [s.strip() for s in content.split('.') if s.strip()]

        # Score sentences by length and keyword presence
        important_keywords = [
            'important', 'key', 'main', 'crucial', 'essential', 'critical',
            'should', 'must', 'need', 'remember', 'note that', 'takeaway'
        ]

        scored_sentences = []
        for sentence in sentences[:10]:  # Limit to first 10 sentences
            if len(sentence.split()) < 5:  # Skip very short sentences
                continue

            score = 0
            sentence_lower = sentence.lower()

            # Score based on keywords
            for keyword in important_keywords:
                if keyword in sentence_lower:
                    score += 2

            # Score based on sentence length (moderate length preferred)
            word_count = len(sentence.split())
            if 8 <= word_count <= 20:
                score += 1

            scored_sentences.append((score, sentence))

        # Sort by score and take top points
        scored_sentences.sort(key=lambda x: x[0], reverse=True)
        key_points = [sentence for score, sentence in scored_sentences[:5] if score > 0]

        return key_points

    def _generate_table_of_contents(self, sections: List[TimestampedSection]) -> List[str]:
        """Generate table of contents from sections.

        Args:
            sections: List of timestamped sections

        Returns:
            Table of contents entries
        """
        toc_entries = []

        for section in sections:
            # Format timestamp as HH:MM:SS
            timestamp_formatted = self._format_timestamp(section.start_timestamp)

            # Create TOC entry with timestamp and title
            entry = f"[{timestamp_formatted}] {section.title}"
            toc_entries.append(entry)

        return toc_entries

    def _format_timestamp(self, seconds: int) -> str:
        """Format seconds as HH:MM:SS or MM:SS.

        Args:
            seconds: Time in seconds

        Returns:
            Formatted timestamp string
        """
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        secs = seconds % 60

        if hours > 0:
            return f"{hours:02d}:{minutes:02d}:{secs:02d}"
        else:
            return f"{minutes:02d}:{secs:02d}"

    async def _generate_markdown_content(
        self,
        video_metadata: VideoMetadata,
        executive_summary: Optional[ExecutiveSummary],
        sections: List[TimestampedSection],
        table_of_contents: List[str],
        config: ExportConfig
    ) -> str:
        """Generate complete markdown content.

        Args:
            video_metadata: Video metadata
            executive_summary: Executive summary
            sections: Timestamped sections
            table_of_contents: TOC entries
            config: Export configuration

        Returns:
            Complete markdown document
        """
        markdown_lines = []

        # 1. Document header
        markdown_lines.extend([
            f"# {video_metadata.title}",
            "",
            f"**Channel:** {video_metadata.channel}  ",
            f"**Duration:** {self._format_timestamp(video_metadata.duration)}  ",
            f"**Video ID:** {video_metadata.video_id}  ",
            f"**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  ",
            "",
            "---",
            ""
        ])

        # 2. Executive Summary
        if executive_summary and config.include_executive_summary:
            markdown_lines.extend([
                "## 📊 Executive Summary",
                "",
                executive_summary.overview,
                "",
                "### Key Metrics",
                ""
            ])

            for key, value in executive_summary.key_metrics.items():
                markdown_lines.append(f"- **{key.replace('_', ' ').title()}:** {value}")

            markdown_lines.extend(["", "### Action Items", ""])

            for item in executive_summary.action_items:
                markdown_lines.append(f"- {item}")

            if executive_summary.business_value:
                markdown_lines.extend([
                    "",
                    "### Business Value",
                    "",
                    executive_summary.business_value
                ])

            markdown_lines.extend(["", "---", ""])

        # 3. Table of Contents
        if table_of_contents and config.include_toc:
            markdown_lines.extend([
                "## 📋 Table of Contents",
                ""
            ])

            for i, entry in enumerate(table_of_contents, 1):
                markdown_lines.append(f"{i}. {entry}")

            markdown_lines.extend(["", "---", ""])

        # 4. Detailed Sections
        if sections and config.include_timestamps:
            markdown_lines.extend([
                "## 📝 Detailed Analysis",
                ""
            ])

            for section in sections:
                timestamp_formatted = self._format_timestamp(section.start_timestamp)

                # Section header with timestamp
                markdown_lines.extend([
                    f"### [{timestamp_formatted}] {section.title}",
                    "",
                    f"**🔗 [Jump to video]({section.youtube_link})**",
                    ""
                ])

                # Section summary
                if section.summary:
                    markdown_lines.extend([
                        "#### Summary",
                        "",
                        section.summary,
                        ""
                    ])

                # Key points
                if section.key_points:
                    markdown_lines.extend([
                        "#### Key Points",
                        ""
                    ])

                    for point in section.key_points:
                        markdown_lines.append(f"- {point}")

                    markdown_lines.append("")

                markdown_lines.extend(["---", ""])

        # 5. Footer
        markdown_lines.extend([
            "## 📄 Document Information",
            "",
            f"- **Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            f"- **Source:** [YouTube Video](https://youtube.com/watch?v={video_metadata.video_id})",
            f"- **Analysis Type:** Enhanced Export with Timestamps",
            f"- **Sections:** {len(sections)}",
            "",
            "*This analysis was generated using AI-powered video summarization technology.*"
        ])

        return "\n".join(markdown_lines)

    def _calculate_export_quality(
        self,
        executive_summary: Optional[ExecutiveSummary],
        sections: List[TimestampedSection],
        markdown_content: str
    ) -> float:
        """Calculate quality score for the export.

        Args:
            executive_summary: Executive summary
            sections: Timestamped sections
            markdown_content: Generated markdown

        Returns:
            Quality score between 0.0 and 1.0
        """
        quality_factors = []

        # Executive summary quality
        if executive_summary:
            exec_score = 0.0
            if len(executive_summary.overview) > 200:  # Substantial overview
                exec_score += 0.3
            if len(executive_summary.action_items) >= 3:  # Good actionable items
                exec_score += 0.2
            if executive_summary.business_value:  # Business value present
                exec_score += 0.3
            quality_factors.append(exec_score)

        # Sections quality
        if sections:
            section_score = 0.0
            avg_section_length = sum(len(s.content) for s in sections) / len(sections)
            if avg_section_length > 200:  # Substantial sections
                section_score += 0.3

            sections_with_summaries = sum(1 for s in sections if s.summary)
            if sections_with_summaries / len(sections) > 0.8:  # Most have summaries
                section_score += 0.4

            sections_with_points = sum(1 for s in sections if s.key_points)
            if sections_with_points / len(sections) > 0.5:  # Half have key points
                section_score += 0.3

            quality_factors.append(section_score)

        # Markdown quality
        markdown_score = 0.0
        if len(markdown_content) > 2000:  # Substantial content
            markdown_score += 0.4
        if "## " in markdown_content:  # Proper structure
            markdown_score += 0.3
        if "[" in markdown_content and "](" in markdown_content:  # Has links
            markdown_score += 0.3
        quality_factors.append(markdown_score)

        # Overall quality is average of factors
        if quality_factors:
            return round(sum(quality_factors) / len(quality_factors), 2)
        else:
            return 0.5  # Default middle score