youtube-summarizer/backend/services/timestamp_processor.py

"""Timestamp Processor for semantic section detection and navigation.

This service processes video transcripts to identify meaningful sections,
create timestamped navigation, and generate clickable YouTube links.
"""

import asyncio
import logging
import re
from datetime import datetime
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass
from urllib.parse import urlparse, parse_qs

from ..services.deepseek_service import DeepSeekService
from ..core.exceptions import ServiceError

logger = logging.getLogger(__name__)


@dataclass
class TimestampedSection:
    """Represents a timestamped section of content."""
    index: int
    title: str
    start_timestamp: int  # seconds
    end_timestamp: int    # seconds
    youtube_link: str
    content: str
    summary: str
    key_points: List[str]
    confidence_score: float


@dataclass
class SectionDetectionResult:
    """Result of section detection process."""
    sections: List[TimestampedSection]
    total_sections: int
    processing_time_seconds: float
    quality_score: float
    created_at: datetime


class TimestampProcessor:
    """Service for processing timestamps and detecting semantic sections."""

    def __init__(self, ai_service: Optional[DeepSeekService] = None):
        """Initialize timestamp processor.

        Args:
            ai_service: AI service for content analysis
        """
        self.ai_service = ai_service or DeepSeekService()

        # Section detection configuration
        self.min_section_duration = 30  # seconds
        self.max_sections = 15
        self.target_section_length = 180  # 3 minutes
        self.overlap_tolerance = 5  # seconds

        logger.info("TimestampProcessor initialized")

    async def detect_semantic_sections(
        self,
        transcript_data: List[Dict[str, Any]],
        video_url: str,
        video_title: str = ""
    ) -> SectionDetectionResult:
        """Detect semantic sections from transcript data.

        Args:
            transcript_data: List of transcript segments with timestamps
            video_url: YouTube video URL for link generation
            video_title: Video title for context

        Returns:
            Section detection result
        """
        start_time = datetime.now()

        if not transcript_data or len(transcript_data) < 2:
            raise ServiceError("Insufficient transcript data for section detection")

        try:
            # Prepare transcript text with timestamps
            full_transcript = self._prepare_transcript_text(transcript_data)

            # Detect section boundaries using AI
            section_boundaries = await self._detect_section_boundaries(
                full_transcript, video_title
            )

            # Create timestamped sections
            sections = await self._create_timestamped_sections(
                transcript_data, section_boundaries, video_url
            )

            # Calculate quality score
            quality_score = self._calculate_quality_score(sections, transcript_data)

            processing_time = (datetime.now() - start_time).total_seconds()

            return SectionDetectionResult(
                sections=sections,
                total_sections=len(sections),
                processing_time_seconds=processing_time,
                quality_score=quality_score,
                created_at=datetime.now()
            )

        except Exception as e:
            logger.error(f"Error detecting semantic sections: {e}")
            raise ServiceError(f"Section detection failed: {str(e)}")

    def _prepare_transcript_text(self, transcript_data: List[Dict[str, Any]]) -> str:
        """Prepare transcript text with timestamp markers."""
        transcript_lines = []

        for segment in transcript_data:
            timestamp = segment.get('start', 0)
            text = segment.get('text', '').strip()

            if text:
                time_marker = self.seconds_to_timestamp(timestamp)
                transcript_lines.append(f"[{time_marker}] {text}")

        return '\n'.join(transcript_lines)

    async def _detect_section_boundaries(
        self,
        transcript_text: str,
        video_title: str
    ) -> List[Dict[str, Any]]:
        """Use AI to detect natural section boundaries."""

        system_prompt = """You are an expert at identifying natural section breaks in video content.

        Analyze the transcript and identify 5-10 meaningful sections that represent distinct topics, themes, or narrative segments.

        Each section should:
        - Have a clear, descriptive title
        - Start and end at natural transition points
        - Be long enough to contain meaningful content (at least 30 seconds)
        - Represent a coherent topic or theme

        Return ONLY valid JSON with this structure:
        {
            "sections": [
                {
                    "title": "Section Title",
                    "start_time_marker": "[00:01:23]",
                    "estimated_duration": "2-3 minutes",
                    "key_topic": "Main topic of this section"
                }
            ]
        }"""

        # Limit transcript length for AI processing
        limited_transcript = transcript_text[:6000]

        prompt = f"""Video Title: {video_title}

        Transcript with timestamps:
        {limited_transcript}

        Identify natural section breaks and create meaningful section titles."""

        response = await self.ai_service.generate_response(
            prompt=prompt,
            system_prompt=system_prompt,
            temperature=0.3,
            max_tokens=800
        )

        try:
            import json
            result = json.loads(response)
            return result.get("sections", [])
        except json.JSONDecodeError:
            logger.warning("AI response was not valid JSON, using fallback sections")
            return self._create_fallback_sections(transcript_text)

    def _create_fallback_sections(self, transcript_text: str) -> List[Dict[str, Any]]:
        """Create fallback sections if AI detection fails."""
        lines = transcript_text.split('\n')
        sections = []

        # Create sections every 3-4 minutes based on timestamps
        current_section = 1
        for i, line in enumerate(lines[::20]):  # Sample every 20th line
            time_match = re.search(r'\[(\d{2}:\d{2}:\d{2})\]', line)
            if time_match:
                sections.append({
                    "title": f"Section {current_section}",
                    "start_time_marker": f"[{time_match.group(1)}]",
                    "estimated_duration": "3-4 minutes",
                    "key_topic": "Content analysis"
                })
                current_section += 1

                if len(sections) >= 8:  # Limit fallback sections
                    break

        return sections

    async def _create_timestamped_sections(
        self,
        transcript_data: List[Dict[str, Any]],
        section_boundaries: List[Dict[str, Any]],
        video_url: str
    ) -> List[TimestampedSection]:
        """Create detailed timestamped sections."""

        sections = []

        for i, boundary in enumerate(section_boundaries):
            try:
                # Extract start time from time marker
                start_time_marker = boundary.get("start_time_marker", "[00:00:00]")
                start_seconds = self.timestamp_to_seconds(start_time_marker)

                # Determine end time (next section start or video end)
                if i + 1 < len(section_boundaries):
                    next_marker = section_boundaries[i + 1].get("start_time_marker", "[99:99:99]")
                    end_seconds = self.timestamp_to_seconds(next_marker)
                else:
                    # Last section goes to end of transcript
                    end_seconds = max(seg.get('start', 0) for seg in transcript_data) + 30

                # Extract content for this section
                section_content = self._extract_section_content(
                    transcript_data, start_seconds, end_seconds
                )

                # Generate section summary and key points
                section_analysis = await self._analyze_section_content(
                    section_content, boundary.get("title", f"Section {i+1}")
                )

                # Create YouTube link with timestamp
                youtube_link = self._create_youtube_link(video_url, start_seconds)

                section = TimestampedSection(
                    index=i + 1,
                    title=boundary.get("title", f"Section {i+1}"),
                    start_timestamp=start_seconds,
                    end_timestamp=end_seconds,
                    youtube_link=youtube_link,
                    content=section_content,
                    summary=section_analysis.get("summary", ""),
                    key_points=section_analysis.get("key_points", []),
                    confidence_score=section_analysis.get("confidence_score", 0.7)
                )

                sections.append(section)

            except Exception as e:
                logger.warning(f"Error creating section {i+1}: {e}")
                continue

        return sections

    def _extract_section_content(
        self,
        transcript_data: List[Dict[str, Any]],
        start_seconds: int,
        end_seconds: int
    ) -> str:
        """Extract content for a specific time range."""

        content_parts = []

        for segment in transcript_data:
            segment_start = segment.get('start', 0)
            segment_text = segment.get('text', '').strip()

            # Include segments that overlap with our section
            if start_seconds <= segment_start <= end_seconds and segment_text:
                content_parts.append(segment_text)

        return ' '.join(content_parts) if content_parts else "Content not available"

    async def _analyze_section_content(
        self,
        content: str,
        section_title: str
    ) -> Dict[str, Any]:
        """Analyze section content to generate summary and key points."""

        if len(content) < 50:
            return {
                "summary": f"Brief content in {section_title}",
                "key_points": ["Content analysis"],
                "confidence_score": 0.5
            }

        system_prompt = """You are analyzing a section of video content.

        Create:
        - A brief summary (1-2 sentences) of what happens in this section
        - 2-3 key points or takeaways
        - Confidence score (0.0-1.0) based on content quality and coherence

        Return ONLY valid JSON:
        {
            "summary": "Brief section summary",
            "key_points": ["point1", "point2", "point3"],
            "confidence_score": 0.8
        }"""

        prompt = f"""Section: {section_title}

        Content: {content[:1500]}

        Analyze this section and provide insights."""

        try:
            response = await self.ai_service.generate_response(
                prompt=prompt,
                system_prompt=system_prompt,
                temperature=0.4,
                max_tokens=300
            )

            import json
            return json.loads(response)

        except Exception as e:
            logger.warning(f"Section analysis failed: {e}")
            return {
                "summary": f"Content analysis for {section_title}",
                "key_points": ["Key insights from section"],
                "confidence_score": 0.6
            }

    def _create_youtube_link(self, video_url: str, timestamp_seconds: int) -> str:
        """Create YouTube link with timestamp parameter."""

        try:
            # Extract video ID from URL
            parsed_url = urlparse(video_url)

            if 'youtube.com' in parsed_url.netloc:
                query_params = parse_qs(parsed_url.query)
                video_id = query_params.get('v', [''])[0]
            elif 'youtu.be' in parsed_url.netloc:
                video_id = parsed_url.path.lstrip('/')
            else:
                return video_url  # Return original if not a YouTube URL

            if not video_id:
                return video_url

            # Create timestamped YouTube link
            return f"https://www.youtube.com/watch?v={video_id}&t={timestamp_seconds}s"

        except Exception as e:
            logger.warning(f"Error creating YouTube link: {e}")
            return video_url

    def _calculate_quality_score(
        self,
        sections: List[TimestampedSection],
        transcript_data: List[Dict[str, Any]]
    ) -> float:
        """Calculate overall quality score for section detection."""

        if not sections:
            return 0.0

        # Quality factors
        factors = []

        # 1. Section count (optimal range: 5-12 sections)
        section_count = len(sections)
        if 5 <= section_count <= 12:
            factors.append(1.0)
        elif section_count < 5:
            factors.append(0.6)
        else:
            factors.append(0.8)

        # 2. Average section confidence
        if sections:
            avg_confidence = sum(s.confidence_score for s in sections) / len(sections)
            factors.append(avg_confidence)

        # 3. Content coverage (how much of transcript is covered)
        total_transcript_duration = max(seg.get('start', 0) for seg in transcript_data)
        covered_duration = sum(s.end_timestamp - s.start_timestamp for s in sections)

        if total_transcript_duration > 0:
            coverage_ratio = min(1.0, covered_duration / total_transcript_duration)
            factors.append(coverage_ratio)

        # 4. Section length distribution (not too short or too long)
        section_lengths = [s.end_timestamp - s.start_timestamp for s in sections]
        avg_length = sum(section_lengths) / len(section_lengths)

        if 60 <= avg_length <= 300:  # 1-5 minutes is good
            factors.append(1.0)
        else:
            factors.append(0.7)

        # Calculate weighted average
        return sum(factors) / len(factors)

    @staticmethod
    def seconds_to_timestamp(seconds: int) -> str:
        """Convert seconds to HH:MM:SS format."""
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        secs = seconds % 60
        return f"{hours:02d}:{minutes:02d}:{secs:02d}"

    @staticmethod
    def timestamp_to_seconds(timestamp_str: str) -> int:
        """Convert HH:MM:SS or [HH:MM:SS] to seconds."""
        # Remove brackets if present
        timestamp_str = timestamp_str.strip('[]')

        try:
            parts = timestamp_str.split(':')
            if len(parts) == 3:
                hours, minutes, seconds = map(int, parts)
                return hours * 3600 + minutes * 60 + seconds
            elif len(parts) == 2:
                minutes, seconds = map(int, parts)
                return minutes * 60 + seconds
            else:
                return int(parts[0])
        except ValueError:
            logger.warning(f"Invalid timestamp format: {timestamp_str}")
            return 0

    async def generate_table_of_contents(
        self,
        sections: List[TimestampedSection]
    ) -> str:
        """Generate markdown table of contents with timestamp links."""

        if not sections:
            return "## Table of Contents\n\n*No sections detected*\n"

        toc_lines = ["## Table of Contents\n"]

        for section in sections:
            timestamp_display = self.seconds_to_timestamp(section.start_timestamp)

            # Create markdown link with timestamp
            toc_line = f"- **[{timestamp_display}]({section.youtube_link})** - {section.title}"

            if section.summary:
                toc_line += f"\n  *{section.summary}*"

            toc_lines.append(toc_line)

        return '\n'.join(toc_lines) + '\n'

    async def generate_section_navigation(
        self,
        sections: List[TimestampedSection]
    ) -> Dict[str, Any]:
        """Generate navigation data for frontend use."""

        navigation = {
            "total_sections": len(sections),
            "sections": []
        }

        for section in sections:
            nav_item = {
                "index": section.index,
                "title": section.title,
                "start_time": section.start_timestamp,
                "timestamp_display": self.seconds_to_timestamp(section.start_timestamp),
                "youtube_link": section.youtube_link,
                "summary": section.summary,
                "duration_seconds": section.end_timestamp - section.start_timestamp,
                "confidence": section.confidence_score
            }
            navigation["sections"].append(nav_item)

        return navigation

    def get_processor_stats(self) -> Dict[str, Any]:
        """Get processor configuration and statistics."""
        return {
            "service_name": "TimestampProcessor",
            "min_section_duration": self.min_section_duration,
            "max_sections": self.max_sections,
            "target_section_length": self.target_section_length,
            "overlap_tolerance": self.overlap_tolerance
        }