youtube-summarizer/backend/services/transcript_chunker.py

"""Service for semantic chunking of video transcripts."""

import logging
import re
import hashlib
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import json

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

logger = logging.getLogger(__name__)

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')


class TranscriptChunkerError(Exception):
    """Transcript chunking specific errors."""
    pass


class TranscriptChunker:
    """Service for intelligent chunking of video transcripts with semantic segmentation."""

    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        min_chunk_size: int = 100,
        use_semantic_splitting: bool = True
    ):
        """Initialize transcript chunker.

        Args:
            chunk_size: Target size for chunks in characters
            chunk_overlap: Overlap between chunks in characters
            min_chunk_size: Minimum chunk size in characters
            use_semantic_splitting: Whether to use semantic boundaries
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.min_chunk_size = min_chunk_size
        self.use_semantic_splitting = use_semantic_splitting

        # Initialize NLTK components
        try:
            self.stop_words = set(stopwords.words('english'))
        except LookupError:
            self.stop_words = set()
            logger.warning("NLTK stopwords not available, using empty set")

    def chunk_transcript(
        self,
        transcript: str,
        video_id: str,
        transcript_metadata: Optional[Dict[str, Any]] = None
    ) -> List[Dict[str, Any]]:
        """Chunk transcript into semantic segments.

        Args:
            transcript: Full transcript text
            video_id: YouTube video ID
            transcript_metadata: Optional metadata about the transcript

        Returns:
            List of chunk dictionaries with content and metadata
        """
        try:
            logger.info(f"Chunking transcript for video {video_id}, length: {len(transcript)}")

            if not transcript or len(transcript) < self.min_chunk_size:
                logger.warning(f"Transcript too short for chunking: {len(transcript)} characters")
                return []

            # Parse transcript with timestamps if available
            transcript_entries = self._parse_transcript_with_timestamps(transcript)

            if self.use_semantic_splitting and transcript_entries:
                chunks = self._semantic_chunking(transcript_entries, video_id)
            else:
                # Fallback to simple text chunking
                chunks = self._simple_text_chunking(transcript, video_id)

            # Process chunks and add metadata
            processed_chunks = []
            for i, chunk in enumerate(chunks):
                processed_chunk = self._process_chunk(chunk, i, video_id, transcript_metadata)
                processed_chunks.append(processed_chunk)

            logger.info(f"Created {len(processed_chunks)} chunks for video {video_id}")
            return processed_chunks

        except Exception as e:
            logger.error(f"Failed to chunk transcript: {e}")
            raise TranscriptChunkerError(f"Chunking failed: {e}")

    def _parse_transcript_with_timestamps(self, transcript: str) -> List[Dict[str, Any]]:
        """Parse transcript text to extract timestamps and content.

        Args:
            transcript: Raw transcript text

        Returns:
            List of transcript entries with timestamps
        """
        entries = []

        # Try to parse different timestamp formats
        patterns = [
            r'(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*-\s*(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*:\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)',
            r'\[(\d{1,2}:\d{2}:\d{2})\]\s*(.+?)(?=\[\d{1,2}:\d{2}:\d{2}\]|\Z)',
            r'(\d{1,2}:\d{2}:\d{2})\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)',
            r'(\d+\.\d+)s:\s*(.+?)(?=\d+\.\d+s:|\Z)'
        ]

        for pattern in patterns:
            matches = re.finditer(pattern, transcript, re.DOTALL | re.MULTILINE)
            if matches:
                for match in matches:
                    try:
                        if len(match.groups()) == 3:  # Start, end, content
                            start_time = self._parse_timestamp(match.group(1))
                            end_time = self._parse_timestamp(match.group(2))
                            content = match.group(3).strip()
                        else:  # Timestamp, content
                            start_time = self._parse_timestamp(match.group(1))
                            end_time = None
                            content = match.group(2).strip()

                        if content:
                            entries.append({
                                'start_timestamp': start_time,
                                'end_timestamp': end_time,
                                'content': content
                            })
                    except ValueError:
                        continue

                if entries:
                    break

        # If no timestamps found, treat as plain text
        if not entries:
            entries = [{
                'start_timestamp': None,
                'end_timestamp': None,
                'content': transcript
            }]

        return entries

    def _parse_timestamp(self, timestamp_str: str) -> Optional[float]:
        """Parse timestamp string to seconds.

        Args:
            timestamp_str: Timestamp in various formats

        Returns:
            Timestamp in seconds or None
        """
        try:
            if ':' in timestamp_str:
                # Format: HH:MM:SS or MM:SS
                parts = timestamp_str.split(':')
                if len(parts) == 3:
                    hours, minutes, seconds = parts
                    return int(hours) * 3600 + int(minutes) * 60 + float(seconds)
                elif len(parts) == 2:
                    minutes, seconds = parts
                    return int(minutes) * 60 + float(seconds)
            elif timestamp_str.endswith('s'):
                # Format: 123.45s
                return float(timestamp_str[:-1])
            else:
                return float(timestamp_str)
        except (ValueError, IndexError):
            return None

    def _semantic_chunking(
        self,
        transcript_entries: List[Dict[str, Any]],
        video_id: str
    ) -> List[Dict[str, Any]]:
        """Perform semantic chunking using sentence similarity.

        Args:
            transcript_entries: List of transcript entries with timestamps
            video_id: YouTube video ID

        Returns:
            List of semantic chunks
        """
        chunks = []
        current_chunk = {
            'content': '',
            'start_timestamp': None,
            'end_timestamp': None,
            'entries': []
        }

        for entry in transcript_entries:
            # Tokenize content into sentences
            sentences = sent_tokenize(entry['content'])

            for sentence in sentences:
                if not sentence.strip():
                    continue

                # Check if adding this sentence would exceed chunk size
                potential_content = current_chunk['content'] + ' ' + sentence if current_chunk['content'] else sentence

                if len(potential_content) > self.chunk_size and len(current_chunk['content']) > self.min_chunk_size:
                    # Finalize current chunk
                    if current_chunk['content']:
                        chunks.append(self._finalize_chunk(current_chunk))

                    # Start new chunk with overlap
                    overlap_content = self._get_overlap_content(current_chunk['content'], self.chunk_overlap)
                    current_chunk = {
                        'content': overlap_content + ' ' + sentence if overlap_content else sentence,
                        'start_timestamp': entry['start_timestamp'],
                        'end_timestamp': entry['end_timestamp'],
                        'entries': [entry]
                    }
                else:
                    # Add sentence to current chunk
                    current_chunk['content'] = potential_content
                    if current_chunk['start_timestamp'] is None:
                        current_chunk['start_timestamp'] = entry['start_timestamp']
                    current_chunk['end_timestamp'] = entry['end_timestamp']
                    if entry not in current_chunk['entries']:
                        current_chunk['entries'].append(entry)

        # Add final chunk
        if current_chunk['content'] and len(current_chunk['content']) >= self.min_chunk_size:
            chunks.append(self._finalize_chunk(current_chunk))

        return chunks

    def _simple_text_chunking(self, transcript: str, video_id: str) -> List[Dict[str, Any]]:
        """Simple text chunking without semantic analysis.

        Args:
            transcript: Full transcript text
            video_id: YouTube video ID

        Returns:
            List of text chunks
        """
        chunks = []
        text = transcript.strip()

        start = 0
        chunk_index = 0

        while start < len(text):
            # Calculate chunk end with overlap
            end = min(start + self.chunk_size, len(text))

            # Try to break at sentence boundary
            if end < len(text):
                # Look for sentence ending punctuation
                for i in range(end, max(start + self.min_chunk_size, end - 200), -1):
                    if text[i] in '.!?':
                        end = i + 1
                        break

            chunk_text = text[start:end].strip()

            if len(chunk_text) >= self.min_chunk_size:
                chunks.append({
                    'content': chunk_text,
                    'start_timestamp': None,
                    'end_timestamp': None,
                    'chunk_index': chunk_index
                })
                chunk_index += 1

            # Move start position with overlap
            start = end - self.chunk_overlap if end < len(text) else end

        return chunks

    def _finalize_chunk(self, chunk_dict: Dict[str, Any]) -> Dict[str, Any]:
        """Finalize chunk with metadata and cleanup.

        Args:
            chunk_dict: Raw chunk dictionary

        Returns:
            Processed chunk dictionary
        """
        content = chunk_dict['content'].strip()

        return {
            'content': content,
            'start_timestamp': chunk_dict['start_timestamp'],
            'end_timestamp': chunk_dict['end_timestamp'],
            'content_length': len(content),
            'word_count': len(word_tokenize(content)),
            'entries_count': len(chunk_dict.get('entries', [])),
            'keywords': self._extract_keywords(content),
            'entities': self._extract_entities(content)
        }

    def _get_overlap_content(self, content: str, overlap_size: int) -> str:
        """Get overlap content from the end of current chunk.

        Args:
            content: Current chunk content
            overlap_size: Size of overlap in characters

        Returns:
            Overlap content
        """
        if len(content) <= overlap_size:
            return content

        overlap_start = len(content) - overlap_size

        # Try to start overlap at word boundary
        space_index = content.find(' ', overlap_start)
        if space_index != -1 and space_index < len(content) - overlap_size * 0.5:
            overlap_start = space_index + 1

        return content[overlap_start:].strip()

    def _process_chunk(
        self,
        chunk: Dict[str, Any],
        chunk_index: int,
        video_id: str,
        transcript_metadata: Optional[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """Process and enrich chunk with metadata.

        Args:
            chunk: Raw chunk dictionary
            chunk_index: Index of chunk in sequence
            video_id: YouTube video ID
            transcript_metadata: Optional transcript metadata

        Returns:
            Processed chunk with metadata
        """
        content = chunk['content']

        processed_chunk = {
            'video_id': video_id,
            'chunk_index': chunk_index,
            'chunk_type': 'transcript',
            'content': content,
            'content_length': len(content),
            'content_hash': hashlib.sha256(content.encode()).hexdigest(),
            'start_timestamp': chunk.get('start_timestamp'),
            'end_timestamp': chunk.get('end_timestamp'),
            'word_count': chunk.get('word_count', len(word_tokenize(content))),
            'keywords': chunk.get('keywords', []),
            'entities': chunk.get('entities', []),
            'created_at': datetime.now().isoformat()
        }

        # Add transcript metadata if available
        if transcript_metadata:
            processed_chunk['source_metadata'] = transcript_metadata

        return processed_chunk

    def _extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
        """Extract keywords from text using TF-IDF.

        Args:
            text: Text content
            max_keywords: Maximum number of keywords

        Returns:
            List of keywords
        """
        try:
            # Simple keyword extraction using word frequency
            words = word_tokenize(text.lower())
            words = [word for word in words if word.isalpha() and word not in self.stop_words and len(word) > 2]

            # Count word frequencies
            word_freq = {}
            for word in words:
                word_freq[word] = word_freq.get(word, 0) + 1

            # Sort by frequency and return top keywords
            sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
            return [word for word, freq in sorted_words[:max_keywords]]

        except Exception as e:
            logger.warning(f"Keyword extraction failed: {e}")
            return []

    def _extract_entities(self, text: str) -> List[Dict[str, str]]:
        """Extract named entities from text (basic implementation).

        Args:
            text: Text content

        Returns:
            List of entity dictionaries
        """
        try:
            # Simple pattern-based entity extraction
            entities = []

            # Email patterns
            email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
            emails = re.findall(email_pattern, text)
            for email in emails:
                entities.append({'text': email, 'type': 'EMAIL'})

            # URL patterns
            url_pattern = r'https?://[^\s]+'
            urls = re.findall(url_pattern, text)
            for url in urls:
                entities.append({'text': url, 'type': 'URL'})

            # Time patterns
            time_pattern = r'\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?\b'
            times = re.findall(time_pattern, text)
            for time in times:
                entities.append({'text': time, 'type': 'TIME'})

            return entities[:20]  # Limit to 20 entities

        except Exception as e:
            logger.warning(f"Entity extraction failed: {e}")
            return []

    def get_chunking_stats(
        self,
        chunks: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """Get statistics about the chunking process.

        Args:
            chunks: List of processed chunks

        Returns:
            Dictionary with chunking statistics
        """
        if not chunks:
            return {'total_chunks': 0}

        total_content_length = sum(chunk['content_length'] for chunk in chunks)
        total_words = sum(chunk['word_count'] for chunk in chunks)
        avg_chunk_size = total_content_length / len(chunks)

        # Count chunks with timestamps
        timestamped_chunks = sum(1 for chunk in chunks if chunk.get('start_timestamp') is not None)

        return {
            'total_chunks': len(chunks),
            'total_content_length': total_content_length,
            'total_words': total_words,
            'avg_chunk_size': round(avg_chunk_size, 2),
            'timestamped_chunks': timestamped_chunks,
            'timestamp_coverage': round(timestamped_chunks / len(chunks) * 100, 2),
            'min_chunk_size': min(chunk['content_length'] for chunk in chunks),
            'max_chunk_size': max(chunk['content_length'] for chunk in chunks)
        }