"""Service for semantic chunking of video transcripts.""" import logging import re import hashlib from typing import List, Dict, Any, Optional, Tuple from datetime import datetime import json import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np logger = logging.getLogger(__name__) # Download required NLTK data try: nltk.data.find('tokenizers/punkt') nltk.data.find('corpora/stopwords') except LookupError: nltk.download('punkt') nltk.download('stopwords') class TranscriptChunkerError(Exception): """Transcript chunking specific errors.""" pass class TranscriptChunker: """Service for intelligent chunking of video transcripts with semantic segmentation.""" def __init__( self, chunk_size: int = 1000, chunk_overlap: int = 200, min_chunk_size: int = 100, use_semantic_splitting: bool = True ): """Initialize transcript chunker. Args: chunk_size: Target size for chunks in characters chunk_overlap: Overlap between chunks in characters min_chunk_size: Minimum chunk size in characters use_semantic_splitting: Whether to use semantic boundaries """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_chunk_size = min_chunk_size self.use_semantic_splitting = use_semantic_splitting # Initialize NLTK components try: self.stop_words = set(stopwords.words('english')) except LookupError: self.stop_words = set() logger.warning("NLTK stopwords not available, using empty set") def chunk_transcript( self, transcript: str, video_id: str, transcript_metadata: Optional[Dict[str, Any]] = None ) -> List[Dict[str, Any]]: """Chunk transcript into semantic segments. Args: transcript: Full transcript text video_id: YouTube video ID transcript_metadata: Optional metadata about the transcript Returns: List of chunk dictionaries with content and metadata """ try: logger.info(f"Chunking transcript for video {video_id}, length: {len(transcript)}") if not transcript or len(transcript) < self.min_chunk_size: logger.warning(f"Transcript too short for chunking: {len(transcript)} characters") return [] # Parse transcript with timestamps if available transcript_entries = self._parse_transcript_with_timestamps(transcript) if self.use_semantic_splitting and transcript_entries: chunks = self._semantic_chunking(transcript_entries, video_id) else: # Fallback to simple text chunking chunks = self._simple_text_chunking(transcript, video_id) # Process chunks and add metadata processed_chunks = [] for i, chunk in enumerate(chunks): processed_chunk = self._process_chunk(chunk, i, video_id, transcript_metadata) processed_chunks.append(processed_chunk) logger.info(f"Created {len(processed_chunks)} chunks for video {video_id}") return processed_chunks except Exception as e: logger.error(f"Failed to chunk transcript: {e}") raise TranscriptChunkerError(f"Chunking failed: {e}") def _parse_transcript_with_timestamps(self, transcript: str) -> List[Dict[str, Any]]: """Parse transcript text to extract timestamps and content. Args: transcript: Raw transcript text Returns: List of transcript entries with timestamps """ entries = [] # Try to parse different timestamp formats patterns = [ r'(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*-\s*(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*:\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)', r'\[(\d{1,2}:\d{2}:\d{2})\]\s*(.+?)(?=\[\d{1,2}:\d{2}:\d{2}\]|\Z)', r'(\d{1,2}:\d{2}:\d{2})\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)', r'(\d+\.\d+)s:\s*(.+?)(?=\d+\.\d+s:|\Z)' ] for pattern in patterns: matches = re.finditer(pattern, transcript, re.DOTALL | re.MULTILINE) if matches: for match in matches: try: if len(match.groups()) == 3: # Start, end, content start_time = self._parse_timestamp(match.group(1)) end_time = self._parse_timestamp(match.group(2)) content = match.group(3).strip() else: # Timestamp, content start_time = self._parse_timestamp(match.group(1)) end_time = None content = match.group(2).strip() if content: entries.append({ 'start_timestamp': start_time, 'end_timestamp': end_time, 'content': content }) except ValueError: continue if entries: break # If no timestamps found, treat as plain text if not entries: entries = [{ 'start_timestamp': None, 'end_timestamp': None, 'content': transcript }] return entries def _parse_timestamp(self, timestamp_str: str) -> Optional[float]: """Parse timestamp string to seconds. Args: timestamp_str: Timestamp in various formats Returns: Timestamp in seconds or None """ try: if ':' in timestamp_str: # Format: HH:MM:SS or MM:SS parts = timestamp_str.split(':') if len(parts) == 3: hours, minutes, seconds = parts return int(hours) * 3600 + int(minutes) * 60 + float(seconds) elif len(parts) == 2: minutes, seconds = parts return int(minutes) * 60 + float(seconds) elif timestamp_str.endswith('s'): # Format: 123.45s return float(timestamp_str[:-1]) else: return float(timestamp_str) except (ValueError, IndexError): return None def _semantic_chunking( self, transcript_entries: List[Dict[str, Any]], video_id: str ) -> List[Dict[str, Any]]: """Perform semantic chunking using sentence similarity. Args: transcript_entries: List of transcript entries with timestamps video_id: YouTube video ID Returns: List of semantic chunks """ chunks = [] current_chunk = { 'content': '', 'start_timestamp': None, 'end_timestamp': None, 'entries': [] } for entry in transcript_entries: # Tokenize content into sentences sentences = sent_tokenize(entry['content']) for sentence in sentences: if not sentence.strip(): continue # Check if adding this sentence would exceed chunk size potential_content = current_chunk['content'] + ' ' + sentence if current_chunk['content'] else sentence if len(potential_content) > self.chunk_size and len(current_chunk['content']) > self.min_chunk_size: # Finalize current chunk if current_chunk['content']: chunks.append(self._finalize_chunk(current_chunk)) # Start new chunk with overlap overlap_content = self._get_overlap_content(current_chunk['content'], self.chunk_overlap) current_chunk = { 'content': overlap_content + ' ' + sentence if overlap_content else sentence, 'start_timestamp': entry['start_timestamp'], 'end_timestamp': entry['end_timestamp'], 'entries': [entry] } else: # Add sentence to current chunk current_chunk['content'] = potential_content if current_chunk['start_timestamp'] is None: current_chunk['start_timestamp'] = entry['start_timestamp'] current_chunk['end_timestamp'] = entry['end_timestamp'] if entry not in current_chunk['entries']: current_chunk['entries'].append(entry) # Add final chunk if current_chunk['content'] and len(current_chunk['content']) >= self.min_chunk_size: chunks.append(self._finalize_chunk(current_chunk)) return chunks def _simple_text_chunking(self, transcript: str, video_id: str) -> List[Dict[str, Any]]: """Simple text chunking without semantic analysis. Args: transcript: Full transcript text video_id: YouTube video ID Returns: List of text chunks """ chunks = [] text = transcript.strip() start = 0 chunk_index = 0 while start < len(text): # Calculate chunk end with overlap end = min(start + self.chunk_size, len(text)) # Try to break at sentence boundary if end < len(text): # Look for sentence ending punctuation for i in range(end, max(start + self.min_chunk_size, end - 200), -1): if text[i] in '.!?': end = i + 1 break chunk_text = text[start:end].strip() if len(chunk_text) >= self.min_chunk_size: chunks.append({ 'content': chunk_text, 'start_timestamp': None, 'end_timestamp': None, 'chunk_index': chunk_index }) chunk_index += 1 # Move start position with overlap start = end - self.chunk_overlap if end < len(text) else end return chunks def _finalize_chunk(self, chunk_dict: Dict[str, Any]) -> Dict[str, Any]: """Finalize chunk with metadata and cleanup. Args: chunk_dict: Raw chunk dictionary Returns: Processed chunk dictionary """ content = chunk_dict['content'].strip() return { 'content': content, 'start_timestamp': chunk_dict['start_timestamp'], 'end_timestamp': chunk_dict['end_timestamp'], 'content_length': len(content), 'word_count': len(word_tokenize(content)), 'entries_count': len(chunk_dict.get('entries', [])), 'keywords': self._extract_keywords(content), 'entities': self._extract_entities(content) } def _get_overlap_content(self, content: str, overlap_size: int) -> str: """Get overlap content from the end of current chunk. Args: content: Current chunk content overlap_size: Size of overlap in characters Returns: Overlap content """ if len(content) <= overlap_size: return content overlap_start = len(content) - overlap_size # Try to start overlap at word boundary space_index = content.find(' ', overlap_start) if space_index != -1 and space_index < len(content) - overlap_size * 0.5: overlap_start = space_index + 1 return content[overlap_start:].strip() def _process_chunk( self, chunk: Dict[str, Any], chunk_index: int, video_id: str, transcript_metadata: Optional[Dict[str, Any]] ) -> Dict[str, Any]: """Process and enrich chunk with metadata. Args: chunk: Raw chunk dictionary chunk_index: Index of chunk in sequence video_id: YouTube video ID transcript_metadata: Optional transcript metadata Returns: Processed chunk with metadata """ content = chunk['content'] processed_chunk = { 'video_id': video_id, 'chunk_index': chunk_index, 'chunk_type': 'transcript', 'content': content, 'content_length': len(content), 'content_hash': hashlib.sha256(content.encode()).hexdigest(), 'start_timestamp': chunk.get('start_timestamp'), 'end_timestamp': chunk.get('end_timestamp'), 'word_count': chunk.get('word_count', len(word_tokenize(content))), 'keywords': chunk.get('keywords', []), 'entities': chunk.get('entities', []), 'created_at': datetime.now().isoformat() } # Add transcript metadata if available if transcript_metadata: processed_chunk['source_metadata'] = transcript_metadata return processed_chunk def _extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]: """Extract keywords from text using TF-IDF. Args: text: Text content max_keywords: Maximum number of keywords Returns: List of keywords """ try: # Simple keyword extraction using word frequency words = word_tokenize(text.lower()) words = [word for word in words if word.isalpha() and word not in self.stop_words and len(word) > 2] # Count word frequencies word_freq = {} for word in words: word_freq[word] = word_freq.get(word, 0) + 1 # Sort by frequency and return top keywords sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) return [word for word, freq in sorted_words[:max_keywords]] except Exception as e: logger.warning(f"Keyword extraction failed: {e}") return [] def _extract_entities(self, text: str) -> List[Dict[str, str]]: """Extract named entities from text (basic implementation). Args: text: Text content Returns: List of entity dictionaries """ try: # Simple pattern-based entity extraction entities = [] # Email patterns email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' emails = re.findall(email_pattern, text) for email in emails: entities.append({'text': email, 'type': 'EMAIL'}) # URL patterns url_pattern = r'https?://[^\s]+' urls = re.findall(url_pattern, text) for url in urls: entities.append({'text': url, 'type': 'URL'}) # Time patterns time_pattern = r'\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?\b' times = re.findall(time_pattern, text) for time in times: entities.append({'text': time, 'type': 'TIME'}) return entities[:20] # Limit to 20 entities except Exception as e: logger.warning(f"Entity extraction failed: {e}") return [] def get_chunking_stats( self, chunks: List[Dict[str, Any]] ) -> Dict[str, Any]: """Get statistics about the chunking process. Args: chunks: List of processed chunks Returns: Dictionary with chunking statistics """ if not chunks: return {'total_chunks': 0} total_content_length = sum(chunk['content_length'] for chunk in chunks) total_words = sum(chunk['word_count'] for chunk in chunks) avg_chunk_size = total_content_length / len(chunks) # Count chunks with timestamps timestamped_chunks = sum(1 for chunk in chunks if chunk.get('start_timestamp') is not None) return { 'total_chunks': len(chunks), 'total_content_length': total_content_length, 'total_words': total_words, 'avg_chunk_size': round(avg_chunk_size, 2), 'timestamped_chunks': timestamped_chunks, 'timestamp_coverage': round(timestamped_chunks / len(chunks) * 100, 2), 'min_chunk_size': min(chunk['content_length'] for chunk in chunks), 'max_chunk_size': max(chunk['content_length'] for chunk in chunks) }