youtube-summarizer/backend/services/transcript_chunker.py

476 lines
17 KiB
Python

"""Service for semantic chunking of video transcripts."""
import logging
import re
import hashlib
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import json
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
logger = logging.getLogger(__name__)
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('punkt')
nltk.download('stopwords')
class TranscriptChunkerError(Exception):
"""Transcript chunking specific errors."""
pass
class TranscriptChunker:
"""Service for intelligent chunking of video transcripts with semantic segmentation."""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
min_chunk_size: int = 100,
use_semantic_splitting: bool = True
):
"""Initialize transcript chunker.
Args:
chunk_size: Target size for chunks in characters
chunk_overlap: Overlap between chunks in characters
min_chunk_size: Minimum chunk size in characters
use_semantic_splitting: Whether to use semantic boundaries
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_chunk_size = min_chunk_size
self.use_semantic_splitting = use_semantic_splitting
# Initialize NLTK components
try:
self.stop_words = set(stopwords.words('english'))
except LookupError:
self.stop_words = set()
logger.warning("NLTK stopwords not available, using empty set")
def chunk_transcript(
self,
transcript: str,
video_id: str,
transcript_metadata: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
"""Chunk transcript into semantic segments.
Args:
transcript: Full transcript text
video_id: YouTube video ID
transcript_metadata: Optional metadata about the transcript
Returns:
List of chunk dictionaries with content and metadata
"""
try:
logger.info(f"Chunking transcript for video {video_id}, length: {len(transcript)}")
if not transcript or len(transcript) < self.min_chunk_size:
logger.warning(f"Transcript too short for chunking: {len(transcript)} characters")
return []
# Parse transcript with timestamps if available
transcript_entries = self._parse_transcript_with_timestamps(transcript)
if self.use_semantic_splitting and transcript_entries:
chunks = self._semantic_chunking(transcript_entries, video_id)
else:
# Fallback to simple text chunking
chunks = self._simple_text_chunking(transcript, video_id)
# Process chunks and add metadata
processed_chunks = []
for i, chunk in enumerate(chunks):
processed_chunk = self._process_chunk(chunk, i, video_id, transcript_metadata)
processed_chunks.append(processed_chunk)
logger.info(f"Created {len(processed_chunks)} chunks for video {video_id}")
return processed_chunks
except Exception as e:
logger.error(f"Failed to chunk transcript: {e}")
raise TranscriptChunkerError(f"Chunking failed: {e}")
def _parse_transcript_with_timestamps(self, transcript: str) -> List[Dict[str, Any]]:
"""Parse transcript text to extract timestamps and content.
Args:
transcript: Raw transcript text
Returns:
List of transcript entries with timestamps
"""
entries = []
# Try to parse different timestamp formats
patterns = [
r'(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*-\s*(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*:\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)',
r'\[(\d{1,2}:\d{2}:\d{2})\]\s*(.+?)(?=\[\d{1,2}:\d{2}:\d{2}\]|\Z)',
r'(\d{1,2}:\d{2}:\d{2})\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)',
r'(\d+\.\d+)s:\s*(.+?)(?=\d+\.\d+s:|\Z)'
]
for pattern in patterns:
matches = re.finditer(pattern, transcript, re.DOTALL | re.MULTILINE)
if matches:
for match in matches:
try:
if len(match.groups()) == 3: # Start, end, content
start_time = self._parse_timestamp(match.group(1))
end_time = self._parse_timestamp(match.group(2))
content = match.group(3).strip()
else: # Timestamp, content
start_time = self._parse_timestamp(match.group(1))
end_time = None
content = match.group(2).strip()
if content:
entries.append({
'start_timestamp': start_time,
'end_timestamp': end_time,
'content': content
})
except ValueError:
continue
if entries:
break
# If no timestamps found, treat as plain text
if not entries:
entries = [{
'start_timestamp': None,
'end_timestamp': None,
'content': transcript
}]
return entries
def _parse_timestamp(self, timestamp_str: str) -> Optional[float]:
"""Parse timestamp string to seconds.
Args:
timestamp_str: Timestamp in various formats
Returns:
Timestamp in seconds or None
"""
try:
if ':' in timestamp_str:
# Format: HH:MM:SS or MM:SS
parts = timestamp_str.split(':')
if len(parts) == 3:
hours, minutes, seconds = parts
return int(hours) * 3600 + int(minutes) * 60 + float(seconds)
elif len(parts) == 2:
minutes, seconds = parts
return int(minutes) * 60 + float(seconds)
elif timestamp_str.endswith('s'):
# Format: 123.45s
return float(timestamp_str[:-1])
else:
return float(timestamp_str)
except (ValueError, IndexError):
return None
def _semantic_chunking(
self,
transcript_entries: List[Dict[str, Any]],
video_id: str
) -> List[Dict[str, Any]]:
"""Perform semantic chunking using sentence similarity.
Args:
transcript_entries: List of transcript entries with timestamps
video_id: YouTube video ID
Returns:
List of semantic chunks
"""
chunks = []
current_chunk = {
'content': '',
'start_timestamp': None,
'end_timestamp': None,
'entries': []
}
for entry in transcript_entries:
# Tokenize content into sentences
sentences = sent_tokenize(entry['content'])
for sentence in sentences:
if not sentence.strip():
continue
# Check if adding this sentence would exceed chunk size
potential_content = current_chunk['content'] + ' ' + sentence if current_chunk['content'] else sentence
if len(potential_content) > self.chunk_size and len(current_chunk['content']) > self.min_chunk_size:
# Finalize current chunk
if current_chunk['content']:
chunks.append(self._finalize_chunk(current_chunk))
# Start new chunk with overlap
overlap_content = self._get_overlap_content(current_chunk['content'], self.chunk_overlap)
current_chunk = {
'content': overlap_content + ' ' + sentence if overlap_content else sentence,
'start_timestamp': entry['start_timestamp'],
'end_timestamp': entry['end_timestamp'],
'entries': [entry]
}
else:
# Add sentence to current chunk
current_chunk['content'] = potential_content
if current_chunk['start_timestamp'] is None:
current_chunk['start_timestamp'] = entry['start_timestamp']
current_chunk['end_timestamp'] = entry['end_timestamp']
if entry not in current_chunk['entries']:
current_chunk['entries'].append(entry)
# Add final chunk
if current_chunk['content'] and len(current_chunk['content']) >= self.min_chunk_size:
chunks.append(self._finalize_chunk(current_chunk))
return chunks
def _simple_text_chunking(self, transcript: str, video_id: str) -> List[Dict[str, Any]]:
"""Simple text chunking without semantic analysis.
Args:
transcript: Full transcript text
video_id: YouTube video ID
Returns:
List of text chunks
"""
chunks = []
text = transcript.strip()
start = 0
chunk_index = 0
while start < len(text):
# Calculate chunk end with overlap
end = min(start + self.chunk_size, len(text))
# Try to break at sentence boundary
if end < len(text):
# Look for sentence ending punctuation
for i in range(end, max(start + self.min_chunk_size, end - 200), -1):
if text[i] in '.!?':
end = i + 1
break
chunk_text = text[start:end].strip()
if len(chunk_text) >= self.min_chunk_size:
chunks.append({
'content': chunk_text,
'start_timestamp': None,
'end_timestamp': None,
'chunk_index': chunk_index
})
chunk_index += 1
# Move start position with overlap
start = end - self.chunk_overlap if end < len(text) else end
return chunks
def _finalize_chunk(self, chunk_dict: Dict[str, Any]) -> Dict[str, Any]:
"""Finalize chunk with metadata and cleanup.
Args:
chunk_dict: Raw chunk dictionary
Returns:
Processed chunk dictionary
"""
content = chunk_dict['content'].strip()
return {
'content': content,
'start_timestamp': chunk_dict['start_timestamp'],
'end_timestamp': chunk_dict['end_timestamp'],
'content_length': len(content),
'word_count': len(word_tokenize(content)),
'entries_count': len(chunk_dict.get('entries', [])),
'keywords': self._extract_keywords(content),
'entities': self._extract_entities(content)
}
def _get_overlap_content(self, content: str, overlap_size: int) -> str:
"""Get overlap content from the end of current chunk.
Args:
content: Current chunk content
overlap_size: Size of overlap in characters
Returns:
Overlap content
"""
if len(content) <= overlap_size:
return content
overlap_start = len(content) - overlap_size
# Try to start overlap at word boundary
space_index = content.find(' ', overlap_start)
if space_index != -1 and space_index < len(content) - overlap_size * 0.5:
overlap_start = space_index + 1
return content[overlap_start:].strip()
def _process_chunk(
self,
chunk: Dict[str, Any],
chunk_index: int,
video_id: str,
transcript_metadata: Optional[Dict[str, Any]]
) -> Dict[str, Any]:
"""Process and enrich chunk with metadata.
Args:
chunk: Raw chunk dictionary
chunk_index: Index of chunk in sequence
video_id: YouTube video ID
transcript_metadata: Optional transcript metadata
Returns:
Processed chunk with metadata
"""
content = chunk['content']
processed_chunk = {
'video_id': video_id,
'chunk_index': chunk_index,
'chunk_type': 'transcript',
'content': content,
'content_length': len(content),
'content_hash': hashlib.sha256(content.encode()).hexdigest(),
'start_timestamp': chunk.get('start_timestamp'),
'end_timestamp': chunk.get('end_timestamp'),
'word_count': chunk.get('word_count', len(word_tokenize(content))),
'keywords': chunk.get('keywords', []),
'entities': chunk.get('entities', []),
'created_at': datetime.now().isoformat()
}
# Add transcript metadata if available
if transcript_metadata:
processed_chunk['source_metadata'] = transcript_metadata
return processed_chunk
def _extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
"""Extract keywords from text using TF-IDF.
Args:
text: Text content
max_keywords: Maximum number of keywords
Returns:
List of keywords
"""
try:
# Simple keyword extraction using word frequency
words = word_tokenize(text.lower())
words = [word for word in words if word.isalpha() and word not in self.stop_words and len(word) > 2]
# Count word frequencies
word_freq = {}
for word in words:
word_freq[word] = word_freq.get(word, 0) + 1
# Sort by frequency and return top keywords
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
return [word for word, freq in sorted_words[:max_keywords]]
except Exception as e:
logger.warning(f"Keyword extraction failed: {e}")
return []
def _extract_entities(self, text: str) -> List[Dict[str, str]]:
"""Extract named entities from text (basic implementation).
Args:
text: Text content
Returns:
List of entity dictionaries
"""
try:
# Simple pattern-based entity extraction
entities = []
# Email patterns
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
for email in emails:
entities.append({'text': email, 'type': 'EMAIL'})
# URL patterns
url_pattern = r'https?://[^\s]+'
urls = re.findall(url_pattern, text)
for url in urls:
entities.append({'text': url, 'type': 'URL'})
# Time patterns
time_pattern = r'\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?\b'
times = re.findall(time_pattern, text)
for time in times:
entities.append({'text': time, 'type': 'TIME'})
return entities[:20] # Limit to 20 entities
except Exception as e:
logger.warning(f"Entity extraction failed: {e}")
return []
def get_chunking_stats(
self,
chunks: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""Get statistics about the chunking process.
Args:
chunks: List of processed chunks
Returns:
Dictionary with chunking statistics
"""
if not chunks:
return {'total_chunks': 0}
total_content_length = sum(chunk['content_length'] for chunk in chunks)
total_words = sum(chunk['word_count'] for chunk in chunks)
avg_chunk_size = total_content_length / len(chunks)
# Count chunks with timestamps
timestamped_chunks = sum(1 for chunk in chunks if chunk.get('start_timestamp') is not None)
return {
'total_chunks': len(chunks),
'total_content_length': total_content_length,
'total_words': total_words,
'avg_chunk_size': round(avg_chunk_size, 2),
'timestamped_chunks': timestamped_chunks,
'timestamp_coverage': round(timestamped_chunks / len(chunks) * 100, 2),
'min_chunk_size': min(chunk['content_length'] for chunk in chunks),
'max_chunk_size': max(chunk['content_length'] for chunk in chunks)
}