476 lines
17 KiB
Python
476 lines
17 KiB
Python
"""Service for semantic chunking of video transcripts."""
|
|
|
|
import logging
|
|
import re
|
|
import hashlib
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from datetime import datetime
|
|
import json
|
|
|
|
import nltk
|
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
from nltk.corpus import stopwords
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Download required NLTK data
|
|
try:
|
|
nltk.data.find('tokenizers/punkt')
|
|
nltk.data.find('corpora/stopwords')
|
|
except LookupError:
|
|
nltk.download('punkt')
|
|
nltk.download('stopwords')
|
|
|
|
|
|
class TranscriptChunkerError(Exception):
|
|
"""Transcript chunking specific errors."""
|
|
pass
|
|
|
|
|
|
class TranscriptChunker:
|
|
"""Service for intelligent chunking of video transcripts with semantic segmentation."""
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int = 1000,
|
|
chunk_overlap: int = 200,
|
|
min_chunk_size: int = 100,
|
|
use_semantic_splitting: bool = True
|
|
):
|
|
"""Initialize transcript chunker.
|
|
|
|
Args:
|
|
chunk_size: Target size for chunks in characters
|
|
chunk_overlap: Overlap between chunks in characters
|
|
min_chunk_size: Minimum chunk size in characters
|
|
use_semantic_splitting: Whether to use semantic boundaries
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
self.min_chunk_size = min_chunk_size
|
|
self.use_semantic_splitting = use_semantic_splitting
|
|
|
|
# Initialize NLTK components
|
|
try:
|
|
self.stop_words = set(stopwords.words('english'))
|
|
except LookupError:
|
|
self.stop_words = set()
|
|
logger.warning("NLTK stopwords not available, using empty set")
|
|
|
|
def chunk_transcript(
|
|
self,
|
|
transcript: str,
|
|
video_id: str,
|
|
transcript_metadata: Optional[Dict[str, Any]] = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""Chunk transcript into semantic segments.
|
|
|
|
Args:
|
|
transcript: Full transcript text
|
|
video_id: YouTube video ID
|
|
transcript_metadata: Optional metadata about the transcript
|
|
|
|
Returns:
|
|
List of chunk dictionaries with content and metadata
|
|
"""
|
|
try:
|
|
logger.info(f"Chunking transcript for video {video_id}, length: {len(transcript)}")
|
|
|
|
if not transcript or len(transcript) < self.min_chunk_size:
|
|
logger.warning(f"Transcript too short for chunking: {len(transcript)} characters")
|
|
return []
|
|
|
|
# Parse transcript with timestamps if available
|
|
transcript_entries = self._parse_transcript_with_timestamps(transcript)
|
|
|
|
if self.use_semantic_splitting and transcript_entries:
|
|
chunks = self._semantic_chunking(transcript_entries, video_id)
|
|
else:
|
|
# Fallback to simple text chunking
|
|
chunks = self._simple_text_chunking(transcript, video_id)
|
|
|
|
# Process chunks and add metadata
|
|
processed_chunks = []
|
|
for i, chunk in enumerate(chunks):
|
|
processed_chunk = self._process_chunk(chunk, i, video_id, transcript_metadata)
|
|
processed_chunks.append(processed_chunk)
|
|
|
|
logger.info(f"Created {len(processed_chunks)} chunks for video {video_id}")
|
|
return processed_chunks
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to chunk transcript: {e}")
|
|
raise TranscriptChunkerError(f"Chunking failed: {e}")
|
|
|
|
def _parse_transcript_with_timestamps(self, transcript: str) -> List[Dict[str, Any]]:
|
|
"""Parse transcript text to extract timestamps and content.
|
|
|
|
Args:
|
|
transcript: Raw transcript text
|
|
|
|
Returns:
|
|
List of transcript entries with timestamps
|
|
"""
|
|
entries = []
|
|
|
|
# Try to parse different timestamp formats
|
|
patterns = [
|
|
r'(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*-\s*(\d{1,2}:\d{2}:\d{2}(?:\.\d{3})?)\s*:\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)',
|
|
r'\[(\d{1,2}:\d{2}:\d{2})\]\s*(.+?)(?=\[\d{1,2}:\d{2}:\d{2}\]|\Z)',
|
|
r'(\d{1,2}:\d{2}:\d{2})\s*(.+?)(?=\d{1,2}:\d{2}:\d{2}|\Z)',
|
|
r'(\d+\.\d+)s:\s*(.+?)(?=\d+\.\d+s:|\Z)'
|
|
]
|
|
|
|
for pattern in patterns:
|
|
matches = re.finditer(pattern, transcript, re.DOTALL | re.MULTILINE)
|
|
if matches:
|
|
for match in matches:
|
|
try:
|
|
if len(match.groups()) == 3: # Start, end, content
|
|
start_time = self._parse_timestamp(match.group(1))
|
|
end_time = self._parse_timestamp(match.group(2))
|
|
content = match.group(3).strip()
|
|
else: # Timestamp, content
|
|
start_time = self._parse_timestamp(match.group(1))
|
|
end_time = None
|
|
content = match.group(2).strip()
|
|
|
|
if content:
|
|
entries.append({
|
|
'start_timestamp': start_time,
|
|
'end_timestamp': end_time,
|
|
'content': content
|
|
})
|
|
except ValueError:
|
|
continue
|
|
|
|
if entries:
|
|
break
|
|
|
|
# If no timestamps found, treat as plain text
|
|
if not entries:
|
|
entries = [{
|
|
'start_timestamp': None,
|
|
'end_timestamp': None,
|
|
'content': transcript
|
|
}]
|
|
|
|
return entries
|
|
|
|
def _parse_timestamp(self, timestamp_str: str) -> Optional[float]:
|
|
"""Parse timestamp string to seconds.
|
|
|
|
Args:
|
|
timestamp_str: Timestamp in various formats
|
|
|
|
Returns:
|
|
Timestamp in seconds or None
|
|
"""
|
|
try:
|
|
if ':' in timestamp_str:
|
|
# Format: HH:MM:SS or MM:SS
|
|
parts = timestamp_str.split(':')
|
|
if len(parts) == 3:
|
|
hours, minutes, seconds = parts
|
|
return int(hours) * 3600 + int(minutes) * 60 + float(seconds)
|
|
elif len(parts) == 2:
|
|
minutes, seconds = parts
|
|
return int(minutes) * 60 + float(seconds)
|
|
elif timestamp_str.endswith('s'):
|
|
# Format: 123.45s
|
|
return float(timestamp_str[:-1])
|
|
else:
|
|
return float(timestamp_str)
|
|
except (ValueError, IndexError):
|
|
return None
|
|
|
|
def _semantic_chunking(
|
|
self,
|
|
transcript_entries: List[Dict[str, Any]],
|
|
video_id: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""Perform semantic chunking using sentence similarity.
|
|
|
|
Args:
|
|
transcript_entries: List of transcript entries with timestamps
|
|
video_id: YouTube video ID
|
|
|
|
Returns:
|
|
List of semantic chunks
|
|
"""
|
|
chunks = []
|
|
current_chunk = {
|
|
'content': '',
|
|
'start_timestamp': None,
|
|
'end_timestamp': None,
|
|
'entries': []
|
|
}
|
|
|
|
for entry in transcript_entries:
|
|
# Tokenize content into sentences
|
|
sentences = sent_tokenize(entry['content'])
|
|
|
|
for sentence in sentences:
|
|
if not sentence.strip():
|
|
continue
|
|
|
|
# Check if adding this sentence would exceed chunk size
|
|
potential_content = current_chunk['content'] + ' ' + sentence if current_chunk['content'] else sentence
|
|
|
|
if len(potential_content) > self.chunk_size and len(current_chunk['content']) > self.min_chunk_size:
|
|
# Finalize current chunk
|
|
if current_chunk['content']:
|
|
chunks.append(self._finalize_chunk(current_chunk))
|
|
|
|
# Start new chunk with overlap
|
|
overlap_content = self._get_overlap_content(current_chunk['content'], self.chunk_overlap)
|
|
current_chunk = {
|
|
'content': overlap_content + ' ' + sentence if overlap_content else sentence,
|
|
'start_timestamp': entry['start_timestamp'],
|
|
'end_timestamp': entry['end_timestamp'],
|
|
'entries': [entry]
|
|
}
|
|
else:
|
|
# Add sentence to current chunk
|
|
current_chunk['content'] = potential_content
|
|
if current_chunk['start_timestamp'] is None:
|
|
current_chunk['start_timestamp'] = entry['start_timestamp']
|
|
current_chunk['end_timestamp'] = entry['end_timestamp']
|
|
if entry not in current_chunk['entries']:
|
|
current_chunk['entries'].append(entry)
|
|
|
|
# Add final chunk
|
|
if current_chunk['content'] and len(current_chunk['content']) >= self.min_chunk_size:
|
|
chunks.append(self._finalize_chunk(current_chunk))
|
|
|
|
return chunks
|
|
|
|
def _simple_text_chunking(self, transcript: str, video_id: str) -> List[Dict[str, Any]]:
|
|
"""Simple text chunking without semantic analysis.
|
|
|
|
Args:
|
|
transcript: Full transcript text
|
|
video_id: YouTube video ID
|
|
|
|
Returns:
|
|
List of text chunks
|
|
"""
|
|
chunks = []
|
|
text = transcript.strip()
|
|
|
|
start = 0
|
|
chunk_index = 0
|
|
|
|
while start < len(text):
|
|
# Calculate chunk end with overlap
|
|
end = min(start + self.chunk_size, len(text))
|
|
|
|
# Try to break at sentence boundary
|
|
if end < len(text):
|
|
# Look for sentence ending punctuation
|
|
for i in range(end, max(start + self.min_chunk_size, end - 200), -1):
|
|
if text[i] in '.!?':
|
|
end = i + 1
|
|
break
|
|
|
|
chunk_text = text[start:end].strip()
|
|
|
|
if len(chunk_text) >= self.min_chunk_size:
|
|
chunks.append({
|
|
'content': chunk_text,
|
|
'start_timestamp': None,
|
|
'end_timestamp': None,
|
|
'chunk_index': chunk_index
|
|
})
|
|
chunk_index += 1
|
|
|
|
# Move start position with overlap
|
|
start = end - self.chunk_overlap if end < len(text) else end
|
|
|
|
return chunks
|
|
|
|
def _finalize_chunk(self, chunk_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Finalize chunk with metadata and cleanup.
|
|
|
|
Args:
|
|
chunk_dict: Raw chunk dictionary
|
|
|
|
Returns:
|
|
Processed chunk dictionary
|
|
"""
|
|
content = chunk_dict['content'].strip()
|
|
|
|
return {
|
|
'content': content,
|
|
'start_timestamp': chunk_dict['start_timestamp'],
|
|
'end_timestamp': chunk_dict['end_timestamp'],
|
|
'content_length': len(content),
|
|
'word_count': len(word_tokenize(content)),
|
|
'entries_count': len(chunk_dict.get('entries', [])),
|
|
'keywords': self._extract_keywords(content),
|
|
'entities': self._extract_entities(content)
|
|
}
|
|
|
|
def _get_overlap_content(self, content: str, overlap_size: int) -> str:
|
|
"""Get overlap content from the end of current chunk.
|
|
|
|
Args:
|
|
content: Current chunk content
|
|
overlap_size: Size of overlap in characters
|
|
|
|
Returns:
|
|
Overlap content
|
|
"""
|
|
if len(content) <= overlap_size:
|
|
return content
|
|
|
|
overlap_start = len(content) - overlap_size
|
|
|
|
# Try to start overlap at word boundary
|
|
space_index = content.find(' ', overlap_start)
|
|
if space_index != -1 and space_index < len(content) - overlap_size * 0.5:
|
|
overlap_start = space_index + 1
|
|
|
|
return content[overlap_start:].strip()
|
|
|
|
def _process_chunk(
|
|
self,
|
|
chunk: Dict[str, Any],
|
|
chunk_index: int,
|
|
video_id: str,
|
|
transcript_metadata: Optional[Dict[str, Any]]
|
|
) -> Dict[str, Any]:
|
|
"""Process and enrich chunk with metadata.
|
|
|
|
Args:
|
|
chunk: Raw chunk dictionary
|
|
chunk_index: Index of chunk in sequence
|
|
video_id: YouTube video ID
|
|
transcript_metadata: Optional transcript metadata
|
|
|
|
Returns:
|
|
Processed chunk with metadata
|
|
"""
|
|
content = chunk['content']
|
|
|
|
processed_chunk = {
|
|
'video_id': video_id,
|
|
'chunk_index': chunk_index,
|
|
'chunk_type': 'transcript',
|
|
'content': content,
|
|
'content_length': len(content),
|
|
'content_hash': hashlib.sha256(content.encode()).hexdigest(),
|
|
'start_timestamp': chunk.get('start_timestamp'),
|
|
'end_timestamp': chunk.get('end_timestamp'),
|
|
'word_count': chunk.get('word_count', len(word_tokenize(content))),
|
|
'keywords': chunk.get('keywords', []),
|
|
'entities': chunk.get('entities', []),
|
|
'created_at': datetime.now().isoformat()
|
|
}
|
|
|
|
# Add transcript metadata if available
|
|
if transcript_metadata:
|
|
processed_chunk['source_metadata'] = transcript_metadata
|
|
|
|
return processed_chunk
|
|
|
|
def _extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
|
|
"""Extract keywords from text using TF-IDF.
|
|
|
|
Args:
|
|
text: Text content
|
|
max_keywords: Maximum number of keywords
|
|
|
|
Returns:
|
|
List of keywords
|
|
"""
|
|
try:
|
|
# Simple keyword extraction using word frequency
|
|
words = word_tokenize(text.lower())
|
|
words = [word for word in words if word.isalpha() and word not in self.stop_words and len(word) > 2]
|
|
|
|
# Count word frequencies
|
|
word_freq = {}
|
|
for word in words:
|
|
word_freq[word] = word_freq.get(word, 0) + 1
|
|
|
|
# Sort by frequency and return top keywords
|
|
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
|
return [word for word, freq in sorted_words[:max_keywords]]
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Keyword extraction failed: {e}")
|
|
return []
|
|
|
|
def _extract_entities(self, text: str) -> List[Dict[str, str]]:
|
|
"""Extract named entities from text (basic implementation).
|
|
|
|
Args:
|
|
text: Text content
|
|
|
|
Returns:
|
|
List of entity dictionaries
|
|
"""
|
|
try:
|
|
# Simple pattern-based entity extraction
|
|
entities = []
|
|
|
|
# Email patterns
|
|
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|
emails = re.findall(email_pattern, text)
|
|
for email in emails:
|
|
entities.append({'text': email, 'type': 'EMAIL'})
|
|
|
|
# URL patterns
|
|
url_pattern = r'https?://[^\s]+'
|
|
urls = re.findall(url_pattern, text)
|
|
for url in urls:
|
|
entities.append({'text': url, 'type': 'URL'})
|
|
|
|
# Time patterns
|
|
time_pattern = r'\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|am|pm)?\b'
|
|
times = re.findall(time_pattern, text)
|
|
for time in times:
|
|
entities.append({'text': time, 'type': 'TIME'})
|
|
|
|
return entities[:20] # Limit to 20 entities
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Entity extraction failed: {e}")
|
|
return []
|
|
|
|
def get_chunking_stats(
|
|
self,
|
|
chunks: List[Dict[str, Any]]
|
|
) -> Dict[str, Any]:
|
|
"""Get statistics about the chunking process.
|
|
|
|
Args:
|
|
chunks: List of processed chunks
|
|
|
|
Returns:
|
|
Dictionary with chunking statistics
|
|
"""
|
|
if not chunks:
|
|
return {'total_chunks': 0}
|
|
|
|
total_content_length = sum(chunk['content_length'] for chunk in chunks)
|
|
total_words = sum(chunk['word_count'] for chunk in chunks)
|
|
avg_chunk_size = total_content_length / len(chunks)
|
|
|
|
# Count chunks with timestamps
|
|
timestamped_chunks = sum(1 for chunk in chunks if chunk.get('start_timestamp') is not None)
|
|
|
|
return {
|
|
'total_chunks': len(chunks),
|
|
'total_content_length': total_content_length,
|
|
'total_words': total_words,
|
|
'avg_chunk_size': round(avg_chunk_size, 2),
|
|
'timestamped_chunks': timestamped_chunks,
|
|
'timestamp_coverage': round(timestamped_chunks / len(chunks) * 100, 2),
|
|
'min_chunk_size': min(chunk['content_length'] for chunk in chunks),
|
|
'max_chunk_size': max(chunk['content_length'] for chunk in chunks)
|
|
} |