youtube-summarizer/backend/services/timestamp_processor.py

495 lines
18 KiB
Python

"""Timestamp Processor for semantic section detection and navigation.
This service processes video transcripts to identify meaningful sections,
create timestamped navigation, and generate clickable YouTube links.
"""
import asyncio
import logging
import re
from datetime import datetime
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass
from urllib.parse import urlparse, parse_qs
from ..services.deepseek_service import DeepSeekService
from ..core.exceptions import ServiceError
logger = logging.getLogger(__name__)
@dataclass
class TimestampedSection:
"""Represents a timestamped section of content."""
index: int
title: str
start_timestamp: int # seconds
end_timestamp: int # seconds
youtube_link: str
content: str
summary: str
key_points: List[str]
confidence_score: float
@dataclass
class SectionDetectionResult:
"""Result of section detection process."""
sections: List[TimestampedSection]
total_sections: int
processing_time_seconds: float
quality_score: float
created_at: datetime
class TimestampProcessor:
"""Service for processing timestamps and detecting semantic sections."""
def __init__(self, ai_service: Optional[DeepSeekService] = None):
"""Initialize timestamp processor.
Args:
ai_service: AI service for content analysis
"""
self.ai_service = ai_service or DeepSeekService()
# Section detection configuration
self.min_section_duration = 30 # seconds
self.max_sections = 15
self.target_section_length = 180 # 3 minutes
self.overlap_tolerance = 5 # seconds
logger.info("TimestampProcessor initialized")
async def detect_semantic_sections(
self,
transcript_data: List[Dict[str, Any]],
video_url: str,
video_title: str = ""
) -> SectionDetectionResult:
"""Detect semantic sections from transcript data.
Args:
transcript_data: List of transcript segments with timestamps
video_url: YouTube video URL for link generation
video_title: Video title for context
Returns:
Section detection result
"""
start_time = datetime.now()
if not transcript_data or len(transcript_data) < 2:
raise ServiceError("Insufficient transcript data for section detection")
try:
# Prepare transcript text with timestamps
full_transcript = self._prepare_transcript_text(transcript_data)
# Detect section boundaries using AI
section_boundaries = await self._detect_section_boundaries(
full_transcript, video_title
)
# Create timestamped sections
sections = await self._create_timestamped_sections(
transcript_data, section_boundaries, video_url
)
# Calculate quality score
quality_score = self._calculate_quality_score(sections, transcript_data)
processing_time = (datetime.now() - start_time).total_seconds()
return SectionDetectionResult(
sections=sections,
total_sections=len(sections),
processing_time_seconds=processing_time,
quality_score=quality_score,
created_at=datetime.now()
)
except Exception as e:
logger.error(f"Error detecting semantic sections: {e}")
raise ServiceError(f"Section detection failed: {str(e)}")
def _prepare_transcript_text(self, transcript_data: List[Dict[str, Any]]) -> str:
"""Prepare transcript text with timestamp markers."""
transcript_lines = []
for segment in transcript_data:
timestamp = segment.get('start', 0)
text = segment.get('text', '').strip()
if text:
time_marker = self.seconds_to_timestamp(timestamp)
transcript_lines.append(f"[{time_marker}] {text}")
return '\n'.join(transcript_lines)
async def _detect_section_boundaries(
self,
transcript_text: str,
video_title: str
) -> List[Dict[str, Any]]:
"""Use AI to detect natural section boundaries."""
system_prompt = """You are an expert at identifying natural section breaks in video content.
Analyze the transcript and identify 5-10 meaningful sections that represent distinct topics, themes, or narrative segments.
Each section should:
- Have a clear, descriptive title
- Start and end at natural transition points
- Be long enough to contain meaningful content (at least 30 seconds)
- Represent a coherent topic or theme
Return ONLY valid JSON with this structure:
{
"sections": [
{
"title": "Section Title",
"start_time_marker": "[00:01:23]",
"estimated_duration": "2-3 minutes",
"key_topic": "Main topic of this section"
}
]
}"""
# Limit transcript length for AI processing
limited_transcript = transcript_text[:6000]
prompt = f"""Video Title: {video_title}
Transcript with timestamps:
{limited_transcript}
Identify natural section breaks and create meaningful section titles."""
response = await self.ai_service.generate_response(
prompt=prompt,
system_prompt=system_prompt,
temperature=0.3,
max_tokens=800
)
try:
import json
result = json.loads(response)
return result.get("sections", [])
except json.JSONDecodeError:
logger.warning("AI response was not valid JSON, using fallback sections")
return self._create_fallback_sections(transcript_text)
def _create_fallback_sections(self, transcript_text: str) -> List[Dict[str, Any]]:
"""Create fallback sections if AI detection fails."""
lines = transcript_text.split('\n')
sections = []
# Create sections every 3-4 minutes based on timestamps
current_section = 1
for i, line in enumerate(lines[::20]): # Sample every 20th line
time_match = re.search(r'\[(\d{2}:\d{2}:\d{2})\]', line)
if time_match:
sections.append({
"title": f"Section {current_section}",
"start_time_marker": f"[{time_match.group(1)}]",
"estimated_duration": "3-4 minutes",
"key_topic": "Content analysis"
})
current_section += 1
if len(sections) >= 8: # Limit fallback sections
break
return sections
async def _create_timestamped_sections(
self,
transcript_data: List[Dict[str, Any]],
section_boundaries: List[Dict[str, Any]],
video_url: str
) -> List[TimestampedSection]:
"""Create detailed timestamped sections."""
sections = []
for i, boundary in enumerate(section_boundaries):
try:
# Extract start time from time marker
start_time_marker = boundary.get("start_time_marker", "[00:00:00]")
start_seconds = self.timestamp_to_seconds(start_time_marker)
# Determine end time (next section start or video end)
if i + 1 < len(section_boundaries):
next_marker = section_boundaries[i + 1].get("start_time_marker", "[99:99:99]")
end_seconds = self.timestamp_to_seconds(next_marker)
else:
# Last section goes to end of transcript
end_seconds = max(seg.get('start', 0) for seg in transcript_data) + 30
# Extract content for this section
section_content = self._extract_section_content(
transcript_data, start_seconds, end_seconds
)
# Generate section summary and key points
section_analysis = await self._analyze_section_content(
section_content, boundary.get("title", f"Section {i+1}")
)
# Create YouTube link with timestamp
youtube_link = self._create_youtube_link(video_url, start_seconds)
section = TimestampedSection(
index=i + 1,
title=boundary.get("title", f"Section {i+1}"),
start_timestamp=start_seconds,
end_timestamp=end_seconds,
youtube_link=youtube_link,
content=section_content,
summary=section_analysis.get("summary", ""),
key_points=section_analysis.get("key_points", []),
confidence_score=section_analysis.get("confidence_score", 0.7)
)
sections.append(section)
except Exception as e:
logger.warning(f"Error creating section {i+1}: {e}")
continue
return sections
def _extract_section_content(
self,
transcript_data: List[Dict[str, Any]],
start_seconds: int,
end_seconds: int
) -> str:
"""Extract content for a specific time range."""
content_parts = []
for segment in transcript_data:
segment_start = segment.get('start', 0)
segment_text = segment.get('text', '').strip()
# Include segments that overlap with our section
if start_seconds <= segment_start <= end_seconds and segment_text:
content_parts.append(segment_text)
return ' '.join(content_parts) if content_parts else "Content not available"
async def _analyze_section_content(
self,
content: str,
section_title: str
) -> Dict[str, Any]:
"""Analyze section content to generate summary and key points."""
if len(content) < 50:
return {
"summary": f"Brief content in {section_title}",
"key_points": ["Content analysis"],
"confidence_score": 0.5
}
system_prompt = """You are analyzing a section of video content.
Create:
- A brief summary (1-2 sentences) of what happens in this section
- 2-3 key points or takeaways
- Confidence score (0.0-1.0) based on content quality and coherence
Return ONLY valid JSON:
{
"summary": "Brief section summary",
"key_points": ["point1", "point2", "point3"],
"confidence_score": 0.8
}"""
prompt = f"""Section: {section_title}
Content: {content[:1500]}
Analyze this section and provide insights."""
try:
response = await self.ai_service.generate_response(
prompt=prompt,
system_prompt=system_prompt,
temperature=0.4,
max_tokens=300
)
import json
return json.loads(response)
except Exception as e:
logger.warning(f"Section analysis failed: {e}")
return {
"summary": f"Content analysis for {section_title}",
"key_points": ["Key insights from section"],
"confidence_score": 0.6
}
def _create_youtube_link(self, video_url: str, timestamp_seconds: int) -> str:
"""Create YouTube link with timestamp parameter."""
try:
# Extract video ID from URL
parsed_url = urlparse(video_url)
if 'youtube.com' in parsed_url.netloc:
query_params = parse_qs(parsed_url.query)
video_id = query_params.get('v', [''])[0]
elif 'youtu.be' in parsed_url.netloc:
video_id = parsed_url.path.lstrip('/')
else:
return video_url # Return original if not a YouTube URL
if not video_id:
return video_url
# Create timestamped YouTube link
return f"https://www.youtube.com/watch?v={video_id}&t={timestamp_seconds}s"
except Exception as e:
logger.warning(f"Error creating YouTube link: {e}")
return video_url
def _calculate_quality_score(
self,
sections: List[TimestampedSection],
transcript_data: List[Dict[str, Any]]
) -> float:
"""Calculate overall quality score for section detection."""
if not sections:
return 0.0
# Quality factors
factors = []
# 1. Section count (optimal range: 5-12 sections)
section_count = len(sections)
if 5 <= section_count <= 12:
factors.append(1.0)
elif section_count < 5:
factors.append(0.6)
else:
factors.append(0.8)
# 2. Average section confidence
if sections:
avg_confidence = sum(s.confidence_score for s in sections) / len(sections)
factors.append(avg_confidence)
# 3. Content coverage (how much of transcript is covered)
total_transcript_duration = max(seg.get('start', 0) for seg in transcript_data)
covered_duration = sum(s.end_timestamp - s.start_timestamp for s in sections)
if total_transcript_duration > 0:
coverage_ratio = min(1.0, covered_duration / total_transcript_duration)
factors.append(coverage_ratio)
# 4. Section length distribution (not too short or too long)
section_lengths = [s.end_timestamp - s.start_timestamp for s in sections]
avg_length = sum(section_lengths) / len(section_lengths)
if 60 <= avg_length <= 300: # 1-5 minutes is good
factors.append(1.0)
else:
factors.append(0.7)
# Calculate weighted average
return sum(factors) / len(factors)
@staticmethod
def seconds_to_timestamp(seconds: int) -> str:
"""Convert seconds to HH:MM:SS format."""
hours = seconds // 3600
minutes = (seconds % 3600) // 60
secs = seconds % 60
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
@staticmethod
def timestamp_to_seconds(timestamp_str: str) -> int:
"""Convert HH:MM:SS or [HH:MM:SS] to seconds."""
# Remove brackets if present
timestamp_str = timestamp_str.strip('[]')
try:
parts = timestamp_str.split(':')
if len(parts) == 3:
hours, minutes, seconds = map(int, parts)
return hours * 3600 + minutes * 60 + seconds
elif len(parts) == 2:
minutes, seconds = map(int, parts)
return minutes * 60 + seconds
else:
return int(parts[0])
except ValueError:
logger.warning(f"Invalid timestamp format: {timestamp_str}")
return 0
async def generate_table_of_contents(
self,
sections: List[TimestampedSection]
) -> str:
"""Generate markdown table of contents with timestamp links."""
if not sections:
return "## Table of Contents\n\n*No sections detected*\n"
toc_lines = ["## Table of Contents\n"]
for section in sections:
timestamp_display = self.seconds_to_timestamp(section.start_timestamp)
# Create markdown link with timestamp
toc_line = f"- **[{timestamp_display}]({section.youtube_link})** - {section.title}"
if section.summary:
toc_line += f"\n *{section.summary}*"
toc_lines.append(toc_line)
return '\n'.join(toc_lines) + '\n'
async def generate_section_navigation(
self,
sections: List[TimestampedSection]
) -> Dict[str, Any]:
"""Generate navigation data for frontend use."""
navigation = {
"total_sections": len(sections),
"sections": []
}
for section in sections:
nav_item = {
"index": section.index,
"title": section.title,
"start_time": section.start_timestamp,
"timestamp_display": self.seconds_to_timestamp(section.start_timestamp),
"youtube_link": section.youtube_link,
"summary": section.summary,
"duration_seconds": section.end_timestamp - section.start_timestamp,
"confidence": section.confidence_score
}
navigation["sections"].append(nav_item)
return navigation
def get_processor_stats(self) -> Dict[str, Any]:
"""Get processor configuration and statistics."""
return {
"service_name": "TimestampProcessor",
"min_section_duration": self.min_section_duration,
"max_sections": self.max_sections,
"target_section_length": self.target_section_length,
"overlap_tolerance": self.overlap_tolerance
}