495 lines
18 KiB
Python
495 lines
18 KiB
Python
"""Timestamp Processor for semantic section detection and navigation.
|
|
|
|
This service processes video transcripts to identify meaningful sections,
|
|
create timestamped navigation, and generate clickable YouTube links.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
from ..services.deepseek_service import DeepSeekService
|
|
from ..core.exceptions import ServiceError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TimestampedSection:
|
|
"""Represents a timestamped section of content."""
|
|
index: int
|
|
title: str
|
|
start_timestamp: int # seconds
|
|
end_timestamp: int # seconds
|
|
youtube_link: str
|
|
content: str
|
|
summary: str
|
|
key_points: List[str]
|
|
confidence_score: float
|
|
|
|
|
|
@dataclass
|
|
class SectionDetectionResult:
|
|
"""Result of section detection process."""
|
|
sections: List[TimestampedSection]
|
|
total_sections: int
|
|
processing_time_seconds: float
|
|
quality_score: float
|
|
created_at: datetime
|
|
|
|
|
|
class TimestampProcessor:
|
|
"""Service for processing timestamps and detecting semantic sections."""
|
|
|
|
def __init__(self, ai_service: Optional[DeepSeekService] = None):
|
|
"""Initialize timestamp processor.
|
|
|
|
Args:
|
|
ai_service: AI service for content analysis
|
|
"""
|
|
self.ai_service = ai_service or DeepSeekService()
|
|
|
|
# Section detection configuration
|
|
self.min_section_duration = 30 # seconds
|
|
self.max_sections = 15
|
|
self.target_section_length = 180 # 3 minutes
|
|
self.overlap_tolerance = 5 # seconds
|
|
|
|
logger.info("TimestampProcessor initialized")
|
|
|
|
async def detect_semantic_sections(
|
|
self,
|
|
transcript_data: List[Dict[str, Any]],
|
|
video_url: str,
|
|
video_title: str = ""
|
|
) -> SectionDetectionResult:
|
|
"""Detect semantic sections from transcript data.
|
|
|
|
Args:
|
|
transcript_data: List of transcript segments with timestamps
|
|
video_url: YouTube video URL for link generation
|
|
video_title: Video title for context
|
|
|
|
Returns:
|
|
Section detection result
|
|
"""
|
|
start_time = datetime.now()
|
|
|
|
if not transcript_data or len(transcript_data) < 2:
|
|
raise ServiceError("Insufficient transcript data for section detection")
|
|
|
|
try:
|
|
# Prepare transcript text with timestamps
|
|
full_transcript = self._prepare_transcript_text(transcript_data)
|
|
|
|
# Detect section boundaries using AI
|
|
section_boundaries = await self._detect_section_boundaries(
|
|
full_transcript, video_title
|
|
)
|
|
|
|
# Create timestamped sections
|
|
sections = await self._create_timestamped_sections(
|
|
transcript_data, section_boundaries, video_url
|
|
)
|
|
|
|
# Calculate quality score
|
|
quality_score = self._calculate_quality_score(sections, transcript_data)
|
|
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
return SectionDetectionResult(
|
|
sections=sections,
|
|
total_sections=len(sections),
|
|
processing_time_seconds=processing_time,
|
|
quality_score=quality_score,
|
|
created_at=datetime.now()
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error detecting semantic sections: {e}")
|
|
raise ServiceError(f"Section detection failed: {str(e)}")
|
|
|
|
def _prepare_transcript_text(self, transcript_data: List[Dict[str, Any]]) -> str:
|
|
"""Prepare transcript text with timestamp markers."""
|
|
transcript_lines = []
|
|
|
|
for segment in transcript_data:
|
|
timestamp = segment.get('start', 0)
|
|
text = segment.get('text', '').strip()
|
|
|
|
if text:
|
|
time_marker = self.seconds_to_timestamp(timestamp)
|
|
transcript_lines.append(f"[{time_marker}] {text}")
|
|
|
|
return '\n'.join(transcript_lines)
|
|
|
|
async def _detect_section_boundaries(
|
|
self,
|
|
transcript_text: str,
|
|
video_title: str
|
|
) -> List[Dict[str, Any]]:
|
|
"""Use AI to detect natural section boundaries."""
|
|
|
|
system_prompt = """You are an expert at identifying natural section breaks in video content.
|
|
|
|
Analyze the transcript and identify 5-10 meaningful sections that represent distinct topics, themes, or narrative segments.
|
|
|
|
Each section should:
|
|
- Have a clear, descriptive title
|
|
- Start and end at natural transition points
|
|
- Be long enough to contain meaningful content (at least 30 seconds)
|
|
- Represent a coherent topic or theme
|
|
|
|
Return ONLY valid JSON with this structure:
|
|
{
|
|
"sections": [
|
|
{
|
|
"title": "Section Title",
|
|
"start_time_marker": "[00:01:23]",
|
|
"estimated_duration": "2-3 minutes",
|
|
"key_topic": "Main topic of this section"
|
|
}
|
|
]
|
|
}"""
|
|
|
|
# Limit transcript length for AI processing
|
|
limited_transcript = transcript_text[:6000]
|
|
|
|
prompt = f"""Video Title: {video_title}
|
|
|
|
Transcript with timestamps:
|
|
{limited_transcript}
|
|
|
|
Identify natural section breaks and create meaningful section titles."""
|
|
|
|
response = await self.ai_service.generate_response(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
temperature=0.3,
|
|
max_tokens=800
|
|
)
|
|
|
|
try:
|
|
import json
|
|
result = json.loads(response)
|
|
return result.get("sections", [])
|
|
except json.JSONDecodeError:
|
|
logger.warning("AI response was not valid JSON, using fallback sections")
|
|
return self._create_fallback_sections(transcript_text)
|
|
|
|
def _create_fallback_sections(self, transcript_text: str) -> List[Dict[str, Any]]:
|
|
"""Create fallback sections if AI detection fails."""
|
|
lines = transcript_text.split('\n')
|
|
sections = []
|
|
|
|
# Create sections every 3-4 minutes based on timestamps
|
|
current_section = 1
|
|
for i, line in enumerate(lines[::20]): # Sample every 20th line
|
|
time_match = re.search(r'\[(\d{2}:\d{2}:\d{2})\]', line)
|
|
if time_match:
|
|
sections.append({
|
|
"title": f"Section {current_section}",
|
|
"start_time_marker": f"[{time_match.group(1)}]",
|
|
"estimated_duration": "3-4 minutes",
|
|
"key_topic": "Content analysis"
|
|
})
|
|
current_section += 1
|
|
|
|
if len(sections) >= 8: # Limit fallback sections
|
|
break
|
|
|
|
return sections
|
|
|
|
async def _create_timestamped_sections(
|
|
self,
|
|
transcript_data: List[Dict[str, Any]],
|
|
section_boundaries: List[Dict[str, Any]],
|
|
video_url: str
|
|
) -> List[TimestampedSection]:
|
|
"""Create detailed timestamped sections."""
|
|
|
|
sections = []
|
|
|
|
for i, boundary in enumerate(section_boundaries):
|
|
try:
|
|
# Extract start time from time marker
|
|
start_time_marker = boundary.get("start_time_marker", "[00:00:00]")
|
|
start_seconds = self.timestamp_to_seconds(start_time_marker)
|
|
|
|
# Determine end time (next section start or video end)
|
|
if i + 1 < len(section_boundaries):
|
|
next_marker = section_boundaries[i + 1].get("start_time_marker", "[99:99:99]")
|
|
end_seconds = self.timestamp_to_seconds(next_marker)
|
|
else:
|
|
# Last section goes to end of transcript
|
|
end_seconds = max(seg.get('start', 0) for seg in transcript_data) + 30
|
|
|
|
# Extract content for this section
|
|
section_content = self._extract_section_content(
|
|
transcript_data, start_seconds, end_seconds
|
|
)
|
|
|
|
# Generate section summary and key points
|
|
section_analysis = await self._analyze_section_content(
|
|
section_content, boundary.get("title", f"Section {i+1}")
|
|
)
|
|
|
|
# Create YouTube link with timestamp
|
|
youtube_link = self._create_youtube_link(video_url, start_seconds)
|
|
|
|
section = TimestampedSection(
|
|
index=i + 1,
|
|
title=boundary.get("title", f"Section {i+1}"),
|
|
start_timestamp=start_seconds,
|
|
end_timestamp=end_seconds,
|
|
youtube_link=youtube_link,
|
|
content=section_content,
|
|
summary=section_analysis.get("summary", ""),
|
|
key_points=section_analysis.get("key_points", []),
|
|
confidence_score=section_analysis.get("confidence_score", 0.7)
|
|
)
|
|
|
|
sections.append(section)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error creating section {i+1}: {e}")
|
|
continue
|
|
|
|
return sections
|
|
|
|
def _extract_section_content(
|
|
self,
|
|
transcript_data: List[Dict[str, Any]],
|
|
start_seconds: int,
|
|
end_seconds: int
|
|
) -> str:
|
|
"""Extract content for a specific time range."""
|
|
|
|
content_parts = []
|
|
|
|
for segment in transcript_data:
|
|
segment_start = segment.get('start', 0)
|
|
segment_text = segment.get('text', '').strip()
|
|
|
|
# Include segments that overlap with our section
|
|
if start_seconds <= segment_start <= end_seconds and segment_text:
|
|
content_parts.append(segment_text)
|
|
|
|
return ' '.join(content_parts) if content_parts else "Content not available"
|
|
|
|
async def _analyze_section_content(
|
|
self,
|
|
content: str,
|
|
section_title: str
|
|
) -> Dict[str, Any]:
|
|
"""Analyze section content to generate summary and key points."""
|
|
|
|
if len(content) < 50:
|
|
return {
|
|
"summary": f"Brief content in {section_title}",
|
|
"key_points": ["Content analysis"],
|
|
"confidence_score": 0.5
|
|
}
|
|
|
|
system_prompt = """You are analyzing a section of video content.
|
|
|
|
Create:
|
|
- A brief summary (1-2 sentences) of what happens in this section
|
|
- 2-3 key points or takeaways
|
|
- Confidence score (0.0-1.0) based on content quality and coherence
|
|
|
|
Return ONLY valid JSON:
|
|
{
|
|
"summary": "Brief section summary",
|
|
"key_points": ["point1", "point2", "point3"],
|
|
"confidence_score": 0.8
|
|
}"""
|
|
|
|
prompt = f"""Section: {section_title}
|
|
|
|
Content: {content[:1500]}
|
|
|
|
Analyze this section and provide insights."""
|
|
|
|
try:
|
|
response = await self.ai_service.generate_response(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
temperature=0.4,
|
|
max_tokens=300
|
|
)
|
|
|
|
import json
|
|
return json.loads(response)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Section analysis failed: {e}")
|
|
return {
|
|
"summary": f"Content analysis for {section_title}",
|
|
"key_points": ["Key insights from section"],
|
|
"confidence_score": 0.6
|
|
}
|
|
|
|
def _create_youtube_link(self, video_url: str, timestamp_seconds: int) -> str:
|
|
"""Create YouTube link with timestamp parameter."""
|
|
|
|
try:
|
|
# Extract video ID from URL
|
|
parsed_url = urlparse(video_url)
|
|
|
|
if 'youtube.com' in parsed_url.netloc:
|
|
query_params = parse_qs(parsed_url.query)
|
|
video_id = query_params.get('v', [''])[0]
|
|
elif 'youtu.be' in parsed_url.netloc:
|
|
video_id = parsed_url.path.lstrip('/')
|
|
else:
|
|
return video_url # Return original if not a YouTube URL
|
|
|
|
if not video_id:
|
|
return video_url
|
|
|
|
# Create timestamped YouTube link
|
|
return f"https://www.youtube.com/watch?v={video_id}&t={timestamp_seconds}s"
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error creating YouTube link: {e}")
|
|
return video_url
|
|
|
|
def _calculate_quality_score(
|
|
self,
|
|
sections: List[TimestampedSection],
|
|
transcript_data: List[Dict[str, Any]]
|
|
) -> float:
|
|
"""Calculate overall quality score for section detection."""
|
|
|
|
if not sections:
|
|
return 0.0
|
|
|
|
# Quality factors
|
|
factors = []
|
|
|
|
# 1. Section count (optimal range: 5-12 sections)
|
|
section_count = len(sections)
|
|
if 5 <= section_count <= 12:
|
|
factors.append(1.0)
|
|
elif section_count < 5:
|
|
factors.append(0.6)
|
|
else:
|
|
factors.append(0.8)
|
|
|
|
# 2. Average section confidence
|
|
if sections:
|
|
avg_confidence = sum(s.confidence_score for s in sections) / len(sections)
|
|
factors.append(avg_confidence)
|
|
|
|
# 3. Content coverage (how much of transcript is covered)
|
|
total_transcript_duration = max(seg.get('start', 0) for seg in transcript_data)
|
|
covered_duration = sum(s.end_timestamp - s.start_timestamp for s in sections)
|
|
|
|
if total_transcript_duration > 0:
|
|
coverage_ratio = min(1.0, covered_duration / total_transcript_duration)
|
|
factors.append(coverage_ratio)
|
|
|
|
# 4. Section length distribution (not too short or too long)
|
|
section_lengths = [s.end_timestamp - s.start_timestamp for s in sections]
|
|
avg_length = sum(section_lengths) / len(section_lengths)
|
|
|
|
if 60 <= avg_length <= 300: # 1-5 minutes is good
|
|
factors.append(1.0)
|
|
else:
|
|
factors.append(0.7)
|
|
|
|
# Calculate weighted average
|
|
return sum(factors) / len(factors)
|
|
|
|
@staticmethod
|
|
def seconds_to_timestamp(seconds: int) -> str:
|
|
"""Convert seconds to HH:MM:SS format."""
|
|
hours = seconds // 3600
|
|
minutes = (seconds % 3600) // 60
|
|
secs = seconds % 60
|
|
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
|
|
|
@staticmethod
|
|
def timestamp_to_seconds(timestamp_str: str) -> int:
|
|
"""Convert HH:MM:SS or [HH:MM:SS] to seconds."""
|
|
# Remove brackets if present
|
|
timestamp_str = timestamp_str.strip('[]')
|
|
|
|
try:
|
|
parts = timestamp_str.split(':')
|
|
if len(parts) == 3:
|
|
hours, minutes, seconds = map(int, parts)
|
|
return hours * 3600 + minutes * 60 + seconds
|
|
elif len(parts) == 2:
|
|
minutes, seconds = map(int, parts)
|
|
return minutes * 60 + seconds
|
|
else:
|
|
return int(parts[0])
|
|
except ValueError:
|
|
logger.warning(f"Invalid timestamp format: {timestamp_str}")
|
|
return 0
|
|
|
|
async def generate_table_of_contents(
|
|
self,
|
|
sections: List[TimestampedSection]
|
|
) -> str:
|
|
"""Generate markdown table of contents with timestamp links."""
|
|
|
|
if not sections:
|
|
return "## Table of Contents\n\n*No sections detected*\n"
|
|
|
|
toc_lines = ["## Table of Contents\n"]
|
|
|
|
for section in sections:
|
|
timestamp_display = self.seconds_to_timestamp(section.start_timestamp)
|
|
|
|
# Create markdown link with timestamp
|
|
toc_line = f"- **[{timestamp_display}]({section.youtube_link})** - {section.title}"
|
|
|
|
if section.summary:
|
|
toc_line += f"\n *{section.summary}*"
|
|
|
|
toc_lines.append(toc_line)
|
|
|
|
return '\n'.join(toc_lines) + '\n'
|
|
|
|
async def generate_section_navigation(
|
|
self,
|
|
sections: List[TimestampedSection]
|
|
) -> Dict[str, Any]:
|
|
"""Generate navigation data for frontend use."""
|
|
|
|
navigation = {
|
|
"total_sections": len(sections),
|
|
"sections": []
|
|
}
|
|
|
|
for section in sections:
|
|
nav_item = {
|
|
"index": section.index,
|
|
"title": section.title,
|
|
"start_time": section.start_timestamp,
|
|
"timestamp_display": self.seconds_to_timestamp(section.start_timestamp),
|
|
"youtube_link": section.youtube_link,
|
|
"summary": section.summary,
|
|
"duration_seconds": section.end_timestamp - section.start_timestamp,
|
|
"confidence": section.confidence_score
|
|
}
|
|
navigation["sections"].append(nav_item)
|
|
|
|
return navigation
|
|
|
|
def get_processor_stats(self) -> Dict[str, Any]:
|
|
"""Get processor configuration and statistics."""
|
|
return {
|
|
"service_name": "TimestampProcessor",
|
|
"min_section_duration": self.min_section_duration,
|
|
"max_sections": self.max_sections,
|
|
"target_section_length": self.target_section_length,
|
|
"overlap_tolerance": self.overlap_tolerance
|
|
} |