"""Timestamp Processor for semantic section detection and navigation. This service processes video transcripts to identify meaningful sections, create timestamped navigation, and generate clickable YouTube links. """ import asyncio import logging import re from datetime import datetime from typing import Dict, Any, List, Optional, Tuple from dataclasses import dataclass from urllib.parse import urlparse, parse_qs from ..services.deepseek_service import DeepSeekService from ..core.exceptions import ServiceError logger = logging.getLogger(__name__) @dataclass class TimestampedSection: """Represents a timestamped section of content.""" index: int title: str start_timestamp: int # seconds end_timestamp: int # seconds youtube_link: str content: str summary: str key_points: List[str] confidence_score: float @dataclass class SectionDetectionResult: """Result of section detection process.""" sections: List[TimestampedSection] total_sections: int processing_time_seconds: float quality_score: float created_at: datetime class TimestampProcessor: """Service for processing timestamps and detecting semantic sections.""" def __init__(self, ai_service: Optional[DeepSeekService] = None): """Initialize timestamp processor. Args: ai_service: AI service for content analysis """ self.ai_service = ai_service or DeepSeekService() # Section detection configuration self.min_section_duration = 30 # seconds self.max_sections = 15 self.target_section_length = 180 # 3 minutes self.overlap_tolerance = 5 # seconds logger.info("TimestampProcessor initialized") async def detect_semantic_sections( self, transcript_data: List[Dict[str, Any]], video_url: str, video_title: str = "" ) -> SectionDetectionResult: """Detect semantic sections from transcript data. Args: transcript_data: List of transcript segments with timestamps video_url: YouTube video URL for link generation video_title: Video title for context Returns: Section detection result """ start_time = datetime.now() if not transcript_data or len(transcript_data) < 2: raise ServiceError("Insufficient transcript data for section detection") try: # Prepare transcript text with timestamps full_transcript = self._prepare_transcript_text(transcript_data) # Detect section boundaries using AI section_boundaries = await self._detect_section_boundaries( full_transcript, video_title ) # Create timestamped sections sections = await self._create_timestamped_sections( transcript_data, section_boundaries, video_url ) # Calculate quality score quality_score = self._calculate_quality_score(sections, transcript_data) processing_time = (datetime.now() - start_time).total_seconds() return SectionDetectionResult( sections=sections, total_sections=len(sections), processing_time_seconds=processing_time, quality_score=quality_score, created_at=datetime.now() ) except Exception as e: logger.error(f"Error detecting semantic sections: {e}") raise ServiceError(f"Section detection failed: {str(e)}") def _prepare_transcript_text(self, transcript_data: List[Dict[str, Any]]) -> str: """Prepare transcript text with timestamp markers.""" transcript_lines = [] for segment in transcript_data: timestamp = segment.get('start', 0) text = segment.get('text', '').strip() if text: time_marker = self.seconds_to_timestamp(timestamp) transcript_lines.append(f"[{time_marker}] {text}") return '\n'.join(transcript_lines) async def _detect_section_boundaries( self, transcript_text: str, video_title: str ) -> List[Dict[str, Any]]: """Use AI to detect natural section boundaries.""" system_prompt = """You are an expert at identifying natural section breaks in video content. Analyze the transcript and identify 5-10 meaningful sections that represent distinct topics, themes, or narrative segments. Each section should: - Have a clear, descriptive title - Start and end at natural transition points - Be long enough to contain meaningful content (at least 30 seconds) - Represent a coherent topic or theme Return ONLY valid JSON with this structure: { "sections": [ { "title": "Section Title", "start_time_marker": "[00:01:23]", "estimated_duration": "2-3 minutes", "key_topic": "Main topic of this section" } ] }""" # Limit transcript length for AI processing limited_transcript = transcript_text[:6000] prompt = f"""Video Title: {video_title} Transcript with timestamps: {limited_transcript} Identify natural section breaks and create meaningful section titles.""" response = await self.ai_service.generate_response( prompt=prompt, system_prompt=system_prompt, temperature=0.3, max_tokens=800 ) try: import json result = json.loads(response) return result.get("sections", []) except json.JSONDecodeError: logger.warning("AI response was not valid JSON, using fallback sections") return self._create_fallback_sections(transcript_text) def _create_fallback_sections(self, transcript_text: str) -> List[Dict[str, Any]]: """Create fallback sections if AI detection fails.""" lines = transcript_text.split('\n') sections = [] # Create sections every 3-4 minutes based on timestamps current_section = 1 for i, line in enumerate(lines[::20]): # Sample every 20th line time_match = re.search(r'\[(\d{2}:\d{2}:\d{2})\]', line) if time_match: sections.append({ "title": f"Section {current_section}", "start_time_marker": f"[{time_match.group(1)}]", "estimated_duration": "3-4 minutes", "key_topic": "Content analysis" }) current_section += 1 if len(sections) >= 8: # Limit fallback sections break return sections async def _create_timestamped_sections( self, transcript_data: List[Dict[str, Any]], section_boundaries: List[Dict[str, Any]], video_url: str ) -> List[TimestampedSection]: """Create detailed timestamped sections.""" sections = [] for i, boundary in enumerate(section_boundaries): try: # Extract start time from time marker start_time_marker = boundary.get("start_time_marker", "[00:00:00]") start_seconds = self.timestamp_to_seconds(start_time_marker) # Determine end time (next section start or video end) if i + 1 < len(section_boundaries): next_marker = section_boundaries[i + 1].get("start_time_marker", "[99:99:99]") end_seconds = self.timestamp_to_seconds(next_marker) else: # Last section goes to end of transcript end_seconds = max(seg.get('start', 0) for seg in transcript_data) + 30 # Extract content for this section section_content = self._extract_section_content( transcript_data, start_seconds, end_seconds ) # Generate section summary and key points section_analysis = await self._analyze_section_content( section_content, boundary.get("title", f"Section {i+1}") ) # Create YouTube link with timestamp youtube_link = self._create_youtube_link(video_url, start_seconds) section = TimestampedSection( index=i + 1, title=boundary.get("title", f"Section {i+1}"), start_timestamp=start_seconds, end_timestamp=end_seconds, youtube_link=youtube_link, content=section_content, summary=section_analysis.get("summary", ""), key_points=section_analysis.get("key_points", []), confidence_score=section_analysis.get("confidence_score", 0.7) ) sections.append(section) except Exception as e: logger.warning(f"Error creating section {i+1}: {e}") continue return sections def _extract_section_content( self, transcript_data: List[Dict[str, Any]], start_seconds: int, end_seconds: int ) -> str: """Extract content for a specific time range.""" content_parts = [] for segment in transcript_data: segment_start = segment.get('start', 0) segment_text = segment.get('text', '').strip() # Include segments that overlap with our section if start_seconds <= segment_start <= end_seconds and segment_text: content_parts.append(segment_text) return ' '.join(content_parts) if content_parts else "Content not available" async def _analyze_section_content( self, content: str, section_title: str ) -> Dict[str, Any]: """Analyze section content to generate summary and key points.""" if len(content) < 50: return { "summary": f"Brief content in {section_title}", "key_points": ["Content analysis"], "confidence_score": 0.5 } system_prompt = """You are analyzing a section of video content. Create: - A brief summary (1-2 sentences) of what happens in this section - 2-3 key points or takeaways - Confidence score (0.0-1.0) based on content quality and coherence Return ONLY valid JSON: { "summary": "Brief section summary", "key_points": ["point1", "point2", "point3"], "confidence_score": 0.8 }""" prompt = f"""Section: {section_title} Content: {content[:1500]} Analyze this section and provide insights.""" try: response = await self.ai_service.generate_response( prompt=prompt, system_prompt=system_prompt, temperature=0.4, max_tokens=300 ) import json return json.loads(response) except Exception as e: logger.warning(f"Section analysis failed: {e}") return { "summary": f"Content analysis for {section_title}", "key_points": ["Key insights from section"], "confidence_score": 0.6 } def _create_youtube_link(self, video_url: str, timestamp_seconds: int) -> str: """Create YouTube link with timestamp parameter.""" try: # Extract video ID from URL parsed_url = urlparse(video_url) if 'youtube.com' in parsed_url.netloc: query_params = parse_qs(parsed_url.query) video_id = query_params.get('v', [''])[0] elif 'youtu.be' in parsed_url.netloc: video_id = parsed_url.path.lstrip('/') else: return video_url # Return original if not a YouTube URL if not video_id: return video_url # Create timestamped YouTube link return f"https://www.youtube.com/watch?v={video_id}&t={timestamp_seconds}s" except Exception as e: logger.warning(f"Error creating YouTube link: {e}") return video_url def _calculate_quality_score( self, sections: List[TimestampedSection], transcript_data: List[Dict[str, Any]] ) -> float: """Calculate overall quality score for section detection.""" if not sections: return 0.0 # Quality factors factors = [] # 1. Section count (optimal range: 5-12 sections) section_count = len(sections) if 5 <= section_count <= 12: factors.append(1.0) elif section_count < 5: factors.append(0.6) else: factors.append(0.8) # 2. Average section confidence if sections: avg_confidence = sum(s.confidence_score for s in sections) / len(sections) factors.append(avg_confidence) # 3. Content coverage (how much of transcript is covered) total_transcript_duration = max(seg.get('start', 0) for seg in transcript_data) covered_duration = sum(s.end_timestamp - s.start_timestamp for s in sections) if total_transcript_duration > 0: coverage_ratio = min(1.0, covered_duration / total_transcript_duration) factors.append(coverage_ratio) # 4. Section length distribution (not too short or too long) section_lengths = [s.end_timestamp - s.start_timestamp for s in sections] avg_length = sum(section_lengths) / len(section_lengths) if 60 <= avg_length <= 300: # 1-5 minutes is good factors.append(1.0) else: factors.append(0.7) # Calculate weighted average return sum(factors) / len(factors) @staticmethod def seconds_to_timestamp(seconds: int) -> str: """Convert seconds to HH:MM:SS format.""" hours = seconds // 3600 minutes = (seconds % 3600) // 60 secs = seconds % 60 return f"{hours:02d}:{minutes:02d}:{secs:02d}" @staticmethod def timestamp_to_seconds(timestamp_str: str) -> int: """Convert HH:MM:SS or [HH:MM:SS] to seconds.""" # Remove brackets if present timestamp_str = timestamp_str.strip('[]') try: parts = timestamp_str.split(':') if len(parts) == 3: hours, minutes, seconds = map(int, parts) return hours * 3600 + minutes * 60 + seconds elif len(parts) == 2: minutes, seconds = map(int, parts) return minutes * 60 + seconds else: return int(parts[0]) except ValueError: logger.warning(f"Invalid timestamp format: {timestamp_str}") return 0 async def generate_table_of_contents( self, sections: List[TimestampedSection] ) -> str: """Generate markdown table of contents with timestamp links.""" if not sections: return "## Table of Contents\n\n*No sections detected*\n" toc_lines = ["## Table of Contents\n"] for section in sections: timestamp_display = self.seconds_to_timestamp(section.start_timestamp) # Create markdown link with timestamp toc_line = f"- **[{timestamp_display}]({section.youtube_link})** - {section.title}" if section.summary: toc_line += f"\n *{section.summary}*" toc_lines.append(toc_line) return '\n'.join(toc_lines) + '\n' async def generate_section_navigation( self, sections: List[TimestampedSection] ) -> Dict[str, Any]: """Generate navigation data for frontend use.""" navigation = { "total_sections": len(sections), "sections": [] } for section in sections: nav_item = { "index": section.index, "title": section.title, "start_time": section.start_timestamp, "timestamp_display": self.seconds_to_timestamp(section.start_timestamp), "youtube_link": section.youtube_link, "summary": section.summary, "duration_seconds": section.end_timestamp - section.start_timestamp, "confidence": section.confidence_score } navigation["sections"].append(nav_item) return navigation def get_processor_stats(self) -> Dict[str, Any]: """Get processor configuration and statistics.""" return { "service_name": "TimestampProcessor", "min_section_duration": self.min_section_duration, "max_sections": self.max_sections, "target_section_length": self.target_section_length, "overlap_tolerance": self.overlap_tolerance }