youtube-summarizer/backend/services/transcript_processor.py

import re
from typing import List, Dict, Any, Optional
from backend.models.transcript import TranscriptChunk, TranscriptSegment


class TranscriptProcessor:
    """Process and clean transcript data for AI consumption"""

    def __init__(self):
        self.chunk_overlap = 100  # Characters to overlap between chunks

    def clean_transcript(self, raw_transcript: str) -> str:
        """
        Clean and format raw transcript data.

        Args:
            raw_transcript: Raw transcript text

        Returns:
            Cleaned transcript text
        """
        if not raw_transcript:
            return ""

        # Remove multiple spaces
        cleaned = re.sub(r'\s+', ' ', raw_transcript)

        # Fix common OCR/speech recognition errors
        replacements = {
            ' i ': ' I ',
            ' im ': " I'm ",
            ' dont ': " don't ",
            ' wont ': " won't ",
            ' cant ': " can't ",
            ' youre ': " you're ",
            ' theyre ': " they're ",
            ' were ': " we're ",
            ' its ': " it's ",
        }

        for old, new in replacements.items():
            cleaned = cleaned.replace(old, new)

        # Ensure sentences end with proper punctuation
        cleaned = re.sub(r'([a-z])(\s+)([A-Z])', r'\1.\2\3', cleaned)

        # Remove extra whitespace
        cleaned = cleaned.strip()

        # Fix multiple punctuation
        cleaned = re.sub(r'([.!?])\1+', r'\1', cleaned)

        return cleaned

    def chunk_transcript(self, transcript: str,
                        max_tokens: int = 3000) -> List[TranscriptChunk]:
        """
        Split transcript into manageable chunks for AI processing.

        Args:
            transcript: Full transcript text
            max_tokens: Maximum tokens per chunk (approximate)

        Returns:
            List of transcript chunks
        """
        if not transcript:
            return []

        # Approximate tokens as words * 1.3
        max_words = int(max_tokens / 1.3)

        sentences = self._split_into_sentences(transcript)
        chunks = []
        current_chunk = []
        current_word_count = 0
        chunk_index = 0

        for sentence in sentences:
            sentence_words = len(sentence.split())

            # If adding this sentence would exceed limit, create new chunk
            if current_word_count + sentence_words > max_words and current_chunk:
                chunk_text = ' '.join(current_chunk)
                chunks.append(TranscriptChunk(
                    chunk_index=chunk_index,
                    text=chunk_text,
                    token_count=int(len(chunk_text.split()) * 1.3)
                ))

                # Start new chunk with overlap (last sentence of previous chunk)
                if chunks and current_chunk:
                    current_chunk = [current_chunk[-1]] if len(current_chunk) > 0 else []
                    current_word_count = len(current_chunk[0].split()) if current_chunk else 0
                else:
                    current_chunk = []
                    current_word_count = 0

                chunk_index += 1

            current_chunk.append(sentence)
            current_word_count += sentence_words

        # Add final chunk
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append(TranscriptChunk(
                chunk_index=chunk_index,
                text=chunk_text,
                token_count=int(len(chunk_text.split()) * 1.3)
            ))

        return chunks

    def _split_into_sentences(self, text: str) -> List[str]:
        """Split text into sentences"""
        # Simple sentence splitting (can be improved with NLTK)
        sentences = re.split(r'(?<=[.!?])\s+', text)
        return [s.strip() for s in sentences if s.strip()]

    def extract_key_moments(self, segments: List[TranscriptSegment],
                           min_duration: float = 5.0) -> List[Dict[str, Any]]:
        """
        Extract key moments from timestamped segments.

        Args:
            segments: List of transcript segments with timestamps
            min_duration: Minimum duration for a key moment

        Returns:
            List of key moments with timestamps
        """
        if not segments:
            return []

        key_moments = []
        current_moment = {
            "text": "",
            "start": 0.0,
            "end": 0.0
        }

        for segment in segments:
            # If this segment starts a new topic (simple heuristic)
            if self._is_topic_transition(segment.text):
                if current_moment["text"] and \
                   (current_moment["end"] - current_moment["start"]) >= min_duration:
                    key_moments.append(current_moment)

                current_moment = {
                    "text": segment.text,
                    "start": segment.start,
                    "end": segment.end
                }
            else:
                # Continue current moment
                if not current_moment["text"]:
                    current_moment["start"] = segment.start
                current_moment["text"] += " " + segment.text
                current_moment["end"] = segment.end

        # Add final moment
        if current_moment["text"] and \
           (current_moment["end"] - current_moment["start"]) >= min_duration:
            key_moments.append(current_moment)

        return key_moments

    def _is_topic_transition(self, text: str) -> bool:
        """Detect if text indicates a topic transition"""
        transition_phrases = [
            "first", "second", "third", "next", "now", "let's",
            "moving on", "another", "finally", "in conclusion",
            "to summarize", "let me", "I want to", "the key"
        ]

        text_lower = text.lower()
        return any(phrase in text_lower for phrase in transition_phrases)

    def format_with_timestamps(self, segments: List[TranscriptSegment]) -> str:
        """
        Format transcript with timestamps.

        Args:
            segments: List of transcript segments

        Returns:
            Formatted transcript with timestamps
        """
        if not segments:
            return ""

        formatted_lines = []
        for segment in segments:
            timestamp = self._format_timestamp(segment.start)
            formatted_lines.append(f"[{timestamp}] {segment.text}")

        return "\n".join(formatted_lines)

    def _format_timestamp(self, seconds: float) -> str:
        """Format seconds to MM:SS or HH:MM:SS format"""
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)

        if hours > 0:
            return f"{hours:02d}:{minutes:02d}:{secs:02d}"
        else:
            return f"{minutes:02d}:{secs:02d}"

    def estimate_summary_length(self, transcript: str,
                               compression_ratio: float = 0.2) -> int:
        """
        Estimate appropriate summary length based on transcript.

        Args:
            transcript: Full transcript text
            compression_ratio: Target compression ratio (0.2 = 20% of original)

        Returns:
            Estimated summary word count
        """
        word_count = len(transcript.split())
        target_words = int(word_count * compression_ratio)

        # Set reasonable bounds
        min_words = 100
        max_words = 1000

        return max(min_words, min(target_words, max_words))

    def detect_language(self, text: str) -> str:
        """
        Simple language detection (mock implementation).

        Args:
            text: Text to analyze

        Returns:
            Language code (e.g., 'en', 'es', 'fr')
        """
        # Mock implementation - always returns English
        # In production, use langdetect or similar library
        return "en"

    def extract_topics(self, transcript: str, max_topics: int = 5) -> List[str]:
        """
        Extract main topics from transcript (mock implementation).

        Args:
            transcript: Full transcript text
            max_topics: Maximum number of topics to extract

        Returns:
            List of topic strings
        """
        # Mock implementation - extract capitalized phrases as topics
        topics = []

        # Find capitalized phrases (simple heuristic)
        pattern = r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*'
        matches = re.findall(pattern, transcript)

        # Filter and deduplicate
        seen = set()
        for match in matches:
            if len(match.split()) >= 2 and match not in seen:
                topics.append(match)
                seen.add(match)
                if len(topics) >= max_topics:
                    break

        # If not enough topics found, add generic ones
        if len(topics) < 3:
            topics.extend(["Technology", "Innovation", "Development"][:3-len(topics)])

        return topics