trax/tests/test_domain_enhancement.py

"""Test domain-specific enhancement pipeline.

Tests the specialized enhancement workflows for different domains,
including technical terminology enhancement, medical vocabulary optimization,
academic citation handling, and domain-specific quality metrics.
"""

import pytest
import asyncio
from unittest.mock import Mock, AsyncMock, patch
from typing import Dict, Any

from src.services.domain_enhancement import (
    DomainEnhancementPipeline,
    DomainEnhancementConfig,
    DomainType,
    EnhancementResult
)


class TestDomainEnhancementPipeline:
    """Test the domain-specific enhancement pipeline."""

    @pytest.fixture
    def mock_enhancement_service(self):
        """Create a mock enhancement service."""
        service = Mock()
        service.enhance_transcript = AsyncMock()
        return service

    @pytest.fixture
    def pipeline(self, mock_enhancement_service):
        """Create a DomainEnhancementPipeline instance."""
        return DomainEnhancementPipeline(enhancement_service=mock_enhancement_service)

    @pytest.fixture
    def sample_texts(self):
        """Sample texts for different domains."""
        return {
            DomainType.TECHNICAL: "The algorithm implements a singleton pattern for thread safety in the software system",
            DomainType.MEDICAL: "Patient presents with symptoms of hypertension and requires treatment for myocardial infarction",
            DomainType.ACADEMIC: "Research study analysis shows hypothesis testing methodology with literature review",
            DomainType.LEGAL: "Contract agreement compliance with law regulation and legal jurisdiction",
            DomainType.GENERAL: "This is a general conversation about various topics and interests"
        }

    def test_initialization(self, pipeline):
        """Test pipeline initialization."""
        assert pipeline.enhancement_service is not None
        assert pipeline.domain_detector is not None
        assert len(pipeline.strategies) == 5  # All domain types
        assert len(pipeline.quality_metrics) == 5  # All domain types

    def test_domain_type_enum(self):
        """Test domain type enumeration."""
        assert DomainType.GENERAL.value == "general"
        assert DomainType.TECHNICAL.value == "technical"
        assert DomainType.MEDICAL.value == "medical"
        assert DomainType.ACADEMIC.value == "academic"
        assert DomainType.LEGAL.value == "legal"

    def test_domain_enhancement_config(self):
        """Test domain enhancement configuration."""
        config = DomainEnhancementConfig(domain=DomainType.TECHNICAL)

        assert config.domain == DomainType.TECHNICAL
        assert config.enable_terminology_enhancement is True
        assert config.enable_citation_handling is True
        assert config.enable_formatting_optimization is True
        assert config.quality_threshold == 0.8
        assert config.max_enhancement_iterations == 2
        assert config.technical_jargon_threshold == 0.7
        assert config.medical_terminology_threshold == 0.8
        assert config.academic_citation_threshold == 0.75
        assert config.legal_precision_threshold == 0.85

    @pytest.mark.asyncio
    async def test_enhance_content_with_specified_domain(self, pipeline, sample_texts):
        """Test content enhancement with specified domain."""
        text = sample_texts[DomainType.TECHNICAL]

        # Mock the enhancement service response
        pipeline.enhancement_service.enhance_transcript.return_value = {
            "enhanced_text": "The **algorithm** implements a `singleton pattern` for thread safety in the **software system**"
        }

        result = await pipeline.enhance_content(text, domain=DomainType.TECHNICAL)

        assert isinstance(result, EnhancementResult)
        assert result.original_text == text
        assert result.domain == DomainType.TECHNICAL
        assert result.confidence_score > 0
        assert len(result.improvements) > 0
        assert len(result.quality_metrics) > 0
        assert result.processing_time > 0

    @pytest.mark.asyncio
    async def test_enhance_content_auto_detect_domain(self, pipeline, sample_texts):
        """Test content enhancement with automatic domain detection."""
        text = sample_texts[DomainType.MEDICAL]

        # Mock the enhancement service response
        pipeline.enhancement_service.enhance_transcript.return_value = {
            "enhanced_text": "**Patient** presents with symptoms of **hypertension** and requires treatment for **myocardial infarction**"
        }

        result = await pipeline.enhance_content(text)

        assert isinstance(result, EnhancementResult)
        assert result.domain in [DomainType.MEDICAL, DomainType.GENERAL]  # May fall back to general
        assert result.confidence_score > 0

    @pytest.mark.asyncio
    async def test_enhance_technical_content(self, pipeline, sample_texts):
        """Test technical content enhancement."""
        text = sample_texts[DomainType.TECHNICAL]

        # Mock the enhancement service response
        pipeline.enhancement_service.enhance_transcript.return_value = {
            "enhanced_text": "The **algorithm** implements a `singleton pattern` for thread safety in the **software system**"
        }

        config = DomainEnhancementConfig(domain=DomainType.TECHNICAL)
        enhanced_text, improvements, corrections = await pipeline._enhance_technical_content(text, config)

        assert enhanced_text != text
        assert len(improvements) > 0
        assert "Applied technical formatting standards" in improvements

    @pytest.mark.asyncio
    async def test_enhance_medical_content(self, pipeline, sample_texts):
        """Test medical content enhancement."""
        text = sample_texts[DomainType.MEDICAL]

        # Mock the enhancement service response
        pipeline.enhancement_service.enhance_transcript.return_value = {
            "enhanced_text": "**Patient** presents with symptoms of **hypertension** and requires treatment for **myocardial infarction**"
        }

        config = DomainEnhancementConfig(domain=DomainType.MEDICAL)
        enhanced_text, improvements, corrections = await pipeline._enhance_medical_content(text, config)

        assert enhanced_text != text
        assert len(improvements) > 0
        assert "Applied medical documentation standards" in improvements

    @pytest.mark.asyncio
    async def test_enhance_academic_content(self, pipeline, sample_texts):
        """Test academic content enhancement."""
        text = sample_texts[DomainType.ACADEMIC]

        # Mock the enhancement service responses
        pipeline.enhancement_service.enhance_transcript.side_effect = [
            {"enhanced_text": "Research study analysis shows hypothesis testing methodology with literature review"},
            {"enhanced_text": "**Research** **study** **analysis** shows **hypothesis** testing **methodology** with **literature** review"}
        ]

        config = DomainEnhancementConfig(domain=DomainType.ACADEMIC)
        enhanced_text, improvements, corrections = await pipeline._enhance_academic_content(text, config)

        assert enhanced_text != text
        assert len(improvements) > 0
        assert "Applied academic formatting standards" in improvements

    @pytest.mark.asyncio
    async def test_enhance_legal_content(self, pipeline, sample_texts):
        """Test legal content enhancement."""
        text = sample_texts[DomainType.LEGAL]

        # Mock the enhancement service response
        pipeline.enhancement_service.enhance_transcript.return_value = {
            "enhanced_text": "**Contract** **agreement** compliance with **law** **regulation** and **legal** **jurisdiction**"
        }

        config = DomainEnhancementConfig(domain=DomainType.LEGAL)
        enhanced_text, improvements, corrections = await pipeline._enhance_legal_content(text, config)

        assert enhanced_text != text
        assert len(improvements) > 0
        assert "Applied legal precision standards" in improvements

    def test_optimize_technical_formatting(self, pipeline):
        """Test technical formatting optimization."""
        text = "The code function method class uses file path C:\\temp\\file.txt and version v1.2.3"

        enhanced = pipeline._optimize_technical_formatting(text)

        # Check that technical terms are formatted
        assert "`code`" in enhanced
        assert "`function`" in enhanced
        assert "`method`" in enhanced
        assert "`class`" in enhanced
        assert "`C:\\temp\\file.txt`" in enhanced
        assert "**v1.2.3**" in enhanced

    def test_apply_medical_formatting(self, pipeline):
        """Test medical formatting application."""
        text = "Patient takes aspirin and ibuprofen with blood pressure 120/80 mmHg and heart rate 72 bpm"

        enhanced = pipeline._apply_medical_formatting(text)

        # Check that medical terms are formatted
        assert "**aspirin**" in enhanced
        assert "**ibuprofen**" in enhanced
        assert "`120/80 mmHg`" in enhanced
        assert "`72 bpm`" in enhanced

    def test_apply_academic_formatting(self, pipeline):
        """Test academic formatting application."""
        text = "Research shows et al. findings ibid. and op. cit. references with Figure 1 and Table 2"

        enhanced = pipeline._apply_academic_formatting(text)

        # Check that academic terms are formatted
        assert "*et al.*" in enhanced
        assert "*ibid.*" in enhanced
        assert "*op. cit.*" in enhanced
        assert "**Figure 1**" in enhanced
        assert "**Table 2**" in enhanced

    def test_optimize_legal_precision(self, pipeline):
        """Test legal precision optimization."""
        text = "The contract shall must may hereby whereas therefore be executed"

        enhanced = pipeline._optimize_legal_precision(text)

        # Check that legal terms are emphasized
        assert "**shall**" in enhanced
        assert "**must**" in enhanced
        assert "**may**" in enhanced
        assert "**hereby**" in enhanced
        assert "**whereas**" in enhanced
        assert "**therefore**" in enhanced

    def test_identify_technical_corrections(self, pipeline):
        """Test technical terminology correction identification."""
        original = "The python free code uses my sequel database"
        enhanced = "The Python 3 code uses MySQL database"

        corrections = pipeline._identify_technical_corrections(original, enhanced)

        assert len(corrections) > 0
        assert any("python free" in corr and "Python 3" in corr for corr in corrections)
        assert any("my sequel" in corr and "MySQL" in corr for corr in corrections)

    def test_identify_medical_corrections(self, pipeline):
        """Test medical terminology correction identification."""
        original = "Patient has hippa compliance issues and takes prozack"
        enhanced = "Patient has HIPAA compliance issues and takes Prozac"

        corrections = pipeline._identify_medical_corrections(original, enhanced)

        assert len(corrections) > 0
        assert any("hippa" in corr and "HIPAA" in corr for corr in corrections)
        assert any("prozack" in corr and "Prozac" in corr for corr in corrections)

    def test_identify_academic_corrections(self, pipeline):
        """Test academic terminology correction identification."""
        original = "The research methodology hypothesis and literature review"
        enhanced = "The **research** **methodology** **hypothesis** and **literature** review"

        corrections = pipeline._identify_academic_corrections(original, enhanced)

        # Note: This test may not find corrections if the original text already contains correct terms
        # The identification depends on the specific correction patterns
        assert isinstance(corrections, list)

    def test_identify_legal_corrections(self, pipeline):
        """Test legal terminology correction identification."""
        original = "The contract jurisdiction statute and compliance requirements"
        enhanced = "The **contract** **jurisdiction** **statute** and **compliance** requirements"

        corrections = pipeline._identify_legal_corrections(original, enhanced)

        # Note: This test may not find corrections if the original text already contains correct terms
        assert isinstance(corrections, list)

    def test_calculate_technical_quality(self, pipeline):
        """Test technical content quality calculation."""
        enhanced_text = "The `algorithm` implements a **v1.2.3** system with `code` and `function`"
        original_text = "The algorithm implements a v1.2.3 system with code and function"

        metrics = pipeline._calculate_technical_quality(enhanced_text, original_text)

        assert 'technical_term_density' in metrics
        assert 'code_reference_accuracy' in metrics
        assert 'technical_precision' in metrics
        assert all(0 <= value <= 1 for value in metrics.values())

    def test_calculate_medical_quality(self, pipeline):
        """Test medical content quality calculation."""
        enhanced_text = "**Patient** has **diagnosis** with `120/80 mmHg` and **treatment**"
        original_text = "Patient has diagnosis with 120/80 mmHg and treatment"

        metrics = pipeline._calculate_medical_quality(enhanced_text, original_text)

        assert 'medical_terminology_accuracy' in metrics
        assert 'formatting_compliance' in metrics
        assert 'medical_precision' in metrics
        assert all(0 <= value <= 1 for value in metrics.values())

    def test_calculate_academic_quality(self, pipeline):
        """Test academic content quality calculation."""
        enhanced_text = "**Research** *et al.* shows **hypothesis** and **Figure 1**"
        original_text = "Research et al. shows hypothesis and Figure 1"

        metrics = pipeline._calculate_academic_quality(enhanced_text, original_text)

        assert 'citation_handling' in metrics
        assert 'academic_terminology' in metrics
        assert 'academic_quality' in metrics
        assert all(0 <= value <= 1 for value in metrics.values())

    def test_calculate_legal_quality(self, pipeline):
        """Test legal content quality calculation."""
        enhanced_text = "**Contract** **agreement** with `reference` and **legal** terms"
        original_text = "Contract agreement with reference and legal terms"

        metrics = pipeline._calculate_legal_quality(enhanced_text, original_text)

        assert 'legal_terminology_precision' in metrics
        assert 'legal_formatting' in metrics
        assert 'legal_quality' in metrics
        assert all(0 <= value <= 1 for value in metrics.values())

    def test_calculate_general_quality(self, pipeline):
        """Test general content quality calculation."""
        enhanced_text = "This is a general conversation. It has proper punctuation!"
        original_text = "This is a general conversation It has proper punctuation"

        metrics = pipeline._calculate_general_quality(enhanced_text, original_text)

        assert 'length_ratio' in metrics
        assert 'punctuation_improvement' in metrics
        assert 'general_quality' in metrics
        assert all(0 <= value <= 1 for value in metrics.values())

    def test_calculate_confidence_score(self, pipeline):
        """Test confidence score calculation."""
        quality_metrics = {
            'technical_precision': 0.8,
            'medical_precision': 0.9,
            'academic_quality': 0.7,
            'legal_quality': 0.85,
            'general_quality': 0.75
        }

        confidence = pipeline._calculate_confidence_score(quality_metrics)

        assert 0 <= confidence <= 1
        assert confidence > 0.7  # Should be high with good metrics

    def test_calculate_confidence_score_empty_metrics(self, pipeline):
        """Test confidence score calculation with empty metrics."""
        confidence = pipeline._calculate_confidence_score({})

        assert confidence == 0.0

    @pytest.mark.asyncio
    async def test_enhancement_service_failure_handling(self, pipeline, sample_texts):
        """Test handling of enhancement service failures."""
        text = sample_texts[DomainType.TECHNICAL]

        # Mock enhancement service to raise an exception
        pipeline.enhancement_service.enhance_transcript.side_effect = Exception("Service unavailable")

        config = DomainEnhancementConfig(domain=DomainType.TECHNICAL)
        enhanced_text, improvements, corrections = await pipeline._enhance_technical_content(text, config)

        # Should fall back to original text for terminology enhancement
        # But formatting optimization may still be applied
        assert len(corrections) == 0  # No terminology corrections
        # Note: Formatting may still be applied even if enhancement service fails

    @pytest.mark.asyncio
    async def test_domain_specific_configuration(self, pipeline, sample_texts):
        """Test domain-specific configuration options."""
        text = sample_texts[DomainType.TECHNICAL]

        # Create config with disabled terminology enhancement
        config = DomainEnhancementConfig(
            domain=DomainType.TECHNICAL,
            enable_terminology_enhancement=False,
            enable_formatting_optimization=True
        )

        enhanced_text, improvements, corrections = await pipeline._enhance_technical_content(text, config)

        # Should skip terminology enhancement but apply formatting
        assert "Applied technical formatting standards" in improvements
        assert len(corrections) == 0  # No terminology corrections

    def test_enhancement_result_structure(self):
        """Test EnhancementResult data structure."""
        result = EnhancementResult(
            original_text="Original text",
            enhanced_text="Enhanced text",
            domain=DomainType.TECHNICAL,
            confidence_score=0.85,
            improvements=["Improved formatting"],
            terminology_corrections=["Corrected term"],
            quality_metrics={"technical_precision": 0.8},
            processing_time=1.5
        )

        assert result.original_text == "Original text"
        assert result.enhanced_text == "Enhanced text"
        assert result.domain == DomainType.TECHNICAL
        assert result.confidence_score == 0.85
        assert len(result.improvements) == 1
        assert len(result.terminology_corrections) == 1
        assert len(result.quality_metrics) == 1
        assert result.processing_time == 1.5


class TestDomainEnhancementIntegration:
    """Test integration of domain enhancement with the pipeline."""

    @pytest.mark.asyncio
    async def test_end_to_end_technical_enhancement(self):
        """Test end-to-end technical content enhancement."""
        from src.services.domain_enhancement import DomainEnhancementPipeline

        # Create pipeline with mock service
        mock_service = Mock()
        mock_service.enhance_transcript = AsyncMock(return_value={
            "enhanced_text": "The **algorithm** implements a `singleton pattern` for thread safety"
        })

        pipeline = DomainEnhancementPipeline(enhancement_service=mock_service)

        text = "The algorithm implements a singleton pattern for thread safety"
        result = await pipeline.enhance_content(text, domain=DomainType.TECHNICAL)

        assert result.domain == DomainType.TECHNICAL
        assert result.confidence_score > 0
        assert len(result.improvements) > 0
        assert "Applied technical formatting standards" in result.improvements

    @pytest.mark.asyncio
    async def test_domain_switching(self):
        """Test switching between different domains."""
        from src.services.domain_enhancement import DomainEnhancementPipeline

        mock_service = Mock()
        mock_service.enhance_transcript = AsyncMock(return_value={
            "enhanced_text": "Enhanced content"
        })

        pipeline = DomainEnhancementPipeline(enhancement_service=mock_service)

        # Test different domains
        domains = [DomainType.TECHNICAL, DomainType.MEDICAL, DomainType.ACADEMIC]

        for domain in domains:
            result = await pipeline.enhance_content("Test content", domain=domain)
            assert result.domain == domain
            # Confidence score may be 0 if no domain-specific terms are detected
            # This is expected behavior for generic content
            assert result.confidence_score >= 0


if __name__ == "__main__":
    pytest.main([__file__, "-v"])