""" Unit tests for WordDetector and related classes. """ import pytest import json from pathlib import Path from unittest.mock import Mock, patch, mock_open from src.core.word_detector import ( Severity, DetectedWord, WordList, WordDetector ) class TestSeverity: """Test Severity enum.""" def test_severity_values(self): """Test severity values are correct.""" assert Severity.LOW.value == 1 assert Severity.MEDIUM.value == 2 assert Severity.HIGH.value == 3 assert Severity.EXTREME.value == 4 def test_from_string(self): """Test creating severity from string.""" assert Severity.from_string('low') == Severity.LOW assert Severity.from_string('LOW') == Severity.LOW assert Severity.from_string('medium') == Severity.MEDIUM assert Severity.from_string('high') == Severity.HIGH assert Severity.from_string('extreme') == Severity.EXTREME # Unknown values should default to MEDIUM assert Severity.from_string('unknown') == Severity.MEDIUM assert Severity.from_string('') == Severity.MEDIUM def test_severity_ordering(self): """Test severity levels can be compared.""" assert Severity.LOW.value < Severity.MEDIUM.value assert Severity.MEDIUM.value < Severity.HIGH.value assert Severity.HIGH.value < Severity.EXTREME.value class TestDetectedWord: """Test DetectedWord dataclass.""" def test_basic_creation(self): """Test creating a DetectedWord.""" word = DetectedWord( word="badword", original="BadWord", start=5.0, end=6.0, severity=Severity.HIGH, confidence=0.95 ) assert word.word == "badword" assert word.original == "BadWord" assert word.start == 5.0 assert word.end == 6.0 assert word.severity == Severity.HIGH assert word.confidence == 0.95 assert word.context == "" def test_duration_property(self): """Test duration calculation.""" word = DetectedWord( word="test", original="test", start=2.5, end=4.0, severity=Severity.LOW, confidence=1.0 ) assert word.duration == 1.5 def test_to_dict(self): """Test converting to dictionary.""" word = DetectedWord( word="test", original="TEST", start=1.0, end=2.5, severity=Severity.MEDIUM, confidence=0.85, context="this is a [test] word" ) data = word.to_dict() assert data['word'] == "test" assert data['original'] == "TEST" assert data['start'] == 1.0 assert data['end'] == 2.5 assert data['duration'] == 1.5 assert data['severity'] == "MEDIUM" assert data['confidence'] == 0.85 assert data['context'] == "this is a [test] word" class TestWordList: """Test WordList class.""" def test_initialization(self): """Test WordList initialization.""" word_list = WordList() # Should have some default words loaded assert len(word_list) > 0 assert isinstance(word_list.words, dict) assert isinstance(word_list.patterns, dict) assert isinstance(word_list.variations, dict) def test_add_word(self): """Test adding words to the list.""" word_list = WordList() initial_count = len(word_list) # Add word with string severity word_list.add_word("testword", "high") assert "testword" in word_list.words assert word_list.words["testword"] == Severity.HIGH # Add word with Severity enum word_list.add_word("another", Severity.LOW) assert "another" in word_list.words assert word_list.words["another"] == Severity.LOW assert len(word_list) == initial_count + 2 def test_add_word_variations(self): """Test that adding a word creates variations.""" word_list = WordList() word_list.add_word("test", Severity.MEDIUM) # Should create plural variation assert "tests" in word_list.variations assert word_list.variations["tests"] == "test" def test_remove_word(self): """Test removing words from the list.""" word_list = WordList() word_list.add_word("removeme", Severity.LOW) # Verify word was added assert "removeme" in word_list.words # Remove the word removed = word_list.remove_word("removeme") assert removed is True assert "removeme" not in word_list.words # Try removing non-existent word removed = word_list.remove_word("nonexistent") assert removed is False def test_contains(self): """Test checking if word is in list.""" word_list = WordList() word_list.add_word("contained", Severity.MEDIUM) assert "contained" in word_list assert "CONTAINED" in word_list # Case insensitive assert " contained " in word_list # Strips whitespace assert "notcontained" not in word_list def test_load_from_json_file(self, temp_dir): """Test loading word list from JSON file.""" # Create test JSON file test_data = { "word1": "LOW", "word2": "HIGH", "word3": "EXTREME" } json_file = temp_dir / "test_words.json" with open(json_file, 'w') as f: json.dump(test_data, f) word_list = WordList() initial_count = len(word_list) word_list.load_from_file(json_file) assert "word1" in word_list.words assert word_list.words["word1"] == Severity.LOW assert "word2" in word_list.words assert word_list.words["word2"] == Severity.HIGH assert "word3" in word_list.words assert word_list.words["word3"] == Severity.EXTREME assert len(word_list) == initial_count + 3 def test_load_from_csv_file(self, temp_dir): """Test loading word list from CSV file.""" # Create test CSV file csv_content = """word,severity testword1,low testword2,medium testword3,high""" csv_file = temp_dir / "test_words.csv" csv_file.write_text(csv_content) word_list = WordList() initial_count = len(word_list) word_list.load_from_file(csv_file) assert "testword1" in word_list.words assert word_list.words["testword1"] == Severity.LOW assert "testword2" in word_list.words assert word_list.words["testword2"] == Severity.MEDIUM assert "testword3" in word_list.words assert word_list.words["testword3"] == Severity.HIGH assert len(word_list) == initial_count + 3 def test_load_from_text_file(self, temp_dir): """Test loading word list from plain text file.""" # Create test text file text_content = """word1 word2 # This is a comment word3 """ text_file = temp_dir / "test_words.txt" text_file.write_text(text_content) word_list = WordList() initial_count = len(word_list) word_list.load_from_file(text_file) assert "word1" in word_list.words assert "word2" in word_list.words assert "word3" in word_list.words # Comment should be ignored assert "# This is a comment" not in word_list.words assert len(word_list) == initial_count + 3 def test_load_nonexistent_file(self): """Test loading from non-existent file.""" word_list = WordList() with pytest.raises(FileNotFoundError): word_list.load_from_file("nonexistent.json") def test_save_to_json_file(self, temp_dir): """Test saving word list to JSON file.""" word_list = WordList() word_list.add_word("save1", Severity.LOW) word_list.add_word("save2", Severity.HIGH) json_file = temp_dir / "saved_words.json" word_list.save_to_file(json_file) assert json_file.exists() # Load and verify with open(json_file, 'r') as f: data = json.load(f) assert "save1" in data assert "save2" in data assert data["save1"] == "LOW" assert data["save2"] == "HIGH" def test_save_to_csv_file(self, temp_dir): """Test saving word list to CSV file.""" word_list = WordList() word_list.add_word("csv1", Severity.MEDIUM) word_list.add_word("csv2", Severity.EXTREME) csv_file = temp_dir / "saved_words.csv" word_list.save_to_file(csv_file) assert csv_file.exists() # Verify content content = csv_file.read_text() assert "csv1,medium" in content assert "csv2,extreme" in content assert "word,severity" in content # Header class TestWordDetector: """Test WordDetector class.""" def test_initialization_default(self): """Test detector initialization with defaults.""" detector = WordDetector() assert detector.word_list is not None assert detector.min_confidence == 0.7 assert detector.check_variations is True assert detector.context_window == 5 def test_initialization_custom(self): """Test detector initialization with custom parameters.""" word_list = WordList() detector = WordDetector( word_list=word_list, min_confidence=0.8, check_variations=False, context_window=3 ) assert detector.word_list == word_list assert detector.min_confidence == 0.8 assert detector.check_variations is False assert detector.context_window == 3 def test_detect_direct_match(self): """Test detecting direct word matches.""" word_list = WordList() word_list.add_word("badword", Severity.HIGH) detector = WordDetector(word_list=word_list) # Mock transcription result mock_word = Mock() mock_word.text = "badword" mock_word.start = 5.0 mock_word.end = 6.0 mock_transcription = Mock() mock_transcription.words = [mock_word] detected = detector.detect(mock_transcription, include_context=False) assert len(detected) == 1 assert detected[0].word == "badword" assert detected[0].original == "badword" assert detected[0].start == 5.0 assert detected[0].end == 6.0 assert detected[0].severity == Severity.HIGH assert detected[0].confidence == 1.0 def test_detect_case_insensitive(self): """Test case-insensitive detection.""" word_list = WordList() word_list.add_word("badword", Severity.MEDIUM) detector = WordDetector(word_list=word_list) # Mock transcription with uppercase word mock_word = Mock() mock_word.text = "BADWORD" mock_word.start = 2.0 mock_word.end = 3.0 mock_transcription = Mock() mock_transcription.words = [mock_word] detected = detector.detect(mock_transcription, include_context=False) assert len(detected) == 1 assert detected[0].word == "badword" # Normalized assert detected[0].original == "BADWORD" # Original preserved def test_detect_with_context(self): """Test detection with context extraction.""" word_list = WordList() word_list.add_word("explicit", Severity.MEDIUM) detector = WordDetector(word_list=word_list, context_window=2) # Mock transcription with multiple words words = [] word_texts = ["this", "is", "explicit", "content", "here"] for i, text in enumerate(word_texts): word = Mock() word.text = text word.start = float(i) word.end = float(i + 1) words.append(word) mock_transcription = Mock() mock_transcription.words = words detected = detector.detect(mock_transcription, include_context=True) assert len(detected) == 1 assert detected[0].word == "explicit" assert detected[0].context == "this is [explicit] content here" def test_detect_variations(self): """Test detection of word variations.""" word_list = WordList() word_list.add_word("test", Severity.LOW) # This should create "tests" variation detector = WordDetector(word_list=word_list, check_variations=True) # Mock transcription with variation mock_word = Mock() mock_word.text = "tests" mock_word.start = 1.0 mock_word.end = 2.0 mock_transcription = Mock() mock_transcription.words = [mock_word] detected = detector.detect(mock_transcription, include_context=False) assert len(detected) == 1 assert detected[0].word == "test" # Base word assert detected[0].original == "tests" # Original variation assert detected[0].confidence == 0.95 # Variation confidence def test_detect_no_variations(self): """Test detection with variations disabled.""" word_list = WordList() word_list.add_word("test", Severity.LOW) detector = WordDetector(word_list=word_list, check_variations=False) # Mock transcription with variation that shouldn't match mock_word = Mock() mock_word.text = "tests" mock_word.start = 1.0 mock_word.end = 2.0 mock_transcription = Mock() mock_transcription.words = [mock_word] detected = detector.detect(mock_transcription, include_context=False) assert len(detected) == 0 def test_check_variations_known(self): """Test checking known variations.""" word_list = WordList() word_list.add_word("base", Severity.MEDIUM) word_list.variations["bases"] = "base" # Manually add variation detector = WordDetector(word_list=word_list) match, confidence = detector._check_variations("bases") assert match == "bases" assert confidence == 0.95 def test_check_variations_fuzzy(self): """Test fuzzy matching for variations.""" word_list = WordList() word_list.add_word("hello", Severity.LOW) detector = WordDetector(word_list=word_list, min_confidence=0.8) # Test similar word match, confidence = detector._check_variations("helo") # Missing 'l' if match: # Fuzzy matching might or might not match depending on similarity assert confidence >= 0.8 def test_get_context_boundary(self): """Test context extraction at boundaries.""" detector = WordDetector(context_window=2) # Create mock words word_texts = ["a", "b", "target", "d", "e"] words = [] for text in word_texts: word = Mock() word.text = text words.append(word) # Test target at beginning context = detector._get_context(words, 0) assert context == "[a] b target" # Test target at end context = detector._get_context(words, 4) assert context == "target d [e]" # Test target in middle context = detector._get_context(words, 2) assert context == "a b [target] d e" def test_filter_by_severity(self): """Test filtering detected words by severity.""" detector = WordDetector() # Create detected words with different severities detected_words = [ DetectedWord("low", "low", 1.0, 2.0, Severity.LOW, 1.0), DetectedWord("med", "med", 3.0, 4.0, Severity.MEDIUM, 1.0), DetectedWord("high", "high", 5.0, 6.0, Severity.HIGH, 1.0), DetectedWord("extreme", "extreme", 7.0, 8.0, Severity.EXTREME, 1.0) ] # Filter by MEDIUM and above filtered = detector.filter_by_severity(detected_words, Severity.MEDIUM) assert len(filtered) == 3 # MEDIUM, HIGH, EXTREME severities = [w.severity for w in filtered] assert Severity.LOW not in severities assert Severity.MEDIUM in severities assert Severity.HIGH in severities assert Severity.EXTREME in severities def test_get_statistics_empty(self): """Test statistics for empty detection results.""" detector = WordDetector() stats = detector.get_statistics([]) assert stats['total_count'] == 0 assert stats['unique_words'] == 0 assert stats['by_severity'] == {} assert stats['most_common'] == [] def test_get_statistics_with_words(self): """Test statistics for detection results.""" detector = WordDetector() detected_words = [ DetectedWord("word1", "word1", 1.0, 2.0, Severity.HIGH, 0.9), DetectedWord("word1", "word1", 3.0, 4.0, Severity.HIGH, 0.8), DetectedWord("word2", "word2", 5.0, 6.0, Severity.MEDIUM, 0.95), DetectedWord("word3", "word3", 7.0, 8.0, Severity.LOW, 1.0) ] stats = detector.get_statistics(detected_words) assert stats['total_count'] == 4 assert stats['unique_words'] == 3 assert stats['by_severity']['HIGH'] == 2 assert stats['by_severity']['MEDIUM'] == 1 assert stats['by_severity']['LOW'] == 1 assert stats['most_common'][0] == ('word1', 2) # Most frequent assert stats['average_confidence'] == (0.9 + 0.8 + 0.95 + 1.0) / 4 if __name__ == '__main__': pytest.main([__file__, '-v'])