542 lines
18 KiB
Python
542 lines
18 KiB
Python
"""
|
|
Unit tests for WordDetector and related classes.
|
|
"""
|
|
|
|
import pytest
|
|
import json
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch, mock_open
|
|
|
|
from src.core.word_detector import (
|
|
Severity,
|
|
DetectedWord,
|
|
WordList,
|
|
WordDetector
|
|
)
|
|
|
|
|
|
class TestSeverity:
|
|
"""Test Severity enum."""
|
|
|
|
def test_severity_values(self):
|
|
"""Test severity values are correct."""
|
|
assert Severity.LOW.value == 1
|
|
assert Severity.MEDIUM.value == 2
|
|
assert Severity.HIGH.value == 3
|
|
assert Severity.EXTREME.value == 4
|
|
|
|
def test_from_string(self):
|
|
"""Test creating severity from string."""
|
|
assert Severity.from_string('low') == Severity.LOW
|
|
assert Severity.from_string('LOW') == Severity.LOW
|
|
assert Severity.from_string('medium') == Severity.MEDIUM
|
|
assert Severity.from_string('high') == Severity.HIGH
|
|
assert Severity.from_string('extreme') == Severity.EXTREME
|
|
|
|
# Unknown values should default to MEDIUM
|
|
assert Severity.from_string('unknown') == Severity.MEDIUM
|
|
assert Severity.from_string('') == Severity.MEDIUM
|
|
|
|
def test_severity_ordering(self):
|
|
"""Test severity levels can be compared."""
|
|
assert Severity.LOW.value < Severity.MEDIUM.value
|
|
assert Severity.MEDIUM.value < Severity.HIGH.value
|
|
assert Severity.HIGH.value < Severity.EXTREME.value
|
|
|
|
|
|
class TestDetectedWord:
|
|
"""Test DetectedWord dataclass."""
|
|
|
|
def test_basic_creation(self):
|
|
"""Test creating a DetectedWord."""
|
|
word = DetectedWord(
|
|
word="badword",
|
|
original="BadWord",
|
|
start=5.0,
|
|
end=6.0,
|
|
severity=Severity.HIGH,
|
|
confidence=0.95
|
|
)
|
|
|
|
assert word.word == "badword"
|
|
assert word.original == "BadWord"
|
|
assert word.start == 5.0
|
|
assert word.end == 6.0
|
|
assert word.severity == Severity.HIGH
|
|
assert word.confidence == 0.95
|
|
assert word.context == ""
|
|
|
|
def test_duration_property(self):
|
|
"""Test duration calculation."""
|
|
word = DetectedWord(
|
|
word="test",
|
|
original="test",
|
|
start=2.5,
|
|
end=4.0,
|
|
severity=Severity.LOW,
|
|
confidence=1.0
|
|
)
|
|
|
|
assert word.duration == 1.5
|
|
|
|
def test_to_dict(self):
|
|
"""Test converting to dictionary."""
|
|
word = DetectedWord(
|
|
word="test",
|
|
original="TEST",
|
|
start=1.0,
|
|
end=2.5,
|
|
severity=Severity.MEDIUM,
|
|
confidence=0.85,
|
|
context="this is a [test] word"
|
|
)
|
|
|
|
data = word.to_dict()
|
|
|
|
assert data['word'] == "test"
|
|
assert data['original'] == "TEST"
|
|
assert data['start'] == 1.0
|
|
assert data['end'] == 2.5
|
|
assert data['duration'] == 1.5
|
|
assert data['severity'] == "MEDIUM"
|
|
assert data['confidence'] == 0.85
|
|
assert data['context'] == "this is a [test] word"
|
|
|
|
|
|
class TestWordList:
|
|
"""Test WordList class."""
|
|
|
|
def test_initialization(self):
|
|
"""Test WordList initialization."""
|
|
word_list = WordList()
|
|
|
|
# Should have some default words loaded
|
|
assert len(word_list) > 0
|
|
assert isinstance(word_list.words, dict)
|
|
assert isinstance(word_list.patterns, dict)
|
|
assert isinstance(word_list.variations, dict)
|
|
|
|
def test_add_word(self):
|
|
"""Test adding words to the list."""
|
|
word_list = WordList()
|
|
initial_count = len(word_list)
|
|
|
|
# Add word with string severity
|
|
word_list.add_word("testword", "high")
|
|
assert "testword" in word_list.words
|
|
assert word_list.words["testword"] == Severity.HIGH
|
|
|
|
# Add word with Severity enum
|
|
word_list.add_word("another", Severity.LOW)
|
|
assert "another" in word_list.words
|
|
assert word_list.words["another"] == Severity.LOW
|
|
|
|
assert len(word_list) == initial_count + 2
|
|
|
|
def test_add_word_variations(self):
|
|
"""Test that adding a word creates variations."""
|
|
word_list = WordList()
|
|
word_list.add_word("test", Severity.MEDIUM)
|
|
|
|
# Should create plural variation
|
|
assert "tests" in word_list.variations
|
|
assert word_list.variations["tests"] == "test"
|
|
|
|
def test_remove_word(self):
|
|
"""Test removing words from the list."""
|
|
word_list = WordList()
|
|
word_list.add_word("removeme", Severity.LOW)
|
|
|
|
# Verify word was added
|
|
assert "removeme" in word_list.words
|
|
|
|
# Remove the word
|
|
removed = word_list.remove_word("removeme")
|
|
assert removed is True
|
|
assert "removeme" not in word_list.words
|
|
|
|
# Try removing non-existent word
|
|
removed = word_list.remove_word("nonexistent")
|
|
assert removed is False
|
|
|
|
def test_contains(self):
|
|
"""Test checking if word is in list."""
|
|
word_list = WordList()
|
|
word_list.add_word("contained", Severity.MEDIUM)
|
|
|
|
assert "contained" in word_list
|
|
assert "CONTAINED" in word_list # Case insensitive
|
|
assert " contained " in word_list # Strips whitespace
|
|
assert "notcontained" not in word_list
|
|
|
|
def test_load_from_json_file(self, temp_dir):
|
|
"""Test loading word list from JSON file."""
|
|
# Create test JSON file
|
|
test_data = {
|
|
"word1": "LOW",
|
|
"word2": "HIGH",
|
|
"word3": "EXTREME"
|
|
}
|
|
|
|
json_file = temp_dir / "test_words.json"
|
|
with open(json_file, 'w') as f:
|
|
json.dump(test_data, f)
|
|
|
|
word_list = WordList()
|
|
initial_count = len(word_list)
|
|
|
|
word_list.load_from_file(json_file)
|
|
|
|
assert "word1" in word_list.words
|
|
assert word_list.words["word1"] == Severity.LOW
|
|
assert "word2" in word_list.words
|
|
assert word_list.words["word2"] == Severity.HIGH
|
|
assert "word3" in word_list.words
|
|
assert word_list.words["word3"] == Severity.EXTREME
|
|
assert len(word_list) == initial_count + 3
|
|
|
|
def test_load_from_csv_file(self, temp_dir):
|
|
"""Test loading word list from CSV file."""
|
|
# Create test CSV file
|
|
csv_content = """word,severity
|
|
testword1,low
|
|
testword2,medium
|
|
testword3,high"""
|
|
|
|
csv_file = temp_dir / "test_words.csv"
|
|
csv_file.write_text(csv_content)
|
|
|
|
word_list = WordList()
|
|
initial_count = len(word_list)
|
|
|
|
word_list.load_from_file(csv_file)
|
|
|
|
assert "testword1" in word_list.words
|
|
assert word_list.words["testword1"] == Severity.LOW
|
|
assert "testword2" in word_list.words
|
|
assert word_list.words["testword2"] == Severity.MEDIUM
|
|
assert "testword3" in word_list.words
|
|
assert word_list.words["testword3"] == Severity.HIGH
|
|
assert len(word_list) == initial_count + 3
|
|
|
|
def test_load_from_text_file(self, temp_dir):
|
|
"""Test loading word list from plain text file."""
|
|
# Create test text file
|
|
text_content = """word1
|
|
word2
|
|
# This is a comment
|
|
word3
|
|
"""
|
|
|
|
text_file = temp_dir / "test_words.txt"
|
|
text_file.write_text(text_content)
|
|
|
|
word_list = WordList()
|
|
initial_count = len(word_list)
|
|
|
|
word_list.load_from_file(text_file)
|
|
|
|
assert "word1" in word_list.words
|
|
assert "word2" in word_list.words
|
|
assert "word3" in word_list.words
|
|
# Comment should be ignored
|
|
assert "# This is a comment" not in word_list.words
|
|
assert len(word_list) == initial_count + 3
|
|
|
|
def test_load_nonexistent_file(self):
|
|
"""Test loading from non-existent file."""
|
|
word_list = WordList()
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
word_list.load_from_file("nonexistent.json")
|
|
|
|
def test_save_to_json_file(self, temp_dir):
|
|
"""Test saving word list to JSON file."""
|
|
word_list = WordList()
|
|
word_list.add_word("save1", Severity.LOW)
|
|
word_list.add_word("save2", Severity.HIGH)
|
|
|
|
json_file = temp_dir / "saved_words.json"
|
|
word_list.save_to_file(json_file)
|
|
|
|
assert json_file.exists()
|
|
|
|
# Load and verify
|
|
with open(json_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
assert "save1" in data
|
|
assert "save2" in data
|
|
assert data["save1"] == "LOW"
|
|
assert data["save2"] == "HIGH"
|
|
|
|
def test_save_to_csv_file(self, temp_dir):
|
|
"""Test saving word list to CSV file."""
|
|
word_list = WordList()
|
|
word_list.add_word("csv1", Severity.MEDIUM)
|
|
word_list.add_word("csv2", Severity.EXTREME)
|
|
|
|
csv_file = temp_dir / "saved_words.csv"
|
|
word_list.save_to_file(csv_file)
|
|
|
|
assert csv_file.exists()
|
|
|
|
# Verify content
|
|
content = csv_file.read_text()
|
|
assert "csv1,medium" in content
|
|
assert "csv2,extreme" in content
|
|
assert "word,severity" in content # Header
|
|
|
|
|
|
class TestWordDetector:
|
|
"""Test WordDetector class."""
|
|
|
|
def test_initialization_default(self):
|
|
"""Test detector initialization with defaults."""
|
|
detector = WordDetector()
|
|
|
|
assert detector.word_list is not None
|
|
assert detector.min_confidence == 0.7
|
|
assert detector.check_variations is True
|
|
assert detector.context_window == 5
|
|
|
|
def test_initialization_custom(self):
|
|
"""Test detector initialization with custom parameters."""
|
|
word_list = WordList()
|
|
detector = WordDetector(
|
|
word_list=word_list,
|
|
min_confidence=0.8,
|
|
check_variations=False,
|
|
context_window=3
|
|
)
|
|
|
|
assert detector.word_list == word_list
|
|
assert detector.min_confidence == 0.8
|
|
assert detector.check_variations is False
|
|
assert detector.context_window == 3
|
|
|
|
def test_detect_direct_match(self):
|
|
"""Test detecting direct word matches."""
|
|
word_list = WordList()
|
|
word_list.add_word("badword", Severity.HIGH)
|
|
|
|
detector = WordDetector(word_list=word_list)
|
|
|
|
# Mock transcription result
|
|
mock_word = Mock()
|
|
mock_word.text = "badword"
|
|
mock_word.start = 5.0
|
|
mock_word.end = 6.0
|
|
|
|
mock_transcription = Mock()
|
|
mock_transcription.words = [mock_word]
|
|
|
|
detected = detector.detect(mock_transcription, include_context=False)
|
|
|
|
assert len(detected) == 1
|
|
assert detected[0].word == "badword"
|
|
assert detected[0].original == "badword"
|
|
assert detected[0].start == 5.0
|
|
assert detected[0].end == 6.0
|
|
assert detected[0].severity == Severity.HIGH
|
|
assert detected[0].confidence == 1.0
|
|
|
|
def test_detect_case_insensitive(self):
|
|
"""Test case-insensitive detection."""
|
|
word_list = WordList()
|
|
word_list.add_word("badword", Severity.MEDIUM)
|
|
|
|
detector = WordDetector(word_list=word_list)
|
|
|
|
# Mock transcription with uppercase word
|
|
mock_word = Mock()
|
|
mock_word.text = "BADWORD"
|
|
mock_word.start = 2.0
|
|
mock_word.end = 3.0
|
|
|
|
mock_transcription = Mock()
|
|
mock_transcription.words = [mock_word]
|
|
|
|
detected = detector.detect(mock_transcription, include_context=False)
|
|
|
|
assert len(detected) == 1
|
|
assert detected[0].word == "badword" # Normalized
|
|
assert detected[0].original == "BADWORD" # Original preserved
|
|
|
|
def test_detect_with_context(self):
|
|
"""Test detection with context extraction."""
|
|
word_list = WordList()
|
|
word_list.add_word("explicit", Severity.MEDIUM)
|
|
|
|
detector = WordDetector(word_list=word_list, context_window=2)
|
|
|
|
# Mock transcription with multiple words
|
|
words = []
|
|
word_texts = ["this", "is", "explicit", "content", "here"]
|
|
for i, text in enumerate(word_texts):
|
|
word = Mock()
|
|
word.text = text
|
|
word.start = float(i)
|
|
word.end = float(i + 1)
|
|
words.append(word)
|
|
|
|
mock_transcription = Mock()
|
|
mock_transcription.words = words
|
|
|
|
detected = detector.detect(mock_transcription, include_context=True)
|
|
|
|
assert len(detected) == 1
|
|
assert detected[0].word == "explicit"
|
|
assert detected[0].context == "this is [explicit] content here"
|
|
|
|
def test_detect_variations(self):
|
|
"""Test detection of word variations."""
|
|
word_list = WordList()
|
|
word_list.add_word("test", Severity.LOW)
|
|
# This should create "tests" variation
|
|
|
|
detector = WordDetector(word_list=word_list, check_variations=True)
|
|
|
|
# Mock transcription with variation
|
|
mock_word = Mock()
|
|
mock_word.text = "tests"
|
|
mock_word.start = 1.0
|
|
mock_word.end = 2.0
|
|
|
|
mock_transcription = Mock()
|
|
mock_transcription.words = [mock_word]
|
|
|
|
detected = detector.detect(mock_transcription, include_context=False)
|
|
|
|
assert len(detected) == 1
|
|
assert detected[0].word == "test" # Base word
|
|
assert detected[0].original == "tests" # Original variation
|
|
assert detected[0].confidence == 0.95 # Variation confidence
|
|
|
|
def test_detect_no_variations(self):
|
|
"""Test detection with variations disabled."""
|
|
word_list = WordList()
|
|
word_list.add_word("test", Severity.LOW)
|
|
|
|
detector = WordDetector(word_list=word_list, check_variations=False)
|
|
|
|
# Mock transcription with variation that shouldn't match
|
|
mock_word = Mock()
|
|
mock_word.text = "tests"
|
|
mock_word.start = 1.0
|
|
mock_word.end = 2.0
|
|
|
|
mock_transcription = Mock()
|
|
mock_transcription.words = [mock_word]
|
|
|
|
detected = detector.detect(mock_transcription, include_context=False)
|
|
|
|
assert len(detected) == 0
|
|
|
|
def test_check_variations_known(self):
|
|
"""Test checking known variations."""
|
|
word_list = WordList()
|
|
word_list.add_word("base", Severity.MEDIUM)
|
|
word_list.variations["bases"] = "base" # Manually add variation
|
|
|
|
detector = WordDetector(word_list=word_list)
|
|
|
|
match, confidence = detector._check_variations("bases")
|
|
|
|
assert match == "bases"
|
|
assert confidence == 0.95
|
|
|
|
def test_check_variations_fuzzy(self):
|
|
"""Test fuzzy matching for variations."""
|
|
word_list = WordList()
|
|
word_list.add_word("hello", Severity.LOW)
|
|
|
|
detector = WordDetector(word_list=word_list, min_confidence=0.8)
|
|
|
|
# Test similar word
|
|
match, confidence = detector._check_variations("helo") # Missing 'l'
|
|
|
|
if match: # Fuzzy matching might or might not match depending on similarity
|
|
assert confidence >= 0.8
|
|
|
|
def test_get_context_boundary(self):
|
|
"""Test context extraction at boundaries."""
|
|
detector = WordDetector(context_window=2)
|
|
|
|
# Create mock words
|
|
word_texts = ["a", "b", "target", "d", "e"]
|
|
words = []
|
|
for text in word_texts:
|
|
word = Mock()
|
|
word.text = text
|
|
words.append(word)
|
|
|
|
# Test target at beginning
|
|
context = detector._get_context(words, 0)
|
|
assert context == "[a] b target"
|
|
|
|
# Test target at end
|
|
context = detector._get_context(words, 4)
|
|
assert context == "target d [e]"
|
|
|
|
# Test target in middle
|
|
context = detector._get_context(words, 2)
|
|
assert context == "a b [target] d e"
|
|
|
|
def test_filter_by_severity(self):
|
|
"""Test filtering detected words by severity."""
|
|
detector = WordDetector()
|
|
|
|
# Create detected words with different severities
|
|
detected_words = [
|
|
DetectedWord("low", "low", 1.0, 2.0, Severity.LOW, 1.0),
|
|
DetectedWord("med", "med", 3.0, 4.0, Severity.MEDIUM, 1.0),
|
|
DetectedWord("high", "high", 5.0, 6.0, Severity.HIGH, 1.0),
|
|
DetectedWord("extreme", "extreme", 7.0, 8.0, Severity.EXTREME, 1.0)
|
|
]
|
|
|
|
# Filter by MEDIUM and above
|
|
filtered = detector.filter_by_severity(detected_words, Severity.MEDIUM)
|
|
|
|
assert len(filtered) == 3 # MEDIUM, HIGH, EXTREME
|
|
severities = [w.severity for w in filtered]
|
|
assert Severity.LOW not in severities
|
|
assert Severity.MEDIUM in severities
|
|
assert Severity.HIGH in severities
|
|
assert Severity.EXTREME in severities
|
|
|
|
def test_get_statistics_empty(self):
|
|
"""Test statistics for empty detection results."""
|
|
detector = WordDetector()
|
|
|
|
stats = detector.get_statistics([])
|
|
|
|
assert stats['total_count'] == 0
|
|
assert stats['unique_words'] == 0
|
|
assert stats['by_severity'] == {}
|
|
assert stats['most_common'] == []
|
|
|
|
def test_get_statistics_with_words(self):
|
|
"""Test statistics for detection results."""
|
|
detector = WordDetector()
|
|
|
|
detected_words = [
|
|
DetectedWord("word1", "word1", 1.0, 2.0, Severity.HIGH, 0.9),
|
|
DetectedWord("word1", "word1", 3.0, 4.0, Severity.HIGH, 0.8),
|
|
DetectedWord("word2", "word2", 5.0, 6.0, Severity.MEDIUM, 0.95),
|
|
DetectedWord("word3", "word3", 7.0, 8.0, Severity.LOW, 1.0)
|
|
]
|
|
|
|
stats = detector.get_statistics(detected_words)
|
|
|
|
assert stats['total_count'] == 4
|
|
assert stats['unique_words'] == 3
|
|
assert stats['by_severity']['HIGH'] == 2
|
|
assert stats['by_severity']['MEDIUM'] == 1
|
|
assert stats['by_severity']['LOW'] == 1
|
|
assert stats['most_common'][0] == ('word1', 2) # Most frequent
|
|
assert stats['average_confidence'] == (0.9 + 0.8 + 0.95 + 1.0) / 4
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v']) |