-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtest_edit_distance_python_impl.py
More file actions
381 lines (295 loc) · 13.7 KB
/
test_edit_distance_python_impl.py
File metadata and controls
381 lines (295 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
"""Tests for the Python implementation of edit distance algorithms.
These tests specifically target the Python fallback implementations
to ensure coverage when Cython is not available.
"""
from unittest.mock import patch
# Import with Cython disabled to test Python implementations
with patch.dict("sys.modules", {"myspellchecker.utils.edit_distance_c": None}):
# Force reload to use Python implementations
import importlib
import myspellchecker.algorithms.distance.edit_distance as ed_module
importlib.reload(ed_module)
from myspellchecker.algorithms.distance.edit_distance import (
MEDIAL_CONFUSIONS,
MYANMAR_CONSONANTS,
MYANMAR_MEDIALS,
_are_medial_confusions,
damerau_levenshtein_distance,
levenshtein_distance,
myanmar_syllable_edit_distance,
tokenize_myanmar_syllable_units,
weighted_damerau_levenshtein_distance,
)
class TestLevenshteinDistance:
"""Tests for levenshtein_distance function."""
def test_identical_strings(self):
"""Identical strings have distance 0."""
assert levenshtein_distance("hello", "hello") == 0
assert levenshtein_distance("", "") == 0
def test_empty_strings(self):
"""Empty string distance equals length of other string."""
assert levenshtein_distance("", "hello") == 5
assert levenshtein_distance("hello", "") == 5
def test_single_insertion(self):
"""Single character insertion has distance 1."""
assert levenshtein_distance("cat", "cats") == 1
assert levenshtein_distance("at", "cat") == 1
def test_single_deletion(self):
"""Single character deletion has distance 1."""
assert levenshtein_distance("cats", "cat") == 1
assert levenshtein_distance("cat", "at") == 1
def test_single_substitution(self):
"""Single character substitution has distance 1."""
assert levenshtein_distance("cat", "car") == 1
assert levenshtein_distance("cat", "bat") == 1
def test_multiple_operations(self):
"""Multiple operations have cumulative distance."""
assert levenshtein_distance("kitten", "sitting") == 3
assert levenshtein_distance("saturday", "sunday") == 3
def test_case_sensitivity(self):
"""Distance is case-sensitive."""
assert levenshtein_distance("Hello", "hello") == 1
def test_myanmar_text(self):
"""Test with Myanmar characters."""
assert levenshtein_distance("ကာ", "ကာ") == 0
assert levenshtein_distance("ကာ", "ခါ") == 2 # Both chars different
class TestDamerauLevenshteinDistance:
"""Tests for damerau_levenshtein_distance function."""
def test_identical_strings(self):
"""Identical strings have distance 0."""
assert damerau_levenshtein_distance("hello", "hello") == 0
def test_empty_strings(self):
"""Empty string distance equals length of other string."""
assert damerau_levenshtein_distance("", "hello") == 5
assert damerau_levenshtein_distance("hello", "") == 5
def test_single_transposition(self):
"""Adjacent character transposition has distance 1."""
assert damerau_levenshtein_distance("ab", "ba") == 1
assert damerau_levenshtein_distance("abc", "bac") == 1
def test_transposition_vs_two_substitutions(self):
"""Transposition is cheaper than two substitutions."""
# Without transposition: ca -> ac would be 2 substitutions
# With transposition: ca -> ac is 1 transposition
assert damerau_levenshtein_distance("ca", "ac") == 1
def test_insertion_deletion_substitution(self):
"""Standard edit operations work correctly."""
assert damerau_levenshtein_distance("cat", "cats") == 1
assert damerau_levenshtein_distance("cats", "cat") == 1
assert damerau_levenshtein_distance("cat", "car") == 1
def test_complex_edits(self):
"""Complex edit sequences."""
assert damerau_levenshtein_distance("kitten", "sitting") == 3
def test_myanmar_text(self):
"""Test with Myanmar characters."""
assert damerau_levenshtein_distance("ကာ", "ကာ") == 0
class TestWeightedDamerauLevenshteinDistance:
"""Tests for weighted_damerau_levenshtein_distance function."""
def test_identical_strings(self):
"""Identical strings have distance 0."""
dist = weighted_damerau_levenshtein_distance("hello", "hello")
assert dist == 0.0
def test_empty_strings(self):
"""Empty string distance equals length of other string."""
assert weighted_damerau_levenshtein_distance("", "hello") == 5.0
assert weighted_damerau_levenshtein_distance("hello", "") == 5.0
def test_standard_substitution(self):
"""Standard substitution has cost 1.0."""
dist = weighted_damerau_levenshtein_distance("a", "b")
assert dist == 1.0
def test_visual_similar_weight(self):
"""Visually similar characters have reduced cost."""
# Visual similar pairs get visual_weight instead of 1.0
dist = weighted_damerau_levenshtein_distance(
"\u1000",
"\u1001", # Similar Myanmar consonants if in VISUAL_SIMILAR
keyboard_weight=0.5,
visual_weight=0.3,
)
# Result depends on VISUAL_SIMILAR mapping
assert 0.0 <= dist <= 1.0
def test_custom_weights(self):
"""Test custom weight parameters."""
dist1 = weighted_damerau_levenshtein_distance(
"abc",
"abd",
keyboard_weight=0.2,
visual_weight=0.3,
)
dist2 = weighted_damerau_levenshtein_distance(
"abc",
"abd",
keyboard_weight=0.8,
visual_weight=0.9,
)
# Both should work, actual values depend on char similarities
assert isinstance(dist1, float)
assert isinstance(dist2, float)
def test_transposition(self):
"""Transposition has default cost 0.7."""
dist = weighted_damerau_levenshtein_distance("ab", "ba")
assert dist == 0.7 # Default transposition_weight is 0.7
class TestTokenizeMyanmarSyllableUnits:
"""Tests for tokenize_myanmar_syllable_units function."""
def test_empty_string(self):
"""Empty string returns empty list."""
assert tokenize_myanmar_syllable_units("") == []
def test_single_consonant(self):
"""Single consonant is one unit."""
assert tokenize_myanmar_syllable_units("က") == ["က"]
def test_consonant_with_medials(self):
"""Consonant + medials grouped together."""
# Ka + Ya-pin
result = tokenize_myanmar_syllable_units("ကျ")
assert result == ["ကျ"]
# Ka + Ya-yit
result = tokenize_myanmar_syllable_units("ကြ")
assert result == ["ကြ"]
# Ka + multiple medials
result = tokenize_myanmar_syllable_units("ကျွ")
assert result == ["ကျွ"]
def test_vowels_separate(self):
"""Vowels are separate units."""
# Ka + Aa vowel
result = tokenize_myanmar_syllable_units("ကာ")
assert result == ["က", "ာ"]
def test_mixed_text(self):
"""Mixed consonants, medials, and vowels."""
# Mya + n
result = tokenize_myanmar_syllable_units("မြန်")
assert len(result) >= 3
assert result[0] == "မြ" # Ma + Ra grouped
def test_non_myanmar_chars(self):
"""Non-Myanmar characters are separate units."""
result = tokenize_myanmar_syllable_units("abc")
assert result == ["a", "b", "c"]
def test_consecutive_consonants(self):
"""Consecutive consonants without medials."""
result = tokenize_myanmar_syllable_units("ကခ")
assert result == ["က", "ခ"]
class TestAreMedialConfusions:
"""Tests for _are_medial_confusions function."""
def test_identical_units(self):
"""Identical units are not confusions."""
assert not _are_medial_confusions("ကျ", "ကျ")
def test_different_base_consonant(self):
"""Different base consonants are not confusions."""
assert not _are_medial_confusions("ကျ", "ချ")
def test_short_units(self):
"""Units shorter than 2 chars are not confusions."""
assert not _are_medial_confusions("က", "ခ")
assert not _are_medial_confusions("က", "ကျ")
def test_ya_ra_confusion(self):
"""Ya-pin vs Ya-yit is a confusion pair."""
# Ka + Ya vs Ka + Ra
result = _are_medial_confusions("ကျ", "ကြ")
assert result is True
def test_wa_ha_confusion(self):
"""Wa-hswe vs Ha-htoe is a confusion pair."""
# Ka + Wa vs Ka + Ha
result = _are_medial_confusions("ကွ", "ကှ")
assert result is True
def test_non_confusion_medials(self):
"""Non-confusion medial pairs."""
# Ka + Ya vs Ka + Wa - not a standard confusion pair
result = _are_medial_confusions("ကျ", "ကွ")
assert result is False
class TestMyanmarSyllableEditDistance:
"""Tests for myanmar_syllable_edit_distance function."""
def test_identical_strings(self):
"""Identical strings have distance (0, 0.0)."""
int_dist, weighted_dist = myanmar_syllable_edit_distance("ကာ", "ကာ")
assert int_dist == 0
assert weighted_dist == 0.0
def test_empty_strings(self):
"""Empty string distance equals unit count of other string."""
int_dist, weighted_dist = myanmar_syllable_edit_distance("", "ကာ")
# "ကာ" tokenizes to ["က", "ာ"] = 2 units
assert int_dist == 2
assert weighted_dist == 2.0
int_dist2, weighted_dist2 = myanmar_syllable_edit_distance("ကာ", "")
assert int_dist2 == 2
assert weighted_dist2 == 2.0
def test_medial_confusion_reduced_weight(self):
"""Medial confusion has reduced weighted cost."""
# မျ vs မြ - medial confusion
s1 = "မျ"
s2 = "မြ"
int_dist, weighted_dist = myanmar_syllable_edit_distance(s1, s2)
# Integer distance is 1 (one substitution)
assert int_dist == 1
# Weighted distance is 0.5 (reduced cost for medial confusion)
assert weighted_dist == 0.5
def test_non_confusion_substitution(self):
"""Non-confusion substitution has full cost."""
s1 = "ကာ" # Ka + Aa
s2 = "ခါ" # Kha + Aa
int_dist, weighted_dist = myanmar_syllable_edit_distance(s1, s2)
# Different consonants = 1 substitution
# Note: tokenization affects this
assert int_dist >= 1
assert weighted_dist >= 1.0
def test_transposition(self):
"""Transposition of units."""
# Two-unit strings with transposition
s1 = "ကခ" # Ka, Kha
s2 = "ခက" # Kha, Ka
int_dist, weighted_dist = myanmar_syllable_edit_distance(s1, s2)
# Transposition cost is 1
assert int_dist == 1
assert weighted_dist == 1.0
class TestMedialConfusionConstants:
"""Tests for medial confusion constants."""
def test_medial_confusions_defined(self):
"""MEDIAL_CONFUSIONS contains expected pairs."""
assert isinstance(MEDIAL_CONFUSIONS, set)
assert len(MEDIAL_CONFUSIONS) > 0
def test_myanmar_medials_defined(self):
"""MYANMAR_MEDIALS contains expected characters."""
assert isinstance(MYANMAR_MEDIALS, (set, frozenset))
assert "\u103b" in MYANMAR_MEDIALS # Ya-pin
assert "\u103c" in MYANMAR_MEDIALS # Ya-yit
assert "\u103d" in MYANMAR_MEDIALS # Wa-hswe
assert "\u103e" in MYANMAR_MEDIALS # Ha-htoe
def test_myanmar_consonants_defined(self):
"""MYANMAR_CONSONANTS contains expected characters."""
assert isinstance(MYANMAR_CONSONANTS, (set, frozenset))
assert "\u1000" in MYANMAR_CONSONANTS # Ka
assert "\u1001" in MYANMAR_CONSONANTS # Kha
class TestCaching:
"""Tests for LRU caching of edit distance functions."""
def test_levenshtein_cache(self):
"""Levenshtein distance results are cached."""
# Clear cache first
levenshtein_distance.cache_clear()
# First call
levenshtein_distance("test", "test")
info1 = levenshtein_distance.cache_info()
# Second call with same args
levenshtein_distance("test", "test")
info2 = levenshtein_distance.cache_info()
# Hits should increase
assert info2.hits > info1.hits
def test_damerau_cache(self):
"""Damerau-Levenshtein distance results are cached."""
damerau_levenshtein_distance.cache_clear()
damerau_levenshtein_distance("test", "test")
info1 = damerau_levenshtein_distance.cache_info()
damerau_levenshtein_distance("test", "test")
info2 = damerau_levenshtein_distance.cache_info()
assert info2.hits > info1.hits
def test_weighted_cache(self):
"""Weighted Damerau-Levenshtein distance results are cached."""
weighted_damerau_levenshtein_distance.cache_clear()
weighted_damerau_levenshtein_distance("test", "test")
info1 = weighted_damerau_levenshtein_distance.cache_info()
weighted_damerau_levenshtein_distance("test", "test")
info2 = weighted_damerau_levenshtein_distance.cache_info()
assert info2.hits > info1.hits
def test_myanmar_syllable_cache(self):
"""Myanmar syllable edit distance results are cached."""
myanmar_syllable_edit_distance.cache_clear()
myanmar_syllable_edit_distance("ကာ", "ကာ")
info1 = myanmar_syllable_edit_distance.cache_info()
myanmar_syllable_edit_distance("ကာ", "ကာ")
info2 = myanmar_syllable_edit_distance.cache_info()
assert info2.hits > info1.hits