myspellchecker/tests/test_benchmark_integrity.py at develop · thettwe/myspellchecker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Benchmark annotation integrity checks.

Codifies invariants for benchmark entries so annotation bugs are caught at
CI time. Specifically guards against single-token error subtypes
(aukmyit_confusion, compound_confusion) being saddled with multi-token
gold corrections.

Two checks:

1. ``test_single_token_subtypes_have_single_token_spans`` — every error whose
   subtype is in ``SINGLE_TOKEN_SUBTYPES`` must have a span that does NOT
   contain whitespace, AND a ``gold_correction`` that does NOT contain
   whitespace.
2. ``test_span_matches_erroneous_text`` — every expected error's
   ``input[span.start:span.end]`` must equal ``erroneous_text``. Prevents
   off-by-one span bugs that would silently mis-measure detection TP.

The subtype list is deliberately narrow: only strict single-edit subtypes
(aukmyit, consonant substitution, visarga, asat, tone, medial, kinzi,
stacking). Homophone and compound confusions are EXCLUDED because they can
legitimately span multiple tokens.
"""

from __future__ import annotations

from pathlib import Path

import yaml

BENCHMARK_PATH = Path(__file__).parent.parent / "benchmarks" / "myspellchecker_benchmark.yaml"

# Strict single-edit error subtypes: the gold correction differs from the
# erroneous text by a single character-level edit (substitution, insertion,
# deletion, diacritic). Multi-token gold corrections are inappropriate here.
#
# Excluded on purpose (can be multi-token):
#   - homophone_confusion   (whole-word replacements)
#   - compound_confusion    (compounds can span syllables/tokens)
#   - loan_word_misspelling (loanwords vary in structure)
#   - non_word_typo         (arbitrary misspellings)
#   - real_word_confusion   (word-for-word)
#   - register_mismatch, particle_misuse, word_boundary, word_order,
#     verb_tense_agreement, classifier_error, collocation_error (all
#     structural/grammatical, not single-edit)
SINGLE_TOKEN_SUBTYPES: frozenset[str] = frozenset(
    {
        "aukmyit_confusion",  # visarga (့) drop
        "consonant_substitution",  # e.g., aspiration pair ထ ↔ တ
        "tone_confusion",  # tone mark swap
        "medial_confusion",  # medial ြ ↔ ျ
        "visarga_confusion",  # visarga diacritic confusion
        "asat_confusion",  # asat (်) position
        "missing_visarga",  # ့ required but absent
        "missing_asat",  # ် required but absent
        "kinzi_confusion",  # င်္ stacking
        "tone_mark_error",  # tone diacritic mis-placement
        "stacking_error",  # consonant stack
        "vowel_medial_substitution",  # single-edit vowel+medial swap
    }
)


def _load_benchmark() -> dict:
    with open(BENCHMARK_PATH, encoding="utf-8") as f:
        return yaml.safe_load(f)


def test_single_token_subtypes_have_single_token_spans() -> None:
    """Single-edit error subtypes must not span multiple whitespace-separated tokens.

    Catches the bug class where an error annotated as (e.g.) aukmyit_confusion
    has a multi-token gold correction that the token-level homophone/word
    strategy can never match at top-1.
    """
    benchmark = _load_benchmark()
    violations: list[str] = []
    for sentence in benchmark.get("sentences", []):
        sid = sentence["id"]
        input_text = sentence.get("input", "")
        for err in sentence.get("expected_errors", []):
            subtype = err.get("error_subtype", "")
            if subtype not in SINGLE_TOKEN_SUBTYPES:
                continue
            span = err.get("span", {})
            start = span.get("start", 0)
            end = span.get("end", 0)
            span_text = input_text[start:end]
            gold = err.get("gold_correction", "")
            if " " in span_text:
                violations.append(
                    f"{sid}/{err.get('error_id', '?')}: subtype={subtype!r} "
                    f"span contains whitespace: {span_text!r}"
                )
            if " " in gold:
                violations.append(
                    f"{sid}/{err.get('error_id', '?')}: subtype={subtype!r} "
                    f"gold_correction contains whitespace: {gold!r}"
                )
    assert not violations, "Benchmark annotation violations:\n  " + "\n  ".join(violations)


def test_span_matches_erroneous_text() -> None:
    """Every expected_error's ``input[span.start:span.end]`` must equal ``erroneous_text``.

    Catches off-by-one span bugs and ensures that detection overlap matching
    in the benchmark runner compares what the annotator actually meant.
    """
    benchmark = _load_benchmark()
    violations: list[str] = []
    for sentence in benchmark.get("sentences", []):
        sid = sentence["id"]
        input_text = sentence.get("input", "")
        for err in sentence.get("expected_errors", []):
            span = err.get("span", {})
            start = span.get("start", 0)
            end = span.get("end", 0)
            span_text = input_text[start:end]
            erroneous = err.get("erroneous_text", "")
            if span_text != erroneous:
                violations.append(
                    f"{sid}/{err.get('error_id', '?')}: "
                    f"span text {span_text!r} != erroneous_text {erroneous!r}"
                )
    assert not violations, "Span/erroneous_text mismatch:\n  " + "\n  ".join(violations)