Adding weighted WER feature.

RuABraun · RuABraun · commit 6509757b4dce · 2025-11-14T10:27:40.000-05:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include LICENSE README.md requirements.txt
 recursive-include libs *.*
+recursive-include texterrors/data *
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Features:
 - Metrics by group (for example speaker)
 - Comparing two hypothesis files to reference
 - Oracle WER
+- **NEW** Weighted WER (English only)
 - Sorting most common errors by frequency or count
 - Measuring performance on keywords
 - Measuring OOV-CER (see [https://arxiv.org/abs/2107.08091](https://arxiv.org/abs/2107.08091) )
@@ -89,6 +90,7 @@ This results in a WER of 83.3\% because of the extra insertion and deletion. And
 
 Recent changes:  
 
+- 11.11.25 Weighted WER for English
 - 26.02.25 Faster alignment, better multihyp support, fixed multihyp bug.
 - 22.06.22 refactored internals to make them simpler, character aware alignment is off by default, added more explanations
 - 20.05.22 fixed bug missing regex dependency
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ termcolor
 Levenshtein
 regex
 pytest
+importlib_resources
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 import setuptools
 import sys
 
-__version__ = "1.0.10"
+__version__ = "1.0.11"
 
 
 class get_pybind_include(object):
@@ -101,5 +101,7 @@ def get_requires():
     entry_points={'console_scripts': ['texterrors=texterrors.texterrors:cli']},
     install_requires=get_requires(),
     setup_requires=['pybind11'],
-    python_requires='>=3.6'
+    python_requires='>=3.6',
+    package_data={"texterrors": ["data/wordlist"]},
+    include_package_data=True,
 )
diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -117,6 +117,21 @@ def test_oov_cer():
     assert err / cnt == 0., err / cnt
 
 
+def test_weighted_wer():
+    reflines = ['1 my name is john doe']
+    hyplines = ['1 my name is joe doe']
+    refs = create_inp(reflines)
+    hyps = create_inp(hyplines)
+    buffer = io.StringIO()
+    texterrors.process_output(refs, hyps, buffer, 'A', 'B',weighted_wer=True, skip_detailed=True)
+    output = buffer.getvalue()
+    ref ="""WER: 20.0 (ins 0, del 0, sub 1 / 5)
+SER: 100.0
+Weighted WER: 28.3
+"""
+    assert output == ref, show_diff(output, ref)
+
+
 def test_seq_distance():
     a, b = 'a b', 'a b'
     d = texterrors.seq_distance(StringVector(a.split()), StringVector(b.split()))
diff --git a/texterrors/texterrors.py b/texterrors/texterrors.py
@@ -13,6 +13,8 @@
 from texterrors_align import StringVector
 from loguru import logger
 from termcolor import colored
+from importlib.resources import files, as_file
+
 
 OOV_SYM = '<unk>'
 CPP_WORDS_CONTAINER = True
@@ -691,7 +693,7 @@ def process_multiple_outputs(ref_utts, hypa_utts, hypb_utts, fh, num_top_errors,
 def process_output(ref_utts, hyp_utts, fh, ref_file, hyp_file, cer=False, num_top_errors=10, oov_set=None, debug=False,
                   use_chardiff=True, isctm=False, skip_detailed=False,
                   keywords=None, utt_group_map=None, oracle_wer=False,
-                  freq_sort=False, nocolor=False, insert_tok='<eps>', terminal_width=None):
+                  freq_sort=False, nocolor=False, insert_tok='<eps>', terminal_width=None, weighted_wer=False):
  
     if terminal_width is None:
         terminal_width, _ = shutil.get_terminal_size()
@@ -744,6 +746,40 @@ def process_output(ref_utts, hyp_utts, fh, ref_file, hyp_file, cer=False, num_to
     fh.write(f'WER: {100.*wer:.1f} (ins {ins_count}, del {del_count}, sub {sub_count} / {error_stats.total_count})'
              f'\nSER: {100.*error_stats.utt_wrong / len(error_stats.utts):.1f}\n')
 
+    if weighted_wer:
+        words = []
+        probs = []
+        
+        wordlist_resource= files('texterrors') / 'data' / 'wordlist'
+        with as_file(wordlist_resource) as wordlist_path:
+            with open(wordlist_path) as fh_wordlist:
+                for line in fh_wordlist:
+                    word, prob = line.strip().split()
+                    words.append(word)
+                    probs.append(float(prob))
+        probs = -np.log(np.array(probs))
+        minscore, maxscore = probs[100], probs[-1]
+        probs[:100] = minscore
+        word2weight = {}
+        maxweight = 0.
+        for word, prob in zip(words, probs):
+            word2weight[word] = max((prob - minscore) / (maxscore - minscore), 1e-2)
+            maxweight = max(maxweight, word2weight[word])
+
+        num = 0
+        for word, cnt in error_stats.subs.items():
+            ref_w, hyp_w = word.split('>')
+            weight = (word2weight.get(ref_w, maxweight) + word2weight.get(hyp_w, maxweight)) / 2.
+            num += weight * cnt
+        for word, cnt in error_stats.ins.items():
+            num += word2weight.get(word, maxweight) * cnt
+        for word, cnt in error_stats.dels.items():
+            num += word2weight.get(word, maxweight) * cnt
+        denom = sum(word2weight.get(word, maxweight) * cnt for word, cnt in error_stats.word_counts.items())
+
+        weighted_wer = num / denom
+        fh.write(f'Weighted WER: {100.*weighted_wer:.1f}\n')
+
     if cer:
         cer = error_stats.char_error_count / float(error_stats.char_count)
         fh.write(f'CER: {100.*cer:.1f} ({error_stats.char_error_count} / {error_stats.char_count})\n')
@@ -785,7 +821,8 @@ def main(
     utt_group_map_f: ('Should be a file which maps uttids to group, WER will be output per group.', 'option', '') = '',
     usecolor: ('Show detailed output with color (use less -R). Red/white is reference, Green/white model output.', 'flag', 'c')=False,
     num_top_errors: ('Number of errors to show per type in detailed output.', 'option')=10,
-    second_hyp_f: ('Will compare outputs between two hypothesis files.', 'option')=''
+    second_hyp_f: ('Will compare outputs between two hypothesis files.', 'option')='',
+    weighted_wer: ('Use weighted WER, will weight the errors by word frequency.', 'flag', None) = False,
     ):
 
     logger.remove()
@@ -820,7 +857,8 @@ def main(
         process_output(ref_utts, hyp_utts, fh, cer=cer, debug=debug, oov_set=oov_set,
                      ref_file=ref_file, hyp_file=hyp_file, use_chardiff=use_chardiff, skip_detailed=skip_detailed,
                      keywords=keywords, utt_group_map=utt_group_map, freq_sort=freq_sort,
-                     isctm=isctm, oracle_wer=oracle_wer, nocolor=not usecolor, num_top_errors=num_top_errors)
+                     isctm=isctm, oracle_wer=oracle_wer, nocolor=not usecolor, num_top_errors=num_top_errors,
+                     weighted_wer=weighted_wer)
     else:
         ref_utts = read_ref_file(ref_file, isark)
         hyp_uttsa = read_hyp_file(hyp_file, isark, False)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`include LICENSE README.md requirements.txt`
`2`	`2`	`recursive-include libs .`
	`3`	`+recursive-include texterrors/data *`