wer_analysis/preprocessing.py at master · gong-io/wer_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
import string
import nltk
from nltk.corpus import stopwords

ps = nltk.stem.PorterStemmer()
stopwords_set = set(stopwords.words('english'))


def replace_umlaute(str):
    umlaut_map = {
        '\u00dc': 'UE',
        '\u00c4': 'AE',
        '\u00d6': 'OE',
        '\u00fc': 'ue',
        '\u00e4': 'ae',
        '\u00f6': 'oe',
        '\u00df': 'ss',
    }

    for k, v in umlaut_map.items():
        str = str.replace(k, v)
    return str


def preprocessing_normalization_func(text_in, ignore_caps=True, hide_punc=True):
    punc_rep = {
        '.': '',
        ',': '',
        '!': '',
        '?': ''
    }

    replacements = {
        #         '.': '',
        #         ',': '',
        #         '!': '',
        #         '?': '',
        '…': '',
        ':': '',
        ';': '',
        '-': '',
        '_': '',

        "'ve ": ' have ',
        "'re ": ' are ',
        "'m ": ' am ',
        "'em ": ' them ',
        " he's ": ' he is ',
        "'d ": ' would',
        "n't ": ' not ',
        " y'": ' you ',
        "'ll ": ' will ',
        " kinda ": ' kind of ',
        " gonna ": ' going to ',
        " wanna ": ' want to ',
        " dunno ": ' do not know ',
        " because ": ' cause ',
        " ma'am ": ' madam ',

        ' & ': ' and ',
        " <unk> ": ' ',
    }
    uhms = ['uh', 'oh', 'em', 'um', 'ah', 'uhum', 'mmhmm', 'uhm', 'ahm', 'ähm', 'äh', 'ähmm', 'ähh']
    for uhm in uhms:
        replacements[' ' + uhm + ' '] = ' '

    # Remove annotations (e.g. "[laughter]"")
    text_in = re.sub("\[[a-zA-Z0-9]*\]", '', text_in)

    # Remove newline characters
    text_in = ' ' + text_in.replace('\r', ' ').replace('\n', ' ')

    # text_in = ' ' + text_in

    for k, v in replacements.items():
        text_in = text_in.replace(k, v)

    # remove \u200b from text
    text_in = text_in.replace(u'\u200b', '')

    text_in_punc = text_in
    for k, v in punc_rep.items():
        text_in = text_in.replace(k, v)

    if ignore_caps:
        text_in = text_in.lower()  # lowercase

    if hide_punc:
        return text_in
    else:
        return text_in, text_in_punc


def ewer_normalization_func(txt):
    txt = txt.replace('.', '').replace('?', '').replace(',', '')  # remove punctuation
    txt = [' '.join(w) for w in txt.split() if w not in stopwords_set]  # remove stopwords
    txt = replace_umlaute(txt)
    txt = ' '.join([ps.stem(w) for w in txt.split()])  # Apply stemming
    txt = txt.replace(' ', '')
    return txt


def remove_punctuation(text):
    """
    Removes the punctuation r\"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~\""" from the string and replaces them with an
    empty string.
    :param text: The text to process
    :return: The input text without the punctuation symbols
    """
    transtable = str.maketrans('', '', string.punctuation)
    return text.translate(transtable)