-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocessing.py
More file actions
113 lines (91 loc) · 2.81 KB
/
preprocessing.py
File metadata and controls
113 lines (91 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
import string
import nltk
from nltk.corpus import stopwords
ps = nltk.stem.PorterStemmer()
stopwords_set = set(stopwords.words('english'))
def replace_umlaute(str):
umlaut_map = {
'\u00dc': 'UE',
'\u00c4': 'AE',
'\u00d6': 'OE',
'\u00fc': 'ue',
'\u00e4': 'ae',
'\u00f6': 'oe',
'\u00df': 'ss',
}
for k, v in umlaut_map.items():
str = str.replace(k, v)
return str
def preprocessing_normalization_func(text_in, ignore_caps=True, hide_punc=True):
punc_rep = {
'.': '',
',': '',
'!': '',
'?': ''
}
replacements = {
# '.': '',
# ',': '',
# '!': '',
# '?': '',
'…': '',
':': '',
';': '',
'-': '',
'_': '',
"'ve ": ' have ',
"'re ": ' are ',
"'m ": ' am ',
"'em ": ' them ',
" he's ": ' he is ',
"'d ": ' would',
"n't ": ' not ',
" y'": ' you ',
"'ll ": ' will ',
" kinda ": ' kind of ',
" gonna ": ' going to ',
" wanna ": ' want to ',
" dunno ": ' do not know ',
" because ": ' cause ',
" ma'am ": ' madam ',
' & ': ' and ',
" <unk> ": ' ',
}
uhms = ['uh', 'oh', 'em', 'um', 'ah', 'uhum', 'mmhmm', 'uhm', 'ahm', 'ähm', 'äh', 'ähmm', 'ähh']
for uhm in uhms:
replacements[' ' + uhm + ' '] = ' '
# Remove annotations (e.g. "[laughter]"")
text_in = re.sub("\[[a-zA-Z0-9]*\]", '', text_in)
# Remove newline characters
text_in = ' ' + text_in.replace('\r', ' ').replace('\n', ' ')
# text_in = ' ' + text_in
for k, v in replacements.items():
text_in = text_in.replace(k, v)
# remove \u200b from text
text_in = text_in.replace(u'\u200b', '')
text_in_punc = text_in
for k, v in punc_rep.items():
text_in = text_in.replace(k, v)
if ignore_caps:
text_in = text_in.lower() # lowercase
if hide_punc:
return text_in
else:
return text_in, text_in_punc
def ewer_normalization_func(txt):
txt = txt.replace('.', '').replace('?', '').replace(',', '') # remove punctuation
txt = [' '.join(w) for w in txt.split() if w not in stopwords_set] # remove stopwords
txt = replace_umlaute(txt)
txt = ' '.join([ps.stem(w) for w in txt.split()]) # Apply stemming
txt = txt.replace(' ', '')
return txt
def remove_punctuation(text):
"""
Removes the punctuation r\"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~\""" from the string and replaces them with an
empty string.
:param text: The text to process
:return: The input text without the punctuation symbols
"""
transtable = str.maketrans('', '', string.punctuation)
return text.translate(transtable)