Skip to content

Commit 0850235

Browse files
committed
change the text to phoneme conversion for Mandarin Chinese
1 parent bf0a59f commit 0850235

File tree

2 files changed

+50
-10
lines changed

2 files changed

+50
-10
lines changed

Preprocessing/TextFrontend.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sys
66

77
import torch
8+
from dragonmapper.transcriptions import pinyin_to_ipa
89
from phonemizer.backend import EspeakBackend
910
from pypinyin import pinyin
1011

@@ -245,21 +246,57 @@ def get_phone_string(self, text, include_eos_symbol=True, for_feature_extraction
245246
# expand abbreviations
246247
utt = self.expand_abbreviations(text)
247248
# phonemize
248-
phones = self.phonemizer_backend.phonemize([utt], strip=True)[0] # To use a different phonemizer, this is the only line that needs to be exchanged
249+
if self.g2p_lang == "cmn-latn-pinyin" or self.g2p_lang == "cmn":
250+
phones = pinyin_to_ipa(utt)
251+
else:
252+
phones = self.phonemizer_backend.phonemize([utt], strip=True)[0] # To use a different phonemizer, this is the only line that needs to be exchanged
249253

250254
# Unfortunately tonal languages don't agree on the tone, most tonal
251255
# languages use different tones denoted by different numbering
252256
# systems. At this point in the script, it is attempted to unify
253257
# them all to the tones in the IPA standard.
254258
if self.g2p_lang == "cmn-latn-pinyin" or self.g2p_lang == "cmn":
259+
"""
260+
handling for the espeak use-case
261+
255262
phones = phones.replace(".", "") # no idea why espeak puts dots everywhere for Chinese
256-
phones = phones.replace('1', "˥")
257-
phones = phones.replace('2', "˧˥")
258-
phones = phones.replace('ɜ', "˨˩") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
259-
phones = phones.replace('3', "˨˩") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
260-
phones = phones.replace('4', "˦˩")
261-
phones = phones.replace('5', "˧")
262-
phones = phones.replace('0', "˧")
263+
264+
# fix for a bug in espeak that ignores the second target in multi-target tones in mandarin, as proposed by GitHub user @GodEase
265+
phones = ' '.join([re.sub(r'[1-5ɜ]', u[-1], p) for p, u in zip(phones.split(), utt.split())])
266+
267+
# unfortunately the pypinyin package gives us only the unique characters instead of the base characters with modifiers,
268+
# so we have to do every vowel separately ̌ ́ ̄ ̀
269+
270+
# handle flat tone
271+
phones = phones.replace("ā", "˥")
272+
phones = phones.replace("ē", "˥")
273+
phones = phones.replace("ī", "˥")
274+
phones = phones.replace("ō", "˥")
275+
phones = phones.replace("ū", "˥")
276+
phones = phones.replace("ǖ", "˥")
277+
# handle rising tone
278+
phones = phones.replace("á", "˧˥")
279+
phones = phones.replace("é", "˧˥")
280+
phones = phones.replace("í", "˧˥")
281+
phones = phones.replace("ó", "˧˥")
282+
phones = phones.replace("ú", "˧˥")
283+
phones = phones.replace("ǘ", "˧˥")
284+
# handle dip tone
285+
phones = phones.replace("ǎ", "˨˩˦")
286+
phones = phones.replace("ĕ", "˨˩˦")
287+
phones = phones.replace("ǐ", "˨˩˦")
288+
phones = phones.replace("ǒ", "˨˩˦")
289+
phones = phones.replace("ǔ", "˨˩˦")
290+
phones = phones.replace("ǚ", "˨˩˦")
291+
# handle falling tone
292+
phones = phones.replace("à", "˥˩")
293+
phones = phones.replace("è", "˥˩")
294+
phones = phones.replace("ì", "˥˩")
295+
phones = phones.replace("ò", "˥˩")
296+
phones = phones.replace("ù", "˥˩")
297+
phones = phones.replace("ǜ", "˥˩")
298+
"""
299+
263300
if self.g2p_lang == "vi":
264301
phones = phones.replace('1', "˧")
265302
phones = phones.replace('2', "˨˩")
@@ -279,6 +316,7 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
279316
replacements = [
280317
# punctuation in languages with non-latin script
281318
("。", "."),
319+
(",", ","),
282320
("【", '"'),
283321
("】", '"'),
284322
("、", ","),
@@ -288,6 +326,8 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
288326
("“", '"'),
289327
("”", '"'),
290328
("؛", ","),
329+
("《", '"'),
330+
("》", '"'),
291331
# latin script punctuation
292332
("/", " "),
293333
("—", ""),
@@ -456,8 +496,8 @@ def get_language_id(language):
456496

457497
tf = ArticulatoryCombinedTextFrontend(language="cmn")
458498
tf.string_to_tensor("这是一个复杂的句子,它甚至包含一个停顿。", view=True)
459-
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
460-
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
499+
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
500+
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
461501

462502
tf = ArticulatoryCombinedTextFrontend(language="vi")
463503
tf.string_to_tensor("Xin chào thế giới, quả là một ngày tốt lành để học nói tiếng Việt!", view=True)

requirements.txt

42 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)