55import sys
66
77import torch
8+ from dragonmapper .transcriptions import pinyin_to_ipa
89from phonemizer .backend import EspeakBackend
910from pypinyin import pinyin
1011
@@ -245,21 +246,57 @@ def get_phone_string(self, text, include_eos_symbol=True, for_feature_extraction
245246 # expand abbreviations
246247 utt = self .expand_abbreviations (text )
247248 # phonemize
248- phones = self .phonemizer_backend .phonemize ([utt ], strip = True )[0 ] # To use a different phonemizer, this is the only line that needs to be exchanged
249+ if self .g2p_lang == "cmn-latn-pinyin" or self .g2p_lang == "cmn" :
250+ phones = pinyin_to_ipa (utt )
251+ else :
252+ phones = self .phonemizer_backend .phonemize ([utt ], strip = True )[0 ] # To use a different phonemizer, this is the only line that needs to be exchanged
249253
250254 # Unfortunately tonal languages don't agree on the tone, most tonal
251255 # languages use different tones denoted by different numbering
252256 # systems. At this point in the script, it is attempted to unify
253257 # them all to the tones in the IPA standard.
254258 if self .g2p_lang == "cmn-latn-pinyin" or self .g2p_lang == "cmn" :
259+ """
260+ handling for the espeak use-case
261+
255262 phones = phones.replace(".", "") # no idea why espeak puts dots everywhere for Chinese
256- phones = phones .replace ('1' , "˥" )
257- phones = phones .replace ('2' , "˧˥" )
258- phones = phones .replace ('ɜ' , "˨˩" ) # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
259- phones = phones .replace ('3' , "˨˩" ) # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
260- phones = phones .replace ('4' , "˦˩" )
261- phones = phones .replace ('5' , "˧" )
262- phones = phones .replace ('0' , "˧" )
263+
264+ # fix for a bug in espeak that ignores the second target in multi-target tones in mandarin, as proposed by GitHub user @GodEase
265+ phones = ' '.join([re.sub(r'[1-5ɜ]', u[-1], p) for p, u in zip(phones.split(), utt.split())])
266+
267+ # unfortunately the pypinyin package gives us only the unique characters instead of the base characters with modifiers,
268+ # so we have to do every vowel separately ̌ ́ ̄ ̀
269+
270+ # handle flat tone
271+ phones = phones.replace("ā", "˥")
272+ phones = phones.replace("ē", "˥")
273+ phones = phones.replace("ī", "˥")
274+ phones = phones.replace("ō", "˥")
275+ phones = phones.replace("ū", "˥")
276+ phones = phones.replace("ǖ", "˥")
277+ # handle rising tone
278+ phones = phones.replace("á", "˧˥")
279+ phones = phones.replace("é", "˧˥")
280+ phones = phones.replace("í", "˧˥")
281+ phones = phones.replace("ó", "˧˥")
282+ phones = phones.replace("ú", "˧˥")
283+ phones = phones.replace("ǘ", "˧˥")
284+ # handle dip tone
285+ phones = phones.replace("ǎ", "˨˩˦")
286+ phones = phones.replace("ĕ", "˨˩˦")
287+ phones = phones.replace("ǐ", "˨˩˦")
288+ phones = phones.replace("ǒ", "˨˩˦")
289+ phones = phones.replace("ǔ", "˨˩˦")
290+ phones = phones.replace("ǚ", "˨˩˦")
291+ # handle falling tone
292+ phones = phones.replace("à", "˥˩")
293+ phones = phones.replace("è", "˥˩")
294+ phones = phones.replace("ì", "˥˩")
295+ phones = phones.replace("ò", "˥˩")
296+ phones = phones.replace("ù", "˥˩")
297+ phones = phones.replace("ǜ", "˥˩")
298+ """
299+
263300 if self .g2p_lang == "vi" :
264301 phones = phones .replace ('1' , "˧" )
265302 phones = phones .replace ('2' , "˨˩" )
@@ -279,6 +316,7 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
279316 replacements = [
280317 # punctuation in languages with non-latin script
281318 ("。" , "." ),
319+ ("," , "," ),
282320 ("【" , '"' ),
283321 ("】" , '"' ),
284322 ("、" , "," ),
@@ -288,6 +326,8 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
288326 ("“" , '"' ),
289327 ("”" , '"' ),
290328 ("؛" , "," ),
329+ ("《" , '"' ),
330+ ("》" , '"' ),
291331 # latin script punctuation
292332 ("/" , " " ),
293333 ("—" , "" ),
@@ -456,8 +496,8 @@ def get_language_id(language):
456496
457497 tf = ArticulatoryCombinedTextFrontend (language = "cmn" )
458498 tf .string_to_tensor ("这是一个复杂的句子,它甚至包含一个停顿。" , view = True )
459- tf .string_to_tensor ("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。" , view = True )
460- tf .string_to_tensor ("巴 拔 把 爸 吧" , view = True )
499+ tf .string_to_tensor ("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。" , view = True )
500+ tf .string_to_tensor ("巴 拔 把 爸 吧" , view = True )
461501
462502 tf = ArticulatoryCombinedTextFrontend (language = "vi" )
463503 tf .string_to_tensor ("Xin chào thế giới, quả là một ngày tốt lành để học nói tiếng Việt!" , view = True )
0 commit comments