From 996781b2d1328fc103df7b49b94a145d92c99007 Mon Sep 17 00:00:00 2001 From: Sharuk Abdul Kareem Shahul Hameed Date: Wed, 3 Sep 2025 14:23:38 +0800 Subject: [PATCH] Fix: added custom time normalizer rules --- malaya/normalizer/rules.py | 156 ++++++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/malaya/normalizer/rules.py b/malaya/normalizer/rules.py index cd73614f..dca40173 100644 --- a/malaya/normalizer/rules.py +++ b/malaya/normalizer/rules.py @@ -239,6 +239,7 @@ def normalize( normalize_telephone: bool = True, normalize_date: bool = True, normalize_time: bool = True, + time_to_words: bool = False, normalize_emoji: bool = True, normalize_elongated: bool = True, normalize_hingga: bool = True, @@ -303,6 +304,10 @@ def normalize( if True, `pukul 22.30` -> `pukul sepuluh tiga puluh minit malam`. if True, `12:10 AM` -> `pukul dua belas sepuluh minit pagi`. if False, `pukul 2.30` -> `'02:00:00'` + time_to_words: bool, optional (default=False) + if True, converts time to words without adding prefixes or suffixes. + `2:15 p.m.` -> `two fifteen p m` + `12:30 AM` -> `twelve thirty a m` normalize_emoji: bool, (default=True) if True, `🔥` -> `emoji api` Load from `malaya.preprocessing.demoji`. @@ -1038,64 +1043,107 @@ def normalize( ): s = f'index: {index}, word: {word}, condition time' logger.debug(s) - word = word_lower - word = re.sub(r'[ ]+', ' ', word).strip() - if normalize_in_english: - prefix = 'at ' - else: - prefix = 'pukul ' - try: - s = f'index: {index}, word: {word}, parsing time' + + if time_to_words: + s = f'index: {index}, word: {word}, condition time_to_words' logger.debug(s) - parsed = parse_time_string(word) - if len(parsed): - parsed = parsed[0] - word = parsed.strftime('%H:%M:%S') - hour, minute, second = word.split(':') - if normalize_time: - hour = parsed.strftime('%I') - hour = hour.lstrip('0') - if parsed.hour < 12: - if normalize_in_english: - period = 'morning' - else: - period = 'pagi' - elif parsed.hour < 19: - if normalize_in_english: - period = 'evening' - else: - period = 'petang' + + try: + parsed = parse_time_string(word_lower) + if len(parsed): + parsed = parsed[0] + hour = parsed.strftime('%I').lstrip('0') # 12-hour format without leading zero + minute = parsed.strftime('%M') + + hour_words = cardinal(hour, english=normalize_in_english) + if int(minute) > 0: + minute_words = cardinal(minute, english=normalize_in_english) else: - if normalize_in_english: - period = 'night' + minute_words = '' + + am_pm = '' + original_lower = word.lower() + if 'a.m.' in original_lower: + am_pm = 'a m' + elif 'p.m.' in original_lower: + am_pm = 'p m' + elif 'am' in original_lower: + am_pm = 'a m' + elif 'pm' in original_lower: + am_pm = 'p m' + + parts = [hour_words] + if minute_words: + parts.append(minute_words) + if am_pm: + parts.append(am_pm) + + word = ' '.join(parts) + else: + pass + except Exception as e: + logger.warning(f'time_to_words parsing failed: {e}') + pass + else: + word = word_lower + word = re.sub(r'[ ]+', ' ', word).strip() + if normalize_in_english: + prefix = 'at ' + else: + prefix = 'pukul ' + try: + s = f'index: {index}, word: {word}, parsing time' + logger.debug(s) + parsed = parse_time_string(word) + if len(parsed): + parsed = parsed[0] + word = parsed.strftime('%H:%M:%S') + hour, minute, second = word.split(':') + if normalize_time: + hour = parsed.strftime('%I') + hour = hour.lstrip('0') + if parsed.hour < 12: + if normalize_in_english: + period = 'morning' + else: + period = 'pagi' + elif parsed.hour < 19: + if normalize_in_english: + period = 'evening' + else: + period = 'petang' else: - period = 'malam' - hour = cardinal(hour, english=normalize_in_english) - if int(minute) > 0: - minute = cardinal(minute, english=normalize_in_english) - if normalize_in_english: - end = 'minute' + if normalize_in_english: + period = 'night' + else: + period = 'malam' + hour = cardinal(hour, english=normalize_in_english) + if int(minute) > 0: + minute = cardinal(minute, english=normalize_in_english) + if normalize_in_english: + end = 'minute' + else: + end = 'minit' + minute = f'{minute} {end}' else: - end = 'minit' - minute = f'{minute} {end}' - else: - minute = '' - if int(second) > 0: - second = cardinal(second, english=normalize_in_english) - second = f'{second} saat' + minute = '' + if int(second) > 0: + second = cardinal(second, english=normalize_in_english) + second = f'{second} saat' + else: + second = '' + word = f'{prefix}{hour} {minute} {second} {period}' else: - second = '' - word = f'{prefix}{hour} {minute} {second} {period}' - else: - pukul = f'{prefix}{hour}' - if int(minute) > 0: - pukul = f'{pukul}.{minute}' - if int(second) > 0: - pukul = f'{pukul}:{second}' - word = pukul - word = re.sub(r'[ ]+', ' ', word).strip() - except Exception as e: - logger.warning(str(e)) + pukul = f'{prefix}{hour}' + if int(minute) > 0: + pukul = f'{pukul}.{minute}' + if int(second) > 0: + pukul = f'{pukul}:{second}' + word = pukul + word = re.sub(r'[ ]+', ' ', word).strip() + except Exception as e: + logger.warning(str(e)) + result.append(word) index += 1 continue @@ -1354,4 +1402,4 @@ def load( raise ValueError('stemmer must have `stem_word` method') tokenizer = Tokenizer(**kwargs).tokenize - return Normalizer(tokenizer=tokenizer, speller=speller, stemmer=stemmer) + return Normalizer(tokenizer=tokenizer, speller=speller, stemmer=stemmer) \ No newline at end of file