Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 102 additions & 54 deletions malaya/normalizer/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def normalize(
normalize_telephone: bool = True,
normalize_date: bool = True,
normalize_time: bool = True,
time_to_words: bool = False,
normalize_emoji: bool = True,
normalize_elongated: bool = True,
normalize_hingga: bool = True,
Expand Down Expand Up @@ -303,6 +304,10 @@ def normalize(
if True, `pukul 22.30` -> `pukul sepuluh tiga puluh minit malam`.
if True, `12:10 AM` -> `pukul dua belas sepuluh minit pagi`.
if False, `pukul 2.30` -> `'02:00:00'`
time_to_words: bool, optional (default=False)
if True, converts time to words without adding prefixes or suffixes.
`2:15 p.m.` -> `two fifteen p m`
`12:30 AM` -> `twelve thirty a m`
normalize_emoji: bool, (default=True)
if True, `🔥` -> `emoji api`
Load from `malaya.preprocessing.demoji`.
Expand Down Expand Up @@ -1038,64 +1043,107 @@ def normalize(
):
s = f'index: {index}, word: {word}, condition time'
logger.debug(s)
word = word_lower
word = re.sub(r'[ ]+', ' ', word).strip()
if normalize_in_english:
prefix = 'at '
else:
prefix = 'pukul '
try:
s = f'index: {index}, word: {word}, parsing time'

if time_to_words:
s = f'index: {index}, word: {word}, condition time_to_words'
logger.debug(s)
parsed = parse_time_string(word)
if len(parsed):
parsed = parsed[0]
word = parsed.strftime('%H:%M:%S')
hour, minute, second = word.split(':')
if normalize_time:
hour = parsed.strftime('%I')
hour = hour.lstrip('0')
if parsed.hour < 12:
if normalize_in_english:
period = 'morning'
else:
period = 'pagi'
elif parsed.hour < 19:
if normalize_in_english:
period = 'evening'
else:
period = 'petang'

try:
parsed = parse_time_string(word_lower)
if len(parsed):
parsed = parsed[0]
hour = parsed.strftime('%I').lstrip('0') # 12-hour format without leading zero
minute = parsed.strftime('%M')

hour_words = cardinal(hour, english=normalize_in_english)
if int(minute) > 0:
minute_words = cardinal(minute, english=normalize_in_english)
else:
if normalize_in_english:
period = 'night'
minute_words = ''

am_pm = ''
original_lower = word.lower()
if 'a.m.' in original_lower:
am_pm = 'a m'
elif 'p.m.' in original_lower:
am_pm = 'p m'
elif 'am' in original_lower:
am_pm = 'a m'
elif 'pm' in original_lower:
am_pm = 'p m'

parts = [hour_words]
if minute_words:
parts.append(minute_words)
if am_pm:
parts.append(am_pm)

word = ' '.join(parts)
else:
pass
except Exception as e:
logger.warning(f'time_to_words parsing failed: {e}')
pass
else:
word = word_lower
word = re.sub(r'[ ]+', ' ', word).strip()
if normalize_in_english:
prefix = 'at '
else:
prefix = 'pukul '
try:
s = f'index: {index}, word: {word}, parsing time'
logger.debug(s)
parsed = parse_time_string(word)
if len(parsed):
parsed = parsed[0]
word = parsed.strftime('%H:%M:%S')
hour, minute, second = word.split(':')
if normalize_time:
hour = parsed.strftime('%I')
hour = hour.lstrip('0')
if parsed.hour < 12:
if normalize_in_english:
period = 'morning'
else:
period = 'pagi'
elif parsed.hour < 19:
if normalize_in_english:
period = 'evening'
else:
period = 'petang'
else:
period = 'malam'
hour = cardinal(hour, english=normalize_in_english)
if int(minute) > 0:
minute = cardinal(minute, english=normalize_in_english)
if normalize_in_english:
end = 'minute'
if normalize_in_english:
period = 'night'
else:
period = 'malam'
hour = cardinal(hour, english=normalize_in_english)
if int(minute) > 0:
minute = cardinal(minute, english=normalize_in_english)
if normalize_in_english:
end = 'minute'
else:
end = 'minit'
minute = f'{minute} {end}'
else:
end = 'minit'
minute = f'{minute} {end}'
else:
minute = ''
if int(second) > 0:
second = cardinal(second, english=normalize_in_english)
second = f'{second} saat'
minute = ''
if int(second) > 0:
second = cardinal(second, english=normalize_in_english)
second = f'{second} saat'
else:
second = ''
word = f'{prefix}{hour} {minute} {second} {period}'
else:
second = ''
word = f'{prefix}{hour} {minute} {second} {period}'
else:
pukul = f'{prefix}{hour}'
if int(minute) > 0:
pukul = f'{pukul}.{minute}'
if int(second) > 0:
pukul = f'{pukul}:{second}'
word = pukul
word = re.sub(r'[ ]+', ' ', word).strip()
except Exception as e:
logger.warning(str(e))
pukul = f'{prefix}{hour}'
if int(minute) > 0:
pukul = f'{pukul}.{minute}'
if int(second) > 0:
pukul = f'{pukul}:{second}'
word = pukul
word = re.sub(r'[ ]+', ' ', word).strip()
except Exception as e:
logger.warning(str(e))

result.append(word)
index += 1
continue
Expand Down Expand Up @@ -1354,4 +1402,4 @@ def load(
raise ValueError('stemmer must have `stem_word` method')

tokenizer = Tokenizer(**kwargs).tokenize
return Normalizer(tokenizer=tokenizer, speller=speller, stemmer=stemmer)
return Normalizer(tokenizer=tokenizer, speller=speller, stemmer=stemmer)