Skip to content

Commit 0c51fc9

Browse files
author
Varun Gumma
committed
optimizations
1 parent 342a7e9 commit 0c51fc9

File tree

16 files changed

+2691
-2160
lines changed

16 files changed

+2691
-2160
lines changed

indicnlp/normalize/indic_normalize.py

Lines changed: 374 additions & 523 deletions
Large diffs are not rendered by default.

indicnlp/script/english_script.py

Lines changed: 87 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,14 @@
99
import os
1010
import pandas as pd
1111
import numpy as np
12+
from typing import Tuple, Optional
1213

1314
from indicnlp import common
1415

15-
1616
#### Maps from ARPABET to Internal Id
1717
ARPABET_ID_MAP = {}
1818
ID_ARPABET_MAP = {}
1919

20-
2120
###
2221
# Phonetic Information about script characters
2322
###
@@ -70,7 +69,6 @@
7069
"vowel_roundness": [36, 38],
7170
}
7271

73-
7472
####
7573
# Indexes into the Phonetic Vector
7674
####
@@ -90,83 +88,116 @@
9088
## TBD
9189
SCRIPT_RANGE_END = 0x0D2E
9290

91+
# Precompute invalid_vector as a global constant
92+
INVALID_VECTOR = np.zeros(PHONETIC_VECTOR_LENGTH, dtype=int)
93+
INVALID_VECTOR.flags.writeable = False
9394

9495
def init():
9596
"""
9697
To be called by library loader, do not call it in your program
9798
"""
99+
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET, INVALID_VECTOR
98100

99-
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
100-
101-
ENGLISH_PHONETIC_DATA = pd.read_csv(
102-
os.path.join(
103-
common.get_resources_path(), "script", "english_script_phonetic_data.csv"
104-
),
105-
encoding="utf-8",
106-
)
107-
108-
ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[
109-
:, PHONETIC_VECTOR_START_OFFSET:
110-
].values
101+
phonetic_data_path = os.path.join(common.get_resources_path(), "script", "english_script_phonetic_data.csv")
102+
arpabet_list_path = os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv")
111103

104+
# Load phonetic data
105+
ENGLISH_PHONETIC_DATA = pd.read_csv(phonetic_data_path, encoding="utf-8")
106+
ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[:, PHONETIC_VECTOR_START_OFFSET:].to_numpy()
112107
PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1]
113108

114-
### Load mapping from ARPABET representation of phoneme to internal ID
115-
global ARPABET_ID_MAP, ID_ARPABET_MAP
116-
117-
with open(
118-
os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv"),
119-
"r",
120-
encoding="utf-8",
121-
) as infile:
122-
for ph_id, name in enumerate(iter(infile)):
123-
name = name.strip()
124-
ARPABET_ID_MAP[name] = ph_id
125-
ID_ARPABET_MAP[ph_id] = name
126-
127-
128-
def phoneme_to_offset(ph):
129-
return ARPABET_ID_MAP[ph]
109+
# Update INVALID_VECTOR in case PHONETIC_VECTOR_LENGTH has changed
110+
global INVALID_VECTOR
111+
INVALID_VECTOR = np.zeros(PHONETIC_VECTOR_LENGTH, dtype=int)
112+
INVALID_VECTOR.flags.writeable = False
130113

114+
### Load mapping from ARPABET representation of phoneme to internal ID
115+
# Use pandas to load the ARPABET list and create mappings efficiently
116+
arpabet_df = pd.read_csv(arpabet_list_path, header=None, names=["phoneme"], encoding="utf-8")
117+
phonemes = arpabet_df['phoneme'].str.strip().tolist()
118+
ARPABET_ID_MAP.update({phoneme: idx for idx, phoneme in enumerate(phonemes)})
119+
ID_ARPABET_MAP.update({idx: phoneme for idx, phoneme in enumerate(phonemes)})
131120

132-
def offset_to_phoneme(ph_id):
133-
return ID_ARPABET_MAP[ph_id]
134-
135-
136-
def phoneme_to_enc(ph):
137-
return chr(SCRIPT_RANGE_START + phoneme_to_offset(ph))
138-
121+
def phoneme_to_offset(ph: str) -> int:
122+
"""
123+
Convert a phoneme to its internal offset.
124+
Returns -1 if phoneme is not found.
125+
"""
126+
return ARPABET_ID_MAP.get(ph, -1)
139127

140-
def enc_to_phoneme(ph):
141-
return offset_to_phoneme(enc_to_offset(ph))
128+
def offset_to_phoneme(ph_id: int) -> str:
129+
"""
130+
Convert an internal offset to its phoneme.
131+
Returns an empty string if ID is not found.
132+
"""
133+
return ID_ARPABET_MAP.get(ph_id, "")
142134

135+
def phoneme_to_enc(ph: str) -> Optional[str]:
136+
"""
137+
Convert a phoneme to its encoded character.
138+
Returns None if phoneme is invalid or out of range.
139+
"""
140+
offset = phoneme_to_offset(ph)
141+
if offset == -1:
142+
return None
143+
try:
144+
return chr(SCRIPT_RANGE_START + offset)
145+
except ValueError:
146+
return None
147+
148+
def enc_to_phoneme(ph: str) -> str:
149+
"""
150+
Convert an encoded character to its phoneme.
151+
Returns an empty string if character is out of range.
152+
"""
153+
offset = enc_to_offset(ph)
154+
if in_range(offset):
155+
return offset_to_phoneme(offset)
156+
return ""
143157

144-
def enc_to_offset(c):
158+
def enc_to_offset(c: str) -> int:
159+
"""
160+
Convert an encoded character to its internal offset.
161+
"""
145162
return ord(c) - SCRIPT_RANGE_START
146163

164+
def in_range(offset: int) -> bool:
165+
"""
166+
Check if the offset is within the valid script range.
167+
"""
168+
return 0 <= (SCRIPT_RANGE_START + offset) < SCRIPT_RANGE_END
147169

148-
def in_range(offset):
149-
return offset >= SCRIPT_RANGE_START and offset < SCRIPT_RANGE_END
150-
151-
152-
def get_phonetic_info(lang):
153-
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
154-
155-
156-
def invalid_vector():
157-
## TODO: check if np datatype is correct?
158-
return np.array([0] * PHONETIC_VECTOR_LENGTH)
170+
def get_phonetic_info(lang: str) -> Tuple[Optional[pd.DataFrame], Optional[np.ndarray]]:
171+
"""
172+
Get phonetic data and vectors for a given language.
173+
Currently supports English ('en').
174+
"""
175+
if lang.lower() == "en":
176+
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
177+
return (None, None)
159178

179+
def invalid_vector() -> np.ndarray:
180+
"""
181+
Return an invalid phonetic vector (all zeros).
182+
"""
183+
return INVALID_VECTOR
160184

161-
def get_phonetic_feature_vector(p, lang):
185+
def get_phonetic_feature_vector(p: str, lang: str) -> np.ndarray:
186+
"""
187+
Get the phonetic feature vector for a given phoneme and language.
188+
Returns an invalid vector if phoneme is out of range or invalid.
189+
"""
162190
offset = enc_to_offset(p)
163191

164192
if not in_range(offset):
165-
return invalid_vector()
193+
return INVALID_VECTOR
166194

167195
phonetic_data, phonetic_vectors = get_phonetic_info(lang)
168196

169-
if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0:
170-
return invalid_vector()
197+
if phonetic_data is None or offset >= len(phonetic_data):
198+
return INVALID_VECTOR
199+
200+
if phonetic_data.at[offset, "Valid Vector Representation"] == 0:
201+
return INVALID_VECTOR
171202

172203
return phonetic_vectors[offset]

0 commit comments

Comments
 (0)