anoopkunchukuttan
diff --git a/‎indicnlp/normalize/indic_normalize.py‎
Lines changed: 374 additions & 523 deletions b/‎indicnlp/normalize/indic_normalize.py‎
Lines changed: 374 additions & 523 deletions
diff --git a/‎indicnlp/script/english_script.py‎
Lines changed: 87 additions & 56 deletions b/‎indicnlp/script/english_script.py‎
Lines changed: 87 additions & 56 deletions
@@ -9,15 +9,14 @@
 import os
 import pandas as pd
 import numpy as np
+from typing import Tuple, Optional
 
 from indicnlp import common
 
-
 #### Maps from ARPABET to Internal Id
 ARPABET_ID_MAP = {}
 ID_ARPABET_MAP = {}
 
-
 ###
 # Phonetic Information about script characters
 ###
@@ -70,7 +69,6 @@
     "vowel_roundness": [36, 38],
 }
 
-
 ####
 # Indexes into the Phonetic Vector
 ####
@@ -90,83 +88,116 @@
 ## TBD
 SCRIPT_RANGE_END = 0x0D2E
 
+# Precompute invalid_vector as a global constant
+INVALID_VECTOR = np.zeros(PHONETIC_VECTOR_LENGTH, dtype=int)
+INVALID_VECTOR.flags.writeable = False
 
 def init():
     """
     To be called by library loader, do not call it in your program
     """
+    global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET, INVALID_VECTOR
 
-    global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
-
-    ENGLISH_PHONETIC_DATA = pd.read_csv(
-        os.path.join(
-            common.get_resources_path(), "script", "english_script_phonetic_data.csv"
-        ),
-        encoding="utf-8",
-    )
-
-    ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[
-        :, PHONETIC_VECTOR_START_OFFSET:
-    ].values
+    phonetic_data_path = os.path.join(common.get_resources_path(), "script", "english_script_phonetic_data.csv")
+    arpabet_list_path = os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv")
 
+    # Load phonetic data
+    ENGLISH_PHONETIC_DATA = pd.read_csv(phonetic_data_path, encoding="utf-8")
+    ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[:, PHONETIC_VECTOR_START_OFFSET:].to_numpy()
     PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1]
 
-    ### Load mapping from ARPABET representation of phoneme to internal ID
-    global ARPABET_ID_MAP, ID_ARPABET_MAP
-
-    with open(
-        os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv"),
-        "r",
-        encoding="utf-8",
-    ) as infile:
-        for ph_id, name in enumerate(iter(infile)):
-            name = name.strip()
-            ARPABET_ID_MAP[name] = ph_id
-            ID_ARPABET_MAP[ph_id] = name
-
-
-def phoneme_to_offset(ph):
-    return ARPABET_ID_MAP[ph]
+    # Update INVALID_VECTOR in case PHONETIC_VECTOR_LENGTH has changed
+    global INVALID_VECTOR
+    INVALID_VECTOR = np.zeros(PHONETIC_VECTOR_LENGTH, dtype=int)
+    INVALID_VECTOR.flags.writeable = False
 
+    ### Load mapping from ARPABET representation of phoneme to internal ID
+    # Use pandas to load the ARPABET list and create mappings efficiently
+    arpabet_df = pd.read_csv(arpabet_list_path, header=None, names=["phoneme"], encoding="utf-8")
+    phonemes = arpabet_df['phoneme'].str.strip().tolist()
+    ARPABET_ID_MAP.update({phoneme: idx for idx, phoneme in enumerate(phonemes)})
+    ID_ARPABET_MAP.update({idx: phoneme for idx, phoneme in enumerate(phonemes)})
 
-def offset_to_phoneme(ph_id):
-    return ID_ARPABET_MAP[ph_id]
-
-
-def phoneme_to_enc(ph):
-    return chr(SCRIPT_RANGE_START + phoneme_to_offset(ph))
-
+def phoneme_to_offset(ph: str) -> int:
+    """
+    Convert a phoneme to its internal offset.
+    Returns -1 if phoneme is not found.
+    """
+    return ARPABET_ID_MAP.get(ph, -1)
 
-def enc_to_phoneme(ph):
-    return offset_to_phoneme(enc_to_offset(ph))
+def offset_to_phoneme(ph_id: int) -> str:
+    """
+    Convert an internal offset to its phoneme.
+    Returns an empty string if ID is not found.
+    """
+    return ID_ARPABET_MAP.get(ph_id, "")
 
+def phoneme_to_enc(ph: str) -> Optional[str]:
+    """
+    Convert a phoneme to its encoded character.
+    Returns None if phoneme is invalid or out of range.
+    """
+    offset = phoneme_to_offset(ph)
+    if offset == -1:
+        return None
+    try:
+        return chr(SCRIPT_RANGE_START + offset)
+    except ValueError:
+        return None
+
+def enc_to_phoneme(ph: str) -> str:
+    """
+    Convert an encoded character to its phoneme.
+    Returns an empty string if character is out of range.
+    """
+    offset = enc_to_offset(ph)
+    if in_range(offset):
+        return offset_to_phoneme(offset)
+    return ""
 
-def enc_to_offset(c):
+def enc_to_offset(c: str) -> int:
+    """
+    Convert an encoded character to its internal offset.
+    """
     return ord(c) - SCRIPT_RANGE_START
 
+def in_range(offset: int) -> bool:
+    """
+    Check if the offset is within the valid script range.
+    """
+    return 0 <= (SCRIPT_RANGE_START + offset) < SCRIPT_RANGE_END
 
-def in_range(offset):
-    return offset >= SCRIPT_RANGE_START and offset < SCRIPT_RANGE_END
-
-
-def get_phonetic_info(lang):
-    return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
-
-
-def invalid_vector():
-    ##  TODO: check if np datatype is correct?
-    return np.array([0] * PHONETIC_VECTOR_LENGTH)
+def get_phonetic_info(lang: str) -> Tuple[Optional[pd.DataFrame], Optional[np.ndarray]]:
+    """
+    Get phonetic data and vectors for a given language.
+    Currently supports English ('en').
+    """
+    if lang.lower() == "en":
+        return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
+    return (None, None)
 
+def invalid_vector() -> np.ndarray:
+    """
+    Return an invalid phonetic vector (all zeros).
+    """
+    return INVALID_VECTOR
 
-def get_phonetic_feature_vector(p, lang):
+def get_phonetic_feature_vector(p: str, lang: str) -> np.ndarray:
+    """
+    Get the phonetic feature vector for a given phoneme and language.
+    Returns an invalid vector if phoneme is out of range or invalid.
+    """
     offset = enc_to_offset(p)
 
     if not in_range(offset):
-        return invalid_vector()
+        return INVALID_VECTOR
 
     phonetic_data, phonetic_vectors = get_phonetic_info(lang)
 
-    if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0:
-        return invalid_vector()
+    if phonetic_data is None or offset >= len(phonetic_data):
+        return INVALID_VECTOR
+
+    if phonetic_data.at[offset, "Valid Vector Representation"] == 0:
+        return INVALID_VECTOR
 
     return phonetic_vectors[offset]