|
9 | 9 | import os |
10 | 10 | import pandas as pd |
11 | 11 | import numpy as np |
| 12 | +from typing import Tuple, Optional |
12 | 13 |
|
13 | 14 | from indicnlp import common |
14 | 15 |
|
15 | | - |
16 | 16 | #### Maps from ARPABET to Internal Id |
17 | 17 | ARPABET_ID_MAP = {} |
18 | 18 | ID_ARPABET_MAP = {} |
19 | 19 |
|
20 | | - |
21 | 20 | ### |
22 | 21 | # Phonetic Information about script characters |
23 | 22 | ### |
|
70 | 69 | "vowel_roundness": [36, 38], |
71 | 70 | } |
72 | 71 |
|
73 | | - |
74 | 72 | #### |
75 | 73 | # Indexes into the Phonetic Vector |
76 | 74 | #### |
|
90 | 88 | ## TBD |
91 | 89 | SCRIPT_RANGE_END = 0x0D2E |
92 | 90 |
|
| 91 | +# Precompute invalid_vector as a global constant |
| 92 | +INVALID_VECTOR = np.zeros(PHONETIC_VECTOR_LENGTH, dtype=int) |
| 93 | +INVALID_VECTOR.flags.writeable = False |
93 | 94 |
|
94 | 95 | def init(): |
95 | 96 | """ |
96 | 97 | To be called by library loader, do not call it in your program |
97 | 98 | """ |
| 99 | + global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET, INVALID_VECTOR |
98 | 100 |
|
99 | | - global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET |
100 | | - |
101 | | - ENGLISH_PHONETIC_DATA = pd.read_csv( |
102 | | - os.path.join( |
103 | | - common.get_resources_path(), "script", "english_script_phonetic_data.csv" |
104 | | - ), |
105 | | - encoding="utf-8", |
106 | | - ) |
107 | | - |
108 | | - ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[ |
109 | | - :, PHONETIC_VECTOR_START_OFFSET: |
110 | | - ].values |
| 101 | + phonetic_data_path = os.path.join(common.get_resources_path(), "script", "english_script_phonetic_data.csv") |
| 102 | + arpabet_list_path = os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv") |
111 | 103 |
|
| 104 | + # Load phonetic data |
| 105 | + ENGLISH_PHONETIC_DATA = pd.read_csv(phonetic_data_path, encoding="utf-8") |
| 106 | + ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.iloc[:, PHONETIC_VECTOR_START_OFFSET:].to_numpy() |
112 | 107 | PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1] |
113 | 108 |
|
114 | | - ### Load mapping from ARPABET representation of phoneme to internal ID |
115 | | - global ARPABET_ID_MAP, ID_ARPABET_MAP |
116 | | - |
117 | | - with open( |
118 | | - os.path.join(common.get_resources_path(), "script", "english_arpabet_list.csv"), |
119 | | - "r", |
120 | | - encoding="utf-8", |
121 | | - ) as infile: |
122 | | - for ph_id, name in enumerate(iter(infile)): |
123 | | - name = name.strip() |
124 | | - ARPABET_ID_MAP[name] = ph_id |
125 | | - ID_ARPABET_MAP[ph_id] = name |
126 | | - |
127 | | - |
128 | | -def phoneme_to_offset(ph): |
129 | | - return ARPABET_ID_MAP[ph] |
| 109 | + # Update INVALID_VECTOR in case PHONETIC_VECTOR_LENGTH has changed |
| 110 | + global INVALID_VECTOR |
| 111 | + INVALID_VECTOR = np.zeros(PHONETIC_VECTOR_LENGTH, dtype=int) |
| 112 | + INVALID_VECTOR.flags.writeable = False |
130 | 113 |
|
| 114 | + ### Load mapping from ARPABET representation of phoneme to internal ID |
| 115 | + # Use pandas to load the ARPABET list and create mappings efficiently |
| 116 | + arpabet_df = pd.read_csv(arpabet_list_path, header=None, names=["phoneme"], encoding="utf-8") |
| 117 | + phonemes = arpabet_df['phoneme'].str.strip().tolist() |
| 118 | + ARPABET_ID_MAP.update({phoneme: idx for idx, phoneme in enumerate(phonemes)}) |
| 119 | + ID_ARPABET_MAP.update({idx: phoneme for idx, phoneme in enumerate(phonemes)}) |
131 | 120 |
|
132 | | -def offset_to_phoneme(ph_id): |
133 | | - return ID_ARPABET_MAP[ph_id] |
134 | | - |
135 | | - |
136 | | -def phoneme_to_enc(ph): |
137 | | - return chr(SCRIPT_RANGE_START + phoneme_to_offset(ph)) |
138 | | - |
| 121 | +def phoneme_to_offset(ph: str) -> int: |
| 122 | + """ |
| 123 | + Convert a phoneme to its internal offset. |
| 124 | + Returns -1 if phoneme is not found. |
| 125 | + """ |
| 126 | + return ARPABET_ID_MAP.get(ph, -1) |
139 | 127 |
|
140 | | -def enc_to_phoneme(ph): |
141 | | - return offset_to_phoneme(enc_to_offset(ph)) |
| 128 | +def offset_to_phoneme(ph_id: int) -> str: |
| 129 | + """ |
| 130 | + Convert an internal offset to its phoneme. |
| 131 | + Returns an empty string if ID is not found. |
| 132 | + """ |
| 133 | + return ID_ARPABET_MAP.get(ph_id, "") |
142 | 134 |
|
| 135 | +def phoneme_to_enc(ph: str) -> Optional[str]: |
| 136 | + """ |
| 137 | + Convert a phoneme to its encoded character. |
| 138 | + Returns None if phoneme is invalid or out of range. |
| 139 | + """ |
| 140 | + offset = phoneme_to_offset(ph) |
| 141 | + if offset == -1: |
| 142 | + return None |
| 143 | + try: |
| 144 | + return chr(SCRIPT_RANGE_START + offset) |
| 145 | + except ValueError: |
| 146 | + return None |
| 147 | + |
| 148 | +def enc_to_phoneme(ph: str) -> str: |
| 149 | + """ |
| 150 | + Convert an encoded character to its phoneme. |
| 151 | + Returns an empty string if character is out of range. |
| 152 | + """ |
| 153 | + offset = enc_to_offset(ph) |
| 154 | + if in_range(offset): |
| 155 | + return offset_to_phoneme(offset) |
| 156 | + return "" |
143 | 157 |
|
144 | | -def enc_to_offset(c): |
| 158 | +def enc_to_offset(c: str) -> int: |
| 159 | + """ |
| 160 | + Convert an encoded character to its internal offset. |
| 161 | + """ |
145 | 162 | return ord(c) - SCRIPT_RANGE_START |
146 | 163 |
|
| 164 | +def in_range(offset: int) -> bool: |
| 165 | + """ |
| 166 | + Check if the offset is within the valid script range. |
| 167 | + """ |
| 168 | + return 0 <= (SCRIPT_RANGE_START + offset) < SCRIPT_RANGE_END |
147 | 169 |
|
148 | | -def in_range(offset): |
149 | | - return offset >= SCRIPT_RANGE_START and offset < SCRIPT_RANGE_END |
150 | | - |
151 | | - |
152 | | -def get_phonetic_info(lang): |
153 | | - return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS) |
154 | | - |
155 | | - |
156 | | -def invalid_vector(): |
157 | | - ## TODO: check if np datatype is correct? |
158 | | - return np.array([0] * PHONETIC_VECTOR_LENGTH) |
| 170 | +def get_phonetic_info(lang: str) -> Tuple[Optional[pd.DataFrame], Optional[np.ndarray]]: |
| 171 | + """ |
| 172 | + Get phonetic data and vectors for a given language. |
| 173 | + Currently supports English ('en'). |
| 174 | + """ |
| 175 | + if lang.lower() == "en": |
| 176 | + return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS) |
| 177 | + return (None, None) |
159 | 178 |
|
| 179 | +def invalid_vector() -> np.ndarray: |
| 180 | + """ |
| 181 | + Return an invalid phonetic vector (all zeros). |
| 182 | + """ |
| 183 | + return INVALID_VECTOR |
160 | 184 |
|
161 | | -def get_phonetic_feature_vector(p, lang): |
| 185 | +def get_phonetic_feature_vector(p: str, lang: str) -> np.ndarray: |
| 186 | + """ |
| 187 | + Get the phonetic feature vector for a given phoneme and language. |
| 188 | + Returns an invalid vector if phoneme is out of range or invalid. |
| 189 | + """ |
162 | 190 | offset = enc_to_offset(p) |
163 | 191 |
|
164 | 192 | if not in_range(offset): |
165 | | - return invalid_vector() |
| 193 | + return INVALID_VECTOR |
166 | 194 |
|
167 | 195 | phonetic_data, phonetic_vectors = get_phonetic_info(lang) |
168 | 196 |
|
169 | | - if phonetic_data.iloc[offset]["Valid Vector Representation"] == 0: |
170 | | - return invalid_vector() |
| 197 | + if phonetic_data is None or offset >= len(phonetic_data): |
| 198 | + return INVALID_VECTOR |
| 199 | + |
| 200 | + if phonetic_data.at[offset, "Valid Vector Representation"] == 0: |
| 201 | + return INVALID_VECTOR |
171 | 202 |
|
172 | 203 | return phonetic_vectors[offset] |
0 commit comments