Skip to content

Commit f1878f1

Browse files
Use uroman python module (#76)
Co-authored-by: Mahmoud Ashraf <[email protected]>
1 parent 201276a commit f1878f1

File tree

5 files changed

+17
-44
lines changed

5 files changed

+17
-44
lines changed

.gitmodules

Lines changed: 0 additions & 3 deletions
This file was deleted.

ctc_forced_aligner/text_utils.py

Lines changed: 15 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
import os
21
import re
3-
import subprocess
42
import unicodedata
53

64
import numpy as np
75

6+
from uroman import Uroman
7+
88
from .norm_config import norm_config
99

10-
UROMAN_PATH = os.path.join(os.path.dirname(__file__), "uroman", "bin")
10+
uroman_instance = Uroman()
1111

1212

1313
def text_normalize(
@@ -148,41 +148,20 @@ def normalize_uroman(text):
148148
return text.strip()
149149

150150

151-
def get_uroman_tokens(norm_transcripts, iso=None):
152-
input_text = "\n".join(norm_transcripts) + "\n"
153-
154-
assert os.path.exists(os.path.join(UROMAN_PATH, "uroman.pl")), "uroman not found"
155-
156-
assert not subprocess.call(
157-
["perl", "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
158-
), (
159-
"Please ensure that a valid perl installation exists,"
160-
" you can verify by running `perl --version` in your terminal"
161-
)
162-
163-
cmd = ["perl", os.path.join(UROMAN_PATH, "uroman.pl")]
164-
if iso in special_isos_uroman:
165-
cmd.extend(["-l", iso])
166-
167-
result = subprocess.run(
168-
cmd,
169-
input=input_text,
170-
text=True,
171-
capture_output=True,
172-
check=True,
173-
encoding="utf-8",
174-
)
175-
output_text = result.stdout
176-
177-
outtexts = []
178-
for line in output_text.splitlines():
179-
line = " ".join(line.strip())
180-
line = re.sub(r"\s+", " ", line).strip()
181-
outtexts.append(line)
151+
def get_uroman_tokens(norm_transcripts: list[str], iso=None):
152+
outtexts = [
153+
uroman_instance.romanize_string(transcript, lcode=iso)
154+
for transcript in norm_transcripts
155+
]
182156

183-
assert len(outtexts) == len(norm_transcripts)
157+
uromans = []
158+
for ot in outtexts:
159+
ot = " ".join(ot.strip())
160+
ot = re.sub(r"\s+", " ", ot).strip()
161+
normalized = normalize_uroman(ot)
162+
uromans.append(normalized)
184163

185-
uromans = [normalize_uroman(ot) for ot in outtexts]
164+
assert len(uromans) == len(norm_transcripts)
186165

187166
return uromans
188167

ctc_forced_aligner/uroman

Lines changed: 0 additions & 1 deletion
This file was deleted.

pyproject.toml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,6 @@ ctc-forced-aligner = "ctc_forced_aligner.align:cli"
2727
[tool.setuptools.package-data]
2828
"*" = [
2929
"punctuations.lst",
30-
"uroman/bin/**/*.*",
31-
"uroman/data/**/*.*",
32-
"uroman/lib/**/*.*",
3330
]
3431

3532
[tool.flake8]
@@ -38,4 +35,4 @@ ignore = ["E203", "W503"]
3835

3936
[tool.isort]
4037
profile = "black"
41-
lines_between_types = 1
38+
lines_between_types = 1

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
uroman
12
nltk
23
torch
34
torchaudio

0 commit comments

Comments
 (0)