MahmoudAshraf97
diff --git a/‎.gitmodules‎
Lines changed: 0 additions & 3 deletions b/‎.gitmodules‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎ctc_forced_aligner/text_utils.py‎
Lines changed: 15 additions & 36 deletions b/‎ctc_forced_aligner/text_utils.py‎
Lines changed: 15 additions & 36 deletions
diff --git a/‎ctc_forced_aligner/uroman‎
Lines changed: 0 additions & 1 deletion b/‎ctc_forced_aligner/uroman‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 4 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
@@ -1,13 +1,13 @@
-import os
 import re
-import subprocess
 import unicodedata
 
 import numpy as np
 
+from uroman import Uroman
+
 from .norm_config import norm_config
 
-UROMAN_PATH = os.path.join(os.path.dirname(__file__), "uroman", "bin")
+uroman_instance = Uroman()
 
 
 def text_normalize(
@@ -148,41 +148,20 @@ def normalize_uroman(text):
     return text.strip()
 
 
-def get_uroman_tokens(norm_transcripts, iso=None):
-    input_text = "\n".join(norm_transcripts) + "\n"
-
-    assert os.path.exists(os.path.join(UROMAN_PATH, "uroman.pl")), "uroman not found"
-
-    assert not subprocess.call(
-        ["perl", "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
-    ), (
-        "Please ensure that a valid perl installation exists,"
-        " you can verify by running `perl --version` in your terminal"
-    )
-
-    cmd = ["perl", os.path.join(UROMAN_PATH, "uroman.pl")]
-    if iso in special_isos_uroman:
-        cmd.extend(["-l", iso])
-
-    result = subprocess.run(
-        cmd,
-        input=input_text,
-        text=True,
-        capture_output=True,
-        check=True,
-        encoding="utf-8",
-    )
-    output_text = result.stdout
-
-    outtexts = []
-    for line in output_text.splitlines():
-        line = " ".join(line.strip())
-        line = re.sub(r"\s+", " ", line).strip()
-        outtexts.append(line)
+def get_uroman_tokens(norm_transcripts: list[str], iso=None):
+    outtexts = [
+        uroman_instance.romanize_string(transcript, lcode=iso)
+        for transcript in norm_transcripts
+    ]
 
-    assert len(outtexts) == len(norm_transcripts)
+    uromans = []
+    for ot in outtexts:
+        ot = " ".join(ot.strip())
+        ot = re.sub(r"\s+", " ", ot).strip()
+        normalized = normalize_uroman(ot)
+        uromans.append(normalized)
 
-    uromans = [normalize_uroman(ot) for ot in outtexts]
+    assert len(uromans) == len(norm_transcripts)
 
     return uromans
 
 
@@ -27,9 +27,6 @@ ctc-forced-aligner = "ctc_forced_aligner.align:cli"
 [tool.setuptools.package-data]
 "*" = [
     "punctuations.lst",
-    "uroman/bin/**/*.*",
-    "uroman/data/**/*.*",
-    "uroman/lib/**/*.*",
 ]
 
 [tool.flake8]
@@ -38,4 +35,4 @@ ignore = ["E203", "W503"]
 
 [tool.isort]
 profile = "black"
-lines_between_types = 1
+lines_between_types = 1
@@ -1,3 +1,4 @@
+uroman
 nltk
 torch
 torchaudio
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+uroman`
`1`	`2`	`nltk`
`2`	`3`	`torch`
`3`	`4`	`torchaudio`