NRC-ILT · joanise · Apr 27, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 27, 2026
diff --git a/g2p/__init__.py b/g2p/__init__.py
@@ -1,11 +1,11 @@
 """
-
 Basic init file for g2p module
 
 The main entry points for the g2p module are:
  - make_g2p() to create a mapper from and lang to another
  - make_tokenizer() to create a tokenizer for a given language
  - get_arpabet_langs() to get the list of languages with a path to eng-arpabet
+ - get_ipa_code() to get the name of the canonical IPA lang code for a given lang id
 
 Basic Usage:
     from g2p import make_g2p
@@ -222,6 +222,22 @@ def get_arpabet_langs():
         return _langs_cache, _lang_names_cache
 
 
+def get_ipa_code(lang_id: str) -> str:
+    """Given a lang ID in get_arpabet_langs()[0], find its IPA language code.
+
+    You can import this function from g2p if you set your dependency to g2p as
+    g2p>2.3.1, but if you want to remain compatible with older versions of g2p,
+    it is safe to copy it into your code instead. This function has been
+    confirmed to work for all published versions of g2p>=0.2, and we commit to
+    keep it working unchanged for all future versions of g2p."""
+    from g2p.mappings.langs import LANGS_NETWORK
+
+    if lang_id + "-ipa" in LANGS_NETWORK.nodes:
+        return lang_id + "-ipa"
+    else:
+        return lang_id.split("-", 1)[0] + "-ipa"
+
+
 def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer:
     """Make the tokenizer for input in language in_lang
 
@@ -254,6 +270,7 @@ def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer:
     "NoPath",
     "Token",
     "get_arpabet_langs",
+    "get_ipa_code",
     "make_g2p",
     "make_tokenizer",
     "tokenize_and_map",

diff --git a/g2p/mappings/langs/__init__.py b/g2p/mappings/langs/__init__.py
@@ -56,6 +56,10 @@ def get_available_mappings(langs: dict) -> list:
     return mappings_available
 
 
+# Inadvertently part of the g2p programmatic API because this is not available for
+# import elsewhere. Don't change this! The following code must always work:
+#     from g2p.mappings.langs import LANGS_NETWORK
+#     nodes: Collection[str] = LANGS_NETWORK.nodes
 LANGS_NETWORK = load_network()
 # Making private because it should be imported from g2p.mappings instead
 _LANGS = load_langs()

diff --git a/g2p/mappings/langs/network_lite.py b/g2p/mappings/langs/network_lite.py
@@ -1,6 +1,7 @@
 from collections import deque
 from typing import (
     Any,
+    Collection,
     Deque,
     Dict,
     Generic,
@@ -58,7 +59,7 @@ def add_edges_from(self, edges: Iterable[Tuple[T, T]]):
             self.add_edge(u, v)
 
     @property  # read-only
-    def nodes(self):
+    def nodes(self) -> Collection[T]:
         """Return the nodes"""
         return self._edges.keys()
 

diff --git a/g2p/mappings/langs/utils.py b/g2p/mappings/langs/utils.py
@@ -202,9 +202,10 @@ def network_to_echart(outfile: Optional[str] = None, layout: bool = False):
             ),
         )
         size = round(size, 2)
-        node = {"name": node, "symbolSize": size, "id": node, "category": lang_name}
-        nodes.append(node)
-    nodes.sort(key=lambda x: x["name"])
+        nodes.append(
+            {"name": node, "symbolSize": size, "id": node, "category": lang_name}
+        )
+    nodes.sort(key=lambda x: x["name"])  # type: ignore
     edges = []
     for edge in LANGS_NETWORK.edges:
         edges.append({"source": edge[0], "target": edge[1]})

diff --git a/g2p/tests/test_langs.py b/g2p/tests/test_langs.py
@@ -1,49 +1,95 @@
 #!/usr/bin/env python
 
 import sys
-from unittest import TestCase
+from typing import Collection
 
 from pytest import main
 
-from g2p import make_g2p
+from g2p import get_arpabet_langs, get_ipa_code, make_g2p
 from g2p.log import LOGGER
+from g2p.mappings.langs import LANGS_NETWORK
 from g2p.tests.public.data import load_public_test_data
 
 
-class LangTest(TestCase):
+def test_io() -> None:
     """Basic Test for individual lookup tables.
 
     Test files (in g2p/tests/public/data) are either .csv, .psv, or
     .tsv files, the only difference being the delimiter used (comma,
     pipe, or tab).
 
-    Each line in the test file consists of SOURCE,TARGET,INPUT,OUTPUT
-
-    """
-
-    def test_io(self):
-        langs_to_test = load_public_test_data()
-
-        # go through each language declared in the test case set up
-        # Instead of asserting immediately, we go through all the cases first, so that
-        # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
-        # Then we call assertEqual on the first failed case, to make unittest register the failure.
-        error_count = 0
-        error_prefix = "test_langs.py: mapping error"
-        for test in langs_to_test:
-            transducer = make_g2p(test[0], test[1])
-            output_string = transducer(test[2]).output_string.strip()
-            if output_string != test[3].strip():
-                LOGGER.error(
-                    f"{error_prefix} for {test[-1]}: {test[2]} from {test[0]} to {test[1]} should be {test[3]}, got {output_string}"
-                )
-                error_count += 1
-
-        self.assertEqual(
-            error_count,
-            0,
-            f'Search for "ERROR - {error_prefix}" above to find all the g2p mapping errors.',
-        )
+    Each line in the test files consist of SOURCE,TARGET,INPUT,OUTPUT"""
+    langs_to_test = load_public_test_data()
+
+    # go through each language declared in the test case set up
+    # Instead of asserting immediately, we go through all the cases first, so that
+    # running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
+    # Then we call assertEqual on the first failed case, to make unittest register the failure.
+    error_count = 0
+    error_prefix = "test_langs.py: mapping error"
+    for test in langs_to_test:
+        transducer = make_g2p(test[0], test[1])
+        output_string = transducer(test[2]).output_string.strip()
+        if output_string != test[3].strip():
+            LOGGER.error(
+                f"{error_prefix} for {test[-1]}: {test[2]} from {test[0]} to {test[1]} should be {test[3]}, got {output_string}"
+            )
+            error_count += 1
+
+    assert (
+        error_count == 0
+    ), f'g2p mapping errors found, look for "{error_prefix}" above for detail.'
+
+
+def test_ipa_heuristic(subtests) -> None:
+    """Make sure we have a reliable heuristic for finding the IPA code for all langs.
+
+    In EveryVoice, we want to be able to assume that a simple heuristic works to find
+    the IPA language code for a given language code, so let's exercise this heuristic
+    here and thus make sure it will always work.
+
+    The first heuristic was lang_id + "-ipa" was the IPA code, but that breaks with
+    sal-apa -> sal-ipa and oji-syl -> oji-ipa.
+    A mostly correct heuristic is lang_id.split("-",1)[0]+"-ipa", but this fails for
+    iku-sro -> iku-sro-ipa, since iku-ipa exists but there is no path from iku-sro
+    to iku-ipa.
+    So the correct heuristic is:
+        1) try lang_id + "-ipa" and use it if it is in LANGS_NETWORK.nodes
+        2) otherwise use lang_id.split("-",1)[0] + "-ipa"
+    Sigh..."""
+
+    def locked_get_ipa_code(lang_id: str) -> str:
+        # Prevent inadvertent changes to g2p.get_ipa_code with this locked test copy,
+        # including this deep import which we promise will keep working.
+        from g2p.mappings.langs import LANGS_NETWORK
+
+        if lang_id + "-ipa" in LANGS_NETWORK.nodes:
+            return lang_id + "-ipa"
+        else:
+            return lang_id.split("-", 1)[0] + "-ipa"
+
+    # Make sure client code can assume "lang_id in nodes" will work
+    nodes: Collection[str] = LANGS_NETWORK.nodes
+    assert isinstance(nodes, Collection)
+
+    langs, _ = get_arpabet_langs()
+
+    for lang in langs:
+        with subtests.test(lang=lang):
+            ipa_code = get_ipa_code(lang)
+            assert ipa_code == locked_get_ipa_code(lang)
+            assert ipa_code in LANGS_NETWORK.nodes
+            assert LANGS_NETWORK.has_path(lang, ipa_code)
+
+    for hypothetical_lang, ref_ipa_code in (
+        ("ll-foo", "ll-ipa"),
+        ("lll-bar", "lll-ipa"),
+        ("lang-foo", "lang-ipa"),
+        ("language-bar", "language-ipa"),
+        ("lang", "lang-ipa"),
+        ("lll-foo-bar-baz", "lll-ipa"),
+    ):
+        assert get_ipa_code(hypothetical_lang) == ref_ipa_code
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -61,6 +61,7 @@ test = [
   "jsonschema>=4.17.3",
   "pep440>=0.1.2",
   "pytest",
+  "pytest-subtests; python_version < '3.10'",
   "httpx",
   # Kind of bogus that we need both httpx and aiohttp, but socketio
   # wants this