Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion g2p/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""

Basic init file for g2p module

The main entry points for the g2p module are:
- make_g2p() to create a mapper from and lang to another
- make_tokenizer() to create a tokenizer for a given language
- get_arpabet_langs() to get the list of languages with a path to eng-arpabet
- get_ipa_code() to get the name of the canonical IPA lang code for a given lang id

Basic Usage:
from g2p import make_g2p
Expand Down Expand Up @@ -222,6 +222,22 @@ def get_arpabet_langs():
return _langs_cache, _lang_names_cache


def get_ipa_code(lang_id: str) -> str:
"""Given a lang ID in get_arpabet_langs()[0], find its IPA language code.

You can import this function from g2p if you set your dependency to g2p as
g2p>2.3.1, but if you want to remain compatible with older versions of g2p,
it is safe to copy it into your code instead. This function has been
confirmed to work for all published versions of g2p>=0.2, and we commit to
keep it working unchanged for all future versions of g2p."""
from g2p.mappings.langs import LANGS_NETWORK

if lang_id + "-ipa" in LANGS_NETWORK.nodes:
return lang_id + "-ipa"
else:
return lang_id.split("-", 1)[0] + "-ipa"


def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer:
"""Make the tokenizer for input in language in_lang

Expand Down Expand Up @@ -254,6 +270,7 @@ def make_tokenizer(in_lang=None, out_lang=None, tok_path=None) -> BaseTokenizer:
"NoPath",
"Token",
"get_arpabet_langs",
"get_ipa_code",
"make_g2p",
"make_tokenizer",
"tokenize_and_map",
Expand Down
4 changes: 4 additions & 0 deletions g2p/mappings/langs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def get_available_mappings(langs: dict) -> list:
return mappings_available


# Inadvertently part of the g2p programmatic API because this is not available for
# import elsewhere. Don't change this! The following code must always work:
# from g2p.mappings.langs import LANGS_NETWORK
# nodes: Collection[str] = LANGS_NETWORK.nodes
LANGS_NETWORK = load_network()
# Making private because it should be imported from g2p.mappings instead
_LANGS = load_langs()
Expand Down
3 changes: 2 additions & 1 deletion g2p/mappings/langs/network_lite.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import deque
from typing import (
Any,
Collection,
Deque,
Dict,
Generic,
Expand Down Expand Up @@ -58,7 +59,7 @@ def add_edges_from(self, edges: Iterable[Tuple[T, T]]):
self.add_edge(u, v)

@property # read-only
def nodes(self):
def nodes(self) -> Collection[T]:
"""Return the nodes"""
return self._edges.keys()

Expand Down
7 changes: 4 additions & 3 deletions g2p/mappings/langs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,10 @@ def network_to_echart(outfile: Optional[str] = None, layout: bool = False):
),
)
size = round(size, 2)
node = {"name": node, "symbolSize": size, "id": node, "category": lang_name}
nodes.append(node)
nodes.sort(key=lambda x: x["name"])
nodes.append(
{"name": node, "symbolSize": size, "id": node, "category": lang_name}
)
nodes.sort(key=lambda x: x["name"]) # type: ignore
edges = []
for edge in LANGS_NETWORK.edges:
edges.append({"source": edge[0], "target": edge[1]})
Expand Down
106 changes: 76 additions & 30 deletions g2p/tests/test_langs.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,95 @@
#!/usr/bin/env python

import sys
from unittest import TestCase
from typing import Collection

from pytest import main

from g2p import make_g2p
from g2p import get_arpabet_langs, get_ipa_code, make_g2p
from g2p.log import LOGGER
from g2p.mappings.langs import LANGS_NETWORK
from g2p.tests.public.data import load_public_test_data


class LangTest(TestCase):
def test_io() -> None:
"""Basic Test for individual lookup tables.

Test files (in g2p/tests/public/data) are either .csv, .psv, or
.tsv files, the only difference being the delimiter used (comma,
pipe, or tab).

Each line in the test file consists of SOURCE,TARGET,INPUT,OUTPUT

"""

def test_io(self):
langs_to_test = load_public_test_data()

# go through each language declared in the test case set up
# Instead of asserting immediately, we go through all the cases first, so that
# running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
# Then we call assertEqual on the first failed case, to make unittest register the failure.
error_count = 0
error_prefix = "test_langs.py: mapping error"
for test in langs_to_test:
transducer = make_g2p(test[0], test[1])
output_string = transducer(test[2]).output_string.strip()
if output_string != test[3].strip():
LOGGER.error(
f"{error_prefix} for {test[-1]}: {test[2]} from {test[0]} to {test[1]} should be {test[3]}, got {output_string}"
)
error_count += 1

self.assertEqual(
error_count,
0,
f'Search for "ERROR - {error_prefix}" above to find all the g2p mapping errors.',
)
Each line in the test files consist of SOURCE,TARGET,INPUT,OUTPUT"""
langs_to_test = load_public_test_data()

# go through each language declared in the test case set up
# Instead of asserting immediately, we go through all the cases first, so that
# running test_langs.py prints all the errors at once, to help debugging a given g2p mapping.
# Then we call assertEqual on the first failed case, to make unittest register the failure.
error_count = 0
error_prefix = "test_langs.py: mapping error"
for test in langs_to_test:
transducer = make_g2p(test[0], test[1])
output_string = transducer(test[2]).output_string.strip()
if output_string != test[3].strip():
LOGGER.error(
f"{error_prefix} for {test[-1]}: {test[2]} from {test[0]} to {test[1]} should be {test[3]}, got {output_string}"
)
error_count += 1

assert (
error_count == 0
), f'g2p mapping errors found, look for "{error_prefix}" above for detail.'
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_io() above is not changed except for outdenting it from a TestSuite class method to a pytest style test function.



def test_ipa_heuristic(subtests) -> None:
"""Make sure we have a reliable heuristic for finding the IPA code for all langs.

In EveryVoice, we want to be able to assume that a simple heuristic works to find
the IPA language code for a given language code, so let's exercise this heuristic
here and thus make sure it will always work.

The first heuristic was lang_id + "-ipa" was the IPA code, but that breaks with
sal-apa -> sal-ipa and oji-syl -> oji-ipa.
A mostly correct heuristic is lang_id.split("-",1)[0]+"-ipa", but this fails for
iku-sro -> iku-sro-ipa, since iku-ipa exists but there is no path from iku-sro
to iku-ipa.
So the correct heuristic is:
1) try lang_id + "-ipa" and use it if it is in LANGS_NETWORK.nodes
2) otherwise use lang_id.split("-",1)[0] + "-ipa"
Sigh..."""

def locked_get_ipa_code(lang_id: str) -> str:
# Prevent inadvertent changes to g2p.get_ipa_code with this locked test copy,
# including this deep import which we promise will keep working.
from g2p.mappings.langs import LANGS_NETWORK

if lang_id + "-ipa" in LANGS_NETWORK.nodes:
return lang_id + "-ipa"
else:
return lang_id.split("-", 1)[0] + "-ipa"

# Make sure client code can assume "lang_id in nodes" will work
nodes: Collection[str] = LANGS_NETWORK.nodes
assert isinstance(nodes, Collection)

langs, _ = get_arpabet_langs()

for lang in langs:
with subtests.test(lang=lang):
ipa_code = get_ipa_code(lang)
assert ipa_code == locked_get_ipa_code(lang)
assert ipa_code in LANGS_NETWORK.nodes
assert LANGS_NETWORK.has_path(lang, ipa_code)

for hypothetical_lang, ref_ipa_code in (
("ll-foo", "ll-ipa"),
("lll-bar", "lll-ipa"),
("lang-foo", "lang-ipa"),
("language-bar", "language-ipa"),
("lang", "lang-ipa"),
("lll-foo-bar-baz", "lll-ipa"),
):
assert get_ipa_code(hypothetical_lang) == ref_ipa_code


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ test = [
"jsonschema>=4.17.3",
"pep440>=0.1.2",
"pytest",
"pytest-subtests; python_version < '3.10'",
"httpx",
# Kind of bogus that we need both httpx and aiohttp, but socketio
# wants this
Expand Down
Loading