Skip to content

Check for alignable pairs in compute_alignment_scores #762

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions silnlp/alignment/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from ..common.corpus import tokenize_corpus, write_corpus
from ..common.corpus import load_corpus, tokenize_corpus, write_corpus
from ..common.environment import SIL_NLP_ENV
from .config import get_aligner
from .lexicon import Lexicon
Expand Down Expand Up @@ -61,7 +61,7 @@ def compute_alignment_score(
prob = max(direct_prob, inverse_prob)
probs.append(prob)
else:
LOGGER.warn(
LOGGER.warning(
f"No pairs in alignment! src >>{src_sentence}<< trg >>{trg_sentence}<< alignment >>{alignment}<<"
)

Expand Down Expand Up @@ -90,6 +90,13 @@ def add_alignment_scores(corpus: pd.DataFrame, aligner_id: str = "fast_align") -
def compute_alignment_scores(
src_input_path: Path, trg_input_path: Path, aligner_id: str = "fast_align", sym_align_path: Path = None
) -> List[float]:
# Check for alignable pairs
src_sents = list(load_corpus(src_input_path))
trg_sents = list(load_corpus(trg_input_path))
if not any(len(s) > 0 and len(t) > 0 for s, t in zip(src_sents, trg_sents)):
LOGGER.warning("No pairs to align.")
return [0 for _ in src_sents]

with tempfile.TemporaryDirectory() as td:
temp_dir = Path(td)
src_tok_output_path = temp_dir / "tokenize-src-output.txt"
Expand All @@ -114,9 +121,11 @@ def compute_alignment_scores(
inverse_lexicon = None

scores: List[float] = []
with src_tok_output_path.open("r", encoding="utf-8") as src_tok_output_file, trg_tok_output_path.open(
"r", encoding="utf-8"
) as trg_tok_output_file, sym_align_path.open("r", encoding="utf-8") as sym_align_file:
with (
src_tok_output_path.open("r", encoding="utf-8") as src_tok_output_file,
trg_tok_output_path.open("r", encoding="utf-8") as trg_tok_output_file,
sym_align_path.open("r", encoding="utf-8") as sym_align_file,
):
for src_sentence, trg_sentence, alignment in zip(src_tok_output_file, trg_tok_output_file, sym_align_file):
if src_sentence.strip() == "" or trg_sentence.strip() == "":
scores.append(-1)
Expand Down