From 0b8ae450a5cb3fc9c3911fb525da42760cdd3992 Mon Sep 17 00:00:00 2001 From: Isaac Schifferer Date: Tue, 24 Jun 2025 04:50:50 +0000 Subject: [PATCH] Check for alignable pairs in compute_alignment_scores --- silnlp/alignment/utils.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/silnlp/alignment/utils.py b/silnlp/alignment/utils.py index 1c0b3c37..a01855ff 100644 --- a/silnlp/alignment/utils.py +++ b/silnlp/alignment/utils.py @@ -6,7 +6,7 @@ import pandas as pd -from ..common.corpus import tokenize_corpus, write_corpus +from ..common.corpus import load_corpus, tokenize_corpus, write_corpus from ..common.environment import SIL_NLP_ENV from .config import get_aligner from .lexicon import Lexicon @@ -61,7 +61,7 @@ def compute_alignment_score( prob = max(direct_prob, inverse_prob) probs.append(prob) else: - LOGGER.warn( + LOGGER.warning( f"No pairs in alignment! src >>{src_sentence}<< trg >>{trg_sentence}<< alignment >>{alignment}<<" ) @@ -90,6 +90,13 @@ def add_alignment_scores(corpus: pd.DataFrame, aligner_id: str = "fast_align") - def compute_alignment_scores( src_input_path: Path, trg_input_path: Path, aligner_id: str = "fast_align", sym_align_path: Path = None ) -> List[float]: + # Check for alignable pairs + src_sents = list(load_corpus(src_input_path)) + trg_sents = list(load_corpus(trg_input_path)) + if not any(len(s) > 0 and len(t) > 0 for s, t in zip(src_sents, trg_sents)): + LOGGER.warning("No pairs to align.") + return [0 for _ in src_sents] + with tempfile.TemporaryDirectory() as td: temp_dir = Path(td) src_tok_output_path = temp_dir / "tokenize-src-output.txt" @@ -114,9 +121,11 @@ def compute_alignment_scores( inverse_lexicon = None scores: List[float] = [] - with src_tok_output_path.open("r", encoding="utf-8") as src_tok_output_file, trg_tok_output_path.open( - "r", encoding="utf-8" - ) as trg_tok_output_file, sym_align_path.open("r", encoding="utf-8") as sym_align_file: + with ( + src_tok_output_path.open("r", encoding="utf-8") as src_tok_output_file, + trg_tok_output_path.open("r", encoding="utf-8") as trg_tok_output_file, + sym_align_path.open("r", encoding="utf-8") as sym_align_file, + ): for src_sentence, trg_sentence, alignment in zip(src_tok_output_file, trg_tok_output_file, sym_align_file): if src_sentence.strip() == "" or trg_sentence.strip() == "": scores.append(-1)