Skip to content

Commit 5b28d16

Browse files
committed
Use src_lang/tgt_lang for missing characters since it operates on the same examples
1 parent 3b2e71e commit 5b28d16

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

machine/translation/huggingface/hugging_face_nmt_model_trainer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,10 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
211211
# using unofficially supported behavior to set the normalizer
212212
lang_codes = []
213213
tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore
214-
if self._add_unk_src_tokens and self._src_lang is not None:
215-
lang_codes.append(self._src_lang)
216-
if self._add_unk_tgt_tokens and self._tgt_lang is not None:
217-
lang_codes.append(self._tgt_lang)
214+
if self._add_unk_src_tokens:
215+
lang_codes.append(src_lang)
216+
if self._add_unk_tgt_tokens:
217+
lang_codes.append(tgt_lang)
218218
missing_tokens = find_missing_characters(tokenizer, train_dataset, lang_codes)
219219
if missing_tokens:
220220
tokenizer = add_tokens(tokenizer, missing_tokens)

0 commit comments

Comments
 (0)