Skip to content

Commit 55c1652

Browse files
committed
simplify
1 parent 798c29f commit 55c1652

File tree

2 files changed

+10
-6
lines changed

2 files changed

+10
-6
lines changed

src/transformers/tokenization_utils_base.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2099,13 +2099,12 @@ def from_pretrained(
20992099
template = template.removesuffix(".jinja")
21002100
vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
21012101

2102-
# Find the first file matching pattern
21032102
remote_files = list_repo_files(pretrained_model_name_or_path)
2104-
vocab_files["vocab_file"] += "|tekken.json|tokenizer.model.*"
2105-
for file_name in remote_files:
2106-
if re.search(vocab_files["vocab_file"], file_name):
2107-
vocab_files["vocab_file"] = file_name
2108-
break
2103+
if not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
2104+
# mistral tokenizer names are different, but we can still convert them if
2105+
# mistral common is not there
2106+
other_pattern = "tekken.json|tokenizer.model.*"
2107+
vocab_files["vocab_file"] = re.search(other_pattern, "".join(remote_files)).group()
21092108

21102109
resolved_vocab_files = {}
21112110
for file_id, file_path in vocab_files.items():
@@ -2425,6 +2424,9 @@ def _from_pretrained(
24252424
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
24262425
" fine-tuned or trained."
24272426
)
2427+
if tokenizer.vocab_size > 100000:
2428+
# Try to catch mistral tokenizers.
2429+
pass # TODO
24282430
return tokenizer
24292431

24302432
@staticmethod

src/transformers/tokenization_utils_fast.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,8 @@ def __init__(self, *args, **kwargs):
207207
if tokens:
208208
self.add_tokens(tokens)
209209

210+
# there was an issue with mistral models where the pre_tokenizer's regex pattern
211+
# is not correct. Here we try to fix it.
210212
try:
211213
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
212214
if pre_tok_state.get("add_prefix_space", self.add_prefix_space) != self.add_prefix_space:

0 commit comments

Comments
 (0)