simplify

ArthurZucker · ArthurZucker · commit 55c1652f5733 · 2025-11-20T15:33:54.000+01:00
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -2099,13 +2099,12 @@ def from_pretrained(
                             template = template.removesuffix(".jinja")
                             vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
 
-        # Find the first file matching pattern
         remote_files = list_repo_files(pretrained_model_name_or_path)
-        vocab_files["vocab_file"] += "|tekken.json|tokenizer.model.*"
-        for file_name in remote_files:
-            if re.search(vocab_files["vocab_file"], file_name):
-                vocab_files["vocab_file"] = file_name
-                break
+        if not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
+            # mistral tokenizer names are different, but we can still convert them if
+            # mistral common is not there
+            other_pattern = "tekken.json|tokenizer.model.*"
+            vocab_files["vocab_file"] = re.search(other_pattern, "".join(remote_files)).group()
 
         resolved_vocab_files = {}
         for file_id, file_path in vocab_files.items():
@@ -2425,6 +2424,9 @@ def _from_pretrained(
                 "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
                 " fine-tuned or trained."
             )
+        if tokenizer.vocab_size > 100000:
+            # Try to catch mistral tokenizers.
+            pass # TODO
         return tokenizer
 
     @staticmethod
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
@@ -207,6 +207,8 @@ def __init__(self, *args, **kwargs):
             if tokens:
                 self.add_tokens(tokens)
 
+        # there was an issue with mistral models where the pre_tokenizer's regex pattern
+        # is not correct. Here we try to fix it.
         try:
             pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
             if pre_tok_state.get("add_prefix_space", self.add_prefix_space) != self.add_prefix_space: