hf: add custom tokenizer example

danbev · danbev · commit 6063c8a5a7ae · 2025-11-02T10:30:57.000+01:00
diff --git a/fundamentals/huggingface/create_save_tokenizer.py b/fundamentals/huggingface/create_save_tokenizer.py
@@ -0,0 +1,14 @@
+from tokenizer.tokenization_custom import CustomTokenizer
+
+# Create tokenizer
+tokenizer = CustomTokenizer(vocab_size=100)
+
+# Set auto_map as an attribute
+tokenizer.auto_map = {
+    "AutoTokenizer": ["tokenization_custom.CustomTokenizer", None]
+}
+tokenizer.save_pretrained("tokenizer")
+
+print("Tokenizer saved to ./tokenizer")
+print(f"Vocab size: {len(tokenizer)}")
+print(f"Special tokens: {tokenizer.all_special_tokens}")
diff --git a/fundamentals/huggingface/test_tokenizer.py b/fundamentals/huggingface/test_tokenizer.py
@@ -0,0 +1,22 @@
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("./tokenizer", trust_remote_code=True)
+
+print("✓ Successfully loaded with AutoTokenizer!")
+print(f"Type: {type(tokenizer)}")
+print(f"Vocab size: {len(tokenizer)}")
+print()
+
+# Test encoding
+text = "Hello, World!"
+encoded = tokenizer.encode(text)
+print(f"Text: '{text}'")
+print(f"Encoded: {encoded}")
+print(f"Decoded: '{tokenizer.decode(encoded)}'")
+print()
+
+# Test special tokens
+print(f"PAD token: {tokenizer.pad_token} (id: {tokenizer.pad_token_id})")
+print(f"UNK token: {tokenizer.unk_token} (id: {tokenizer.unk_token_id})")
+print(f"BOS token: {tokenizer.bos_token} (id: {tokenizer.bos_token_id})")
+print(f"EOS token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")
diff --git a/fundamentals/huggingface/tokenizer/special_tokens_map.json b/fundamentals/huggingface/tokenizer/special_tokens_map.json
@@ -0,0 +1,6 @@
+{
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}
diff --git a/fundamentals/huggingface/tokenizer/tokenization_custom.py b/fundamentals/huggingface/tokenizer/tokenization_custom.py
@@ -0,0 +1,88 @@
+from transformers import PreTrainedTokenizer
+from typing import List, Optional
+
+class CustomTokenizer(PreTrainedTokenizer):
+
+    def __init__(self, vocab_size=100, auto_map=None, **kwargs):
+        print(f"Initializing CustomTokenizer with vocab_size={vocab_size} and kwargs={kwargs}")
+        # Defaults if not provided via kwargs (e.g., when loading from config)
+        default_pad = "[PAD]"
+        default_unk = "[UNK]"
+        default_bos = "[BOS]"
+        default_eos = "[EOS]"
+
+        # Pull possibly-present values from kwargs FIRST to avoid duplicates
+        pad_token = kwargs.pop("pad_token", default_pad)
+        unk_token = kwargs.pop("unk_token", default_unk)
+        bos_token = kwargs.pop("bos_token", default_bos)
+        eos_token = kwargs.pop("eos_token", default_eos)
+
+        self._vocab_size_value = vocab_size
+        self._pad_token = pad_token
+        self._unk_token = unk_token
+        self._bos_token = bos_token
+        self._eos_token = eos_token
+
+        # Build vocab: special tokens + printable ASCII chars
+        special_tokens = [self._pad_token, self._unk_token, self._bos_token, self._eos_token]
+        print(f"{special_tokens=}")
+        chars = [chr(i) for i in range(32, 127)]
+        print(chars)
+        vocab_list = special_tokens + chars[: max(0, vocab_size - len(special_tokens))]
+        print(f"{vocab_list=}")
+
+        self._vocab = {tok: idx for idx, tok in enumerate(vocab_list)}
+        self._reverse_vocab = {idx: tok for tok, idx in self._vocab.items()}
+
+        # Optional: advertise class name (helps some HF versions)
+        self.tokenizer_class = self.__class__.__name__
+
+        super().__init__(
+            pad_token=self._pad_token,
+            unk_token=self._unk_token,
+            bos_token=self._bos_token,
+            eos_token=self._eos_token,
+            auto_map=auto_map,
+            **kwargs,  # now safe — no duplicate special-token keys left
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab)  # Return actual vocab length
+
+    def get_vocab(self):
+        return self._vocab.copy()
+
+    # This is what decides how to split text into tokens.
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+
+    # This function will get called when tokenizing a string. It is what allows
+    # the tokenizer to convert from token to ID.
+    def _convert_token_to_id(self, token: str) -> int:
+        # _vocab is a dict mapping token to ID, and we are specifying a default
+        # value if the token is not found which is the ID for the unk_token.
+        return self._vocab.get(token, self._vocab[self._unk_token])
+
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._reverse_vocab.get(index, self._unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        import os
+        import json
+
+        if filename_prefix is None:
+            filename_prefix = ""
+
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            json.dump(self._vocab, f, ensure_ascii=False, indent=2)
+
+        return (vocab_file,)
diff --git a/fundamentals/huggingface/tokenizer/tokenizer_config.json b/fundamentals/huggingface/tokenizer/tokenizer_config.json
@@ -0,0 +1,50 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_custom.CustomTokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "CustomTokenizer",
+  "unk_token": "[UNK]"
+}
diff --git a/fundamentals/huggingface/tokenizer/vocab.json b/fundamentals/huggingface/tokenizer/vocab.json
@@ -0,0 +1,101 @@
+{
+  "[PAD]": 0,
+  "[UNK]": 1,
+  "[BOS]": 2,
+  "[EOS]": 3,
+  " ": 4,
+  "!": 5,
+  "\"": 6,
+  "#": 7,
+  "$": 8,
+  "%": 9,
+  "&": 10,
+  "'": 11,
+  "(": 12,
+  ")": 13,
+  "*": 14,
+  "+": 15,
+  ",": 16,
+  "-": 17,
+  ".": 18,
+  "/": 19,
+  "0": 20,
+  "1": 21,
+  "2": 22,
+  "3": 23,
+  "4": 24,
+  "5": 25,
+  "6": 26,
+  "7": 27,
+  "8": 28,
+  "9": 29,
+  ":": 30,
+  ";": 31,
+  "<": 32,
+  "=": 33,
+  ">": 34,
+  "?": 35,
+  "@": 36,
+  "A": 37,
+  "B": 38,
+  "C": 39,
+  "D": 40,
+  "E": 41,
+  "F": 42,
+  "G": 43,
+  "H": 44,
+  "I": 45,
+  "J": 46,
+  "K": 47,
+  "L": 48,
+  "M": 49,
+  "N": 50,
+  "O": 51,
+  "P": 52,
+  "Q": 53,
+  "R": 54,
+  "S": 55,
+  "T": 56,
+  "U": 57,
+  "V": 58,
+  "W": 59,
+  "X": 60,
+  "Y": 61,
+  "Z": 62,
+  "[": 63,
+  "\\": 64,
+  "]": 65,
+  "^": 66,
+  "_": 67,
+  "`": 68,
+  "a": 69,
+  "b": 70,
+  "c": 71,
+  "d": 72,
+  "e": 73,
+  "f": 74,
+  "g": 75,
+  "h": 76,
+  "i": 77,
+  "j": 78,
+  "k": 79,
+  "l": 80,
+  "m": 81,
+  "n": 82,
+  "o": 83,
+  "p": 84,
+  "q": 85,
+  "r": 86,
+  "s": 87,
+  "t": 88,
+  "u": 89,
+  "v": 90,
+  "w": 91,
+  "x": 92,
+  "y": 93,
+  "z": 94,
+  "{": 95,
+  "|": 96,
+  "}": 97,
+  "~": 98
+}