Skip to content

Commit 6063c8a

Browse files
committed
hf: add custom tokenizer example
1 parent d1d2ce5 commit 6063c8a

File tree

6 files changed

+281
-0
lines changed

6 files changed

+281
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from tokenizer.tokenization_custom import CustomTokenizer
2+
3+
# Create tokenizer
4+
tokenizer = CustomTokenizer(vocab_size=100)
5+
6+
# Set auto_map as an attribute
7+
tokenizer.auto_map = {
8+
"AutoTokenizer": ["tokenization_custom.CustomTokenizer", None]
9+
}
10+
tokenizer.save_pretrained("tokenizer")
11+
12+
print("Tokenizer saved to ./tokenizer")
13+
print(f"Vocab size: {len(tokenizer)}")
14+
print(f"Special tokens: {tokenizer.all_special_tokens}")
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from transformers import AutoTokenizer
2+
3+
tokenizer = AutoTokenizer.from_pretrained("./tokenizer", trust_remote_code=True)
4+
5+
print("✓ Successfully loaded with AutoTokenizer!")
6+
print(f"Type: {type(tokenizer)}")
7+
print(f"Vocab size: {len(tokenizer)}")
8+
print()
9+
10+
# Test encoding
11+
text = "Hello, World!"
12+
encoded = tokenizer.encode(text)
13+
print(f"Text: '{text}'")
14+
print(f"Encoded: {encoded}")
15+
print(f"Decoded: '{tokenizer.decode(encoded)}'")
16+
print()
17+
18+
# Test special tokens
19+
print(f"PAD token: {tokenizer.pad_token} (id: {tokenizer.pad_token_id})")
20+
print(f"UNK token: {tokenizer.unk_token} (id: {tokenizer.unk_token_id})")
21+
print(f"BOS token: {tokenizer.bos_token} (id: {tokenizer.bos_token_id})")
22+
print(f"EOS token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"bos_token": "[BOS]",
3+
"eos_token": "[EOS]",
4+
"pad_token": "[PAD]",
5+
"unk_token": "[UNK]"
6+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from transformers import PreTrainedTokenizer
2+
from typing import List, Optional
3+
4+
class CustomTokenizer(PreTrainedTokenizer):
5+
6+
def __init__(self, vocab_size=100, auto_map=None, **kwargs):
7+
print(f"Initializing CustomTokenizer with vocab_size={vocab_size} and kwargs={kwargs}")
8+
# Defaults if not provided via kwargs (e.g., when loading from config)
9+
default_pad = "[PAD]"
10+
default_unk = "[UNK]"
11+
default_bos = "[BOS]"
12+
default_eos = "[EOS]"
13+
14+
# Pull possibly-present values from kwargs FIRST to avoid duplicates
15+
pad_token = kwargs.pop("pad_token", default_pad)
16+
unk_token = kwargs.pop("unk_token", default_unk)
17+
bos_token = kwargs.pop("bos_token", default_bos)
18+
eos_token = kwargs.pop("eos_token", default_eos)
19+
20+
self._vocab_size_value = vocab_size
21+
self._pad_token = pad_token
22+
self._unk_token = unk_token
23+
self._bos_token = bos_token
24+
self._eos_token = eos_token
25+
26+
# Build vocab: special tokens + printable ASCII chars
27+
special_tokens = [self._pad_token, self._unk_token, self._bos_token, self._eos_token]
28+
print(f"{special_tokens=}")
29+
chars = [chr(i) for i in range(32, 127)]
30+
print(chars)
31+
vocab_list = special_tokens + chars[: max(0, vocab_size - len(special_tokens))]
32+
print(f"{vocab_list=}")
33+
34+
self._vocab = {tok: idx for idx, tok in enumerate(vocab_list)}
35+
self._reverse_vocab = {idx: tok for tok, idx in self._vocab.items()}
36+
37+
# Optional: advertise class name (helps some HF versions)
38+
self.tokenizer_class = self.__class__.__name__
39+
40+
super().__init__(
41+
pad_token=self._pad_token,
42+
unk_token=self._unk_token,
43+
bos_token=self._bos_token,
44+
eos_token=self._eos_token,
45+
auto_map=auto_map,
46+
**kwargs, # now safe — no duplicate special-token keys left
47+
)
48+
49+
@property
50+
def vocab_size(self) -> int:
51+
return len(self._vocab) # Return actual vocab length
52+
53+
def get_vocab(self):
54+
return self._vocab.copy()
55+
56+
# This is what decides how to split text into tokens.
57+
def _tokenize(self, text: str) -> List[str]:
58+
return list(text)
59+
60+
# This function will get called when tokenizing a string. It is what allows
61+
# the tokenizer to convert from token to ID.
62+
def _convert_token_to_id(self, token: str) -> int:
63+
# _vocab is a dict mapping token to ID, and we are specifying a default
64+
# value if the token is not found which is the ID for the unk_token.
65+
return self._vocab.get(token, self._vocab[self._unk_token])
66+
67+
def _convert_id_to_token(self, index: int) -> str:
68+
return self._reverse_vocab.get(index, self._unk_token)
69+
70+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
71+
return "".join(tokens)
72+
73+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
74+
import os
75+
import json
76+
77+
if filename_prefix is None:
78+
filename_prefix = ""
79+
80+
vocab_file = os.path.join(
81+
save_directory,
82+
(filename_prefix + "-" if filename_prefix else "") + "vocab.json"
83+
)
84+
85+
with open(vocab_file, "w", encoding="utf-8") as f:
86+
json.dump(self._vocab, f, ensure_ascii=False, indent=2)
87+
88+
return (vocab_file,)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"added_tokens_decoder": {
3+
"0": {
4+
"content": "[PAD]",
5+
"lstrip": false,
6+
"normalized": false,
7+
"rstrip": false,
8+
"single_word": false,
9+
"special": true
10+
},
11+
"1": {
12+
"content": "[UNK]",
13+
"lstrip": false,
14+
"normalized": false,
15+
"rstrip": false,
16+
"single_word": false,
17+
"special": true
18+
},
19+
"2": {
20+
"content": "[BOS]",
21+
"lstrip": false,
22+
"normalized": false,
23+
"rstrip": false,
24+
"single_word": false,
25+
"special": true
26+
},
27+
"3": {
28+
"content": "[EOS]",
29+
"lstrip": false,
30+
"normalized": false,
31+
"rstrip": false,
32+
"single_word": false,
33+
"special": true
34+
}
35+
},
36+
"auto_map": {
37+
"AutoTokenizer": [
38+
"tokenization_custom.CustomTokenizer",
39+
null
40+
]
41+
},
42+
"bos_token": "[BOS]",
43+
"clean_up_tokenization_spaces": false,
44+
"eos_token": "[EOS]",
45+
"extra_special_tokens": {},
46+
"model_max_length": 1000000000000000019884624838656,
47+
"pad_token": "[PAD]",
48+
"tokenizer_class": "CustomTokenizer",
49+
"unk_token": "[UNK]"
50+
}
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
"[PAD]": 0,
3+
"[UNK]": 1,
4+
"[BOS]": 2,
5+
"[EOS]": 3,
6+
" ": 4,
7+
"!": 5,
8+
"\"": 6,
9+
"#": 7,
10+
"$": 8,
11+
"%": 9,
12+
"&": 10,
13+
"'": 11,
14+
"(": 12,
15+
")": 13,
16+
"*": 14,
17+
"+": 15,
18+
",": 16,
19+
"-": 17,
20+
".": 18,
21+
"/": 19,
22+
"0": 20,
23+
"1": 21,
24+
"2": 22,
25+
"3": 23,
26+
"4": 24,
27+
"5": 25,
28+
"6": 26,
29+
"7": 27,
30+
"8": 28,
31+
"9": 29,
32+
":": 30,
33+
";": 31,
34+
"<": 32,
35+
"=": 33,
36+
">": 34,
37+
"?": 35,
38+
"@": 36,
39+
"A": 37,
40+
"B": 38,
41+
"C": 39,
42+
"D": 40,
43+
"E": 41,
44+
"F": 42,
45+
"G": 43,
46+
"H": 44,
47+
"I": 45,
48+
"J": 46,
49+
"K": 47,
50+
"L": 48,
51+
"M": 49,
52+
"N": 50,
53+
"O": 51,
54+
"P": 52,
55+
"Q": 53,
56+
"R": 54,
57+
"S": 55,
58+
"T": 56,
59+
"U": 57,
60+
"V": 58,
61+
"W": 59,
62+
"X": 60,
63+
"Y": 61,
64+
"Z": 62,
65+
"[": 63,
66+
"\\": 64,
67+
"]": 65,
68+
"^": 66,
69+
"_": 67,
70+
"`": 68,
71+
"a": 69,
72+
"b": 70,
73+
"c": 71,
74+
"d": 72,
75+
"e": 73,
76+
"f": 74,
77+
"g": 75,
78+
"h": 76,
79+
"i": 77,
80+
"j": 78,
81+
"k": 79,
82+
"l": 80,
83+
"m": 81,
84+
"n": 82,
85+
"o": 83,
86+
"p": 84,
87+
"q": 85,
88+
"r": 86,
89+
"s": 87,
90+
"t": 88,
91+
"u": 89,
92+
"v": 90,
93+
"w": 91,
94+
"x": 92,
95+
"y": 93,
96+
"z": 94,
97+
"{": 95,
98+
"|": 96,
99+
"}": 97,
100+
"~": 98
101+
}

0 commit comments

Comments
 (0)