From 7cf18aa028f17bfb48ed43c17678534a31731480 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 27 May 2025 09:50:24 +0200 Subject: [PATCH 01/17] Add benchmark for deserializing large added vocab --- tokenizers/Cargo.toml | 4 ++ tokenizers/benches/added_vocab_deserialize.rs | 37 ++++++++++++++++ tokenizers/src/models/bpe/model.rs | 28 ++++++++++++ tokenizers/src/models/mod.rs | 10 +++++ tokenizers/src/models/unigram/model.rs | 19 ++++++++ tokenizers/src/models/wordlevel/mod.rs | 24 +++++++++++ tokenizers/src/models/wordpiece/mod.rs | 24 +++++++++++ tokenizers/src/tokenizer/added_vocabulary.rs | 43 ++++++++++--------- 8 files changed, 169 insertions(+), 20 deletions(-) create mode 100644 tokenizers/benches/added_vocab_deserialize.rs diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index db56865d2..25dc7f2c8 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -41,6 +41,10 @@ name = "llama3" required-features = ["http"] harness = false +[[bench]] +name = "added_vocab_deserialize" +harness = false + [dependencies] rand = "0.8" onig = { version = "6.4", default-features = false, optional = true } diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs new file mode 100644 index 000000000..4286a84c4 --- /dev/null +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -0,0 +1,37 @@ +#[macro_use] +extern crate criterion; + +use criterion::{black_box, Criterion}; +use tokenizers::tokenizer::{AddedToken, Tokenizer}; +use tokenizers::models::wordlevel::WordLevel; +use std::collections::HashMap; + +fn serialized_tokenizer(size: usize) -> String { + // Create a very small model + let mut vocab = HashMap::new(); + vocab.insert("a".to_string(), 0); + let model = WordLevel::builder().vocab(vocab).unk_token("[UNK]".into()).build().unwrap(); + let mut tokenizer = Tokenizer::new(model); + // Add many tokens to the added vocabulary + let tokens: Vec<_> = (0..size) + .map(|i| AddedToken::from(format!("tok{i}"), false)) + .collect(); + tokenizer.add_tokens(&tokens); + serde_json::to_string(&tokenizer).unwrap() +} + +fn bench_deserialize(c: &mut Criterion) { + for &size in &[10_000usize, 100_000, 400_000] { + let json = serialized_tokenizer(size); + let label = format!("deserialize_added_vocab_{size}"); + c.bench_function(&label, |b| { + b.iter(|| { + let tok: Tokenizer = black_box(serde_json::from_str(&json).unwrap()); + black_box(tok); + }) + }); + } +} + +criterion_group!(benches, bench_deserialize); +criterion_main!(benches); diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 217c37e90..3cb0af35b 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -361,6 +361,34 @@ impl BPE { } } + /// Add tokens to the vocabulary. New tokens will be assigned consecutive IDs + /// after the current maximum ID. + pub fn add_tokens(&mut self, tokens: &[String]) -> usize { + let mut added = 0; + let mut next_id = self + .vocab_r + .keys() + .copied() + .max() + .map_or(0, |max| max + 1); + + for token in tokens { + if self.vocab.contains_key(token) { + continue; + } + self.vocab.insert(token.clone(), next_id); + self.vocab_r.insert(next_id, token.clone()); + added += 1; + next_id += 1; + } + + if let Some(ref cache) = self.cache { + cache.clear(); + } + + added + } + pub fn get_vocab(&self) -> Vocab { self.vocab.clone() } diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index 3a3a91adc..9eb24b4c1 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -222,6 +222,16 @@ impl ModelWrapper { _ => (), } } + + /// Add tokens to the vocabulary of the underlying model. + pub fn add_tokens(&mut self, tokens: &[String]) -> usize { + match self { + Self::WordLevel(model) => model.add_tokens(tokens), + Self::WordPiece(model) => model.add_tokens(tokens), + Self::BPE(model) => model.add_tokens(tokens), + Self::Unigram(model) => model.add_tokens(tokens), + } + } } #[derive(Clone, Serialize, Deserialize)] diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs index da4d631ce..7c2963438 100644 --- a/tokenizers/src/models/unigram/model.rs +++ b/tokenizers/src/models/unigram/model.rs @@ -157,6 +157,25 @@ impl Unigram { self.vocab.len() } + /// Add tokens to the vocabulary. New tokens receive a default score equal to + /// the current minimum score and are appended at the end of the vocab. + pub fn add_tokens(&mut self, tokens: &[String]) -> usize { + let mut added = 0; + let mut next_id = self.vocab.len() as u32; + for token in tokens { + if self.token_to_ids.contains_key(token) { + continue; + } + self.token_to_ids.insert(token.clone(), next_id); + self.trie.push(&token.as_bytes()); + self.vocab.push((token.clone(), self.min_score)); + added += 1; + next_id += 1; + } + self.cache.clear(); + added + } + pub(super) fn populate_nodes(&self, lattice: &mut Lattice) { let unk_score = self.min_score - K_UNK_PENALTY; diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs index 545db13a7..063898422 100644 --- a/tokenizers/src/models/wordlevel/mod.rs +++ b/tokenizers/src/models/wordlevel/mod.rs @@ -143,6 +143,30 @@ impl WordLevel { let vocab = WordLevel::read_file(vocab_path)?; Self::builder().vocab(vocab).unk_token(unk_token).build() } + + /// Add tokens to the vocabulary. New tokens will be assigned consecutive IDs + /// after the current maximum ID. + pub fn add_tokens(&mut self, tokens: &[String]) -> usize { + let mut added = 0; + let mut next_id = self + .vocab_r + .keys() + .copied() + .max() + .map_or(0, |max| max + 1); + + for token in tokens { + if self.vocab.contains_key(token) { + continue; + } + self.vocab.insert(token.clone(), next_id); + self.vocab_r.insert(next_id, token.clone()); + added += 1; + next_id += 1; + } + + added + } } impl Default for WordLevel { diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 0c63405c1..66c2b3b98 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -187,6 +187,30 @@ impl WordPiece { } wp } + + /// Add tokens to the vocabulary. New tokens will be assigned consecutive IDs + /// after the current maximum ID. + pub fn add_tokens(&mut self, tokens: &[String]) -> usize { + let mut added = 0; + let mut next_id = self + .vocab_r + .keys() + .copied() + .max() + .map_or(0, |max| max + 1); + + for token in tokens { + if self.vocab.contains_key(token) { + continue; + } + self.vocab.insert(token.clone(), next_id); + self.vocab_r.insert(next_id, token.clone()); + added += 1; + next_id += 1; + } + + added + } } impl Model for WordPiece { diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index f988477be..952e69579 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -272,50 +272,53 @@ impl AddedVocabulary { } } - // Then we delegate to `add_tokens`, that will take care of refreshing added tokens too. let mut ignored = 0; + use std::collections::HashSet; + let mut existing: HashSet = self.added_tokens_map_r.values().cloned().collect(); + let mut next_id = self + .added_tokens_map_r + .keys() + .copied() + .max() + .map_or(model.get_vocab_size() as u32, |max| { + if max >= model.get_vocab_size() as u32 || model.get_vocab_size() == 0 { + max + 1 + } else { + model.get_vocab_size() as u32 + } + }); + for token in tokens { - if token.content.is_empty() || self.added_tokens_map_r.values().any(|val| val == token) - { + if token.content.is_empty() || existing.contains(token) { ignored += 1; continue; } - // If a token is already part of the vocabulary, we mark it as added + let new_id = if let Some(new_id) = self.token_to_id(&token.content, model) { new_id } else { - self.added_tokens_map.values().cloned().max().map_or( - model.get_vocab_size() as u32, - |max| { - if (max >= model.get_vocab_size() as u32) || model.get_vocab_size() == 0 { - max + 1 - } else { - model.get_vocab_size() as u32 - } - }, - ) + let id = next_id; + next_id += 1; + id }; - // Make sure we modify the previous entry + self.added_tokens_map .entry(token.content.clone()) .and_modify(|old_id| *old_id = new_id) - .or_insert_with(|| new_id); - // Update the current revert operation + .or_insert(new_id); self.added_tokens_map_r .entry(new_id) .and_modify(|t| *t = token.clone()) .or_insert_with(|| token.clone()); - // Make sure to remove previous entry (if the token gets a new id) - // Finally add the token to the classic set if special if !self.special_tokens_set.contains(&token.content) { self.added_tokens.push(token.clone()); } + existing.insert(token.clone()); } self.refresh_added_tokens(model, normalizer); - // Return the number of added tokens tokens.len() - ignored } From 47c2e9fa86613aa519a147078310780b05f39af0 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 12:11:15 +0200 Subject: [PATCH 02/17] revert dumb stuff, isolate changes --- tokenizers/src/models/bpe/model.rs | 28 -------------------------- tokenizers/src/models/mod.rs | 10 --------- tokenizers/src/models/unigram/model.rs | 19 ----------------- tokenizers/src/models/wordlevel/mod.rs | 24 ---------------------- tokenizers/src/models/wordpiece/mod.rs | 24 ---------------------- 5 files changed, 105 deletions(-) diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 3cb0af35b..217c37e90 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -361,34 +361,6 @@ impl BPE { } } - /// Add tokens to the vocabulary. New tokens will be assigned consecutive IDs - /// after the current maximum ID. - pub fn add_tokens(&mut self, tokens: &[String]) -> usize { - let mut added = 0; - let mut next_id = self - .vocab_r - .keys() - .copied() - .max() - .map_or(0, |max| max + 1); - - for token in tokens { - if self.vocab.contains_key(token) { - continue; - } - self.vocab.insert(token.clone(), next_id); - self.vocab_r.insert(next_id, token.clone()); - added += 1; - next_id += 1; - } - - if let Some(ref cache) = self.cache { - cache.clear(); - } - - added - } - pub fn get_vocab(&self) -> Vocab { self.vocab.clone() } diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index 9eb24b4c1..3a3a91adc 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -222,16 +222,6 @@ impl ModelWrapper { _ => (), } } - - /// Add tokens to the vocabulary of the underlying model. - pub fn add_tokens(&mut self, tokens: &[String]) -> usize { - match self { - Self::WordLevel(model) => model.add_tokens(tokens), - Self::WordPiece(model) => model.add_tokens(tokens), - Self::BPE(model) => model.add_tokens(tokens), - Self::Unigram(model) => model.add_tokens(tokens), - } - } } #[derive(Clone, Serialize, Deserialize)] diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs index 7c2963438..da4d631ce 100644 --- a/tokenizers/src/models/unigram/model.rs +++ b/tokenizers/src/models/unigram/model.rs @@ -157,25 +157,6 @@ impl Unigram { self.vocab.len() } - /// Add tokens to the vocabulary. New tokens receive a default score equal to - /// the current minimum score and are appended at the end of the vocab. - pub fn add_tokens(&mut self, tokens: &[String]) -> usize { - let mut added = 0; - let mut next_id = self.vocab.len() as u32; - for token in tokens { - if self.token_to_ids.contains_key(token) { - continue; - } - self.token_to_ids.insert(token.clone(), next_id); - self.trie.push(&token.as_bytes()); - self.vocab.push((token.clone(), self.min_score)); - added += 1; - next_id += 1; - } - self.cache.clear(); - added - } - pub(super) fn populate_nodes(&self, lattice: &mut Lattice) { let unk_score = self.min_score - K_UNK_PENALTY; diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs index 063898422..545db13a7 100644 --- a/tokenizers/src/models/wordlevel/mod.rs +++ b/tokenizers/src/models/wordlevel/mod.rs @@ -143,30 +143,6 @@ impl WordLevel { let vocab = WordLevel::read_file(vocab_path)?; Self::builder().vocab(vocab).unk_token(unk_token).build() } - - /// Add tokens to the vocabulary. New tokens will be assigned consecutive IDs - /// after the current maximum ID. - pub fn add_tokens(&mut self, tokens: &[String]) -> usize { - let mut added = 0; - let mut next_id = self - .vocab_r - .keys() - .copied() - .max() - .map_or(0, |max| max + 1); - - for token in tokens { - if self.vocab.contains_key(token) { - continue; - } - self.vocab.insert(token.clone(), next_id); - self.vocab_r.insert(next_id, token.clone()); - added += 1; - next_id += 1; - } - - added - } } impl Default for WordLevel { diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 66c2b3b98..0c63405c1 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -187,30 +187,6 @@ impl WordPiece { } wp } - - /// Add tokens to the vocabulary. New tokens will be assigned consecutive IDs - /// after the current maximum ID. - pub fn add_tokens(&mut self, tokens: &[String]) -> usize { - let mut added = 0; - let mut next_id = self - .vocab_r - .keys() - .copied() - .max() - .map_or(0, |max| max + 1); - - for token in tokens { - if self.vocab.contains_key(token) { - continue; - } - self.vocab.insert(token.clone(), next_id); - self.vocab_r.insert(next_id, token.clone()); - added += 1; - next_id += 1; - } - - added - } } impl Model for WordPiece { From a8f6a71ab0f2bff2ed48e74efd8f40f1838e5e5b Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 12:13:01 +0200 Subject: [PATCH 03/17] try to only normalize once --- tokenizers/src/tokenizer/added_vocabulary.rs | 84 ++++++++++++-------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 952e69579..467c55f77 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -326,45 +326,65 @@ impl AddedVocabulary { /// /// We keep two different RegexSet, one that will take care of matching against the /// non-normalized string, and one matching against the normalized one. - fn refresh_added_tokens(&mut self, model: &impl Model, normalizer: Option<&N>) { - type TupleTokenId<'a> = (&'a AddedToken, u32); - let (normalized, non_normalized): (Vec, Vec) = self - .special_tokens +fn refresh_added_tokens(&mut self, model: &impl Model, normalizer: Option<&N>) { + type TupleTokenId<'a> = (&'a AddedToken, u32); + let (normalized, non_normalized): (Vec, Vec) = self + .special_tokens + .iter() + .chain(self.added_tokens.iter()) + .map(|token| { + ( + token, + self.token_to_id(&token.content, model) + .expect("Missing additional token"), + ) + }) + .partition(|(token, _)| token.normalized); + + // Build non-normalized trie + let (tokens, ids): (Vec<&AddedToken>, Vec) = non_normalized.into_iter().unzip(); + let trie = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(tokens.iter().map(|token| &token.content)) + .expect("Failed to build trie when refreshing tokens"); + self.split_trie = (trie, ids); + + // Build normalized trie + let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); + if let Some(n) = normalizer { + let delimiter = "\u{0000}"; + let joined = ntokens .iter() - .chain(self.added_tokens.iter()) - .map(|token| { - ( - token, - self.token_to_id(&token.content, model) - .expect("Missing additional token"), - ) - }) - .partition(|(token, _)| token.normalized); + .map(|token| token.content.as_str()) + .collect::>() + .join(delimiter); - let (tokens, ids): (Vec<&AddedToken>, Vec) = non_normalized.into_iter().unzip(); - let trie = AhoCorasickBuilder::new() - .match_kind(MatchKind::LeftmostLongest) - .build(tokens.iter().map(|token| &token.content)) - .expect("Failed to build tried when refreshing tokens"); - self.split_trie = (trie, ids); + let mut content = NormalizedString::from(joined); + n.normalize(&mut content).unwrap(); + let normalized_str = content.get(); + let split_normalized: Vec<&str> = normalized_str.split(delimiter).collect(); - let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); - let patterns: Vec<_> = ntokens - .iter() - .map(|token| { - let mut content = NormalizedString::from(token.content.as_ref()); - if let Some(n) = normalizer { - n.normalize(&mut content).unwrap(); - } - content - }) - .collect(); + assert_eq!( + split_normalized.len(), + ntokens.len(), + "Mismatch between normalized tokens and split results" + ); + + let normalized_trie = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(split_normalized) + .expect("Failed to build trie when refreshing tokens (normalized)"); + self.split_normalized_trie = (normalized_trie, nids); + } else { + // Fallback: use raw content if no normalizer provided + let patterns: Vec<&str> = ntokens.iter().map(|token| token.content.as_str()).collect(); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(patterns.iter().map(|content| content.get())) - .expect("Failed to build tried when refreshing tokens (normalized)"); + .build(patterns) + .expect("Failed to build trie when refreshing tokens (normalized)"); self.split_normalized_trie = (normalized_trie, nids); } +} /// Find any AddedToken in the given sentence, using the provided MatchingSet. /// This method returns a list "splits", each of them being a pair of Offsets From 8ba1d202797ce608f1cb461679bc5bb8045df676 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 13:30:59 +0200 Subject: [PATCH 04/17] small improvement? --- tokenizers/src/tokenizer/added_vocabulary.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 467c55f77..8fb3efec7 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -347,7 +347,7 @@ fn refresh_added_tokens(&mut self, model: &impl Model, normalizer .match_kind(MatchKind::LeftmostLongest) .build(tokens.iter().map(|token| &token.content)) .expect("Failed to build trie when refreshing tokens"); - self.split_trie = (trie, ids); + self.split_trie = (trie.clone(), ids.clone()); // Build normalized trie let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); @@ -376,13 +376,7 @@ fn refresh_added_tokens(&mut self, model: &impl Model, normalizer .expect("Failed to build trie when refreshing tokens (normalized)"); self.split_normalized_trie = (normalized_trie, nids); } else { - // Fallback: use raw content if no normalizer provided - let patterns: Vec<&str> = ntokens.iter().map(|token| token.content.as_str()).collect(); - let normalized_trie = AhoCorasickBuilder::new() - .match_kind(MatchKind::LeftmostLongest) - .build(patterns) - .expect("Failed to build trie when refreshing tokens (normalized)"); - self.split_normalized_trie = (normalized_trie, nids); + self.split_normalized_trie = (trie, ids); // non normalized is the same } } From 6714ceb33845ca91311a1a42ec9088407e0e6af2 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 14:03:53 +0200 Subject: [PATCH 05/17] some updates --- tokenizers/benches/added_vocab_deserialize.rs | 66 ++++++++++++++----- tokenizers/src/normalizers/byte_level.rs | 2 + tokenizers/src/normalizers/mod.rs | 2 + tokenizers/src/tokenizer/added_vocabulary.rs | 41 ++++++------ tokenizers/src/tokenizer/mod.rs | 2 +- 5 files changed, 76 insertions(+), 37 deletions(-) diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs index 4286a84c4..296ca4d84 100644 --- a/tokenizers/benches/added_vocab_deserialize.rs +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -1,35 +1,71 @@ #[macro_use] extern crate criterion; -use criterion::{black_box, Criterion}; -use tokenizers::tokenizer::{AddedToken, Tokenizer}; -use tokenizers::models::wordlevel::WordLevel; use std::collections::HashMap; +use criterion::{black_box, Criterion}; +use tokenizers::{ + AddedToken, Normalizer, Tokenizer, + models::wordlevel::WordLevel, + normalizers::*, +}; -fn serialized_tokenizer(size: usize) -> String { - // Create a very small model +fn serialized_tokenizer>( + size: usize, + normalizer: Option, +) -> String { let mut vocab = HashMap::new(); vocab.insert("a".to_string(), 0); - let model = WordLevel::builder().vocab(vocab).unk_token("[UNK]".into()).build().unwrap(); + let model = WordLevel::builder() + .vocab(vocab) + .unk_token("[UNK]".into()) + .build() + .unwrap(); + let mut tokenizer = Tokenizer::new(model); - // Add many tokens to the added vocabulary let tokens: Vec<_> = (0..size) .map(|i| AddedToken::from(format!("tok{i}"), false)) .collect(); tokenizer.add_tokens(&tokens); + + if let Some(norm) = normalizer { + tokenizer.with_normalizer(Some(norm)); + } + serde_json::to_string(&tokenizer).unwrap() } fn bench_deserialize(c: &mut Criterion) { + let normalizers: Vec<(&str, Option NormalizerWrapper>)> = vec![ + ("none", None), + ("bert", Some(|| BertNormalizer::default().into())), + ("byte_level", Some(|| ByteLevel::default().into())), + ("lowercase", Some(|| Lowercase.into())), + ("nfc", Some(|| NFC.into())), + ("nfd", Some(|| NFD.into())), + ("nfkc", Some(|| NFKC.into())), + ("nfkd", Some(|| NFKD.into())), + ("nmt", Some(|| Nmt.into())), + ("strip", Some(|| Strip::new(true, true).into())), + ("replace", Some(|| Replace::new("a", "b").unwrap().into())), + ("prepend", Some(|| Prepend::new("pre_".to_string()).into())), + ]; + for &size in &[10_000usize, 100_000, 400_000] { - let json = serialized_tokenizer(size); - let label = format!("deserialize_added_vocab_{size}"); - c.bench_function(&label, |b| { - b.iter(|| { - let tok: Tokenizer = black_box(serde_json::from_str(&json).unwrap()); - black_box(tok); - }) - }); + for (norm_name, maybe_factory) in &normalizers { + let label = format!("deserialize_added_vocab_{}_norm_{}", size, norm_name); + + let json = match maybe_factory { + Some(factory) => serialized_tokenizer(size, Some(factory())), + None => serialized_tokenizer::(size, None), + }; + + c.bench_function(&label, |b| { + b.iter(|| { + let tok: Tokenizer = black_box(serde_json::from_str(&json).unwrap()); + black_box(tok); + }) + }); + } } } diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index ae8fecfb6..13697a129 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -8,6 +8,8 @@ use std::sync::LazyLock; #[macro_rules_attribute(impl_serde_type!)] pub struct ByteLevel; + + static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); impl Default for ByteLevel { diff --git a/tokenizers/src/normalizers/mod.rs b/tokenizers/src/normalizers/mod.rs index f400f13da..0f7bab6a0 100644 --- a/tokenizers/src/normalizers/mod.rs +++ b/tokenizers/src/normalizers/mod.rs @@ -38,6 +38,8 @@ pub enum NormalizerWrapper { ByteLevel(ByteLevel), } +unsafe impl Sync for NormalizerWrapper {} + impl<'de> Deserialize<'de> for NormalizerWrapper { fn deserialize(deserializer: D) -> std::result::Result where diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 6da0c622a..c2451c6a7 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -6,6 +6,7 @@ use regex::Regex; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; +use rayon::prelude::*; /// Represent a token added by the user on top of the existing Model vocabulary. /// AddedToken can be configured to specify the behavior they should have in various situations @@ -164,6 +165,21 @@ pub struct AddedVocabulary { encode_special_tokens: bool, } + +fn normalize_token_contents( + n: &N, + ntokens: Vec<&AddedToken>, +) -> Vec { + ntokens + .par_iter() + .map(|token| { + let mut content = NormalizedString::from(token.content.as_ref()); + n.normalize(&mut content).expect("Normalization failed"); + content.get().to_string() // Convert once, reuse later + }) + .collect() +} + impl AddedVocabulary { pub fn new() -> Self { let trie = AhoCorasickBuilder::new() @@ -350,30 +366,13 @@ fn refresh_added_tokens(&mut self, model: &impl Model, normalizer self.split_trie = (trie.clone(), ids.clone()); // Build normalized trie - let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); if let Some(n) = normalizer { - let delimiter = "\u{0000}"; - let joined = ntokens - .iter() - .map(|token| token.content.as_str()) - .collect::>() - .join(delimiter); - - let mut content = NormalizedString::from(joined); - n.normalize(&mut content).unwrap(); - let normalized_str = content.get(); - let split_normalized: Vec<&str> = normalized_str.split(delimiter).collect(); - - assert_eq!( - split_normalized.len(), - ntokens.len(), - "Mismatch between normalized tokens and split results" - ); - + let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); + let patterns: Vec<_> =normalize_token_contents(n, ntokens); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(split_normalized) - .expect("Failed to build trie when refreshing tokens (normalized)"); + .build(patterns) + .expect("Failed to build tried when refreshing tokens (normalized)"); self.split_normalized_trie = (normalized_trie, nids); } else { self.split_normalized_trie = (trie, ids); // non normalized is the same diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index f4a136091..c8b756792 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -53,7 +53,7 @@ pub type Result = std::result::Result; pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. -pub trait Normalizer { +pub trait Normalizer : Sync{ fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; } From e07ecfc6e5d73ff6893643c2b384f4258d7a6c99 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 14:05:38 +0200 Subject: [PATCH 06/17] nit --- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index c2451c6a7..3c718dea3 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -368,7 +368,7 @@ fn refresh_added_tokens(&mut self, model: &impl Model, normalizer // Build normalized trie if let Some(n) = normalizer { let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); - let patterns: Vec<_> =normalize_token_contents(n, ntokens); + let patterns: Vec<_> = normalize_token_contents(n, ntokens); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) .build(patterns) From 8849d71e9962da1f7da5369996b55667aa5b7bdd Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 14:09:52 +0200 Subject: [PATCH 07/17] fmt --- tokenizers/benches/added_vocab_deserialize.rs | 10 +-- tokenizers/src/normalizers/byte_level.rs | 2 - tokenizers/src/tokenizer/added_vocabulary.rs | 90 +++++++++---------- tokenizers/src/tokenizer/mod.rs | 2 +- 4 files changed, 46 insertions(+), 58 deletions(-) diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs index 296ca4d84..1eeee8046 100644 --- a/tokenizers/benches/added_vocab_deserialize.rs +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -1,13 +1,9 @@ #[macro_use] extern crate criterion; -use std::collections::HashMap; use criterion::{black_box, Criterion}; -use tokenizers::{ - AddedToken, Normalizer, Tokenizer, - models::wordlevel::WordLevel, - normalizers::*, -}; +use std::collections::HashMap; +use tokenizers::{models::wordlevel::WordLevel, normalizers::*, AddedToken, Normalizer, Tokenizer}; fn serialized_tokenizer>( size: usize, @@ -37,7 +33,6 @@ fn serialized_tokenizer>( fn bench_deserialize(c: &mut Criterion) { let normalizers: Vec<(&str, Option NormalizerWrapper>)> = vec![ ("none", None), - ("bert", Some(|| BertNormalizer::default().into())), ("byte_level", Some(|| ByteLevel::default().into())), ("lowercase", Some(|| Lowercase.into())), ("nfc", Some(|| NFC.into())), @@ -48,6 +43,7 @@ fn bench_deserialize(c: &mut Criterion) { ("strip", Some(|| Strip::new(true, true).into())), ("replace", Some(|| Replace::new("a", "b").unwrap().into())), ("prepend", Some(|| Prepend::new("pre_".to_string()).into())), + ("bert", Some(|| BertNormalizer::default().into())), ]; for &size in &[10_000usize, 100_000, 400_000] { diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index 13697a129..ae8fecfb6 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -8,8 +8,6 @@ use std::sync::LazyLock; #[macro_rules_attribute(impl_serde_type!)] pub struct ByteLevel; - - static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); impl Default for ByteLevel { diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 3c718dea3..fe599f09a 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -2,11 +2,11 @@ use super::{ normalizer::Range, Model, NormalizedString, Normalizer, Offsets, PreTokenizedString, Token, }; use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use rayon::prelude::*; use regex::Regex; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use rayon::prelude::*; /// Represent a token added by the user on top of the existing Model vocabulary. /// AddedToken can be configured to specify the behavior they should have in various situations @@ -165,17 +165,13 @@ pub struct AddedVocabulary { encode_special_tokens: bool, } - -fn normalize_token_contents( - n: &N, - ntokens: Vec<&AddedToken>, -) -> Vec { +fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { ntokens .par_iter() .map(|token| { let mut content = NormalizedString::from(token.content.as_ref()); n.normalize(&mut content).expect("Normalization failed"); - content.get().to_string() // Convert once, reuse later + content.get().to_string() // Convert once, reuse later }) .collect() } @@ -291,18 +287,16 @@ impl AddedVocabulary { let mut ignored = 0; use std::collections::HashSet; let mut existing: HashSet = self.added_tokens_map_r.values().cloned().collect(); - let mut next_id = self - .added_tokens_map_r - .keys() - .copied() - .max() - .map_or(model.get_vocab_size() as u32, |max| { + let mut next_id = self.added_tokens_map_r.keys().copied().max().map_or( + model.get_vocab_size() as u32, + |max| { if max >= model.get_vocab_size() as u32 || model.get_vocab_size() == 0 { max + 1 } else { model.get_vocab_size() as u32 } - }); + }, + ); for token in tokens { if token.content.is_empty() || existing.contains(token) { @@ -342,42 +336,42 @@ impl AddedVocabulary { /// /// We keep two different RegexSet, one that will take care of matching against the /// non-normalized string, and one matching against the normalized one. -fn refresh_added_tokens(&mut self, model: &impl Model, normalizer: Option<&N>) { - type TupleTokenId<'a> = (&'a AddedToken, u32); - let (normalized, non_normalized): (Vec, Vec) = self - .special_tokens - .iter() - .chain(self.added_tokens.iter()) - .map(|token| { - ( - token, - self.token_to_id(&token.content, model) - .expect("Missing additional token"), - ) - }) - .partition(|(token, _)| token.normalized); - - // Build non-normalized trie - let (tokens, ids): (Vec<&AddedToken>, Vec) = non_normalized.into_iter().unzip(); - let trie = AhoCorasickBuilder::new() - .match_kind(MatchKind::LeftmostLongest) - .build(tokens.iter().map(|token| &token.content)) - .expect("Failed to build trie when refreshing tokens"); - self.split_trie = (trie.clone(), ids.clone()); - - // Build normalized trie - if let Some(n) = normalizer { - let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); - let patterns: Vec<_> = normalize_token_contents(n, ntokens); - let normalized_trie = AhoCorasickBuilder::new() + fn refresh_added_tokens(&mut self, model: &impl Model, normalizer: Option<&N>) { + type TupleTokenId<'a> = (&'a AddedToken, u32); + let (normalized, non_normalized): (Vec, Vec) = self + .special_tokens + .iter() + .chain(self.added_tokens.iter()) + .map(|token| { + ( + token, + self.token_to_id(&token.content, model) + .expect("Missing additional token"), + ) + }) + .partition(|(token, _)| token.normalized); + + // Build non-normalized trie + let (tokens, ids): (Vec<&AddedToken>, Vec) = non_normalized.into_iter().unzip(); + let trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(patterns) - .expect("Failed to build tried when refreshing tokens (normalized)"); - self.split_normalized_trie = (normalized_trie, nids); - } else { - self.split_normalized_trie = (trie, ids); // non normalized is the same + .build(tokens.iter().map(|token| &token.content)) + .expect("Failed to build trie when refreshing tokens"); + self.split_trie = (trie.clone(), ids.clone()); + + // Build normalized trie + if let Some(n) = normalizer { + let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); + let patterns: Vec<_> = normalize_token_contents(n, ntokens); + let normalized_trie = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(patterns) + .expect("Failed to build tried when refreshing tokens (normalized)"); + self.split_normalized_trie = (normalized_trie, nids); + } else { + self.split_normalized_trie = (trie, ids); // non normalized is the same + } } -} /// Find any AddedToken in the given sentence, using the provided MatchingSet. /// This method returns a list "splits", each of them being a pair of Offsets diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index c8b756792..14f60701d 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -53,7 +53,7 @@ pub type Result = std::result::Result; pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. -pub trait Normalizer : Sync{ +pub trait Normalizer: Sync { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; } From 5da668a9620debd56a44fb29210f29dd38b02dcd Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 14:45:52 +0200 Subject: [PATCH 08/17] normalized string are a fucking waste of time when you just want to add tokens to the vocab man.... --- tokenizers/src/normalizers/byte_level.rs | 25 +++++++++++++------- tokenizers/src/tokenizer/added_vocabulary.rs | 4 +--- tokenizers/src/tokenizer/mod.rs | 4 ++++ tokenizers/src/tokenizer/normalizer.rs | 18 ++++++++++++++ 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index ae8fecfb6..e4fa66604 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -31,25 +31,34 @@ impl Normalizer for ByteLevel { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { if !normalized.is_empty() { let s = normalized.get(); + let s_bytes = s.as_bytes(); let mut transformations: Vec<(char, isize)> = Vec::with_capacity(s.len()); let mut i = 0; for cur_char in s.chars() { let size = cur_char.len_utf8(); - let bytes = &s.as_bytes()[i..i + size]; + let bytes = &s_bytes[i..i + size]; i += size; - transformations.extend( - bytes - .iter() - .enumerate() - .map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))), - ); + for (j, b) in bytes.iter().enumerate() { + transformations.push((BYTES_CHAR[b], if j > 0 { 1 } else { 0 })); + } } normalized.transform(transformations, 0); } - Ok(()) + Ok(()) + } + + /// Fast normalization: byte-to-char mapping without tracking positions + fn normalize_fast(&self, input: &str) -> String { + let mut out = String::with_capacity(input.len()); + for b in input.as_bytes() { + out.push(BYTES_CHAR[b]); + } + out } } +unsafe impl Sync for ByteLevel {} + #[cfg(test)] mod tests { diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index fe599f09a..18e2d90de 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -169,9 +169,7 @@ fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToke ntokens .par_iter() .map(|token| { - let mut content = NormalizedString::from(token.content.as_ref()); - n.normalize(&mut content).expect("Normalization failed"); - content.get().to_string() // Convert once, reuse later + n.normalize_fast(&token.content) }) .collect() } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 14f60701d..59adcb76c 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -55,6 +55,10 @@ pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. pub trait Normalizer: Sync { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; + fn normalize_fast(&self, normalized: &str) ->String { + // Default implementation just calls the normalizer + normalized.to_string() + } } /// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 0b8c519ea..819a47be5 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -304,6 +304,24 @@ impl NormalizedString { }) } + pub fn fast_transform(&mut self, dest: I) + where + I: IntoIterator, + { + let mut buf = String::with_capacity(self.normalized.len()); + for (c, _) in dest { + buf.push(c); + } + unsafe { + // This assumes you're not mutating in the middle of a UTF-8 char + self.normalized.as_mut_vec().clear(); + self.normalized.as_mut_vec().extend_from_slice(buf.as_bytes()); + } + + // Drop alignments if unused + self.alignments.clear(); + } + /// Applies transformations to the current normalized version of the string, /// while updating the alignments. /// This method expect an Iterator yielding each char of the new normalized string From 8e7ce86cb8a4a4444ac6d54a84bf2d007793a133 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 15:12:15 +0200 Subject: [PATCH 09/17] more attempts --- tokenizers/benches/added_vocab_deserialize.rs | 4 ++-- tokenizers/src/normalizers/byte_level.rs | 18 ++++++++++++---- tokenizers/src/tokenizer/added_vocabulary.rs | 21 ++++++++++++++----- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs index 1eeee8046..594fdadca 100644 --- a/tokenizers/benches/added_vocab_deserialize.rs +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -1,6 +1,6 @@ #[macro_use] extern crate criterion; - +use std::str::FromStr; use criterion::{black_box, Criterion}; use std::collections::HashMap; use tokenizers::{models::wordlevel::WordLevel, normalizers::*, AddedToken, Normalizer, Tokenizer}; @@ -57,7 +57,7 @@ fn bench_deserialize(c: &mut Criterion) { c.bench_function(&label, |b| { b.iter(|| { - let tok: Tokenizer = black_box(serde_json::from_str(&json).unwrap()); + let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap()); black_box(tok); }) }); diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index e4fa66604..7d64da915 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -3,12 +3,19 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::macro_rules_attribute; use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; +use std::cell::RefCell; + + #[derive(Clone, Debug)] #[macro_rules_attribute(impl_serde_type!)] pub struct ByteLevel; -static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); +pub static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); + +thread_local! { + static THREAD_BYTES_CHAR: RefCell> = RefCell::new(BYTES_CHAR.clone()); +} impl Default for ByteLevel { fn default() -> Self { @@ -49,11 +56,14 @@ impl Normalizer for ByteLevel { /// Fast normalization: byte-to-char mapping without tracking positions fn normalize_fast(&self, input: &str) -> String { - let mut out = String::with_capacity(input.len()); + THREAD_BYTES_CHAR.with(|map_cell| { + let map = map_cell.borrow(); + let mut out = String::with_capacity(input.len()); for b in input.as_bytes() { - out.push(BYTES_CHAR[b]); + out.push(map[b]); } - out + out + }) } } diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 18e2d90de..ca0942724 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -7,6 +7,7 @@ use regex::Regex; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; +use rayon::ThreadPoolBuilder; /// Represent a token added by the user on top of the existing Model vocabulary. /// AddedToken can be configured to specify the behavior they should have in various situations @@ -165,13 +166,23 @@ pub struct AddedVocabulary { encode_special_tokens: bool, } + fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { + // let pool = ThreadPoolBuilder::new() + // .num_threads(24) + // .build() + // .expect("Failed to build custom Rayon thread pool"); + + // pool.install(|| { + // ntokens + // .par_iter() + // .map(|token| n.normalize_fast(&token.content)) + // .collect() + // }) ntokens - .par_iter() - .map(|token| { - n.normalize_fast(&token.content) - }) - .collect() + .iter() + .map(|token| n.normalize_fast(&token.content)) + .collect() } impl AddedVocabulary { From 948eeadf2e1ec0cfb69d1a99c87b40e517886d5b Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 15:29:51 +0200 Subject: [PATCH 10/17] works --- tokenizers/benches/added_vocab_deserialize.rs | 4 +-- tokenizers/src/normalizers/byte_level.rs | 4 +-- tokenizers/src/tokenizer/added_vocabulary.rs | 33 +++++++++---------- tokenizers/src/tokenizer/mod.rs | 4 +-- tokenizers/src/tokenizer/normalizer.rs | 18 ---------- 5 files changed, 22 insertions(+), 41 deletions(-) diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs index 594fdadca..accaaebc9 100644 --- a/tokenizers/benches/added_vocab_deserialize.rs +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -1,6 +1,6 @@ #[macro_use] extern crate criterion; -use std::str::FromStr; +use std::{process::exit, str::FromStr}; use criterion::{black_box, Criterion}; use std::collections::HashMap; use tokenizers::{models::wordlevel::WordLevel, normalizers::*, AddedToken, Normalizer, Tokenizer}; @@ -54,7 +54,6 @@ fn bench_deserialize(c: &mut Criterion) { Some(factory) => serialized_tokenizer(size, Some(factory())), None => serialized_tokenizer::(size, None), }; - c.bench_function(&label, |b| { b.iter(|| { let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap()); @@ -62,6 +61,7 @@ fn bench_deserialize(c: &mut Criterion) { }) }); } + exit(0); } } diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index 7d64da915..36b570d89 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -55,8 +55,8 @@ impl Normalizer for ByteLevel { } /// Fast normalization: byte-to-char mapping without tracking positions - fn normalize_fast(&self, input: &str) -> String { - THREAD_BYTES_CHAR.with(|map_cell| { + fn normalize_fast(&self, input: String) -> String { + THREAD_BYTES_CHAR.with(|map_cell| { let map = map_cell.borrow(); let mut out = String::with_capacity(input.len()); for b in input.as_bytes() { diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index ca0942724..6b7e2a093 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -168,21 +168,21 @@ pub struct AddedVocabulary { fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { - // let pool = ThreadPoolBuilder::new() - // .num_threads(24) - // .build() - // .expect("Failed to build custom Rayon thread pool"); - - // pool.install(|| { - // ntokens - // .par_iter() + let pool = ThreadPoolBuilder::new() + .num_threads(24) + .build() + .expect("Failed to build custom Rayon thread pool"); + + pool.install(|| { + ntokens + .par_iter() + .map( |token| n.normalize_fast(token.content.to_owned())) + .collect() + }) + // ntokens + // .iter() // .map(|token| n.normalize_fast(&token.content)) - // .collect() - // }) - ntokens - .iter() - .map(|token| n.normalize_fast(&token.content)) - .collect() + // .collect() } impl AddedVocabulary { @@ -370,13 +370,12 @@ impl AddedVocabulary { // Build normalized trie if let Some(n) = normalizer { - let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); - let patterns: Vec<_> = normalize_token_contents(n, ntokens); + let patterns: Vec<_> = normalize_token_contents(n, tokens); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) .build(patterns) .expect("Failed to build tried when refreshing tokens (normalized)"); - self.split_normalized_trie = (normalized_trie, nids); + self.split_normalized_trie = (normalized_trie, ids); } else { self.split_normalized_trie = (trie, ids); // non normalized is the same } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 59adcb76c..55072bae3 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -55,9 +55,9 @@ pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. pub trait Normalizer: Sync { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; - fn normalize_fast(&self, normalized: &str) ->String { + fn normalize_fast(&self, normalized: String) -> String { // Default implementation just calls the normalizer - normalized.to_string() + normalized } } diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 819a47be5..0b8c519ea 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -304,24 +304,6 @@ impl NormalizedString { }) } - pub fn fast_transform(&mut self, dest: I) - where - I: IntoIterator, - { - let mut buf = String::with_capacity(self.normalized.len()); - for (c, _) in dest { - buf.push(c); - } - unsafe { - // This assumes you're not mutating in the middle of a UTF-8 char - self.normalized.as_mut_vec().clear(); - self.normalized.as_mut_vec().extend_from_slice(buf.as_bytes()); - } - - // Drop alignments if unused - self.alignments.clear(); - } - /// Applies transformations to the current normalized version of the string, /// while updating the alignments. /// This method expect an Iterator yielding each char of the new normalized string From 43cef926445e77730b9a10582de98a9ab573048b Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 15:33:30 +0200 Subject: [PATCH 11/17] let's fucking go, parity --- tokenizers/src/normalizers/byte_level.rs | 2 -- tokenizers/src/normalizers/mod.rs | 2 -- tokenizers/src/tokenizer/added_vocabulary.rs | 21 +++++--------------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index 36b570d89..2c15c33b4 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -67,8 +67,6 @@ impl Normalizer for ByteLevel { } } -unsafe impl Sync for ByteLevel {} - #[cfg(test)] mod tests { diff --git a/tokenizers/src/normalizers/mod.rs b/tokenizers/src/normalizers/mod.rs index 0f7bab6a0..f400f13da 100644 --- a/tokenizers/src/normalizers/mod.rs +++ b/tokenizers/src/normalizers/mod.rs @@ -38,8 +38,6 @@ pub enum NormalizerWrapper { ByteLevel(ByteLevel), } -unsafe impl Sync for NormalizerWrapper {} - impl<'de> Deserialize<'de> for NormalizerWrapper { fn deserialize(deserializer: D) -> std::result::Result where diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 6b7e2a093..0157cc5e2 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -167,22 +167,11 @@ pub struct AddedVocabulary { } -fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { - let pool = ThreadPoolBuilder::new() - .num_threads(24) - .build() - .expect("Failed to build custom Rayon thread pool"); - - pool.install(|| { - ntokens - .par_iter() - .map( |token| n.normalize_fast(token.content.to_owned())) - .collect() - }) - // ntokens - // .iter() - // .map(|token| n.normalize_fast(&token.content)) - // .collect() +fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { + ntokens + .iter() + .map(|token| n.normalize_fast(token.content.to_owned())) + .collect() } impl AddedVocabulary { From ae8a7b4d6e0bf74196e5fc37e67f6245f4c9a1ff Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 15:37:17 +0200 Subject: [PATCH 12/17] update --- tokenizers/src/normalizers/byte_level.rs | 2 +- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- tokenizers/src/tokenizer/mod.rs | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index 2c15c33b4..eaf3f4a9f 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -55,7 +55,7 @@ impl Normalizer for ByteLevel { } /// Fast normalization: byte-to-char mapping without tracking positions - fn normalize_fast(&self, input: String) -> String { + fn normalize_fast(&self, input: &str) -> String { THREAD_BYTES_CHAR.with(|map_cell| { let map = map_cell.borrow(); let mut out = String::with_capacity(input.len()); diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 0157cc5e2..c34b1968b 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -170,7 +170,7 @@ pub struct AddedVocabulary { fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { ntokens .iter() - .map(|token| n.normalize_fast(token.content.to_owned())) + .map(|token| n.normalize_fast(&token.content)) .collect() } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 55072bae3..ac563e2e9 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -55,9 +55,9 @@ pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. pub trait Normalizer: Sync { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; - fn normalize_fast(&self, normalized: String) -> String { + fn normalize_fast(&self, normalized: &str) -> String { // Default implementation just calls the normalizer - normalized + normalized.to_owned() } } From 44beeb71d36796fe3eda68d90cf95cb5969fe17c Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 15:47:05 +0200 Subject: [PATCH 13/17] hahahhahaha --- tokenizers/benches/added_vocab_deserialize.rs | 22 +++++++++++++++---- tokenizers/src/tokenizer/added_vocabulary.rs | 1 + 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/tokenizers/benches/added_vocab_deserialize.rs b/tokenizers/benches/added_vocab_deserialize.rs index accaaebc9..5693919c4 100644 --- a/tokenizers/benches/added_vocab_deserialize.rs +++ b/tokenizers/benches/added_vocab_deserialize.rs @@ -8,6 +8,7 @@ use tokenizers::{models::wordlevel::WordLevel, normalizers::*, AddedToken, Norma fn serialized_tokenizer>( size: usize, normalizer: Option, + special_tokens: bool, ) -> String { let mut vocab = HashMap::new(); vocab.insert("a".to_string(), 0); @@ -19,7 +20,7 @@ fn serialized_tokenizer>( let mut tokenizer = Tokenizer::new(model); let tokens: Vec<_> = (0..size) - .map(|i| AddedToken::from(format!("tok{i}"), false)) + .map(|i| AddedToken::from(format!("tok{i}"), special_tokens)) .collect(); tokenizer.add_tokens(&tokens); @@ -48,11 +49,24 @@ fn bench_deserialize(c: &mut Criterion) { for &size in &[10_000usize, 100_000, 400_000] { for (norm_name, maybe_factory) in &normalizers { - let label = format!("deserialize_added_vocab_{}_norm_{}", size, norm_name); + let label = format!("special tokens deserialize_added_vocab_{}_norm_{}", size, norm_name); let json = match maybe_factory { - Some(factory) => serialized_tokenizer(size, Some(factory())), - None => serialized_tokenizer::(size, None), + Some(factory) => serialized_tokenizer(size, Some(factory()), true), + None => serialized_tokenizer::(size, None, true), + }; + c.bench_function(&label, |b| { + b.iter(|| { + let tok: Tokenizer = black_box(Tokenizer::from_str(&json).unwrap()); + black_box(tok); + }) + }); + + let label = format!("non special deserialize_added_vocab_{}_norm_{}", size, norm_name); + + let json = match maybe_factory { + Some(factory) => serialized_tokenizer(size, Some(factory()), false), + None => serialized_tokenizer::(size, None, false), }; c.bench_function(&label, |b| { b.iter(|| { diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index c34b1968b..10ecb1fcc 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -359,6 +359,7 @@ impl AddedVocabulary { // Build normalized trie if let Some(n) = normalizer { + let (tokens, ids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); let patterns: Vec<_> = normalize_token_contents(n, tokens); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) From e7f895419fb450b9d7e950707dc7029bb36aa898 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 15:55:20 +0200 Subject: [PATCH 14/17] revert changes that are not actually even needed --- tokenizers/src/normalizers/byte_level.rs | 31 +++++--------------- tokenizers/src/tokenizer/added_vocabulary.rs | 30 +++++++++---------- tokenizers/src/tokenizer/mod.rs | 6 +--- 3 files changed, 24 insertions(+), 43 deletions(-) diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index eaf3f4a9f..ae89d74f2 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -3,7 +3,6 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::macro_rules_attribute; use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use std::cell::RefCell; @@ -11,11 +10,7 @@ use std::cell::RefCell; #[macro_rules_attribute(impl_serde_type!)] pub struct ByteLevel; -pub static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); - -thread_local! { - static THREAD_BYTES_CHAR: RefCell> = RefCell::new(BYTES_CHAR.clone()); -} +static BYTES_CHAR: LazyLock> = LazyLock::new(bytes_char); impl Default for ByteLevel { fn default() -> Self { @@ -38,33 +33,23 @@ impl Normalizer for ByteLevel { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { if !normalized.is_empty() { let s = normalized.get(); - let s_bytes = s.as_bytes(); let mut transformations: Vec<(char, isize)> = Vec::with_capacity(s.len()); let mut i = 0; for cur_char in s.chars() { let size = cur_char.len_utf8(); - let bytes = &s_bytes[i..i + size]; + let bytes = &s.as_bytes()[i..i + size]; i += size; - for (j, b) in bytes.iter().enumerate() { - transformations.push((BYTES_CHAR[b], if j > 0 { 1 } else { 0 })); - } + transformations.extend( + bytes + .iter() + .enumerate() + .map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))), + ); } normalized.transform(transformations, 0); } Ok(()) } - - /// Fast normalization: byte-to-char mapping without tracking positions - fn normalize_fast(&self, input: &str) -> String { - THREAD_BYTES_CHAR.with(|map_cell| { - let map = map_cell.borrow(); - let mut out = String::with_capacity(input.len()); - for b in input.as_bytes() { - out.push(map[b]); - } - out - }) - } } #[cfg(test)] diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 10ecb1fcc..8bd266284 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -2,12 +2,10 @@ use super::{ normalizer::Range, Model, NormalizedString, Normalizer, Offsets, PreTokenizedString, Token, }; use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; -use rayon::prelude::*; use regex::Regex; use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer}; use std::collections::{HashMap, HashSet}; use std::sync::LazyLock; -use rayon::ThreadPoolBuilder; /// Represent a token added by the user on top of the existing Model vocabulary. /// AddedToken can be configured to specify the behavior they should have in various situations @@ -166,14 +164,6 @@ pub struct AddedVocabulary { encode_special_tokens: bool, } - -fn normalize_token_contents(n: &N, ntokens: Vec<&AddedToken>) -> Vec { - ntokens - .iter() - .map(|token| n.normalize_fast(&token.content)) - .collect() -} - impl AddedVocabulary { pub fn new() -> Self { let trie = AhoCorasickBuilder::new() @@ -283,7 +273,7 @@ impl AddedVocabulary { } let mut ignored = 0; - use std::collections::HashSet; + let mut existing: HashSet = self.added_tokens_map_r.values().cloned().collect(); let mut next_id = self.added_tokens_map_r.keys().copied().max().map_or( model.get_vocab_size() as u32, @@ -318,7 +308,9 @@ impl AddedVocabulary { .entry(new_id) .and_modify(|t| *t = token.clone()) .or_insert_with(|| token.clone()); + // Make sure to remove previous entry (if the token gets a new id) + // Finally add the token to the classic set if special if !self.special_tokens_set.contains(&token.content) { self.added_tokens.push(token.clone()); } @@ -327,6 +319,7 @@ impl AddedVocabulary { self.refresh_added_tokens(model, normalizer); + // Return the number of added tokens tokens.len() - ignored } @@ -359,13 +352,20 @@ impl AddedVocabulary { // Build normalized trie if let Some(n) = normalizer { - let (tokens, ids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); - let patterns: Vec<_> = normalize_token_contents(n, tokens); + let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); + let patterns: Vec<_> = ntokens + .iter() + .map(|token| { + let mut content = NormalizedString::from(token.content.as_ref()); + n.normalize(&mut content).unwrap(); + content + }) + .collect(); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(patterns) + .build(patterns.iter().map(|content| content.get())) .expect("Failed to build tried when refreshing tokens (normalized)"); - self.split_normalized_trie = (normalized_trie, ids); + self.split_normalized_trie = (normalized_trie, nids); } else { self.split_normalized_trie = (trie, ids); // non normalized is the same } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index ac563e2e9..f4a136091 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -53,12 +53,8 @@ pub type Result = std::result::Result; pub type Offsets = (usize, usize); /// Takes care of pre-processing strings. -pub trait Normalizer: Sync { +pub trait Normalizer { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; - fn normalize_fast(&self, normalized: &str) -> String { - // Default implementation just calls the normalizer - normalized.to_owned() - } } /// The `PreTokenizer` is in charge of doing the pre-segmentation step. It splits the given string From 8d49849d7c7a0738a2d1e303380f11632c5ae3d4 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 16:01:48 +0200 Subject: [PATCH 15/17] add a python test! --- bindings/python/benches/test_deserialize.py | 81 +++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 bindings/python/benches/test_deserialize.py diff --git a/bindings/python/benches/test_deserialize.py b/bindings/python/benches/test_deserialize.py new file mode 100644 index 000000000..2ad529a69 --- /dev/null +++ b/bindings/python/benches/test_deserialize.py @@ -0,0 +1,81 @@ +import json +import timeit +from tokenizers import Tokenizer, AddedToken +from tokenizers.models import WordLevel +from tokenizers.normalizers import ( + ByteLevel, Lowercase, NFC, NFD, NFKC, NFKD, Nmt, Strip, Replace, Prepend, BertNormalizer +) +import pytest + +def build_tokenizer_json(size, normalizer=None, special_tokens=True): + # Build vocab and WordLevel model + vocab = {"a": 0} + model = WordLevel(vocab=vocab, unk_token="[UNK]") + tokenizer = Tokenizer(model) + + # Add many tokens + tokens = [AddedToken(f"tok{i}", special=special_tokens) for i in range(size)] + tokenizer.add_tokens(tokens) + + # Add normalizer if specified + if normalizer: + tokenizer.normalizer = normalizer + + # Return serialized tokenizer JSON + return tokenizer.to_str() + + +normalizer_factories = { + "none": None, + "byte_level": ByteLevel, + "lowercase": Lowercase, + "nfc": NFC, + "nfd": NFD, + "nfkc": NFKC, + "nfkd": NFKD, + "nmt": Nmt, + "strip": lambda: Strip(strip_left=True, strip_right=True), + "replace": lambda: Replace("a", "b"), + "prepend": lambda: Prepend("pre_"), + "bert": BertNormalizer +} + + + +normalizer_factories = { + "none": None, + "byte_level": ByteLevel, + "lowercase": Lowercase, + "nfc": NFC, + "nfd": NFD, + "nfkc": NFKC, + "nfkd": NFKD, + "nmt": Nmt, + "strip": lambda: Strip(True, True), + "replace": lambda: Replace("a", "b"), + "prepend": lambda: Prepend("pre_"), + "bert": BertNormalizer, +} + + +@pytest.mark.parametrize("size", [10_000, 100_000]) +@pytest.mark.parametrize("special_tokens", [True, False]) +@pytest.mark.parametrize("norm_name,norm_factory", normalizer_factories.items()) +def test_tokenizer_deserialization(benchmark, size, special_tokens, norm_name, norm_factory): + """Benchmark Tokenizer.from_str deserialization with different vocab sizes and normalizers.""" + normalizer = norm_factory() if norm_factory else None + tok_json = build_tokenizer_json(size, normalizer, special_tokens) + + def deserialize(): + tok = Tokenizer.from_str(tok_json) + _ = tok + + benchmark.group = f"deserialize_{size}_{'special' if special_tokens else 'non_special'}" + benchmark.name = f"norm_{norm_name}" + benchmark(deserialize) + +# some example usage +# pytest benches/test_deserialize.py --benchmark-enable +# pytest test_deserialization_benchmark.py --benchmark-save=baseline +# pytest test_deserialization_benchmark.py --benchmark-compare=baseline +# pytest test_deserialization_benchmark.py --benchmark-compare=baseline --benchmark-save=baseline2 \ No newline at end of file From d8f07fad5514274401e28b1729e45d86a04f6889 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 16:04:26 +0200 Subject: [PATCH 16/17] use normalizer before come on --- bindings/python/benches/test_deserialize.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/bindings/python/benches/test_deserialize.py b/bindings/python/benches/test_deserialize.py index 2ad529a69..369b4c8c7 100644 --- a/bindings/python/benches/test_deserialize.py +++ b/bindings/python/benches/test_deserialize.py @@ -11,16 +11,12 @@ def build_tokenizer_json(size, normalizer=None, special_tokens=True): # Build vocab and WordLevel model vocab = {"a": 0} model = WordLevel(vocab=vocab, unk_token="[UNK]") - tokenizer = Tokenizer(model) - - # Add many tokens - tokens = [AddedToken(f"tok{i}", special=special_tokens) for i in range(size)] - tokenizer.add_tokens(tokens) - # Add normalizer if specified + tokenizer = Tokenizer(model) if normalizer: tokenizer.normalizer = normalizer - + tokens = [AddedToken(f"tok{i}", special=special_tokens) for i in range(size)] + tokenizer.add_tokens(tokens) # Return serialized tokenizer JSON return tokenizer.to_str() From f6df6035f167afb3800509c791fc06beb3935a22 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 27 May 2025 17:18:32 +0200 Subject: [PATCH 17/17] nit --- bindings/python/benches/test_deserialize.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/bindings/python/benches/test_deserialize.py b/bindings/python/benches/test_deserialize.py index 369b4c8c7..d98d6d35c 100644 --- a/bindings/python/benches/test_deserialize.py +++ b/bindings/python/benches/test_deserialize.py @@ -38,25 +38,10 @@ def build_tokenizer_json(size, normalizer=None, special_tokens=True): -normalizer_factories = { - "none": None, - "byte_level": ByteLevel, - "lowercase": Lowercase, - "nfc": NFC, - "nfd": NFD, - "nfkc": NFKC, - "nfkd": NFKD, - "nmt": Nmt, - "strip": lambda: Strip(True, True), - "replace": lambda: Replace("a", "b"), - "prepend": lambda: Prepend("pre_"), - "bert": BertNormalizer, -} - -@pytest.mark.parametrize("size", [10_000, 100_000]) @pytest.mark.parametrize("special_tokens", [True, False]) @pytest.mark.parametrize("norm_name,norm_factory", normalizer_factories.items()) +@pytest.mark.parametrize("size", [10_000, 100_000]) def test_tokenizer_deserialization(benchmark, size, special_tokens, norm_name, norm_factory): """Benchmark Tokenizer.from_str deserialization with different vocab sizes and normalizers.""" normalizer = norm_factory() if norm_factory else None