diff --git a/bindings/node/Cargo.toml b/bindings/node/Cargo.toml
index cf1e51e99..6e00f9d7c 100644
--- a/bindings/node/Cargo.toml
+++ b/bindings/node/Cargo.toml
@@ -14,6 +14,7 @@ napi        = "2"
 napi-derive = "2"
 serde       = { version = "1.0.163", features = ["derive"] }
 tokenizers  = { path = "../../tokenizers/" }
+ahash = { version = "0.8.11", features = ["serde"] }
 
 [build-dependencies]
 napi-build = "2"
diff --git a/bindings/node/src/models.rs b/bindings/node/src/models.rs
index a4138b91f..9ee7f60f7 100644
--- a/bindings/node/src/models.rs
+++ b/bindings/node/src/models.rs
@@ -1,6 +1,7 @@
 use crate::arc_rwlock_serde;
 use crate::tasks::models::{BPEFromFilesTask, WordLevelFromFilesTask, WordPieceFromFilesTask};
 use crate::trainers::Trainer;
+use ahash::AHashMap;
 use napi::bindgen_prelude::*;
 use napi_derive::napi;
 use serde::{Deserialize, Serialize};
@@ -8,7 +9,7 @@ use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 use std::sync::{Arc, RwLock};
 use tokenizers as tk;
-use tokenizers::models::bpe::{BpeBuilder, Merges, Vocab};
+use tokenizers::models::bpe::{BpeBuilder, Merges};
 use tokenizers::models::wordlevel::WordLevelBuilder;
 use tokenizers::models::wordpiece::WordPieceBuilder;
 
@@ -44,8 +45,13 @@ impl Bpe {
   }
 
   #[napi(factory, ts_return_type = "Model")]
-  pub fn init(vocab: Vocab, merges: Merges, options: Option<BpeOptions>) -> Result<Model> {
+  pub fn init(
+    vocab: HashMap<String, u32>,
+    merges: Merges,
+    options: Option<BpeOptions>,
+  ) -> Result<Model> {
     let options = options.unwrap_or_default();
+    let vocab: AHashMap<_, _> = vocab.into_iter().collect();
     let mut builder = tk::models::bpe::BPE::builder().vocab_and_merges(vocab, merges);
     builder = options.apply_to_bpe_builder(builder);
     let model = builder
@@ -206,10 +212,11 @@ pub struct WordPiece {}
 #[napi]
 impl WordPiece {
   #[napi(factory, ts_return_type = "Model")]
-  pub fn init(vocab: Vocab, options: Option<WordPieceOptions>) -> Result<Model> {
+  pub fn init(vocab: HashMap<String, u32>, options: Option<WordPieceOptions>) -> Result<Model> {
     let options = options.unwrap_or_default();
 
-    let mut builder = tk::models::wordpiece::WordPiece::builder().vocab(vocab);
+    let mut builder = tk::models::wordpiece::WordPiece::builder()
+      .vocab(vocab.into_iter().collect::<AHashMap<_, _>>());
     builder = options.apply_to_wordpiece_builder(builder);
     let model = builder
       .build()
@@ -263,9 +270,10 @@ pub struct WordLevel {}
 #[napi]
 impl WordLevel {
   #[napi(factory, ts_return_type = "Model")]
-  pub fn init(vocab: Vocab, options: Option<WordLevelOptions>) -> Result<Model> {
+  pub fn init(vocab: HashMap<String, u32>, options: Option<WordLevelOptions>) -> Result<Model> {
     let options = options.unwrap_or_default();
-    let mut builder = tk::models::wordlevel::WordLevel::builder().vocab(vocab);
+    let mut builder =
+      tk::models::wordlevel::WordLevel::builder().vocab(vocab.into_iter().collect());
     builder = options.apply_to_wordlevel_builder(builder);
     let model = builder
       .build()
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
index c0f05ac6b..76f09604a 100644
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -18,6 +18,7 @@ pyo3 = { version = "0.25", features = ["abi3", "abi3-py39", "py-clone"] }
 numpy = "0.25"
 ndarray = "0.16"
 itertools = "0.14"
+ahash = { version = "0.8.11", features = ["serde"] }
 
 [dependencies.tokenizers]
 path = "../../tokenizers"
diff --git a/bindings/python/benches/test_backtrack.py b/bindings/python/benches/test_backtrack.py
new file mode 100644
index 000000000..0988d387c
--- /dev/null
+++ b/bindings/python/benches/test_backtrack.py
@@ -0,0 +1,88 @@
+import os
+import argparse
+import datetime
+from datasets import load_dataset
+from tokenizers import Tokenizer
+from typing import Tuple
+
+MODEL_ID = "meta-llama/Meta-Llama-3.1-8B"
+DATASET = "facebook/xnli"
+DATASET_CONFIG = "all_languages"
+DEFAULT_THREADS = [2**i for i in range(8) if 2**i <= os.cpu_count()]
+
+
+def format_byte_size(num_bytes: int) -> Tuple[str, str]:
+    """Convert bytes to a human-readable format (KB, MB, GB)."""
+    num_bytes_f = float(num_bytes)
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if num_bytes_f < 1024:
+            return f"{num_bytes_f:.2f} {unit}", unit
+        num_bytes_f /= 1024
+    return f"{num_bytes_f:.2f} PB", "PB"
+
+
+def test(model: str, dataset: str, dataset_config: str):
+    dataset_xnli = load_dataset(dataset, dataset_config)
+    tokenizer = Tokenizer.from_pretrained(model)
+    tokenizer2 = Tokenizer.from_pretrained(model)
+    tokenizer2.enable_backtrack()
+
+    for easy in ["1880", " cream"]:
+        encoded = tokenizer.encode(easy)
+        encoded2 = tokenizer2.encode(easy)
+        if encoded.ids != encoded2.ids:
+            import ipdb
+
+            ipdb.set_trace()
+        assert encoded.ids == encoded2.ids
+
+    sentences = []
+    en_sentences = []
+    for _i, item in enumerate(dataset_xnli["train"]):
+        # sentence = item["premise"]["en"]
+        # sentences.append(sentence)
+        for lang, sentence in item["premise"].items():
+            if lang == "en":
+                en_sentences.append(sentence)
+            sentences.append(sentence)
+    sentences = en_sentences + sentences
+
+    start = datetime.datetime.now()
+    encoded = tokenizer.encode_batch_fast(sentences)
+    print(f"Took {datetime.datetime.now() - start}")
+
+    start = datetime.datetime.now()
+    encoded2 = tokenizer2.encode_batch_fast(sentences)
+    print(f"Took {datetime.datetime.now() - start}")
+
+    assert len(encoded) == len(encoded2)
+    assert len(encoded) == len(sentences)
+    total = 0
+    correct = 0
+    for enc, enc2, sentence in zip(encoded, encoded2, sentences):
+        # if enc.ids != enc2.ids:
+        #     print(enc.ids)
+        #     print(enc2.ids)
+        if enc.ids == enc2.ids:
+            correct += 1
+        total += 1
+        assert enc.ids == enc2.ids, f"{enc.ids} != {enc2.ids} (Source: {sentence}"
+    print(f"{correct} / {total} ({correct / total * 100:.2f}%%)")
+    # print("All good !")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="bench_tokenizer",
+        description="Getting a feel for speed when tokenizing",
+    )
+    parser.add_argument("-m", "--model", default=MODEL_ID, type=str)
+    parser.add_argument("-d", "--dataset", default=DATASET, type=str)
+    parser.add_argument("-ds", "--dataset-config", default=DATASET_CONFIG, type=str)
+    args = parser.parse_args()
+    test(args.model, args.dataset, args.dataset_config)
+
+
+# Call the function to run the benchmark
+if __name__ == "__main__":
+    main()
diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
index 2f4dba825..81d5f4eb6 100644
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@@ -4,11 +4,12 @@ use std::sync::{Arc, RwLock};
 
 use crate::token::PyToken;
 use crate::trainers::PyTrainer;
+use ahash::AHashMap;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use serde::{Deserialize, Serialize};
-use tk::models::bpe::{BpeBuilder, Merges, Vocab, BPE};
+use tk::models::bpe::{BpeBuilder, Merges, BPE};
 use tk::models::unigram::Unigram;
 use tk::models::wordlevel::WordLevel;
 use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
@@ -347,9 +348,10 @@ macro_rules! setter {
 
 #[derive(FromPyObject)]
 enum PyVocab {
-    Vocab(Vocab),
+    Vocab(HashMap<String, u32>),
     Filename(String),
 }
+
 #[derive(FromPyObject)]
 enum PyMerges {
     Merges(Merges),
@@ -454,6 +456,7 @@ impl PyBPE {
         if let (Some(vocab), Some(merges)) = (vocab, merges) {
             match (vocab, merges) {
                 (PyVocab::Vocab(vocab), PyMerges::Merges(merges)) => {
+                    let vocab: AHashMap<_, _> = vocab.into_iter().collect();
                     builder = builder.vocab_and_merges(vocab, merges);
                 }
                 (PyVocab::Filename(vocab_filename), PyMerges::Filename(merges_filename)) => {
@@ -494,13 +497,15 @@ impl PyBPE {
     ///         The vocabulary and merges loaded into memory
     #[staticmethod]
     #[pyo3(text_signature = "(self, vocab, merges)")]
-    fn read_file(vocab: &str, merges: &str) -> PyResult<(Vocab, Merges)> {
-        BPE::read_file(vocab, merges).map_err(|e| {
+    fn read_file(vocab: &str, merges: &str) -> PyResult<(HashMap<String, u32>, Merges)> {
+        let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
             exceptions::PyException::new_err(format!(
                 "Error while reading vocab & merges files: {}",
                 e
             ))
-        })
+        })?;
+        let vocab = vocab.into_iter().collect();
+        Ok((vocab, merges))
     }
 
     /// Instantiate a BPE model from the given files.
@@ -536,6 +541,7 @@ impl PyBPE {
         let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
             exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e))
         })?;
+        let vocab = vocab.into_iter().collect();
         Py::new(
             py,
             PyBPE::new(
@@ -668,6 +674,7 @@ impl PyWordPiece {
         if let Some(vocab) = vocab {
             match vocab {
                 PyVocab::Vocab(vocab) => {
+                    let vocab: AHashMap<_, _> = vocab.into_iter().collect();
                     builder = builder.vocab(vocab);
                 }
                 PyVocab::Filename(vocab_filename) => {
@@ -699,10 +706,11 @@ impl PyWordPiece {
     ///     :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
     #[staticmethod]
     #[pyo3(text_signature = "(vocab)")]
-    fn read_file(vocab: &str) -> PyResult<Vocab> {
-        WordPiece::read_file(vocab).map_err(|e| {
+    fn read_file(vocab: &str) -> PyResult<HashMap<String, u32>> {
+        let vocab = WordPiece::read_file(vocab).map_err(|e| {
             exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
-        })
+        })?;
+        Ok(vocab.into_iter().collect())
     }
 
     /// Instantiate a WordPiece model from the given file
@@ -734,6 +742,7 @@ impl PyWordPiece {
         let vocab = WordPiece::read_file(vocab).map_err(|e| {
             exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
         })?;
+        let vocab = vocab.into_iter().collect();
         Py::new(
             py,
             PyWordPiece::new(py, Some(PyVocab::Vocab(vocab)), kwargs)?,
@@ -778,6 +787,7 @@ impl PyWordLevel {
         if let Some(vocab) = vocab {
             match vocab {
                 PyVocab::Vocab(vocab) => {
+                    let vocab = vocab.into_iter().collect();
                     builder = builder.vocab(vocab);
                 }
                 PyVocab::Filename(vocab_filename) => {
@@ -818,10 +828,12 @@ impl PyWordLevel {
     ///     :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
     #[staticmethod]
     #[pyo3(text_signature = "(vocab)")]
-    fn read_file(vocab: &str) -> PyResult<Vocab> {
-        WordLevel::read_file(vocab).map_err(|e| {
+    fn read_file(vocab: &str) -> PyResult<HashMap<String, u32>> {
+        let vocab = WordLevel::read_file(vocab).map_err(|e| {
             exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e))
-        })
+        })?;
+        let vocab: HashMap<_, _> = vocab.into_iter().collect();
+        Ok(vocab)
     }
 
     /// Instantiate a WordLevel model from the given file
@@ -853,6 +865,7 @@ impl PyWordLevel {
         let vocab = WordLevel::read_file(vocab).map_err(|e| {
             exceptions::PyException::new_err(format!("Error while reading WordLevel file: {}", e))
         })?;
+        let vocab = vocab.into_iter().collect();
         Py::new(
             py,
             PyWordLevel::new(py, Some(PyVocab::Vocab(vocab)), unk_token)?,
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 73a0dbbe8..124319838 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -1,6 +1,8 @@
 use serde::Serialize;
 use std::collections::{hash_map::DefaultHasher, HashMap};
 use std::hash::{Hash, Hasher};
+use tk::pre_tokenizers::byte_level::ByteLevel;
+use tk::ModelWrapper;
 
 use numpy::{npyffi, PyArray1, PyArrayMethods};
 use pyo3::class::basic::CompareOp;
@@ -1118,6 +1120,19 @@ impl PyTokenizer {
             .into()
         })
     }
+    ///
+    #[pyo3(signature = ())]
+    #[pyo3(text_signature = "(self)")]
+    fn enable_backtrack(&mut self) -> PyResult<()> {
+        // self.tokenizer.with_pre_tokenizer(None::<ByteLevel>);
+        let model = self.tokenizer.get_model();
+        let mut model = model.model.write().unwrap();
+        let ModelWrapper::BPE(ref mut model) = *model else {
+            todo!();
+        };
+        model.enable_backtrack();
+        Ok(())
+    }
 
     /// Decode the given list of ids back to a string
     ///
diff --git a/bindings/python/test.py b/bindings/python/test.py
new file mode 100644
index 000000000..931a2a353
--- /dev/null
+++ b/bindings/python/test.py
@@ -0,0 +1,313 @@
+import torch
+from transformers import AutoModel
+from transformers import AutoTokenizer
+from faker import Faker
+from huggingface_hub import hf_hub_download
+import json
+
+# Create a Faker instance with Japanese locale
+fake = Faker("ja_JP")
+
+
+# Generate random Japanese text
+def generate_random_japanese_text():
+    return fake.text()
+
+
+def move_to_cuda(sample):
+    if len(sample) == 0:
+        return {}
+
+    def _move_to_cuda(maybe_tensor):
+        if torch.is_tensor(maybe_tensor):
+            return maybe_tensor.cuda(non_blocking=True)
+        elif isinstance(maybe_tensor, dict):
+            return {key: _move_to_cuda(value) for key, value in maybe_tensor.items()}
+        elif isinstance(maybe_tensor, list):
+            return [_move_to_cuda(x) for x in maybe_tensor]
+        elif isinstance(maybe_tensor, tuple):
+            return tuple([_move_to_cuda(x) for x in maybe_tensor])
+        # elif isinstance(maybe_tensor, Mapping):
+        #     return type(maybe_tensor)({k: _move_to_cuda(v) for k, v in maybe_tensor.items()})
+        else:
+            return maybe_tensor
+
+    return _move_to_cuda(sample)
+
+
+def create_batch_dict(tokenizer, input_texts, max_length: int = 512):
+    return tokenizer(
+        input_texts,
+        max_length=max_length,
+        padding=True,
+        pad_to_multiple_of=8,
+        return_token_type_ids=False,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+
+def pool(last_hidden_states, attention_mask, pool_type: str):
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+
+    if pool_type == "avg":
+        emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pool_type == "weightedavg":  # position-weighted mean pooling from SGPT (https://arxiv.org/abs/2202.08904)
+        attention_mask *= attention_mask.cumsum(dim=1)  # [0,1,1,1,0,0] -> [0,1,2,3,0,0]
+        s = torch.sum(last_hidden * attention_mask.unsqueeze(-1).float(), dim=1)
+        d = attention_mask.sum(dim=1, keepdim=True).float()
+        emb = s / d
+    elif pool_type == "cls":
+        emb = last_hidden[:, 0]
+    elif pool_type == "last":
+        left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
+        if left_padding:
+            emb = last_hidden[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden.shape[0]
+            emb = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
+    else:
+        raise ValueError(f"pool_type {pool_type} not supported")
+
+    return emb
+
+
+class KVEmbedding:
+    def __init__(self, device):
+        self.device = device
+
+        # Load tokenizer and model from pretrained multilingual-e5-small
+        self.tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")
+        self.model = AutoModel.from_pretrained("intfloat/multilingual-e5-small").to(self.device)
+
+        self.model.eval()  # Set model to evaluation mode
+
+    def average_pool(self, last_hidden_states, attention_mask):
+        # Apply mask to hidden states, set masked positions to 0
+        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+        # Average the hidden states along the sequence dimension
+        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+    def embedding(self, l_transcription, batch_size=32):
+        # Tokenize input transcriptions
+        batch_dict = self.tokenizer(
+            l_transcription,
+            max_length=512,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        ).to(self.device)
+
+        return batch_dict
+
+    def _do_encode(self, input_texts):
+        encoded_embeds = []
+        batch_size = 64
+        for start_idx in range(0, len(input_texts), batch_size):
+            batch_input_texts = input_texts[start_idx : start_idx + batch_size]
+
+            batch_dict = create_batch_dict(self.tokenizer, batch_input_texts)
+            # batch_dict = move_to_cuda(batch_dict)
+        return encoded_embeds
+
+
+import random
+from faker import Faker
+
+# # Lists of Japanese characters
+hiragana = [
+    "あ",
+    "い",
+    "う",
+    "え",
+    "お",
+    "か",
+    "き",
+    "く",
+    "け",
+    "こ",
+    "さ",
+    "し",
+    "す",
+    "せ",
+    "そ",
+    "た",
+    "ち",
+    "つ",
+    "て",
+    "と",
+    "な",
+    "に",
+    "ぬ",
+    "ね",
+    "の",
+    "は",
+    "ひ",
+    "ふ",
+    "へ",
+    "ほ",
+    "ま",
+    "み",
+    "む",
+    "め",
+    "も",
+    "や",
+    "ゆ",
+    "よ",
+    "ら",
+    "り",
+    "る",
+    "れ",
+    "ろ",
+    "わ",
+    "を",
+    "ん",
+]
+katakana = [
+    "ア",
+    "イ",
+    "ウ",
+    "エ",
+    "オ",
+    "カ",
+    "キ",
+    "ク",
+    "ケ",
+    "コ",
+    "サ",
+    "シ",
+    "ス",
+    "セ",
+    "ソ",
+    "タ",
+    "チ",
+    "ツ",
+    "テ",
+    "ト",
+    "ナ",
+    "ニ",
+    "ヌ",
+    "ネ",
+    "ノ",
+    "ハ",
+    "ヒ",
+    "フ",
+    "ヘ",
+    "ホ",
+    "マ",
+    "ミ",
+    "ム",
+    "メ",
+    "モ",
+    "ヤ",
+    "ユ",
+    "ヨ",
+    "ラ",
+    "リ",
+    "ル",
+    "レ",
+    "ロ",
+    "ワ",
+    "ヲ",
+    "ン",
+]
+kanji = [
+    "日",
+    "本",
+    "語",
+    "学",
+    "校",
+    "生",
+    "時",
+    "間",
+    "人",
+    "大",
+    "小",
+    "中",
+    "山",
+    "川",
+    "口",
+    "目",
+    "耳",
+    "手",
+    "足",
+    "力",
+    "男",
+    "女",
+    "子",
+    "父",
+    "母",
+]
+
+# Combine all character sets
+all_characters = hiragana + katakana + kanji
+
+
+# Generate random Japanese text
+def generate_random_japanese(length):
+    return "".join(random.choices(all_characters, k=length))
+
+
+def remove_invalid_characters(valid_chars, text):
+    """
+    Removes all invalid characters from the given text, keeping only the characters present in char_dicts.
+
+    Args:
+    char_dicts (dict): Dictionary of valid characters.
+    text (str): Input text string.
+
+    Returns:
+    str: Text string with only valid characters.
+    """
+    # Convert dict keys to a set for faster lookup
+    filtered_text = "".join(c for c in text if c in valid_chars)
+    return filtered_text
+
+
+if __name__ == "__main__":
+    from tqdm import tqdm
+    import psutil
+
+    print("Start app ...")
+    filename = hf_hub_download("intfloat/multilingual-e5-small", "tokenizer.json")
+    with open(filename, "r") as file:
+        character_info = json.load(file)
+    character_dict = {}
+    print("Vocab is loading ...")
+    with tqdm(total=100, desc="cpu%", position=1) as cpubar, tqdm(total=100, desc="ram%", position=0) as rambar:
+        for data in character_info["model"]["vocab"]:
+            character_dict[data[0]] = data[1]
+        valid_chars = set(character_dict.keys())
+        print("Start loading model")
+        kv_embedding = KVEmbedding("cpu")
+        print("Loading model: Done!!!")
+        for i in range(7500):
+            print(f"============{i}==============")
+            length = random.randint(600, 1000)
+            # print(length)
+            input_texts = []
+            for s in range(length):
+                text_length = random.randint(1, 10000)
+
+                random_text = generate_random_japanese(text_length)
+
+                # before = len(random_text)
+                random_text = remove_invalid_characters(valid_chars, random_text)
+                # after = len(random_text)
+                # if after != before:
+                #     print(before, after)
+                random_text = random_text[:450]
+                input_texts.append(random_text)
+                rambar.n = psutil.virtual_memory().percent
+                cpubar.n = psutil.cpu_percent()
+                rambar.refresh()
+                cpubar.refresh()
+
+            filter_output = input_texts[:512]
+
+            del input_texts
+
+            # print(len(filter_output))
+
+            output = kv_embedding.embedding(filter_output)
diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
index b24715862..3e444e333 100644
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@@ -66,6 +66,11 @@ fancy-regex = { version = "0.14", optional = true}
 getrandom = { version = "0.3" }
 esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
 monostate = "0.1.12"
+ahash = { version = "0.8.11", features = ["serde"] }
+dary_heap = { version = "0.3.6", features = ["serde"] }
+compact_str = { version = "0.9", features = ["serde"] }
+fnv = "1.0.7"
+aneubeck-daachorse = "1.1.1"
 
 [features]
 default = ["progressbar", "onig", "esaxx_fast"]
diff --git a/tokenizers/src/models/bpe/backtrack.rs b/tokenizers/src/models/bpe/backtrack.rs
new file mode 100644
index 000000000..f419b6506
--- /dev/null
+++ b/tokenizers/src/models/bpe/backtrack.rs
@@ -0,0 +1,716 @@
+use crate::decoders::byte_level::CHAR_BYTES;
+use crate::models::bpe::Pair;
+use crate::pre_tokenizers::byte_level::ByteLevel;
+use crate::pre_tokenizers::byte_level::BYTES_CHAR;
+use crate::tokenizer::{Decoder, Result};
+use ahash::AHashMap;
+use aneubeck_daachorse::DoubleArrayAhoCorasick;
+use aneubeck_daachorse::DoubleArrayAhoCorasickBuilder;
+use fnv::{FnvHashMap, FnvHasher};
+use itertools::Itertools;
+use std::cmp::Reverse;
+use std::collections::BinaryHeap;
+use std::hash::{Hash, Hasher};
+use std::ops::Range;
+
+use super::MergeMap;
+use super::Merges;
+use super::Vocab;
+use super::VocabR;
+
+/// Small helper to manage a bit field which supports predecessor and successor queries with a simple scan implementation.
+/// This is sufficient for our use case, since two one bits will be at most 128 bits apart.
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) struct BitField {
+    bitfield: Vec<u64>,
+}
+
+impl BitField {
+    /// All bits are initialized to 1.
+    pub(crate) fn new(bits: usize) -> Self {
+        Self {
+            bitfield: vec![u64::MAX; (bits + 63) / 64],
+        }
+    }
+
+    pub(crate) fn is_set(&self, bit: usize) -> bool {
+        let (word, bit) = (bit / 64, bit % 64);
+        self.bitfield[word] & (1 << bit) != 0
+    }
+
+    pub(crate) fn clear(&mut self, bit: usize) {
+        let (word, bit) = (bit / 64, bit % 64);
+        self.bitfield[word] &= !(1 << bit);
+    }
+
+    pub(crate) fn successor(&self, bit: usize) -> usize {
+        let (mut word_idx, bit_idx) = (bit / 64, bit % 64);
+        let word = self.bitfield[word_idx] >> bit_idx;
+        if word != 0 {
+            word.trailing_zeros() as usize + bit
+        } else {
+            loop {
+                word_idx += 1;
+                let word = self.bitfield[word_idx];
+                if word != 0 {
+                    break word.trailing_zeros() as usize + word_idx * 64;
+                }
+            }
+        }
+    }
+
+    pub(crate) fn predecessor(&self, bit: usize) -> usize {
+        let (mut word_idx, bit_idx) = (bit / 64, bit % 64);
+        let word = self.bitfield[word_idx] << (63 - bit_idx);
+        if word != 0 {
+            bit - word.leading_zeros() as usize
+        } else {
+            loop {
+                word_idx -= 1;
+                let word = self.bitfield[word_idx];
+                if word != 0 {
+                    break word_idx * 64 + 63 - word.leading_zeros() as usize;
+                }
+            }
+        }
+    }
+}
+
+/// This can be thought of as a lazy variation of the dynamic programming approach.
+/// It only computes those states which have to be visited in order to compute the tokenization
+/// for a given input text.
+/// It keeps track of visited states in a bitfield and only remembers the tokenization
+/// of the currently processed dynamic programming state.
+///
+/// The biggest downside of this approach is that the search for the longest leftmost match (the firt token?)
+/// has to be reset at every (backtracking) step which is still a net win in practice compared to other approaches.
+#[derive(Clone, PartialEq)]
+pub struct BacktrackState<'a> {
+    pub(crate) text: &'a [u8],
+    pub(crate) tokens: Vec<u32>,        // len of the tezt / 3
+    pub(crate) next_token: Option<u32>, // bpe.next_match(text) wich is longest_searcher.leftmost_find_iter(text)'s first match value
+    pub(crate) pos: usize,              // current pos in the text?
+    pub(crate) bitfield: BitField, // keeps track of token boundaries? keeps track of all the valid tokenization positions and making the runtime linear in the input length.
+}
+
+impl<'a> BacktrackState<'a> {
+    pub(crate) fn new(text: &'a [u8], next_token: Option<u32>) -> Self {
+        Self::with_capacity(text, next_token, text.len() / 3)
+    }
+
+    pub(crate) fn with_capacity(text: &'a [u8], next_token: Option<u32>, cap: usize) -> Self {
+        Self {
+            text,
+            tokens: Vec::with_capacity(cap),
+            next_token,
+            pos: 0,
+            bitfield: BitField::new(text.len() + 1),
+        }
+    }
+    pub(crate) fn count(&self) -> usize {
+        self.tokens.len()
+    }
+
+    pub(crate) fn pos(&self) -> usize {
+        self.pos
+    }
+
+    pub(crate) fn last_token(&self) -> Option<u32> {
+        self.tokens.last().copied()
+    }
+
+    pub(crate) fn into_tokens(self) -> Vec<u32> {
+        self.tokens
+    }
+}
+
+#[derive(PartialEq, Clone)]
+pub struct Backtrack {
+    /// All the decoded tokens concatenated into? used to build the aho corasick searchers
+    all_tokens: Vec<u8>,
+    /// Start index of each token in all_tokens.
+    /// The end is simply the next entry in this vector.
+    token_starts: Vec<u32>,
+    /// Mapping from hash of token to token id.
+    bytes_hash_to_token: FnvHashMap<u32, u32>,
+    /// The two tokens from which the token got merged.
+    /// If the token is an original one, than the two tokens point back to itself.
+    split_table: Vec<(u32, u32)>,
+    /// Mapping from a pair of tokens to a merged token if such a merged token exists.
+    pair_lookup: FnvHashMap<(u32, u32), u32>,
+    /// An aho corasick automaton to find the next longest token in a byte sequence.
+    // #[serde(
+    //     serialize_with = "serialize_daac",
+    //     deserialize_with = "deserialize_daac"
+    // )]
+    longest_searcher: DoubleArrayAhoCorasick<u32>,
+    /// An aho corasick automaton to find ALL tokens in a byte sequence.
+    // #[serde(
+    //     serialize_with = "serialize_daac",
+    //     deserialize_with = "deserialize_daac"
+    // )]
+    pub(crate) overlapping_searcher: DoubleArrayAhoCorasick<u32>,
+    /// An aho corasick automaton to find ALL tokens in a byte sequence which is being processed in reverse order.
+    // #[serde(
+    //     serialize_with = "serialize_daac",
+    //     deserialize_with = "deserialize_daac"
+    // )]
+    pub(crate) overlapping_searcher_rev: DoubleArrayAhoCorasick<u32>,
+    /// Mapping from a token to the next longest prefix token.
+    /// This is in principle information represented by the AhoCorasick automaton.
+    /// But we don't have efficient access to it and therefore store it here again.
+    /// If there is none, then the value is set to u32::MAX.
+    next_prefix_match: Vec<u32>,
+    /// Hash factor used to prevent hash collisions.
+    hash_factor: u64,
+    vocab: Vocab,
+    vocab_r: VocabR,
+    unk_token: Option<String>,
+    merges: MergeMap,
+}
+
+fn hash_bytes(bytes: &[u8], factor: u64) -> u32 {
+    let mut hasher = FnvHasher::default();
+    bytes.hash(&mut hasher);
+    // Note: we save 1/3 of space for the hashmap by only using the most significant bits of the hash.
+    // To make them unique for the given tokens, we have to add unfortunately another multiplication.
+    ((hasher.finish().wrapping_mul(factor)) >> 32) as u32
+}
+
+// #[cfg(feature = "rand")]
+pub fn find_hash_factor_for_dictionary(tokens: impl IntoIterator<Item = Vec<u8>>) -> u64 {
+    use std::collections::HashSet;
+
+    use rand::Rng;
+
+    let all_tokens: Vec<Vec<u8>> = tokens.into_iter().collect();
+    let mut rnd = rand::rng();
+    loop {
+        let factor: u64 = rnd.random();
+        let mut seen = HashSet::new();
+        if all_tokens
+            .iter()
+            .all(|token| seen.insert(hash_bytes(token, factor)))
+        {
+            return factor;
+        }
+    }
+}
+
+impl Backtrack {
+    pub(crate) fn new(vocab: Vocab, merge_map: MergeMap) -> Self {
+        // let vocab_vec: Vec<_> = vocab
+        //     .into_iter()
+        //     .sorted_unstable_by(|a, b| a.1.cmp(&b.1))
+        //     .map(|(k, _v)| k.chars().map(|b| CHAR_BYTES[&b] as u8).collect::<Vec<_>>())
+        //     .collect();
+        let mut merges: Vec<_> = merge_map.values().collect();
+        merges.sort();
+        let merge_vocab: Vec<u32> = merges
+            .into_iter()
+            .map(|(_rank, token_id)| *token_id)
+            .collect();
+
+        let vocab_r: AHashMap<_, _> = vocab.iter().map(|(k, v)| (v, k)).collect();
+        let mut tokens: Vec<_> = vocab
+            .clone()
+            .into_iter()
+            .flat_map(|(k, token_id)| {
+                if merge_vocab.contains(&token_id) {
+                    Some((token_id, k))
+                } else {
+                    None
+                }
+            })
+            .collect();
+        tokens.sort();
+        let mut tokens: Vec<_> = tokens.into_iter().map(|(_token_id, k)| k).collect();
+
+        let merge_vocab: Vec<String> = merge_vocab
+            .into_iter()
+            .map(|token_id| vocab_r[&token_id].clone())
+            .collect();
+        tokens.extend(merge_vocab);
+        let vocab_vec: Vec<_> = tokens.into_iter().map(|k| k.as_bytes().to_vec()).collect();
+
+        let hash_factor = find_hash_factor_for_dictionary(vocab_vec.clone());
+        let mut all_tokens = Vec::new();
+        let mut all_tokens_rev = Vec::new();
+        let mut token_starts = vec![0]; // The begin byte index of each token in all_tokens.
+        let mut bytes_hash_to_token = FnvHashMap::default();
+        let tokens = vocab_vec;
+        for (i, token) in tokens.into_iter().enumerate() {
+            info!(
+                "token byte: {:?}, {i}",
+                ByteLevel::default()
+                    .decode_chain(unsafe { vec![String::from_utf8_unchecked(token.clone())] })
+                    .unwrap()
+            );
+            bytes_hash_to_token.insert(hash_bytes(&token, hash_factor), i as u32);
+            all_tokens_rev.extend(token.iter().copied().rev());
+            all_tokens.extend(token);
+            token_starts.push(all_tokens.len() as u32);
+        }
+        assert_eq!(
+            bytes_hash_to_token.len() + 1,
+            token_starts.len(),
+            "Some tokens are not unique under the hash function!"
+        ); // TODO maybe this check is needed?
+        let longest_searcher = DoubleArrayAhoCorasickBuilder::new()
+            .match_kind(aneubeck_daachorse::MatchKind::LeftmostLongest)
+            .build(token_iter(&all_tokens, &token_starts))
+            .expect("failed to build AhoCorasick");
+
+        let overlapping_searcher =
+            DoubleArrayAhoCorasick::<u32>::new(token_iter(&all_tokens, &token_starts)).expect("");
+        let overlapping_searcher_rev =
+            DoubleArrayAhoCorasick::<u32>::new(token_iter(&all_tokens_rev, &token_starts))
+                .expect("");
+
+        let next_prefix_match: Vec<_> = token_iter(&all_tokens, &token_starts)
+            .map(|token| {
+                next_match(&longest_searcher, &token[0..token.len() - 1]).unwrap_or(u32::MAX)
+            })
+            .collect();
+
+        let vocab: AHashMap<String, u32> = token_iter(&all_tokens, &token_starts)
+            .enumerate()
+            .map(|(id, bytes)| {
+                (
+                    bytes.iter().map(|b| BYTES_CHAR[b]).collect::<String>(),
+                    id as u32,
+                )
+            })
+            .collect();
+
+        let vocab_r: AHashMap<u32, String> = token_iter(&all_tokens, &token_starts)
+            .enumerate()
+            .map(|(id, bytes)| {
+                (
+                    id as u32,
+                    bytes.iter().map(|b| BYTES_CHAR[b]).collect::<String>(),
+                )
+            })
+            .collect();
+
+        let mut split_table = vec![];
+        let mut pair_lookup = FnvHashMap::default();
+        let mut merge_map = AHashMap::new();
+
+        // // First option, use the input merge table.
+        // if let Some(ref merges) = merges {
+        //     for (index, pair) in merges.into_iter().enumerate() {
+        //         let token1 = &pair.0.clone();
+        //         let token2 = &pair.1.clone();
+        //         // TODO something is weird here
+        //         if token1.len() ==1{
+        //             split_table.push((vocab[token1], vocab[token1]));
+        //         }
+        //         if token2.len() == 1 {
+        //             split_table.push((vocab[token2], vocab[token2]));
+        //         }
+        //         let id1 = vocab[token1];
+        //         let id2 = vocab[token2];
+        //         let new_token = format!("{}{}", token1, &token2);
+        //         let new_id = vocab
+        //             .get(&new_token)
+        //             .ok_or(Error::MergeTokenOutOfVocabulary(new_token));
+        //         if let Ok(id) = new_id {
+        //             pair_lookup.insert((id1, id2), *id);
+        //             split_table.push((id1, id2));
+        //             merge_map.insert(Pair::from((id1, id2)), (index as u32, *id));
+        //         } else {
+        //             println!("Token not added?");
+        //         }
+
+        //         // TODO wrong
+        //     }
+        //     split_table.push((merges.len() as u32, merges.len() as u32));
+        // }
+        // Second option, reverse engineer the merge/split table from the vocabulary.
+        {
+            for (id, token) in token_iter(&all_tokens, &token_starts).enumerate() {
+                let mut id1 = next_prefix_match[id];
+                while id1 != u32::MAX {
+                    let rest = &token[token_range(&token_starts, id1).len()..];
+                    if let Some(id2) = find_token_by_bytes(
+                        &all_tokens,
+                        &token_starts,
+                        &bytes_hash_to_token,
+                        rest,
+                        hash_factor,
+                    ) {
+                        if id1 < id as u32
+                            && id2 < id as u32
+                            && is_valid_token_pair(&pair_lookup, &split_table, id1, id2)
+                        {
+                            pair_lookup.insert((id1, id2), id as u32);
+                            split_table.push((id1, id2));
+                            merge_map.insert(Pair::from((id1, id2)), (id as u32, id as u32));
+                            break;
+                        }
+                    }
+                    id1 = next_prefix_match[id1 as usize];
+                }
+                if id1 == u32::MAX {
+                    split_table.push((id as u32, id as u32));
+                }
+            }
+        };
+        let bpe = Self {
+            all_tokens,
+            token_starts,
+            bytes_hash_to_token,
+            overlapping_searcher,
+            overlapping_searcher_rev,
+            longest_searcher,
+            next_prefix_match,
+            pair_lookup,
+            split_table,
+            hash_factor,
+            unk_token: None,
+            vocab,
+            vocab_r,
+            merges: merge_map,
+        };
+        // A health checkup
+        for token_id in 0..bpe.num_tokens() as u32 {
+            let bytes = bpe.token_bytes(token_id);
+            let strs = bytes.iter().map(|b| char::from(*b)).collect::<Vec<_>>();
+            // println!("Encoding {bytes:?} into bitfield");
+            let tokens = bpe.encode_via_bitfield(bytes);
+            assert_eq!(
+                tokens,
+                vec![token_id],
+                "token {token_id} with bytes {bytes:?} (tokens {strs:?} encodes to {tokens:?} instead of to itself"
+            );
+        }
+        bpe
+    }
+
+    fn bitfield_into_tokens(&self, bytes: &[u8], bitfield: BitField, count: usize) -> Vec<u32> {
+        let mut encoded = Vec::with_capacity(count);
+        let mut start = 0;
+        while start < bytes.len() {
+            let end = bitfield.successor(start + 1);
+            // println!("bitfield's successor {:?}", &bytes[start..end]);
+            let token = self
+                .find_token_by_bytes(&bytes[start..end])
+                .expect(&format!(
+                    "Could not convert bytes to tokens for bytes: [{:?}]",
+                    bytes.into_iter().map(|b| BYTES_CHAR[b]).join("")
+                ));
+            encoded.push(token);
+            start = end;
+        }
+        encoded
+    }
+
+    fn encode_into_bitfield(&self, bytes: &[u8]) -> (BitField, usize) {
+        // Reserve for every byte a bit in the bitfield.
+        let mut bitfield = BitField::new(bytes.len() + 1);
+        let mut heap = BinaryHeap::with_capacity(bytes.len() * 2);
+        heap.extend((0..bytes.len().saturating_sub(1)).filter_map(|i| {
+            self.find_token_by_bytes(&bytes[i..i + 2])
+                .map(|e| Reverse((e, i as u32)))
+        }));
+        let mut count = bytes.len();
+        while let Some(Reverse((token, start))) = heap.pop() {
+            let start = start as usize;
+            if !bitfield.is_set(start) {
+                continue;
+            }
+            let mid = bitfield.successor(start + 1);
+            if mid >= bytes.len() {
+                continue;
+            }
+            let end = bitfield.successor(mid + 1);
+            if self.token_len(token) != end - start {
+                continue;
+            }
+            bitfield.clear(mid);
+            count -= 1;
+            if end < bytes.len() {
+                let new_end = bitfield.successor(end + 1);
+                if let Some(e) = self.find_token_by_bytes(&bytes[start..new_end]) {
+                    heap.push(Reverse((e, start as u32)));
+                }
+            }
+            if start > 0 {
+                let new_start = bitfield.predecessor(start - 1);
+                if let Some(e) = self.find_token_by_bytes(&bytes[new_start..end]) {
+                    heap.push(Reverse((e, new_start as u32)));
+                }
+            }
+        }
+        (bitfield, count)
+    }
+
+    pub fn encode_via_bitfield(&self, text: &[u8]) -> Vec<u32> {
+        let (bitfield, count) = self.encode_into_bitfield(text);
+        self.bitfield_into_tokens(text, bitfield, count)
+    }
+
+    /// Return the number of tokens in this BPE dictionary.
+    pub fn num_tokens(&self) -> usize {
+        self.token_starts.len() - 1
+    }
+
+    /// Converts a token id into its corresponding token bytes.
+    /// Panics if the token_id is not within the valid 0..num_tokens() range!
+    pub fn token_bytes(&self, token_id: u32) -> &[u8] {
+        token_bytes(&self.all_tokens, &self.token_starts, token_id)
+    }
+
+    pub(crate) fn is_valid_token_pair(&self, token1: u32, token2: u32) -> bool {
+        is_valid_token_pair(&self.pair_lookup, &self.split_table, token1, token2)
+    }
+
+    /// Returns the length of the decoded byte slice of a token.
+    pub fn token_len(&self, token_id: u32) -> usize {
+        token_range(&self.token_starts, token_id).len()
+    }
+
+    /// Returns the first longest match in the provided text.
+    pub(crate) fn next_match(&self, text: &[u8]) -> Option<u32> {
+        next_match(&self.longest_searcher, text)
+    }
+
+    /// Returns the next token which shares the longest prefix with the specified token.
+    pub(crate) fn next_prefix(&self, token_id: u32) -> Option<u32> {
+        let prefix = self.next_prefix_match[token_id as usize];
+        if prefix == u32::MAX {
+            None
+        } else {
+            Some(prefix)
+        }
+    }
+
+    fn find_token_by_bytes(&self, bytes: &[u8]) -> Option<u32> {
+        find_token_by_bytes(
+            &self.all_tokens,
+            &self.token_starts,
+            &self.bytes_hash_to_token,
+            bytes,
+            self.hash_factor,
+        )
+    }
+
+    /// Decode a sequence of tokens back to its original byte sequence.
+    /// Note: we don't return here a str, since not every token sequence corresponds to a valid
+    /// utf8 sequence.
+    pub fn decode_tokens(&self, tokens: &[u32]) -> Vec<u8> {
+        let mut text = vec![];
+        for token in tokens {
+            text.extend(self.token_bytes(*token));
+        }
+        text
+    }
+
+    /// Computes for every prefix of the input text a corresponding last token.
+    pub(crate) fn encode_all_prefixes(&self, text: &[u8]) -> Vec<u32> {
+        let mut last_token = Vec::with_capacity(text.len());
+        let mut state = self.overlapping_searcher.start_state();
+        for (pos, c) in text.iter().enumerate() {
+            let (s, iter) = self.overlapping_searcher.consume(state, pos + 1, *c);
+            state = s;
+            for m in iter {
+                let new_token = m.value();
+                let new_range = m.start()..m.end();
+                assert_eq!(new_range.end, last_token.len() + 1);
+                if new_range.start == 0 {
+                    last_token.push(new_token);
+                    break;
+                } else {
+                    let prev_token = unsafe { *last_token.get_unchecked(new_range.start - 1) };
+                    if self.is_valid_token_pair(prev_token, new_token) {
+                        last_token.push(new_token);
+                        break;
+                    }
+                    // println!("Finished encoding prefix")
+                }
+            }
+        }
+        last_token
+    }
+
+    /// Counts the number tokens produced when encoding the text.
+    pub fn count(&mut self, text: &[u8]) -> usize {
+        let mut enc = BacktrackState::new(text, None);
+        while self.step(&mut enc).is_some() {}
+        enc.count()
+    }
+
+    pub fn encode_via_table(&self, text: &[u8]) -> Vec<u32> {
+        let last_token = self.encode_all_prefixes(text);
+        let mut encoded = Vec::with_capacity(text.len() / 3);
+        let mut pos = text.len();
+        while pos > 0 {
+            let token = last_token[pos - 1];
+            encoded.push(token);
+            pos -= self.token_len(token);
+        }
+        encoded.reverse();
+        encoded
+    }
+
+    pub fn encode_via_backtracking(&self, text: &[u8]) -> Vec<u32> {
+        let next_token = self.next_match(text);
+        let mut enc = BacktrackState::new(text, next_token);
+        while self.step(&mut enc).is_some() {}
+        enc.into_tokens()
+    }
+
+    pub fn get_vocab(&self) -> Vocab {
+        self.vocab.clone()
+    }
+
+    pub fn get_unk_token(&self) -> &Option<String> {
+        &self.unk_token
+    }
+
+    pub fn step(&self, backtrack_state: &mut BacktrackState) -> Option<u32> {
+        let mut token = backtrack_state.next_token?;
+        let last = backtrack_state.tokens.last().copied();
+        loop {
+            // println!("in step, token: {last:?}, {token}");
+            let token_len = self.token_len(token);
+            let end_pos = backtrack_state.pos + token_len;
+            if backtrack_state.bitfield.is_set(end_pos)
+                && last
+                    .map(|last_token| self.is_valid_token_pair(last_token, token))
+                    .unwrap_or(true)
+            {
+                backtrack_state.tokens.push(token);
+                backtrack_state.pos = end_pos;
+                // In principle, we could in some cases reuse the leftmost longest match iterator.
+                // Especially when it has to look ahead, this could save scanning the input multiple times.
+                // But on average this seems to be slower due to the overhead of storing the iterator as part of the struct.
+                backtrack_state.next_token = self.next_match(&backtrack_state.text[end_pos..]);
+                break;
+            } else if let Some(shorter) = self.next_prefix(token) {
+                token = shorter;
+            } else {
+                // Clearing the bitfield when we pop tokens saves a little bit of work...
+                backtrack_state.bitfield.clear(backtrack_state.pos);
+                backtrack_state.tokens.pop();
+                backtrack_state.pos -= last.map(|t| self.token_len(t)).unwrap_or(0);
+                backtrack_state.next_token = last;
+                break;
+            }
+        }
+        // println!("finished step, token: {last:?}, {token}");
+
+        backtrack_state.next_token
+    }
+}
+
+// A helper function to iterate over the tokens in a byte sequence
+fn token_iter<'a>(all_tokens: &'a [u8], token_starts: &'a [u32]) -> impl Iterator<Item = &'a [u8]> {
+    token_starts
+        .iter()
+        .tuple_windows()
+        .map(move |(start, end)| &all_tokens[*start as usize..*end as usize])
+}
+
+fn next_match(longest_searcher: &DoubleArrayAhoCorasick<u32>, text: &[u8]) -> Option<u32> {
+    longest_searcher
+        .leftmost_find_iter(text)
+        .map(|m| m.value())
+        .next()
+}
+
+fn is_valid_token_pair(
+    pair_lookup: &FnvHashMap<(u32, u32), u32>,
+    split_table: &[(u32, u32)],
+    mut token1: u32,
+    mut token2: u32,
+) -> bool {
+    // Keep track of the maximum token which can still be chosen across the split point.
+    let mut limit = u32::MAX;
+    // println!("checking if {token1}, {token2} is a valid token_pair");
+    loop {
+        // Check whether BPE would choose a different token pair across the split point.
+        // this is super super important
+        if let Some(combined) = pair_lookup.get(&(token1, token2)) {
+            if *combined < limit {
+                // println!("Done1");
+                return false;
+            }
+        }
+        // Reverse the merge operation from BPE.
+
+        // println!("{:?}", split_table);
+        if token1 > token2 {
+            limit = token1;
+            token1 = unsafe { split_table.get_unchecked(token1 as usize).1 };
+            if token1 == limit {
+                limit = token2 + 1;
+                token2 = unsafe { split_table.get_unchecked(token2 as usize).0 };
+                if token2 + 1 == limit {
+                    // println!("Done2");
+                    return true;
+                }
+            }
+        } else {
+            limit = token2 + 1;
+            token2 = unsafe { split_table.get_unchecked(token2 as usize).0 };
+            if token2 + 1 == limit {
+                limit = token1;
+                token1 = unsafe { split_table.get_unchecked(token1 as usize).1 };
+                if token1 == limit {
+                    // println!("Done3");
+                    return true;
+                }
+            }
+        }
+    }
+}
+
+fn token_range(token_starts: &[u32], token_id: u32) -> Range<usize> {
+    unsafe {
+        *token_starts.get_unchecked(token_id as usize) as usize
+            ..*token_starts.get_unchecked(token_id as usize + 1) as usize
+    }
+}
+
+fn token_bytes<'a>(all_tokens: &'a [u8], token_starts: &[u32], token_id: u32) -> &'a [u8] {
+    &all_tokens[token_range(token_starts, token_id)]
+}
+
+fn find_token_by_bytes(
+    all_tokens: &[u8],
+    token_starts: &[u32],
+    bytes_hash_to_token: &FnvHashMap<u32, u32>,
+    bytes: &[u8],
+    hash_factor: u64,
+) -> Option<u32> {
+    let hash = hash_bytes(bytes, hash_factor);
+    let token = *bytes_hash_to_token.get(&hash)?;
+    if token_bytes(all_tokens, token_starts, token) == bytes {
+        Some(token)
+    } else {
+        None
+    }
+}
+
+/// Converts the merges strings (for example from `merges.txt` file) with the format
+/// "{pair_a} {pair_b}" into the format expected by the BacktrackingBpe struct
+pub(crate) fn convert_merges_to_hashmap<I: Iterator<Item = String>>(
+    iter: I,
+    _vocab: &Vocab,
+) -> Result<Merges> {
+    let mut merges = vec![];
+
+    let lines = iter.filter(|l| !l.starts_with("#version"));
+    for (rank, line) in lines.enumerate() {
+        let parts = line.split(' ').collect::<Vec<_>>();
+        if parts.len() != 2 {
+            return Err(super::Error::BadMerges(rank + 1).into());
+        }
+
+        merges.push((parts[0].to_string(), parts[1].to_string()));
+    }
+
+    Ok(merges)
+}
diff --git a/tokenizers/src/models/bpe/mod.rs b/tokenizers/src/models/bpe/mod.rs
index f0d40b2df..a176fe365 100644
--- a/tokenizers/src/models/bpe/mod.rs
+++ b/tokenizers/src/models/bpe/mod.rs
@@ -1,6 +1,7 @@
 //! [Byte Pair Encoding](https://www.aclweb.org/anthology/P16-1162/) model.
 use std::{iter, mem};
 
+mod backtrack;
 mod model;
 mod serialization;
 pub mod trainer;
diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs
index 50c9815e9..d0e1cf842 100644
--- a/tokenizers/src/models/bpe/model.rs
+++ b/tokenizers/src/models/bpe/model.rs
@@ -1,20 +1,23 @@
+use super::backtrack::Backtrack;
 use super::{super::OrderedVocabIter, trainer::BpeTrainer, Error, Pair, Word};
 use crate::tokenizer::{Model, Result, Token};
 use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY, MAX_LENGTH};
 use crate::utils::iter::ResultShunt;
+use ahash::AHashMap;
 use serde_json::Value;
 use std::borrow::Cow;
+
+use std::collections::HashMap;
 use std::{
-    collections::HashMap,
     fs::File,
     io::prelude::*,
     io::{BufRead, BufReader},
     path::{Path, PathBuf},
 };
 
-pub type Vocab = HashMap<String, u32>;
-type VocabR = HashMap<u32, String>;
-pub type MergeMap = HashMap<Pair, (u32, u32)>;
+pub type Vocab = AHashMap<String, u32>;
+pub type VocabR = AHashMap<u32, String>;
+pub type MergeMap = AHashMap<Pair, (u32, u32)>;
 pub type Merges = Vec<(String, String)>;
 
 struct Config {
@@ -41,7 +44,7 @@ impl Default for BpeBuilder {
         Self {
             config: Config {
                 files: None,
-                vocab: HashMap::new(),
+                vocab: AHashMap::new(),
                 merges: vec![],
                 cache_capacity: DEFAULT_CACHE_CAPACITY,
                 dropout: None,
@@ -71,8 +74,41 @@ impl BpeBuilder {
 
     /// Set the vocab (token -> ID) and merges mappings.
     #[must_use]
-    pub fn vocab_and_merges(mut self, vocab: Vocab, merges: Merges) -> Self {
-        self.config.vocab = vocab;
+    pub fn vocab_and_merges<V: Into<AHashMap<String, u32>>>(
+        mut self,
+        vocab: V,
+        merges: Merges,
+    ) -> Self {
+        self.config.vocab = vocab.into();
+        // for (i, (left, right)) in merges.iter().enumerate() {
+        //     // println!("{left:?} - {right:?}");
+        //     let mut result = left.clone();
+        //     result.push_str(right);
+        //     if result == String::from("Ġvi") {
+        //         println!("Original merge {left:?} - {right:?} - {i}");
+        //         // panic!("Stop");
+        //     }
+        //     if result == String::from("á»") {
+        //         println!("Original merge {left:?} - {right:?}");
+        //         // panic!("Stop");
+        //     }
+        //     if result == String::from("á»ĩ") {
+        //         println!("Original merge {left:?} - {right:?}");
+        //         // panic!("Stop");
+        //     }
+        //     if result == String::from("iá»ĩ") {
+        //         println!("Original merge {left:?} - {right:?}");
+        //         // panic!("Stop");
+        //     }
+        //     if result == String::from("iá»ĩc") {
+        //         println!("Original merge {left:?} - {right:?}");
+        //         // panic!("Stop");
+        //     }
+        //     if result == String::from("Ġviá»ĩc") {
+        //         println!("Original merge {left:?} - {right:?}");
+        //         // panic!("Stop");
+        //     }
+        // }
         self.config.merges = merges;
         self
     }
@@ -199,6 +235,7 @@ impl BpeBuilder {
             fuse_unk: self.config.fuse_unk,
             byte_fallback: self.config.byte_fallback,
             ignore_merges: self.config.ignore_merges,
+            backtrack: None,
         })
     }
 }
@@ -230,6 +267,8 @@ pub struct BPE {
     pub byte_fallback: bool,
     /// Whether or not to direct output words if they are part of the vocab.
     pub ignore_merges: bool,
+
+    backtrack: Option<Backtrack>,
 }
 
 impl std::fmt::Debug for BPE {
@@ -271,6 +310,7 @@ impl Clone for BPE {
             fuse_unk: self.fuse_unk,
             byte_fallback: self.byte_fallback,
             ignore_merges: self.ignore_merges,
+            backtrack: None,
         }
     }
 }
@@ -324,7 +364,7 @@ impl BPE {
         let mut buffer = String::new();
         vocab_file.read_to_string(&mut buffer)?;
         let json: Value = serde_json::from_str(&buffer)?;
-        let mut vocab = HashMap::new();
+        let mut vocab = AHashMap::new();
         match json {
             Value::Object(m) => {
                 for (token, id) in m {
@@ -361,8 +401,16 @@ impl BPE {
         }
     }
 
-    pub fn get_vocab(&self) -> Vocab {
-        self.vocab.clone()
+    pub fn get_vocab(&self) -> HashMap<String, u32> {
+        self.vocab.clone().into_iter().collect()
+    }
+
+    pub fn get_vocab_r(&self) -> HashMap<u32, String> {
+        self.vocab_r.clone().into_iter().collect()
+    }
+
+    pub fn get_merges(&self) -> &AHashMap<Pair, Pair> {
+        &self.merges
     }
 
     pub fn get_unk_token(&self) -> &Option<String> {
@@ -455,7 +503,10 @@ impl BPE {
             word.add(unk_id, unk_len);
         }
 
+        // println!("Word {word:?}");
+
         word.merge_all(&self.merges, self.dropout);
+        // println!("After Word {word:?}");
 
         Ok(word)
     }
@@ -488,13 +539,17 @@ impl BPE {
         }
         Ok(ret)
     }
+
+    pub fn enable_backtrack(&mut self) {
+        self.backtrack = Some(Backtrack::new(self.vocab.clone(), self.merges.clone()));
+    }
 }
 
 impl Model for BPE {
     type Trainer = BpeTrainer;
 
     fn get_vocab(&self) -> HashMap<String, u32> {
-        self.vocab.clone()
+        self.vocab.clone().into_iter().collect()
     }
 
     fn get_vocab_size(&self) -> usize {
@@ -502,10 +557,24 @@ impl Model for BPE {
     }
 
     fn tokenize(&self, sequence: &str) -> Result<Vec<Token>> {
+        // println!("Tokenizing {sequence}");
         if sequence.is_empty() {
             return Ok(vec![]);
         }
 
+        if let Some(backtrack) = &self.backtrack {
+            let ids = backtrack.encode_via_backtracking(sequence.as_bytes());
+            let tokens = ids
+                .into_iter()
+                .map(|id| Token {
+                    id,
+                    value: self.vocab_r[&id].clone(),
+                    offsets: (0, 0),
+                })
+                .collect();
+            return Ok(tokens);
+        }
+
         if self.dropout.is_none() || self.dropout == Some(0.0) {
             self.tokenize_with_cache(sequence)
         } else {
diff --git a/tokenizers/src/models/bpe/serialization.rs b/tokenizers/src/models/bpe/serialization.rs
index 98cc15102..98cf54944 100644
--- a/tokenizers/src/models/bpe/serialization.rs
+++ b/tokenizers/src/models/bpe/serialization.rs
@@ -1,10 +1,10 @@
 use super::{super::OrderedVocabIter, convert_merges_to_hashmap, BpeBuilder, Pair, BPE};
+use ahash::AHashMap;
 use serde::{
     de::{Error, MapAccess, Visitor},
     ser::SerializeStruct,
     Deserialize, Deserializer, Serialize, Serializer,
 };
-use std::collections::HashMap;
 
 impl Serialize for BPE {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -80,7 +80,7 @@ impl<'de> Visitor<'de> for BPEVisitor {
         V: MapAccess<'de>,
     {
         let mut builder = BpeBuilder::new();
-        let mut vocab: Option<HashMap<String, u32>> = None;
+        let mut vocab: Option<AHashMap<String, u32>> = None;
 
         #[derive(Debug, Deserialize)]
         #[serde(untagged)]
diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs
index 2484865be..50cc52099 100644
--- a/tokenizers/src/models/bpe/trainer.rs
+++ b/tokenizers/src/models/bpe/trainer.rs
@@ -4,15 +4,17 @@ use super::{Pair, WithFirstLastIterator, Word, BPE};
 use crate::parallelism::*;
 use crate::tokenizer::{AddedToken, Result, Trainer};
 use crate::utils::progress::{ProgressBar, ProgressStyle};
+use ahash::{AHashMap, AHashSet};
+use compact_str::CompactString;
+use dary_heap::OctonaryHeap;
 use serde::{Deserialize, Serialize};
 use std::cmp::Ordering;
-use std::collections::{BinaryHeap, HashMap, HashSet};
 
 #[derive(Debug, Eq)]
 struct Merge {
     pair: Pair,
     count: u64,
-    pos: HashSet<usize>,
+    pos: AHashSet<usize>,
 }
 impl PartialEq for Merge {
     fn eq(&self, other: &Self) -> bool {
@@ -41,7 +43,7 @@ struct Config {
     show_progress: bool,
     special_tokens: Vec<AddedToken>,
     limit_alphabet: Option<usize>,
-    initial_alphabet: HashSet<char>,
+    initial_alphabet: AHashSet<char>,
     continuing_subword_prefix: Option<String>,
     end_of_word_suffix: Option<String>,
     max_token_length: Option<usize>,
@@ -62,7 +64,7 @@ impl Default for BpeTrainerBuilder {
                 show_progress: true,
                 special_tokens: vec![],
                 limit_alphabet: None,
-                initial_alphabet: HashSet::new(),
+                initial_alphabet: AHashSet::new(),
                 continuing_subword_prefix: None,
                 end_of_word_suffix: None,
                 max_token_length: None,
@@ -114,7 +116,7 @@ impl BpeTrainerBuilder {
 
     /// Set the initial alphabet
     #[must_use]
-    pub fn initial_alphabet(mut self, alphabet: HashSet<char>) -> Self {
+    pub fn initial_alphabet(mut self, alphabet: AHashSet<char>) -> Self {
         self.config.initial_alphabet = alphabet;
         self
     }
@@ -151,7 +153,7 @@ impl BpeTrainerBuilder {
             continuing_subword_prefix: self.config.continuing_subword_prefix,
             end_of_word_suffix: self.config.end_of_word_suffix,
             max_token_length: self.config.max_token_length,
-            words: HashMap::new(),
+            words: AHashMap::new(),
         }
     }
 }
@@ -187,7 +189,7 @@ pub struct BpeTrainer {
     pub limit_alphabet: Option<usize>,
     /// The initial alphabet we want absolutely to include. This allows to cover
     /// some characters that are not necessarily in the training set
-    pub initial_alphabet: HashSet<char>,
+    pub initial_alphabet: AHashSet<char>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
     /// An optional suffix to characterize and end-of-word subword
@@ -195,7 +197,7 @@ pub struct BpeTrainer {
     /// An optional parameter to limit the max length of any single token
     pub max_token_length: Option<usize>,
 
-    words: HashMap<String, u64>,
+    words: AHashMap<CompactString, u64>,
 }
 
 impl Default for BpeTrainer {
@@ -251,11 +253,16 @@ impl BpeTrainer {
     }
 
     /// Add the provided special tokens to the initial vocabulary
-    fn add_special_tokens(&self, w2id: &mut HashMap<String, u32>, id2w: &mut Vec<String>) {
+    fn add_special_tokens(
+        &self,
+        w2id: &mut AHashMap<CompactString, u32>,
+        id2w: &mut Vec<CompactString>,
+    ) {
         for token in &self.special_tokens {
-            if !w2id.contains_key(&token.content) {
-                id2w.push(token.content.to_owned());
-                w2id.insert(token.content.to_owned(), (id2w.len() - 1) as u32);
+            // get hash of content
+            if !w2id.contains_key(&CompactString::from(&token.content)) {
+                id2w.push(CompactString::from(&token.content));
+                w2id.insert(CompactString::from(&token.content), (id2w.len() - 1) as u32);
             }
         }
     }
@@ -263,12 +270,12 @@ impl BpeTrainer {
     /// Compute the initial alphabet and limit it if relevant
     fn compute_alphabet(
         &self,
-        wc: &HashMap<String, u64>,
-        w2id: &mut HashMap<String, u32>,
-        id2w: &mut Vec<String>,
+        wc: &AHashMap<CompactString, u64>,
+        w2id: &mut AHashMap<CompactString, u32>,
+        id2w: &mut Vec<CompactString>,
     ) {
         // Compute the alphabet from seen words
-        let mut alphabet: HashMap<char, usize> = HashMap::new();
+        let mut alphabet: AHashMap<char, usize> = AHashMap::new();
         for (word, count) in wc {
             for c in word.chars() {
                 alphabet
@@ -312,19 +319,26 @@ impl BpeTrainer {
         kept.sort_unstable_by_key(|k| (*k.0) as u32);
         kept.into_iter().for_each(|(c, _)| {
             let s = c.to_string();
+            /*
             if !w2id.contains_key(&s) {
                 id2w.push(s.clone());
                 w2id.insert(s, (id2w.len() - 1) as u32);
             }
+            */
+            // u64 hash version
+            if !w2id.contains_key(&CompactString::from(&s)) {
+                id2w.push(CompactString::from(&s));
+                w2id.insert(CompactString::from(&s), (id2w.len() - 1) as u32);
+            }
         });
     }
 
     /// Tokenize words and add subwords to the vocabulary when relevant
     fn tokenize_words(
         &self,
-        wc: &HashMap<String, u64>,
-        w2id: &mut HashMap<String, u32>,
-        id2w: &mut Vec<String>,
+        wc: &AHashMap<CompactString, u64>,
+        w2id: &mut AHashMap<CompactString, u32>,
+        id2w: &mut Vec<CompactString>,
         p: &Option<ProgressBar>,
     ) -> (Vec<Word>, Vec<u64>) {
         let mut words: Vec<Word> = Vec::with_capacity(wc.len());
@@ -336,7 +350,7 @@ impl BpeTrainer {
 
             for (is_first, is_last, c) in word.chars().with_first_and_last() {
                 let mut s = c.to_string();
-                if w2id.contains_key(&s) {
+                if w2id.contains_key(&CompactString::from(&s)) {
                     // Found the initial char in the authorized alphabet
 
                     // Add the `continuing_subword_prefix` if relevant
@@ -353,11 +367,11 @@ impl BpeTrainer {
                     }
 
                     // Insert the new formed string if necessary
-                    if !w2id.contains_key(&s) {
-                        id2w.push(s.clone());
-                        w2id.insert(s.clone(), (id2w.len() - 1) as u32);
+                    if !w2id.contains_key(&CompactString::from(&s)) {
+                        id2w.push(CompactString::from(&s));
+                        w2id.insert(CompactString::from(&s), (id2w.len() - 1) as u32);
                     }
-                    current_word.add(w2id[&s], 1); // We do not care about the len here
+                    current_word.add(w2id[&CompactString::from(&s)], 1); // We do not care about the len here
                 }
             }
             words.push(current_word);
@@ -375,13 +389,13 @@ impl BpeTrainer {
         words: &[Word],
         counts: &[u64],
         p: &Option<ProgressBar>,
-    ) -> (HashMap<Pair, i32>, HashMap<Pair, HashSet<usize>>) {
+    ) -> (AHashMap<Pair, i32>, AHashMap<Pair, AHashSet<usize>>) {
         words
             .maybe_par_iter()
             .enumerate()
             .map(|(i, word)| {
-                let mut pair_counts = HashMap::new();
-                let mut where_to_update: HashMap<Pair, HashSet<usize>> = HashMap::new();
+                let mut pair_counts = AHashMap::new();
+                let mut where_to_update: AHashMap<Pair, AHashSet<usize>> = AHashMap::new();
 
                 for window in word.get_chars().windows(2) {
                     let cur_pair: Pair = (window[0], window[1]);
@@ -399,7 +413,7 @@ impl BpeTrainer {
                             h.insert(i);
                         })
                         .or_insert_with(|| {
-                            let mut h = HashSet::new();
+                            let mut h = AHashSet::new();
                             h.insert(i);
                             h
                         });
@@ -413,7 +427,7 @@ impl BpeTrainer {
                 (pair_counts, where_to_update)
             })
             .reduce(
-                || (HashMap::new(), HashMap::new()),
+                || (AHashMap::new(), AHashMap::new()),
                 |(mut pair_counts, mut where_to_update), (pc, wtu)| {
                     for (k, v) in pc {
                         pair_counts.entry(k).and_modify(|c| *c += v).or_insert(v);
@@ -431,11 +445,11 @@ impl BpeTrainer {
 
     pub fn do_train(
         &self,
-        word_counts: &HashMap<String, u64>,
+        word_counts: &AHashMap<CompactString, u64>,
         model: &mut BPE,
     ) -> Result<Vec<AddedToken>> {
-        let mut word_to_id: HashMap<String, u32> = HashMap::with_capacity(self.vocab_size);
-        let mut id_to_word: Vec<String> = Vec::with_capacity(self.vocab_size);
+        let mut word_to_id: AHashMap<CompactString, u32> = AHashMap::with_capacity(self.vocab_size);
+        let mut id_to_word: Vec<CompactString> = Vec::with_capacity(self.vocab_size);
         let max_token_length: usize = self.max_token_length.unwrap_or(usize::MAX);
 
         let progress = self.setup_progress();
@@ -464,7 +478,7 @@ impl BpeTrainer {
         self.update_progress(&progress, words.len(), "Count pairs");
         let (mut pair_counts, mut where_to_update) = self.count_pairs(&words, &counts, &progress);
         // Insert them in the queue
-        let mut queue = BinaryHeap::with_capacity(pair_counts.len());
+        let mut queue = OctonaryHeap::with_capacity(pair_counts.len());
         where_to_update.drain().for_each(|(pair, pos)| {
             let count = pair_counts[&pair];
             if count > 0 {
@@ -510,7 +524,7 @@ impl BpeTrainer {
             if let Some(prefix) = &self.continuing_subword_prefix {
                 if part_b.starts_with(prefix) {
                     let prefix_byte_len = prefix.chars().map(|c| c.len_utf8()).sum();
-                    part_b = part_b[prefix_byte_len..].to_string();
+                    part_b = CompactString::from(&part_b[prefix_byte_len..]);
                 }
             }
             let new_token = format!("{part_a}{part_b}");
@@ -520,19 +534,19 @@ impl BpeTrainer {
 
             // Insert new token if it does not already exist
             let new_token_id = word_to_id
-                .get(&new_token)
+                .get(&CompactString::from(&new_token))
                 .copied()
                 .unwrap_or(id_to_word.len() as u32);
-            if !word_to_id.contains_key(&new_token) {
-                id_to_word.push(new_token.clone());
-                word_to_id.insert(new_token.clone(), new_token_id);
+            if !word_to_id.contains_key(&CompactString::from(&new_token)) {
+                id_to_word.push(CompactString::from(&new_token));
+                word_to_id.insert(CompactString::from(&new_token), new_token_id);
             }
             merges.push((top.pair, new_token_id));
 
             // Merge the new pair in every words
             // Safety: This is just a type assertion, the code below may no longer be safe
             // if the type of `pos` changes
-            let pos: &HashSet<usize> = &top.pos;
+            let pos: &AHashSet<usize> = &top.pos;
 
             let words_len = words.len();
             struct WordPtr(*mut Word);
@@ -544,11 +558,8 @@ impl BpeTrainer {
             let changes = pos
                 .maybe_par_iter()
                 .flat_map(|&i| {
-                    // Safety:
-                    // We are producing a valid pointer since we are indexing in bounds
-                    //
-                    // We can access each `word` here in parallel because each position
-                    // can be there only once (pos is a HashSet).
+                    // We can merge each of these words in parallel here because each position
+                    // can be there only once (AHashSet). So this is safe.
                     unsafe {
                         assert!(i < words_len);
                         // This is words[i], but avoids needing to go through &T (which triggers UB)
@@ -577,7 +588,7 @@ impl BpeTrainer {
                             h.insert(iw);
                         })
                         .or_insert_with(|| {
-                            let mut h = HashSet::new();
+                            let mut h = AHashSet::new();
                             h.insert(iw);
                             h
                         });
@@ -601,7 +612,12 @@ impl BpeTrainer {
         self.finalize_progress(&progress, merges.len());
 
         // Transfer new vocab & options to model
-        model.vocab = word_to_id;
+        //model.vocab = word_to_id;
+        model.vocab = word_to_id
+            .into_iter()
+            // we have to look up the string in id_to_word because the key in word_to_id is a hash
+            .map(|(_key, val)| (id_to_word[val as usize].to_string(), val))
+            .collect();
         model.vocab_r = model
             .vocab
             .iter()
@@ -647,18 +663,20 @@ impl Trainer for BpeTrainer {
         S: AsRef<str> + Send,
         F: Fn(&str) -> Result<Vec<String>> + Sync,
     {
-        let words: Result<HashMap<String, u64>> = iterator
+        let words: Result<AHashMap<CompactString, u64>> = iterator
             .maybe_par_bridge()
             .map(|sequence| {
                 let words = process(sequence.as_ref())?;
-                let mut map = HashMap::new();
+                let mut map = AHashMap::new();
                 for word in words {
-                    map.entry(word).and_modify(|c| *c += 1).or_insert(1);
+                    map.entry(CompactString::from(word))
+                        .and_modify(|c| *c += 1)
+                        .or_insert(1);
                 }
                 Ok(map)
             })
             .reduce(
-                || Ok(HashMap::new()),
+                || Ok(AHashMap::new()),
                 |acc, ws| {
                     let mut acc = acc?;
                     for (k, v) in ws? {
@@ -676,11 +694,12 @@ impl Trainer for BpeTrainer {
 #[cfg(test)]
 mod tests {
     use super::{BpeTrainer, Pair, BPE};
-    use std::collections::HashMap;
+    use ahash::AHashMap;
+    use compact_str::CompactString;
 
     #[test]
     fn test_train() {
-        let word_counts: HashMap<String, u64> = [
+        let word_counts: AHashMap<CompactString, u64> = [
             ("roses".into(), 1),
             ("are".into(), 2),
             ("red".into(), 1),
@@ -705,7 +724,7 @@ mod tests {
 
         // Vocab should contain all of the characters from the `word_counts` mapping
         // as well as three merges: 're', 'are', and 'is'.
-        let expected_vocab: HashMap<String, u32> = [
+        let expected_vocab: AHashMap<String, u32> = [
             ("-".into(), 0),
             ("2".into(), 1),
             ("B".into(), 2),
@@ -741,7 +760,7 @@ mod tests {
         // where 'rank' determines the order in which this merge will be applied during
         // tokenization, and 'id' is the vocab id of the symbol resulting from merging
         // the pair of symbols in the corresponding key.
-        let expected_merges: HashMap<Pair, (u32, u32)> = [
+        let expected_merges: AHashMap<Pair, (u32, u32)> = [
             ((17, 11), (0, 22)), // 'r' + 'e'  -> 're'
             ((8, 22), (1, 23)),  // 'a' + 're' -> 'are'
             ((13, 18), (2, 24)), // 'i' + 's'  -> 'is'
@@ -759,7 +778,7 @@ mod tests {
          */
 
         let max_token_length = 16;
-        let long_word_counts: HashMap<String, u64> = [
+        let long_word_counts: AHashMap<CompactString, u64> = [
             ("singlelongtokenwithoutcasechange", 2),
             ("singleLongTokenWithCamelCaseChange", 2),
             ("Longsingletokenwithpunctu@t!onwithin", 2),
@@ -774,7 +793,7 @@ mod tests {
             ("GPT-2", 2),
         ]
         .iter()
-        .map(|(key, value)| (key.to_string(), *value))
+        .map(|(key, value)| (CompactString::from(key.to_string()), *value))
         .collect();
         let trainer = BpeTrainer::builder()
             .max_token_length(Some(max_token_length))
@@ -799,7 +818,7 @@ mod tests {
         // directly compares tokens with known expected values.
         // maybe unstable depending on specific settings or changes.
          */
-        let long_word_counts: HashMap<String, u64> = [
+        let long_word_counts: AHashMap<CompactString, u64> = [
             ("sin", 2),
             ("Sin", 2),
             ("Lon", 2),
@@ -814,7 +833,7 @@ mod tests {
             ("GP", 2),
         ]
         .iter()
-        .map(|(key, value)| (key.to_string(), *value))
+        .map(|(key, value)| (CompactString::from(key.to_string()), *value))
         .collect();
         let trainer = BpeTrainer::builder()
             .max_token_length(Some(2))
@@ -823,8 +842,8 @@ mod tests {
             .build();
         let mut model = BPE::default();
         trainer.do_train(&long_word_counts, &mut model).unwrap();
-        let trained_vocab: HashMap<String, u32> = model.get_vocab();
-        let expected_vocab: HashMap<String, u32> = [
+        let trained_vocab: AHashMap<String, u32> = model.get_vocab().into_iter().collect();
+        let expected_vocab: AHashMap<String, u32> = [
             ("短", 12),
             ("n", 6),
             ("i", 5),
diff --git a/tokenizers/src/models/bpe/word.rs b/tokenizers/src/models/bpe/word.rs
index 9d09fa2af..7bf2dee56 100644
--- a/tokenizers/src/models/bpe/word.rs
+++ b/tokenizers/src/models/bpe/word.rs
@@ -1,7 +1,8 @@
 use super::Pair;
+use ahash::AHashMap;
+use dary_heap::QuaternaryHeap;
 use rand::{rng, Rng};
 use std::cmp::Ordering;
-use std::collections::{BinaryHeap, HashMap};
 
 #[derive(Debug, Eq)]
 struct Merge {
@@ -158,8 +159,8 @@ impl Word {
         changes
     }
 
-    pub(super) fn merge_all(&mut self, merges: &HashMap<Pair, (u32, u32)>, dropout: Option<f32>) {
-        let mut queue = BinaryHeap::with_capacity(self.symbols.len());
+    pub(super) fn merge_all(&mut self, merges: &AHashMap<Pair, (u32, u32)>, dropout: Option<f32>) {
+        let mut queue = QuaternaryHeap::with_capacity(self.symbols.len());
         let mut skip = Vec::with_capacity(queue.len());
 
         queue.extend(
diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs
index 4e5419bad..932bc598d 100644
--- a/tokenizers/src/models/mod.rs
+++ b/tokenizers/src/models/mod.rs
@@ -5,6 +5,7 @@ pub mod unigram;
 pub mod wordlevel;
 pub mod wordpiece;
 
+use ahash::AHashMap;
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 
@@ -19,11 +20,11 @@ use crate::{AddedToken, Model, Result, Token, Trainer};
 /// Wraps a vocab mapping (ID -> token) to a struct that will be serialized in order
 /// of token ID, smallest to largest.
 struct OrderedVocabIter<'a> {
-    vocab_r: &'a HashMap<u32, String>,
+    vocab_r: &'a AHashMap<u32, String>,
 }
 
 impl<'a> OrderedVocabIter<'a> {
-    fn new(vocab_r: &'a HashMap<u32, String>) -> Self {
+    fn new(vocab_r: &'a AHashMap<u32, String>) -> Self {
         Self { vocab_r }
     }
 }
@@ -301,8 +302,8 @@ mod tests {
 
     #[test]
     fn incomplete_ordered_vocab() {
-        let vocab_r: HashMap<u32, String> =
-            HashMap::from([(0, "Hi".to_string()), (2, "There".to_string())]);
+        let vocab_r: AHashMap<u32, String> =
+            AHashMap::from([(0, "Hi".to_string()), (2, "There".to_string())]);
 
         let ordered = OrderedVocabIter::new(&vocab_r);
 
diff --git a/tokenizers/src/models/unigram/lattice.rs b/tokenizers/src/models/unigram/lattice.rs
index 1302019e1..2897bb376 100644
--- a/tokenizers/src/models/unigram/lattice.rs
+++ b/tokenizers/src/models/unigram/lattice.rs
@@ -1,13 +1,13 @@
+use dary_heap::QuaternaryHeap;
 use rand::distr::weighted::WeightedIndex;
 use rand::{prelude::*, rng};
 use std::cell::RefCell;
 use std::cmp::{min, Ordering};
-use std::collections::BinaryHeap;
 use std::rc::Rc;
 
 type NodeRef = Rc<RefCell<Node>>;
 type HypothesisRef = Rc<RefCell<Hypothesis>>;
-type Agenda = BinaryHeap<Hypothesis>;
+type Agenda = QuaternaryHeap<Hypothesis>;
 
 struct Hypothesis {
     node_ref: NodeRef,
@@ -240,7 +240,7 @@ impl<'a> Lattice<'a> {
             1 => vec![self.viterbi()],
             _ => {
                 // let k_reserved_hypothesis_size = 512;
-                let mut agenda: Agenda = BinaryHeap::new();
+                let mut agenda: Agenda = QuaternaryHeap::new();
                 let mut hypotheses: Vec<Vec<NodeRef>> = vec![];
                 let eos = self.eos_node();
                 let score = eos.borrow().score;
@@ -282,7 +282,7 @@ impl<'a> Lattice<'a> {
                         let k_max_agenda_size = 100_000;
                         let k_min_agenda_size = 512;
                         if agenda.len() > k_max_agenda_size {
-                            let mut new_agenda = BinaryHeap::new();
+                            let mut new_agenda = QuaternaryHeap::new();
                             let len = min(k_min_agenda_size, n * 10);
                             for _i in 0..len {
                                 new_agenda.push(agenda.pop().unwrap());
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
index fd498c822..7b876ec9d 100644
--- a/tokenizers/src/models/unigram/model.rs
+++ b/tokenizers/src/models/unigram/model.rs
@@ -5,13 +5,14 @@ use super::{
 };
 use crate::tokenizer::{Model, Result, Token};
 use crate::utils::cache::{Cache, MAX_LENGTH};
-
 use std::collections::HashMap;
+
+use ahash::AHashMap;
 use std::convert::TryInto;
 use std::fs::read_to_string;
 use std::path::{Path, PathBuf};
 
-type TokenMap = HashMap<String, u32>;
+type TokenMap = AHashMap<String, u32>;
 type Vocab = Vec<(String, f64)>;
 
 /// A `Unigram` model to encode sentences.
@@ -98,7 +99,7 @@ impl Unigram {
         byte_fallback: bool,
     ) -> Result<Self> {
         let n = vocab.len();
-        let mut token_to_ids: TokenMap = HashMap::new();
+        let mut token_to_ids: TokenMap = AHashMap::new();
         let mut builder = TrieBuilder::default();
 
         if let Some(unk_id) = unk_id {
@@ -416,7 +417,7 @@ impl Model for Unigram {
     type Trainer = UnigramTrainer;
 
     fn get_vocab(&self) -> HashMap<String, u32> {
-        self.token_to_ids.clone()
+        self.token_to_ids.clone().into_iter().collect()
     }
 
     fn get_vocab_size(&self) -> usize {
diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs
index d6d2830fd..920dee525 100644
--- a/tokenizers/src/models/unigram/trainer.rs
+++ b/tokenizers/src/models/unigram/trainer.rs
@@ -2,10 +2,10 @@ use crate::models::unigram::{lattice::Lattice, model::Unigram};
 use crate::tokenizer::{AddedToken, Result, Trainer};
 use crate::utils::parallelism::*;
 use crate::utils::progress::{ProgressBar, ProgressStyle};
+use ahash::{AHashMap, AHashSet};
 use log::debug;
 use serde::{Deserialize, Serialize};
 use std::cmp::Reverse;
-use std::collections::{HashMap, HashSet};
 use std::convert::TryInto;
 
 // A token and a score
@@ -57,8 +57,8 @@ pub struct UnigramTrainer {
     pub shrinking_factor: f64,
     #[builder(default = "vec![]")]
     pub special_tokens: Vec<AddedToken>,
-    #[builder(default = "HashSet::new()")]
-    pub initial_alphabet: HashSet<char>,
+    #[builder(default = "AHashSet::new()")]
+    pub initial_alphabet: AHashSet<char>,
 
     #[builder(default = "None")]
     pub unk_token: Option<String>,
@@ -67,8 +67,8 @@ pub struct UnigramTrainer {
     pub max_piece_length: usize,
     #[builder(default = "1_000_000")]
     seed_size: usize,
-    #[builder(default = "HashMap::new()")]
-    words: HashMap<String, u32>,
+    #[builder(default = "AHashMap::new()")]
+    words: AHashMap<String, u32>,
 }
 
 impl Default for UnigramTrainer {
@@ -110,17 +110,17 @@ impl UnigramTrainer {
         true
     }
 
-    fn finalize(&self, model: Unigram, required_chars: HashSet<String>) -> Result<Unigram> {
+    fn finalize(&self, model: Unigram, required_chars: AHashSet<String>) -> Result<Unigram> {
         let mut min_score_penalty = 0.0;
         let min_score_penalty_delta = 0.0001;
 
         let mut pieces: Vec<(String, f64)> = vec![];
-        let mut inserted: HashSet<String> = HashSet::new();
+        let mut inserted: AHashSet<String> = AHashSet::new();
 
         // We don't want to include the <UNK> that was used to train
         inserted.insert("<UNK>".into());
 
-        let existing_pieces: HashMap<String, f64> = model.iter().cloned().collect();
+        let existing_pieces: AHashMap<String, f64> = model.iter().cloned().collect();
         for c in required_chars {
             if let Some(t) = existing_pieces.get(&c) {
                 inserted.insert(c.clone());
@@ -185,7 +185,7 @@ impl UnigramTrainer {
         )
     }
 
-    fn required_chars(&self, word_counts: &[Sentence]) -> HashSet<String> {
+    fn required_chars(&self, word_counts: &[Sentence]) -> AHashSet<String> {
         word_counts
             .iter()
             .flat_map(|(s, _count)| s.chars())
@@ -205,7 +205,7 @@ impl UnigramTrainer {
             .sum::<usize>()
             + sentences.len();
         let mut flat_string = String::with_capacity(total);
-        let mut all_chars: HashMap<char, u32> = HashMap::new();
+        let mut all_chars: AHashMap<char, u32> = AHashMap::new();
         let c_sentence_boundary = '\0';
         let k_sentence_boundary = '\0'.to_string();
         for (string, n) in sentences {
@@ -631,18 +631,18 @@ impl Trainer for UnigramTrainer {
         S: AsRef<str> + Send,
         F: Fn(&str) -> Result<Vec<String>> + Sync,
     {
-        let words: Result<HashMap<String, u32>> = iterator
+        let words: Result<AHashMap<String, u32>> = iterator
             .maybe_par_bridge()
             .map(|sequence| {
                 let words = process(sequence.as_ref())?;
-                let mut map = HashMap::new();
+                let mut map = AHashMap::new();
                 for word in words {
                     map.entry(word).and_modify(|c| *c += 1).or_insert(1);
                 }
                 Ok(map)
             })
             .reduce(
-                || Ok(HashMap::new()),
+                || Ok(AHashMap::new()),
                 |acc, ws| {
                     let mut acc = acc?;
                     for (k, v) in ws? {
@@ -716,7 +716,7 @@ mod tests {
     fn test_initial_alphabet() {
         let trainer = UnigramTrainerBuilder::default()
             .show_progress(false)
-            .initial_alphabet(HashSet::from_iter(vec!['a', 'b', 'c', 'd', 'e', 'f']))
+            .initial_alphabet(AHashSet::from_iter(vec!['a', 'b', 'c', 'd', 'e', 'f']))
             .build()
             .unwrap();
 
@@ -727,7 +727,7 @@ mod tests {
             vec!["こ", "ん", "に", "ち", "は", "友", "達", "a", "b", "c", "d", "e", "f"]
                 .into_iter()
                 .map(|s| s.to_owned())
-                .collect::<HashSet<_>>()
+                .collect::<AHashSet<_>>()
         );
     }
 
diff --git a/tokenizers/src/models/unigram/trie.rs b/tokenizers/src/models/unigram/trie.rs
index 2f94b1766..dd06f7f02 100644
--- a/tokenizers/src/models/unigram/trie.rs
+++ b/tokenizers/src/models/unigram/trie.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use ahash::AHashMap;
 use std::hash::Hash;
 
 #[derive(Default)]
@@ -78,14 +78,14 @@ impl<Label> Default for Trie<Label> {
 #[derive(Clone)]
 pub struct Node<Label> {
     is_leaf: bool,
-    children: HashMap<Label, Node<Label>>,
+    children: AHashMap<Label, Node<Label>>,
 }
 
 impl<Label> Default for Node<Label> {
     fn default() -> Self {
         Self {
             is_leaf: false,
-            children: HashMap::new(),
+            children: AHashMap::new(),
         }
     }
 }
diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs
index dd66de08e..94e7c86b4 100644
--- a/tokenizers/src/models/wordlevel/mod.rs
+++ b/tokenizers/src/models/wordlevel/mod.rs
@@ -1,5 +1,6 @@
 use super::OrderedVocabIter;
 use crate::tokenizer::{Model, Result, Token};
+use ahash::AHashMap;
 use serde_json::Value;
 use std::collections::HashMap;
 use std::fs::File;
@@ -12,7 +13,7 @@ mod trainer;
 // Re-export
 pub use trainer::*;
 
-type Vocab = HashMap<String, u32>;
+type Vocab = AHashMap<String, u32>;
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
@@ -24,7 +25,7 @@ pub enum Error {
 
 struct Config {
     files: Option<String>,
-    vocab: HashMap<String, u32>,
+    vocab: AHashMap<String, u32>,
     unk_token: String,
 }
 
@@ -39,7 +40,7 @@ impl Default for WordLevelBuilder {
         Self {
             config: Config {
                 files: None,
-                vocab: HashMap::new(),
+                vocab: AHashMap::new(),
                 unk_token: String::from("<unk>"),
             },
         }
@@ -61,7 +62,7 @@ impl WordLevelBuilder {
 
     /// Set the vocab (token -> ID) mapping.
     #[must_use]
-    pub fn vocab(mut self, vocab: HashMap<String, u32>) -> Self {
+    pub fn vocab(mut self, vocab: AHashMap<String, u32>) -> Self {
         self.config.vocab = vocab;
         self
     }
@@ -96,8 +97,8 @@ impl WordLevelBuilder {
 
 #[derive(PartialEq, Clone, Eq)]
 pub struct WordLevel {
-    vocab: HashMap<String, u32>,
-    vocab_r: HashMap<u32, String>,
+    vocab: AHashMap<String, u32>,
+    vocab_r: AHashMap<u32, String>,
     pub unk_token: String,
 }
 
@@ -119,7 +120,7 @@ impl WordLevel {
         let vocab_file = File::open(vocab_path)?;
         let mut vocab_file = BufReader::new(vocab_file);
         let mut buffer = String::new();
-        let mut vocab = HashMap::new();
+        let mut vocab = AHashMap::new();
 
         vocab_file.read_to_string(&mut buffer)?;
         let json: Value = serde_json::from_str(&buffer)?;
@@ -148,8 +149,8 @@ impl WordLevel {
 impl Default for WordLevel {
     fn default() -> Self {
         Self {
-            vocab: HashMap::new(),
-            vocab_r: HashMap::new(),
+            vocab: AHashMap::new(),
+            vocab_r: AHashMap::new(),
             unk_token: String::from("<unk>"),
         }
     }
@@ -185,7 +186,7 @@ impl Model for WordLevel {
     }
 
     fn get_vocab(&self) -> HashMap<String, u32> {
-        self.vocab.clone()
+        self.vocab.clone().into_iter().collect()
     }
 
     fn get_vocab_size(&self) -> usize {
diff --git a/tokenizers/src/models/wordlevel/serialization.rs b/tokenizers/src/models/wordlevel/serialization.rs
index a077a4999..1cc79339e 100644
--- a/tokenizers/src/models/wordlevel/serialization.rs
+++ b/tokenizers/src/models/wordlevel/serialization.rs
@@ -1,10 +1,10 @@
 use super::{super::OrderedVocabIter, WordLevel, WordLevelBuilder};
+use ahash::AHashSet;
 use serde::{
     de::{MapAccess, Visitor},
     ser::SerializeStruct,
     Deserialize, Deserializer, Serialize, Serializer,
 };
-use std::collections::HashSet;
 
 impl Serialize for WordLevel {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -52,7 +52,7 @@ impl<'de> Visitor<'de> for WordLevelVisitor {
             "vocab",
         ]
         .into_iter()
-        .collect::<HashSet<_>>();
+        .collect::<AHashSet<_>>();
         while let Some(key) = map.next_key::<String>()? {
             match key.as_ref() {
                 "vocab" => builder = builder.vocab(map.next_value()?),
diff --git a/tokenizers/src/models/wordlevel/trainer.rs b/tokenizers/src/models/wordlevel/trainer.rs
index c52ad08d7..04f7955a8 100644
--- a/tokenizers/src/models/wordlevel/trainer.rs
+++ b/tokenizers/src/models/wordlevel/trainer.rs
@@ -1,9 +1,9 @@
 use super::WordLevel;
 use crate::utils::parallelism::*;
 use crate::{AddedToken, Result, Trainer};
+use ahash::AHashMap;
 use serde::{Deserialize, Serialize};
 use std::cmp::Ordering;
-use std::collections::HashMap;
 
 #[non_exhaustive]
 #[derive(Debug, Clone, Builder, Serialize, Deserialize)]
@@ -22,7 +22,7 @@ pub struct WordLevelTrainer {
     pub special_tokens: Vec<AddedToken>,
 
     #[builder(default, private)]
-    words: HashMap<String, u64>,
+    words: AHashMap<String, u64>,
 }
 
 impl Default for WordLevelTrainer {
@@ -38,7 +38,7 @@ impl WordLevelTrainer {
 
     fn do_train(
         &self,
-        word_counts: &HashMap<String, u64>,
+        word_counts: &AHashMap<String, u64>,
         model: &mut WordLevel,
     ) -> Result<Vec<AddedToken>> {
         let mut ordered_counts = word_counts.iter().collect::<Vec<_>>();
@@ -100,18 +100,18 @@ impl Trainer for WordLevelTrainer {
         S: AsRef<str> + Send,
         F: Fn(&str) -> Result<Vec<String>> + Sync,
     {
-        let words: Result<HashMap<String, u64>> = iterator
+        let words: Result<AHashMap<String, u64>> = iterator
             .maybe_par_bridge()
             .map(|sequence| {
                 let words = process(sequence.as_ref())?;
-                let mut map = HashMap::new();
+                let mut map = AHashMap::new();
                 for word in words {
                     map.entry(word).and_modify(|c| *c += 1).or_insert(1);
                 }
                 Ok(map)
             })
             .reduce(
-                || Ok(HashMap::new()),
+                || Ok(AHashMap::new()),
                 |acc, ws| {
                     let mut acc = acc?;
                     for (k, v) in ws? {
@@ -132,7 +132,7 @@ mod tests {
 
     #[test]
     fn test_train() {
-        let word_counts: HashMap<String, u64> = [
+        let word_counts: AHashMap<String, u64> = [
             ("the".into(), 25),
             ("roses".into(), 22),
             ("are".into(), 24),
@@ -151,7 +151,7 @@ mod tests {
 
         let mut model = WordLevel::default();
         trainer.do_train(&word_counts, &mut model).unwrap();
-        let expected_vocab: HashMap<String, u32> = [
+        let expected_vocab: AHashMap<String, u32> = [
             ("the".into(), 0),
             ("are".into(), 1),
             ("roses".into(), 2),
@@ -167,7 +167,7 @@ mod tests {
         trainer.min_frequency = 15;
         let mut model = WordLevel::default();
         trainer.do_train(&word_counts, &mut model).unwrap();
-        let expected_vocab: HashMap<String, u32> = [
+        let expected_vocab: AHashMap<String, u32> = [
             ("the".into(), 0),
             ("are".into(), 1),
             ("roses".into(), 2),
diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs
index 5c06bd4b3..7ea1b7b75 100644
--- a/tokenizers/src/models/wordpiece/mod.rs
+++ b/tokenizers/src/models/wordpiece/mod.rs
@@ -3,9 +3,10 @@
 
 use crate::models::bpe::BPE;
 use crate::tokenizer::{Model, Result, Token};
+use ahash::AHashMap;
+use std::collections::HashMap;
 use std::{
     borrow::Cow,
-    collections::HashMap,
     fs::File,
     io::prelude::*,
     io::{BufRead, BufReader},
@@ -22,8 +23,8 @@ pub enum Error {
     MissingUnkToken,
 }
 
-type Vocab = HashMap<String, u32>;
-type VocabR = HashMap<u32, String>;
+type Vocab = AHashMap<String, u32>;
+type VocabR = AHashMap<u32, String>;
 
 struct Config {
     files: Option<String>,
@@ -43,7 +44,7 @@ impl Default for WordPieceBuilder {
         Self {
             config: Config {
                 files: None,
-                vocab: HashMap::new(),
+                vocab: AHashMap::new(),
                 unk_token: String::from("[UNK]"),
                 continuing_subword_prefix: String::from("##"),
                 max_input_chars_per_word: 100,
@@ -67,8 +68,8 @@ impl WordPieceBuilder {
 
     /// Set the vocab (token -> ID) mapping.
     #[must_use]
-    pub fn vocab(mut self, vocab: Vocab) -> Self {
-        self.config.vocab = vocab;
+    pub fn vocab<V: Into<AHashMap<String, u32>>>(mut self, vocab: V) -> Self {
+        self.config.vocab = vocab.into();
         self
     }
 
@@ -142,8 +143,8 @@ impl std::fmt::Debug for WordPiece {
 impl Default for WordPiece {
     fn default() -> Self {
         Self {
-            vocab: HashMap::new(),
-            vocab_r: HashMap::new(),
+            vocab: AHashMap::new(),
+            vocab_r: AHashMap::new(),
             unk_token: String::from("[UNK]"),
             continuing_subword_prefix: String::from("##"),
             max_input_chars_per_word: 100,
@@ -162,7 +163,7 @@ impl WordPiece {
         let file = File::open(vocab)?;
         let file = BufReader::new(file);
 
-        let mut vocab = HashMap::new();
+        let mut vocab = AHashMap::new();
         for (index, line) in file.lines().enumerate() {
             let line = line?;
             vocab.insert(line.trim_end().to_owned(), index as u32);
@@ -178,7 +179,10 @@ impl WordPiece {
 
     /// Create a `WordPiece` model from a `BPE` model.
     pub fn from_bpe(bpe: &BPE) -> Self {
-        let mut wp = Self::builder().vocab(bpe.get_vocab()).build().unwrap();
+        let mut wp = Self::builder()
+            .vocab(bpe.get_vocab().into_iter().collect::<AHashMap<_, _>>())
+            .build()
+            .unwrap();
         if let Some(unk) = bpe.get_unk_token() {
             unk.clone_into(&mut wp.unk_token);
         }
@@ -193,7 +197,7 @@ impl Model for WordPiece {
     type Trainer = WordPieceTrainer;
 
     fn get_vocab(&self) -> HashMap<String, u32> {
-        self.vocab.clone()
+        self.vocab.clone().into_iter().collect()
     }
 
     fn get_vocab_size(&self) -> usize {
diff --git a/tokenizers/src/models/wordpiece/serialization.rs b/tokenizers/src/models/wordpiece/serialization.rs
index a50274e81..7ba496d63 100644
--- a/tokenizers/src/models/wordpiece/serialization.rs
+++ b/tokenizers/src/models/wordpiece/serialization.rs
@@ -1,10 +1,10 @@
 use super::{super::OrderedVocabIter, WordPiece, WordPieceBuilder};
+use ahash::{AHashMap, AHashSet};
 use serde::{
     de::{MapAccess, Visitor},
     ser::SerializeStruct,
     Deserialize, Deserializer, Serialize, Serializer,
 };
-use std::collections::HashSet;
 
 impl Serialize for WordPiece {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -67,7 +67,7 @@ impl<'de> Visitor<'de> for WordPieceVisitor {
             "vocab",
         ]
         .into_iter()
-        .collect::<HashSet<_>>();
+        .collect::<AHashSet<_>>();
 
         while let Some(key) = map.next_key::<String>()? {
             match key.as_ref() {
@@ -78,7 +78,10 @@ impl<'de> Visitor<'de> for WordPieceVisitor {
                 "max_input_chars_per_word" => {
                     builder = builder.max_input_chars_per_word(map.next_value()?)
                 }
-                "vocab" => builder = builder.vocab(map.next_value()?),
+                "vocab" => {
+                    let vocab: AHashMap<String, u32> = map.next_value()?;
+                    builder = builder.vocab(vocab)
+                }
                 "type" => match map.next_value()? {
                     "WordPiece" => {}
                     u => {
diff --git a/tokenizers/src/models/wordpiece/trainer.rs b/tokenizers/src/models/wordpiece/trainer.rs
index f2e79baee..32e76e95b 100644
--- a/tokenizers/src/models/wordpiece/trainer.rs
+++ b/tokenizers/src/models/wordpiece/trainer.rs
@@ -1,8 +1,8 @@
 use super::WordPiece;
 use crate::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE};
 use crate::tokenizer::{AddedToken, Result, Trainer};
+use ahash::AHashSet;
 use serde::{Deserialize, Serialize};
-use std::collections::HashSet;
 
 /// A `WordPieceTrainerBuilder` can be used to create a `WordPieceTrainer` with a custom
 /// configuration.
@@ -61,7 +61,7 @@ impl WordPieceTrainerBuilder {
 
     /// Set the initial alphabet
     #[must_use]
-    pub fn initial_alphabet(mut self, alphabet: HashSet<char>) -> Self {
+    pub fn initial_alphabet(mut self, alphabet: AHashSet<char>) -> Self {
         self.bpe_trainer_builder = self.bpe_trainer_builder.initial_alphabet(alphabet);
         self
     }
@@ -134,11 +134,11 @@ impl WordPieceTrainer {
         self.bpe_trainer.limit_alphabet = limit;
     }
 
-    pub fn initial_alphabet(&self) -> &HashSet<char> {
+    pub fn initial_alphabet(&self) -> &AHashSet<char> {
         &self.bpe_trainer.initial_alphabet
     }
 
-    pub fn set_initial_alphabet(&mut self, alphabet: HashSet<char>) {
+    pub fn set_initial_alphabet(&mut self, alphabet: AHashSet<char>) {
         self.bpe_trainer.initial_alphabet = alphabet;
     }
 
diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs
index 1524725da..41fd41615 100644
--- a/tokenizers/src/normalizers/byte_level.rs
+++ b/tokenizers/src/normalizers/byte_level.rs
@@ -1,14 +1,14 @@
 use crate::processors::byte_level::bytes_char;
 use crate::tokenizer::{NormalizedString, Normalizer, Result};
 use crate::utils::macro_rules_attribute;
-use std::collections::{HashMap, HashSet};
+use ahash::{AHashMap, AHashSet};
 use std::sync::LazyLock;
 
 #[derive(Clone, Debug)]
 #[macro_rules_attribute(impl_serde_type!)]
 pub struct ByteLevel;
 
-static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
+static BYTES_CHAR: LazyLock<AHashMap<u8, char>> = LazyLock::new(bytes_char);
 
 impl Default for ByteLevel {
     fn default() -> Self {
@@ -21,7 +21,7 @@ impl ByteLevel {
         Self {}
     }
 
-    pub fn alphabet() -> HashSet<char> {
+    pub fn alphabet() -> AHashSet<char> {
         BYTES_CHAR.values().copied().collect()
     }
 }
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
index c5eed874c..78f70a9e0 100644
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use ahash::{AHashMap, AHashSet};
 use std::sync::LazyLock;
 
 use crate::utils::SysRegex;
@@ -12,7 +12,7 @@ use crate::utils::macro_rules_attribute;
 
 /// Converts bytes to unicode characters.
 /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
-pub(crate) fn bytes_char() -> HashMap<u8, char> {
+pub(crate) fn bytes_char() -> AHashMap<u8, char> {
     let mut bs: Vec<u8> = vec![];
     bs.extend(b'!'..=b'~');
     bs.extend(b'\xA1'..=b'\xAC');
@@ -44,8 +44,8 @@ static RE: LazyLock<SysRegex> = LazyLock::new(|| {
     SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
         .unwrap()
 });
-static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
-static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
+pub(crate) static BYTES_CHAR: LazyLock<AHashMap<u8, char>> = LazyLock::new(bytes_char);
+pub(crate) static CHAR_BYTES: LazyLock<AHashMap<char, u8>> =
     LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -90,7 +90,7 @@ impl ByteLevel {
         }
     }
 
-    pub fn alphabet() -> HashSet<char> {
+    pub fn alphabet() -> AHashSet<char> {
         BYTES_CHAR.values().copied().collect()
     }
 
@@ -447,7 +447,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         );
         process_offsets(&mut encoding, true);
         assert_eq!(
@@ -461,7 +461,7 @@ mod tests {
                 vec![],
                 vec![],
                 vec![],
-                HashMap::new(),
+                AHashMap::new(),
             )
         );
     }
@@ -483,7 +483,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         );
         let expected = Encoding::new(
             vec![0; 5],
@@ -500,7 +500,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::from_iter(vec![(0, 0..5)]),
+            AHashMap::from_iter(vec![(0, 0..5)]),
         );
 
         let bytelevel = ByteLevel::default().trim_offsets(true);
@@ -540,7 +540,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::from_iter(vec![(0, 0..5), (1, 5..10)]),
+            AHashMap::from_iter(vec![(0, 0..5), (1, 5..10)]),
         );
         assert_eq!(
             pair_expected,
diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs
index 3cd0cf382..a1cab8abd 100644
--- a/tokenizers/src/processors/bert.rs
+++ b/tokenizers/src/processors/bert.rs
@@ -1,6 +1,6 @@
 use crate::tokenizer::{Encoding, PostProcessor, Result};
+use ahash::AHashMap;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 use std::iter::FromIterator;
 
 #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
@@ -78,7 +78,7 @@ impl PostProcessor for BertProcessing {
 
                     // For compatibility with `TemplateProcessing`, the sequence_ranges shouldn't contain
                     // the special tokens.
-                    let sequence_ranges = HashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
+                    let sequence_ranges = AHashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
                     Encoding::new(
                         ids,
                         type_ids,
@@ -111,7 +111,7 @@ impl PostProcessor for BertProcessing {
                                 // For compatibility with `TemplateProcessing`, the sequence_ranges shouldn't
                                 // contain the special tokens.
                                 let sequence_ranges =
-                                    HashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
+                                    AHashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
                                 Encoding::new(
                                     ids,
                                     type_ids,
@@ -140,7 +140,8 @@ impl PostProcessor for BertProcessing {
 
                     // For compatibility with `TemplateProcessing`, the sequence_ranges shouldn't contain
                     // the special tokens.
-                    let pair_sequence_ranges = HashMap::from_iter(vec![(1, 0..pair_ids.len() - 1)]);
+                    let pair_sequence_ranges =
+                        AHashMap::from_iter(vec![(1, 0..pair_ids.len() - 1)]);
                     Encoding::new(
                         pair_ids,
                         pair_type_ids,
@@ -167,7 +168,7 @@ impl PostProcessor for BertProcessing {
                                 // For compatibility with `TemplateProcessing`, the sequence_ranges
                                 // shouldn't contain the special tokens.
                                 let pair_sequence_ranges =
-                                    HashMap::from_iter(vec![(1, 0..pair_ids.len() - 1)]);
+                                    AHashMap::from_iter(vec![(1, 0..pair_ids.len() - 1)]);
                                 Encoding::new(
                                     pair_ids,
                                     pair_type_ids,
@@ -238,7 +239,7 @@ mod tests {
                 vec![1, 0, 0, 1],
                 vec![1, 1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 1..3)]),
+                AHashMap::from_iter(vec![(0, 1..3)]),
             )
         );
         assert_eq!(single_encoding.token_to_sequence(2), Some(0));
@@ -264,7 +265,7 @@ mod tests {
                 vec![1, 0, 0, 1, 0, 1],
                 vec![1, 1, 1, 1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
+                AHashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
             )
         );
         assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
@@ -285,7 +286,7 @@ mod tests {
                 vec![0, 0, 0],
                 vec![1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]),
+                AHashMap::from_iter(vec![(0, 0..2), (1, 2..3)]),
             )
         );
         assert_eq!(pair_encoding.token_to_sequence(0), Some(0));
diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs
index d870f4152..f2a47a9d3 100644
--- a/tokenizers/src/processors/roberta.rs
+++ b/tokenizers/src/processors/roberta.rs
@@ -1,7 +1,7 @@
 use crate::processors::byte_level::process_offsets;
 use crate::tokenizer::{Encoding, PostProcessor, Result};
+use ahash::AHashMap;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 use std::iter::FromIterator;
 
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
@@ -108,7 +108,7 @@ impl PostProcessor for RobertaProcessing {
 
                     // For compatibility with `TemplateProcessing`, the sequence_ranges shouldn't contain
                     // the special tokens.
-                    let sequence_ranges = HashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
+                    let sequence_ranges = AHashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
                     Encoding::new(
                         ids,
                         type_ids,
@@ -141,7 +141,7 @@ impl PostProcessor for RobertaProcessing {
                                 // For compatibility with `TemplateProcessing`, the sequence_ranges shouldn't
                                 // contain the special tokens.
                                 let sequence_ranges =
-                                    HashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
+                                    AHashMap::from_iter(vec![(0, 1..ids.len() - 1)]);
                                 Encoding::new(
                                     ids,
                                     type_ids,
@@ -174,7 +174,8 @@ impl PostProcessor for RobertaProcessing {
 
                     // For compatibility with `TemplateProcessing`, the sequence_ranges shouldn't contain
                     // the special tokens.
-                    let pair_sequence_ranges = HashMap::from_iter(vec![(1, 1..pair_ids.len() - 1)]);
+                    let pair_sequence_ranges =
+                        AHashMap::from_iter(vec![(1, 1..pair_ids.len() - 1)]);
                     Encoding::new(
                         pair_ids,
                         pair_type_ids,
@@ -208,7 +209,7 @@ impl PostProcessor for RobertaProcessing {
                                 // For compatibility with `TemplateProcessing`, the sequence_ranges
                                 // shouldn't contain the special tokens.
                                 let pair_sequence_ranges =
-                                    HashMap::from_iter(vec![(1, 1..pair_ids.len() - 1)]);
+                                    AHashMap::from_iter(vec![(1, 1..pair_ids.len() - 1)]);
                                 Encoding::new(
                                     pair_ids,
                                     pair_type_ids,
@@ -281,7 +282,7 @@ mod tests {
                 vec![1, 0, 0, 1],
                 vec![1, 1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 1..3)]),
+                AHashMap::from_iter(vec![(0, 1..3)]),
             )
         );
         assert_eq!(single_encoding.token_to_sequence(2), Some(0));
@@ -308,7 +309,7 @@ mod tests {
                 vec![1, 0, 0, 1, 1, 0, 1],
                 vec![1, 1, 1, 1, 1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 1..3), (1, 5..6)]),
+                AHashMap::from_iter(vec![(0, 1..3), (1, 5..6)]),
             )
         );
         assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
@@ -330,7 +331,7 @@ mod tests {
                 vec![0, 0, 0],
                 vec![1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 0..2), (1, 2..3)]),
+                AHashMap::from_iter(vec![(0, 0..2), (1, 2..3)]),
             )
         );
         assert_eq!(pair_encoding.token_to_sequence(0), Some(0));
diff --git a/tokenizers/src/processors/sequence.rs b/tokenizers/src/processors/sequence.rs
index 5cfb3eb51..f44cf54ac 100644
--- a/tokenizers/src/processors/sequence.rs
+++ b/tokenizers/src/processors/sequence.rs
@@ -73,7 +73,7 @@ mod tests {
     use super::*;
     use crate::processors::{ByteLevel, PostProcessorWrapper};
     use crate::tokenizer::{Encoding, PostProcessor};
-    use std::collections::HashMap;
+    use ahash::AHashMap;
     use std::iter::FromIterator;
 
     #[test]
@@ -93,7 +93,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         );
 
         let bytelevel = ByteLevel::default().trim_offsets(true);
@@ -113,7 +113,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::from_iter(vec![(0, 0..5)]),
+            AHashMap::from_iter(vec![(0, 0..5)]),
         );
 
         assert_eq!(
@@ -156,7 +156,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::from_iter(vec![(0, 0..5), (1, 5..10)]),
+            AHashMap::from_iter(vec![(0, 0..5), (1, 5..10)]),
         );
         assert_eq!(
             pair_expected,
diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs
index d119a6ffd..50fac99df 100644
--- a/tokenizers/src/processors/template.rs
+++ b/tokenizers/src/processors/template.rs
@@ -57,9 +57,9 @@
 //! [`TemplateProcessing`]: struct.TemplateProcessing.html
 //!
 use crate::{Encoding, PostProcessor, Result};
+use ahash::{AHashMap, AHashSet};
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, HashSet};
 use std::convert::{TryFrom, TryInto};
 use std::result::Result as StdResult;
 
@@ -293,7 +293,7 @@ impl TryFrom<&str> for Template {
 #[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, Eq)]
 #[serde(transparent)]
 pub struct Tokens(
-    #[serde(serialize_with = "crate::utils::ordered_map")] pub HashMap<String, SpecialToken>,
+    #[serde(serialize_with = "crate::utils::ordered_map")] pub AHashMap<String, SpecialToken>,
 );
 
 impl<T: Into<SpecialToken>> From<Vec<T>> for Tokens {
@@ -309,8 +309,8 @@ impl<T: Into<SpecialToken>> From<Vec<T>> for Tokens {
     }
 }
 
-impl From<HashMap<String, SpecialToken>> for Tokens {
-    fn from(v: HashMap<String, SpecialToken>) -> Self {
+impl From<AHashMap<String, SpecialToken>> for Tokens {
+    fn from(v: AHashMap<String, SpecialToken>) -> Self {
         Self(v)
     }
 }
@@ -502,7 +502,7 @@ impl TemplateProcessingBuilder {
         };
 
         let empty = [];
-        let missing: HashSet<&str> = self
+        let missing: AHashSet<&str> = self
             .single
             .as_ref()
             .map_or(empty.iter(), |s| s.0.iter())
@@ -511,7 +511,7 @@ impl TemplateProcessingBuilder {
                 Piece::Sequence { .. } => None,
                 Piece::SpecialToken { id, .. } => check(id.as_ref()),
             })
-            .collect::<HashSet<_>>();
+            .collect::<AHashSet<_>>();
 
         if missing.is_empty() {
             Ok(())
@@ -578,7 +578,7 @@ impl TemplateProcessing {
                                 // overflowing
                                 vec![],
                                 // sequence_range
-                                HashMap::new(),
+                                AHashMap::new(),
                             );
                             Some(encoding)
                         } else {
@@ -917,7 +917,7 @@ mod tests {
                 vec![1, 0, 0, 1],
                 vec![1, 1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 1..3)]),
+                AHashMap::from_iter(vec![(0, 1..3)]),
             )
         );
         assert_eq!(single_encoding.token_to_sequence(2), Some(0));
@@ -941,7 +941,7 @@ mod tests {
                 vec![1, 0, 0, 1, 0, 1],
                 vec![1, 1, 1, 1, 1, 1],
                 vec![],
-                HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
+                AHashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
             )
         );
         assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
@@ -1003,9 +1003,9 @@ mod tests {
                     vec![1, 0, 1],
                     vec![1, 1, 1],
                     vec![],
-                    HashMap::from_iter(vec![(0, 1..2)]),
+                    AHashMap::from_iter(vec![(0, 1..2)]),
                 )],
-                HashMap::from_iter(vec![(0, 1..3)]),
+                AHashMap::from_iter(vec![(0, 1..3)]),
             )
         );
         assert_eq!(single_encoding.token_to_sequence(2), Some(0));
@@ -1061,9 +1061,9 @@ mod tests {
                             vec![1, 0, 1, 0, 1],
                             vec![1, 1, 1, 1, 1],
                             vec![],
-                            HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
+                            AHashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
                         ),],
-                        HashMap::from_iter(vec![(1, 3..5), (0, 1..2)]),
+                        AHashMap::from_iter(vec![(1, 3..5), (0, 1..2)]),
                     ),
                     Encoding::new(
                         vec![1, 13, 0, 17, 0],
@@ -1080,7 +1080,7 @@ mod tests {
                         vec![1, 0, 1, 0, 1],
                         vec![1, 1, 1, 1, 1],
                         vec![],
-                        HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
+                        AHashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
                     ),
                     Encoding::new(
                         vec![1, 12, 14, 0, 17, 0],
@@ -1112,12 +1112,12 @@ mod tests {
                             vec![1, 0, 1, 0, 1],
                             vec![1, 1, 1, 1, 1],
                             vec![],
-                            HashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
+                            AHashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
                         ),],
-                        HashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
+                        AHashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
                     )
                 ],
-                HashMap::from_iter(vec![(0, 1..3), (1, 4..6)]),
+                AHashMap::from_iter(vec![(0, 1..3), (1, 4..6)]),
             )
         );
         assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
index a93d53289..cca7518cb 100644
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -1,10 +1,10 @@
 use super::{
     normalizer::Range, Model, NormalizedString, Normalizer, Offsets, PreTokenizedString, Token,
 };
+use ahash::{AHashMap, AHashSet};
 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
 use regex::Regex;
 use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
-use std::collections::{HashMap, HashSet};
 use std::sync::LazyLock;
 
 /// Represent a token added by the user on top of the existing Model vocabulary.
@@ -141,10 +141,10 @@ fn space_rightmost_at_start(sentence: &str) -> usize {
 pub struct AddedVocabulary {
     /// Contains the mapping from String (token content) to ID. This map contains both special
     /// tokens and classic added tokens that were added to the this vocabulary.
-    added_tokens_map: HashMap<String, u32>,
+    added_tokens_map: AHashMap<String, u32>,
     /// Contains the mapping from ID to AddedToken for all the added tokens, both special
     /// and classic.
-    added_tokens_map_r: HashMap<u32, AddedToken>,
+    added_tokens_map_r: AHashMap<u32, AddedToken>,
 
     /// Contains only the classic AddedToken, in the specific order the user gave them.
     added_tokens: Vec<AddedToken>,
@@ -153,7 +153,7 @@ pub struct AddedVocabulary {
 
     /// A Set, containing all the special token for easy access while decoding. This let's
     /// us remove them easily with an O(1) complexity.
-    special_tokens_set: HashSet<String>,
+    special_tokens_set: AHashSet<String>,
 
     /// A RegexSet containing all the non-normalized patterns used to split on AddedTokens
     split_trie: MatchingSet,
@@ -175,11 +175,11 @@ impl AddedVocabulary {
             .build::<_, &&[u8]>([])
             .expect("The normalized trie should build correctly");
         Self {
-            added_tokens_map: HashMap::new(),
-            added_tokens_map_r: HashMap::new(),
+            added_tokens_map: AHashMap::new(),
+            added_tokens_map_r: AHashMap::new(),
             added_tokens: vec![],
             special_tokens: vec![],
-            special_tokens_set: HashSet::new(),
+            special_tokens_set: AHashSet::new(),
             split_trie: (trie, vec![]),
             split_normalized_trie: (normalized_trie, vec![]),
             encode_special_tokens: false,
@@ -197,12 +197,12 @@ impl AddedVocabulary {
     }
 
     /// Get the additional vocabulary
-    pub fn get_vocab(&self) -> &HashMap<String, u32> {
+    pub fn get_vocab(&self) -> &AHashMap<String, u32> {
         &self.added_tokens_map
     }
 
     /// Get the additional vocabulary with the AddedTokens
-    pub fn get_added_tokens_decoder(&self) -> &HashMap<u32, AddedToken> {
+    pub fn get_added_tokens_decoder(&self) -> &AHashMap<u32, AddedToken> {
         &self.added_tokens_map_r
     }
 
@@ -546,19 +546,20 @@ mod tests {
     use crate::normalizers::utils::Lowercase;
     use crate::normalizers::NormalizerWrapper;
     use crate::{OffsetReferential, OffsetType, Result, Token, Trainer};
+    use std::collections::HashMap;
     use std::path::{Path, PathBuf};
 
     #[derive(Serialize, Deserialize)]
     struct ModelMock {
-        vocab: HashMap<String, u32>,
-        vocab_r: HashMap<u32, String>,
+        vocab: AHashMap<String, u32>,
+        vocab_r: AHashMap<u32, String>,
     }
     impl ModelMock {
         pub fn new<I>(iter: I) -> Self
         where
             I: IntoIterator<Item = &'static (&'static str, u32)>,
         {
-            let vocab: HashMap<String, u32> = iter
+            let vocab: AHashMap<String, u32> = iter
                 .into_iter()
                 .map(|&(tok, id)| (tok.to_string(), id))
                 .collect();
@@ -619,7 +620,7 @@ mod tests {
             self.vocab_r.get(&id).cloned()
         }
         fn get_vocab(&self) -> HashMap<String, u32> {
-            self.vocab.clone()
+            self.vocab.clone().into_iter().collect()
         }
         fn get_vocab_size(&self) -> usize {
             self.vocab.len()
@@ -715,7 +716,7 @@ mod tests {
         assert!(vocab.is_special_token("test"));
         assert_eq!(
             *vocab.get_added_tokens_decoder(),
-            HashMap::from([
+            AHashMap::from([
                 (0, AddedToken::from("test", true)),
                 (2, AddedToken::from("added_token_1", true)),
                 (3, AddedToken::from("added_token_2", true)),
diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs
index 4449bcdee..f48f200a5 100644
--- a/tokenizers/src/tokenizer/encoding.rs
+++ b/tokenizers/src/tokenizer/encoding.rs
@@ -2,8 +2,8 @@ use crate::parallelism::*;
 use crate::tokenizer::{Offsets, Token};
 use crate::utils::padding::PaddingDirection;
 use crate::utils::truncation::TruncationDirection;
+use ahash::AHashMap;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 use std::ops::Range;
 
 /// Represents the output of a `Tokenizer`.
@@ -27,7 +27,7 @@ pub struct Encoding {
     overflowing: Vec<Encoding>,
     /// Ranges of tokens covered by each sequence. If this is empty we consider
     /// there is only one sequence in this Encoding, and that it covers the entire range.
-    sequence_ranges: HashMap<usize, Range<usize>>,
+    sequence_ranges: AHashMap<usize, Range<usize>>,
 }
 impl Encoding {
     #[allow(clippy::too_many_arguments)]
@@ -40,7 +40,7 @@ impl Encoding {
         special_tokens_mask: Vec<u32>,
         attention_mask: Vec<u32>,
         overflowing: Vec<Self>,
-        sequence_ranges: HashMap<usize, Range<usize>>,
+        sequence_ranges: AHashMap<usize, Range<usize>>,
     ) -> Self {
         Self {
             ids,
@@ -65,7 +65,7 @@ impl Encoding {
             special_tokens_mask: Vec::with_capacity(len),
             attention_mask: Vec::with_capacity(len),
             overflowing: vec![],
-            sequence_ranges: HashMap::new(),
+            sequence_ranges: AHashMap::new(),
         }
     }
 
@@ -94,7 +94,7 @@ impl Encoding {
             attention_mask: vec![1; length],
             special_tokens_mask: vec![0; length],
             overflowing: vec![],
-            sequence_ranges: HashMap::new(),
+            sequence_ranges: AHashMap::new(),
         }
     }
 
@@ -363,7 +363,7 @@ impl Encoding {
             special_tokens_mask: self.special_tokens_mask[start..stop].to_vec(),
             attention_mask: self.attention_mask[start..stop].to_vec(),
             overflowing: vec![],
-            sequence_ranges: HashMap::new(),
+            sequence_ranges: AHashMap::new(),
         };
 
         loop {
@@ -381,7 +381,7 @@ impl Encoding {
                 special_tokens_mask: self.special_tokens_mask[start..stop].to_vec(),
                 attention_mask: self.attention_mask[start..stop].to_vec(),
                 overflowing: vec![],
-                sequence_ranges: HashMap::new(),
+                sequence_ranges: AHashMap::new(),
             });
         }
         *self = new_encoding;
@@ -837,7 +837,7 @@ mod tests {
                 Some(2),
                 Some(3),
             ],
-            sequence_ranges: HashMap::from_iter(vec![(0, 0..7), (1, 7..11)]),
+            sequence_ranges: AHashMap::from_iter(vec![(0, 0..7), (1, 7..11)]),
             ..Default::default()
         };
         assert_eq!(encoding.word_to_tokens(0, 0), Some((0, 2)));
@@ -890,7 +890,7 @@ mod tests {
             offsets: vec![(0, 6)],
             special_tokens_mask: vec![0],
             attention_mask: vec![1],
-            sequence_ranges: HashMap::from([(0, 0..1)]),
+            sequence_ranges: AHashMap::from([(0, 0..1)]),
             ..Default::default()
         };
         let target_length = 2;
@@ -904,6 +904,6 @@ mod tests {
             pad_token,
             PaddingDirection::Left,
         );
-        assert_eq!(a.sequence_ranges, HashMap::from([(0, 1..2)]));
+        assert_eq!(a.sequence_ranges, AHashMap::from([(0, 1..2)]));
     }
 }
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index f4a136091..edad83b78 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -9,8 +9,8 @@
 //!   - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
 //!     ...).
 
+use ahash::AHashMap;
 use std::{
-    collections::HashMap,
     fs::{read_to_string, File},
     io::{prelude::*, BufReader},
     ops::{Deref, DerefMut},
@@ -189,6 +189,8 @@ impl Token {
 }
 
 use std::borrow::Cow;
+use std::collections::HashMap;
+
 #[derive(Debug, Clone)]
 pub enum InputSequence<'s> {
     Raw(Cow<'s, str>),
@@ -657,7 +659,7 @@ where
         self.padding.as_mut()
     }
 
-    /// Get the vocabulary
+    // Get the vocabulary as a plain HashMap for bindings compatibility
     pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
         let mut final_vocab = self.model.get_vocab();
 
@@ -675,7 +677,7 @@ where
     }
 
     /// Get the added tokens decoder
-    pub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken> {
+    pub fn get_added_tokens_decoder(&self) -> AHashMap<u32, AddedToken> {
         self.added_vocabulary.get_added_tokens_decoder().clone()
     }
 
@@ -702,7 +704,7 @@ where
             .or_else(|| self.model.id_to_token(id))
     }
 
-    /// set the added bocab's splitting scheme
+    /// set the added vocab's splitting scheme
     pub fn set_encode_special_tokens(&mut self, value: bool) {
         self.added_vocabulary.set_encode_special_tokens(value);
     }
diff --git a/tokenizers/src/utils/cache.rs b/tokenizers/src/utils/cache.rs
index 002fb1d61..15c6b65f1 100644
--- a/tokenizers/src/utils/cache.rs
+++ b/tokenizers/src/utils/cache.rs
@@ -1,5 +1,5 @@
+use ahash::AHashMap;
 use std::borrow::Borrow;
-use std::collections::HashMap;
 use std::hash::Hash;
 use std::sync::RwLock;
 
@@ -19,7 +19,7 @@ where
     K: Eq + Hash + Clone,
     V: Clone,
 {
-    map: RwLock<HashMap<K, V>>,
+    map: RwLock<AHashMap<K, V>>,
     pub capacity: usize,
 }
 
@@ -51,7 +51,7 @@ where
 {
     /// Create new `Cache` with the given capacity.
     pub(crate) fn new(capacity: usize) -> Self {
-        let map = RwLock::new(HashMap::with_capacity(capacity));
+        let map = RwLock::new(AHashMap::with_capacity(capacity));
         Cache { map, capacity }
     }
 
diff --git a/tokenizers/src/utils/mod.rs b/tokenizers/src/utils/mod.rs
index 915dcdd47..636bee660 100644
--- a/tokenizers/src/utils/mod.rs
+++ b/tokenizers/src/utils/mod.rs
@@ -20,11 +20,12 @@ pub mod parallelism;
 pub(crate) mod progress;
 pub mod truncation;
 
+use ahash::AHashMap;
 use serde::{Serialize, Serializer};
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 
 pub(crate) fn ordered_map<S, K, V>(
-    value: &HashMap<K, V>,
+    value: &AHashMap<K, V>,
     serializer: S,
 ) -> std::result::Result<S::Ok, S::Error>
 where
diff --git a/tokenizers/src/utils/padding.rs b/tokenizers/src/utils/padding.rs
index 39585a304..57f8b1d41 100644
--- a/tokenizers/src/utils/padding.rs
+++ b/tokenizers/src/utils/padding.rs
@@ -84,7 +84,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu
 mod tests {
     use super::*;
     use crate::tokenizer::Encoding;
-    use std::collections::HashMap;
+    use ahash::AHashMap;
 
     #[test]
     fn pad_to_multiple() {
@@ -99,7 +99,7 @@ mod tests {
                     vec![],
                     vec![],
                     vec![],
-                    HashMap::new(),
+                    AHashMap::new(),
                 ),
                 Encoding::new(
                     vec![0, 1, 2],
@@ -110,7 +110,7 @@ mod tests {
                     vec![],
                     vec![],
                     vec![],
-                    HashMap::new(),
+                    AHashMap::new(),
                 ),
             ]
         }
diff --git a/tokenizers/src/utils/truncation.rs b/tokenizers/src/utils/truncation.rs
index 9acc297bf..e9b392d2e 100644
--- a/tokenizers/src/utils/truncation.rs
+++ b/tokenizers/src/utils/truncation.rs
@@ -170,7 +170,7 @@ pub fn truncate_encodings(
 mod tests {
     use super::*;
     use crate::tokenizer::Encoding;
-    use std::collections::HashMap;
+    use ahash::AHashMap;
 
     fn get_empty() -> Encoding {
         Encoding::new(
@@ -182,7 +182,7 @@ mod tests {
             vec![],
             vec![],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         )
     }
 
@@ -196,7 +196,7 @@ mod tests {
             vec![0, 0],
             vec![1, 1],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         )
     }
 
@@ -215,7 +215,7 @@ mod tests {
             vec![0, 0, 0, 0],
             vec![1, 1, 1, 1],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         )
     }
 
@@ -256,7 +256,7 @@ mod tests {
             vec![0, 0, 0, 0, 0, 0, 0, 0],
             vec![1, 1, 1, 1, 1, 1, 1, 1],
             vec![],
-            HashMap::new(),
+            AHashMap::new(),
         )
     }
 
diff --git a/tokenizers/test.py b/tokenizers/test.py
new file mode 100644
index 000000000..d1fb56fb9
--- /dev/null
+++ b/tokenizers/test.py
@@ -0,0 +1,20 @@
+from transformers import LlamaTokenizer, LlamaTokenizerFast
+import time
+
+tokenizer1 = LlamaTokenizer.from_pretrained(
+    "meta-llama/Llama-2-7b-chat-hf", split_special_tokens=True
+)  # LlamaTokenizer
+tokenizer2 = LlamaTokenizerFast.from_pretrained(
+    "meta-llama/Llama-2-7b-chat-hf", split_special_tokens=True
+)  # LlamaTokenizer
+print(tokenizer1, tokenizer2)
+
+s_time = time.time()
+for i in range(1000):
+    tokenizer1.tokenize("你好，where are you?" * 1000)
+print(f"slow: {time.time() - s_time}")
+
+s_time = time.time()
+for i in range(1000):
+    tokenizer2.tokenize("你好，where are you?" * 1000)
+print(f"fast: {time.time() - s_time}")
diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs
index 304211e77..2ab99467e 100644
--- a/tokenizers/tests/documentation.rs
+++ b/tokenizers/tests/documentation.rs
@@ -1,6 +1,6 @@
-use std::collections::HashMap;
 use std::iter::FromIterator;
 
+use ahash::AHashMap;
 use tokenizers::decoders::byte_fallback::ByteFallback;
 use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
 use tokenizers::normalizers::{Sequence, Strip, NFC};
@@ -91,7 +91,7 @@ fn streaming_tokenizer() {
     );
 
     // None example
-    let vocab = HashMap::from_iter([
+    let vocab = AHashMap::from_iter([
         ("<0x20>".to_string(), 0),
         ("<0xC3>".to_string(), 1),
         ("<0xA9>".to_string(), 2),