diff --git a/xllm_service/common/macros.h b/xllm_service/common/macros.h
index 4d66f21..068250c 100644
--- a/xllm_service/common/macros.h
+++ b/xllm_service/common/macros.h
@@ -17,30 +17,26 @@ limitations under the License.
 #pragma once
 
 namespace xllm_service {
-// a central place to define common macros for the project
-// clang-format off
-#define DEFINE_ARG(T, name)                                       \
- public:                                                          \
-  inline auto name(const T& name) ->decltype(*this) {             \
-    this->name##_ = name;                                         \
-    return *this;                                                 \
-  }                                                               \
-  inline const T& name() const noexcept { return this->name##_; } \
-  inline T& name() noexcept { return this->name##_; }             \
-                                                                  \
-  T name##_
-
-#define DEFINE_PTR_ARG(T, name)                             \
- public:                                                    \
-  inline auto name(T* name) ->decltype(*this) {             \
-    this->name##_ = name;                                   \
-    return *this;                                           \
-  }                                                         \
-  inline T* name() const noexcept { return this->name##_; } \
-                                                            \
-  T* name##_
-
-// clang-format on
+#define PROPERTY(T, property)                                                 \
+ public:                                                                      \
+  [[nodiscard]] const T& property() const& noexcept { return property##_; }   \
+  [[nodiscard]] T& property() & noexcept { return property##_; }              \
+  [[nodiscard]] T&& property() && noexcept { return std::move(property##_); } \
+                                                                              \
+  auto property(const T& value) & -> decltype(*this) {                        \
+    property##_ = value;                                                      \
+    return *this;                                                             \
+  }                                                                           \
+                                                                              \
+  auto property(T&& value) & -> decltype(*this) {                             \
+    property##_ = std::move(value);                                           \
+    return *this;                                                             \
+  }                                                                           \
+                                                                              \
+  void property(const T& value) && = delete;                                  \
+  void property(T&& value) && = delete;                                       \
+                                                                              \
+  T property##_
 
 #ifndef UNUSED_PARAMETER
 #define UNUSED_PARAMETER(x) ((void)(x))
diff --git a/xllm_service/tokenizer/CMakeLists.txt b/xllm_service/tokenizer/CMakeLists.txt
index 580cd2b..06ae7d9 100644
--- a/xllm_service/tokenizer/CMakeLists.txt
+++ b/xllm_service/tokenizer/CMakeLists.txt
@@ -7,24 +7,25 @@ cc_library(
   NAME 
     tokenizer
   HDRS
-    hf_tokenizer.h
-    sentencepiece_tokenizer.h
-    tiktoken_tokenizer.h
-    tokenizer_args_loader.h
     tokenizer_args.h
     tokenizer.h
+    tokenizer_factory.h
+    tiktoken_tokenizer.h
+    sentencepiece_tokenizer.h
+    fast_tokenizer.h
   SRCS
-    hf_tokenizer.cpp
-    sentencepiece_tokenizer.cpp
+    tokenizer_args.cpp
+    tokenizer_factory.cpp
     tiktoken_tokenizer.cpp
-    tokenizer_args_loader.cpp
+    sentencepiece_tokenizer.cpp
+    fast_tokenizer.cpp
   DEPS
     :common
-    sentencepiece
+    :sentencepiece
     absl::flat_hash_map
     absl::strings
     glog::glog
     rust_tokenizers
     re2::re2
-    nlohmann_json::nlohmann_json
 )
+
diff --git a/xllm_service/tokenizer/fast_tokenizer.cpp b/xllm_service/tokenizer/fast_tokenizer.cpp
new file mode 100644
index 0000000..e863ef9
--- /dev/null
+++ b/xllm_service/tokenizer/fast_tokenizer.cpp
@@ -0,0 +1,67 @@
+#include "fast_tokenizer.h"
+
+#include <glog/logging.h>
+
+namespace xllm_service {
+
+FastTokenizer::FastTokenizer(const std::string& tokenizer_json_path)
+    : tokenizer_json_path_(tokenizer_json_path) {
+  handle_ = tokenizers_new_from_path(tokenizer_json_path.c_str());
+  CHECK(handle_ != nullptr)
+      << "Failed to load tokenizer from file: " << tokenizer_json_path;
+}
+
+std::unique_ptr<Tokenizer> FastTokenizer::clone() const {
+  return std::make_unique<FastTokenizer>(tokenizer_json_path_);
+}
+
+FastTokenizer::~FastTokenizer() { tokenizers_free(handle_); }
+
+bool FastTokenizer::encode(const std::string_view& text,
+                           std::vector<int32_t>* ids) const {
+  TokenizerEncodeResult result;
+  tokenizers_encode(
+      handle_, text.data(), text.size(), /*add_special_tokens=*/1, &result);
+
+  std::vector<int32_t> ret(result.token_ids, result.token_ids + result.len);
+  *ids = std::move(ret);
+
+  return true;
+}
+
+std::string FastTokenizer::decode(const Slice<int32_t>& ids,
+                                  bool skip_special_tokens) const {
+  const char* data = nullptr;
+  size_t len = 0;
+  tokenizers_decode(handle_,
+                    reinterpret_cast<const uint32_t*>(ids.data()),
+                    ids.size(),
+                    skip_special_tokens,
+                    &data,
+                    &len);
+  return {data, len};
+}
+
+std::optional<int32_t> FastTokenizer::token_to_id(
+    const std::string_view& token) const {
+  int32_t id = -1;
+  tokenizers_token_to_id(handle_, token.data(), token.size(), &id);
+  return id == -1 ? std::optional<int32_t>(std::nullopt)
+                  : std::optional<int32_t>(id);
+}
+
+std::string FastTokenizer::id_to_token(int32_t id) const {
+  const char* data = nullptr;
+  size_t len = 0;
+  tokenizers_id_to_token(handle_, id, &data, &len);
+  return {data, len};
+}
+
+size_t FastTokenizer::vocab_size() const {
+  size_t size;
+  tokenizers_get_vocab_size(handle_, &size);
+  CHECK(size > 0) << "vocab_size must be greater than 0.";
+  return size;
+}
+
+}  // namespace xllm_service
diff --git a/xllm_service/tokenizer/hf_tokenizer.h b/xllm_service/tokenizer/fast_tokenizer.h
similarity index 79%
rename from xllm_service/tokenizer/hf_tokenizer.h
rename to xllm_service/tokenizer/fast_tokenizer.h
index f18a692..0802ee7 100644
--- a/xllm_service/tokenizer/hf_tokenizer.h
+++ b/xllm_service/tokenizer/fast_tokenizer.h
@@ -21,13 +21,11 @@ limitations under the License.
 
 namespace xllm_service {
 
-// a tokenizer that uses hf/tokenizers
-// not thread-safe, can't be used in multiple threads.
-class HFTokenizer : public Tokenizer {
+class FastTokenizer : public Tokenizer {
  public:
-  HFTokenizer(const std::string& tokenizer_file_path, TokenizerHandle handle);
+  FastTokenizer(const std::string& tokenizer_json_path);
 
-  ~HFTokenizer() override;
+  ~FastTokenizer() override;
 
   bool encode(const std::string_view& text,
               std::vector<int32_t>* ids) const override;
@@ -44,10 +42,8 @@ class HFTokenizer : public Tokenizer {
 
   std::unique_ptr<Tokenizer> clone() const override;
 
-  static std::unique_ptr<HFTokenizer> from_file(const std::string& path);
-
  private:
-  std::string tokenizer_file_path_;
+  std::string tokenizer_json_path_;
 
   TokenizerHandle handle_ = nullptr;
 };
diff --git a/xllm_service/tokenizer/hf_tokenizer.cpp b/xllm_service/tokenizer/hf_tokenizer.cpp
deleted file mode 100644
index a98480c..0000000
--- a/xllm_service/tokenizer/hf_tokenizer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tokenizer/hf_tokenizer.h"
-
-#include <glog/logging.h>
-
-#include "tokenizers/tokenizers.h"
-
-namespace xllm_service {
-
-std::unique_ptr<HFTokenizer> HFTokenizer::from_file(
-    const std::string& tokenizer_file_path) {
-  TokenizerHandle handle = tokenizer_from_file(tokenizer_file_path.c_str());
-  CHECK(handle != nullptr) << "Failed to load tokenizer from file: "
-                           << tokenizer_file_path;
-  return std::make_unique<HFTokenizer>(tokenizer_file_path, handle);
-}
-
-HFTokenizer::HFTokenizer(const std::string& tokenizer_file_path,
-                         TokenizerHandle handle)
-    : tokenizer_file_path_(tokenizer_file_path), handle_(handle) {
-  CHECK(handle_ != nullptr);
-}
-
-std::unique_ptr<Tokenizer> HFTokenizer::clone() const {
-  return from_file(tokenizer_file_path_);
-}
-
-HFTokenizer::~HFTokenizer() { tokenizer_free(handle_); }
-
-bool HFTokenizer::encode(const std::string_view& text,
-                         std::vector<int32_t>* ids) const {
-  tokenizer_encode(
-      handle_, text.data(), text.size(), /*add_special_tokens=*/true);
-  const uint32_t* data = nullptr;
-  size_t len = 0;
-  tokenizer_get_encode_ids(handle_, &data, &len);
-  ids->reserve(len);
-  for (size_t i = 0; i < len; ++i) {
-    ids->push_back(static_cast<int32_t>(data[i]));
-  }
-  return true;
-}
-
-std::string HFTokenizer::decode(const Slice<int32_t>& ids,
-                                bool skip_special_tokens) const {
-  tokenizer_decode(handle_,
-                   reinterpret_cast<const uint32_t*>(ids.data()),
-                   ids.size(),
-                   skip_special_tokens);
-  const char* data = nullptr;
-  size_t len = 0;
-  tokenizer_get_decode_str(handle_, &data, &len);
-  return {data, len};
-}
-
-std::optional<int32_t> HFTokenizer::token_to_id(
-    const std::string_view& token) const {
-  int32_t id = tokenizer_token_to_id(handle_, token.data(), token.size());
-  if (id == -1) {
-    return std::nullopt;
-  }
-  return id;
-}
-
-std::string HFTokenizer::id_to_token(int32_t id) const {
-  const char* data = nullptr;
-  size_t len = 0;
-  tokenizer_id_to_token(handle_, id, &data, &len);
-  return {data, len};
-}
-
-size_t HFTokenizer::vocab_size() const {
-  return tokenizer_get_vocab_size(handle_, /*with_added_tokens=*/true);
-}
-
-}  // namespace xllm_service
diff --git a/xllm_service/tokenizer/tiktoken_tokenizer.h b/xllm_service/tokenizer/tiktoken_tokenizer.h
index 3e7a0e5..3941023 100644
--- a/xllm_service/tokenizer/tiktoken_tokenizer.h
+++ b/xllm_service/tokenizer/tiktoken_tokenizer.h
@@ -15,6 +15,7 @@ limitations under the License.
 ==============================================================================*/
 
 #pragma once
+
 #include <absl/container/flat_hash_map.h>
 #include <re2/re2.h>
 
diff --git a/xllm_service/tokenizer/tokenizer.h b/xllm_service/tokenizer/tokenizer.h
index e7990b3..f8c6869 100644
--- a/xllm_service/tokenizer/tokenizer.h
+++ b/xllm_service/tokenizer/tokenizer.h
@@ -15,30 +15,16 @@ limitations under the License.
 ==============================================================================*/
 
 #pragma once
-
-#include <common/slice.h>
-
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
+#include "common/slice.h"
+
 namespace xllm_service {
 
-// Fundamentally, Large Language Models (LLM) are designed to generate text
-// based on given prompts. To process text effectively, LLM models typically
-// work with sequences of integers as inputs and produce sequences of integers
-// as outputs. The conversion between text and integer sequences is handled by a
-// tokenizer during preprocessing. The tokenizer serves two primary functions:
-// 1. Breaking down text into tokens and then mapping those tokens to
-// corresponding integers using a predefined vocabulary.
-// 2. Reversing this process by converting a sequence of integers back into
-// human-readable text using the same vocabulary.
-//
-// For example:
-//  ids = tokenizer.Encode("Hello, world!") # [1, 2, 3]
-//  text = tokenizer.Decode(ids) # "Hello, world!"
 class Tokenizer {
  public:
   virtual ~Tokenizer() = default;
diff --git a/xllm_service/tokenizer/tokenizer_args.cpp b/xllm_service/tokenizer/tokenizer_args.cpp
new file mode 100644
index 0000000..e463296
--- /dev/null
+++ b/xllm_service/tokenizer/tokenizer_args.cpp
@@ -0,0 +1,75 @@
+#include "tokenizer_args.h"
+
+#include <fstream>
+
+#include "common/json_reader.h"
+
+namespace xllm_service {
+namespace {
+std::optional<std::string> load_chat_template_file(const std::string& dir) {
+  // chat_template.json
+  const std::string chat_template_path = dir + "/chat_template.json";
+  JsonReader reader;
+  if (reader.parse(chat_template_path);
+      auto v = reader.value<std::string>("chat_template")) {
+    return v;
+  }
+  // chat_template.jinja
+  const std::string raw_chat_template_path = dir + "/chat_template.jinja";
+  std::ifstream file(raw_chat_template_path);
+  if (file.is_open()) {
+    std::ostringstream content;
+    content << file.rdbuf();
+    file.close();
+    return content.str();
+  }
+  return std::nullopt;
+}
+}  // namespace
+
+bool load_tokenizer_args(const std::string& model_weights_path,
+                         TokenizerArgs& tokenizer_args) {
+  // tokenizer args from tokenizer_config.json
+  JsonReader tokenizer_reader;
+  const std::string tokenizer_args_file_path =
+      model_weights_path + "/tokenizer_config.json";
+  if (tokenizer_reader.parse(tokenizer_args_file_path)) {
+    // read chat template if exists
+    if (auto v = load_chat_template_file(model_weights_path)) {
+      tokenizer_args.chat_template() = v.value();
+    } else if (auto v = tokenizer_reader.value<std::string>("chat_template")) {
+      tokenizer_args.chat_template() = v.value();
+    }
+    if (auto v = tokenizer_reader.value<bool>("add_bos_token")) {
+      tokenizer_args.add_bos_token() = v.value();
+    }
+    if (auto v = tokenizer_reader.value<bool>("add_eos_token")) {
+      tokenizer_args.add_eos_token() = v.value();
+    }
+    if (auto v = tokenizer_reader.value<std::string>("tokenizer_class")) {
+      tokenizer_args.tokenizer_class() = v.value();
+    }
+    // read bos_token
+    if (auto v = tokenizer_reader.value<std::string>("bos_token.content")) {
+      tokenizer_args.bos_token() = v.value();
+    } else if (auto v = tokenizer_reader.value<std::string>("bos_token")) {
+      tokenizer_args.bos_token() = v.value();
+    }
+    // read eos_token
+    if (auto v = tokenizer_reader.value<std::string>("eos_token.content")) {
+      tokenizer_args.eos_token() = v.value();
+    } else if (auto v = tokenizer_reader.value<std::string>("eos_token")) {
+      tokenizer_args.eos_token() = v.value();
+    }
+    // read pad_token
+    if (auto v = tokenizer_reader.value<std::string>("pad_token.content")) {
+      tokenizer_args.pad_token() = v.value();
+    } else if (auto v = tokenizer_reader.value<std::string>("pad_token")) {
+      tokenizer_args.pad_token() = v.value();
+    }
+  }
+
+  return true;
+}
+
+}  // namespace xllm_service
\ No newline at end of file
diff --git a/xllm_service/tokenizer/tokenizer_args.h b/xllm_service/tokenizer/tokenizer_args.h
index a690791..c1951e1 100644
--- a/xllm_service/tokenizer/tokenizer_args.h
+++ b/xllm_service/tokenizer/tokenizer_args.h
@@ -31,40 +31,40 @@ using SpecialToken = std::pair<std::string, int32_t>;
 
 struct TokenizerArgs {
   // Type of tokenizer to use. valid values are "sentencepiece" and "tiktoken".
-  DEFINE_ARG(std::string, tokenizer_type) = "sentencepiece";
+  PROPERTY(std::string, tokenizer_type) = "sentencepiece";
 
   // Vocab file name.
-  DEFINE_ARG(std::string, vocab_file) = "tokenizer.model";
+  PROPERTY(std::string, vocab_file) = "tokenizer.model";
 
   // Special tokens to add to the vocabulary.
-  DEFINE_ARG(std::vector<SpecialToken>, special_tokens);
+  PROPERTY(std::vector<SpecialToken>, special_tokens);
 
   // Regex pattern used by tiktok tokenizer only.
-  DEFINE_ARG(std::string, pattern);
+  PROPERTY(std::string, pattern);
 
   // tokens to add to the beginning of the input sequence.
-  DEFINE_ARG(std::vector<std::string>, prefix_tokens);
+  PROPERTY(std::vector<std::string>, prefix_tokens);
 
   // chat template
-  DEFINE_ARG(std::string, chat_template);
+  PROPERTY(std::string, chat_template);
 
   // add_bos_token
-  DEFINE_ARG(bool, add_bos_token) = false;
+  PROPERTY(bool, add_bos_token) = false;
 
   // add_eos_token
-  DEFINE_ARG(bool, add_eos_token) = false;
+  PROPERTY(bool, add_eos_token) = false;
 
   // bos_token
-  DEFINE_ARG(std::string, bos_token);
+  PROPERTY(std::string, bos_token);
 
   // eos_token
-  DEFINE_ARG(std::string, eos_token);
+  PROPERTY(std::string, eos_token);
 
   // pad_token
-  DEFINE_ARG(std::string, pad_token);
+  PROPERTY(std::string, pad_token);
 
   // tokenizer_class
-  DEFINE_ARG(std::string, tokenizer_class);
+  PROPERTY(std::string, tokenizer_class);
 };
 
 inline std::ostream& operator<<(std::ostream& os, const TokenizerArgs& args) {
@@ -93,4 +93,7 @@ inline std::ostream& operator<<(std::ostream& os, const TokenizerArgs& args) {
   return os;
 }
 
+bool load_tokenizer_args(const std::string& model_weights_path,
+                         TokenizerArgs& tokenizer_args);
+
 }  // namespace xllm_service
diff --git a/xllm_service/tokenizer/tokenizer_args_loader.cpp b/xllm_service/tokenizer/tokenizer_args_loader.cpp
deleted file mode 100644
index 5fb387d..0000000
--- a/xllm_service/tokenizer/tokenizer_args_loader.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tokenizer/tokenizer_args_loader.h"
-
-#include <glog/logging.h>
-
-namespace xllm_service {
-#define SET_ARG(arg_name, value) [&] { args->arg_name() = value; }()
-
-static std::string CHATGLM = "chatglm";
-static std::string CHATGLM4 = "chatglm4";
-static std::string YI = "Yi";
-static std::string QWEN = "qwen";
-
-void TokenizerArgsLoader::load(const std::string& model_type,
-                               const std::string& tokenizer_args_file_path,
-                               TokenizerArgs* tokenizer_args) {
-  JsonReader tokenizer_reader;
-  if (tokenizer_reader.parse(tokenizer_args_file_path)) {
-    // read chat template if exists
-    if (auto v = tokenizer_reader.value<std::string>("chat_template")) {
-      tokenizer_args->chat_template() = v.value();
-    }
-    if (auto v = tokenizer_reader.value<bool>("add_bos_token")) {
-      tokenizer_args->add_bos_token() = v.value();
-    }
-    if (auto v = tokenizer_reader.value<bool>("add_eos_token")) {
-      tokenizer_args->add_eos_token() = v.value();
-    }
-    if (auto v = tokenizer_reader.value<std::string>("tokenizer_class")) {
-      tokenizer_args->tokenizer_class() = v.value();
-    }
-    // read bos_token
-    if (auto v = tokenizer_reader.value<std::string>("bos_token.content")) {
-      tokenizer_args->bos_token() = v.value();
-    } else if (auto v = tokenizer_reader.value<std::string>("bos_token")) {
-      tokenizer_args->bos_token() = v.value();
-    }
-    // read eos_token
-    if (auto v = tokenizer_reader.value<std::string>("eos_token.content")) {
-      tokenizer_args->eos_token() = v.value();
-    } else if (auto v = tokenizer_reader.value<std::string>("eos_token")) {
-      tokenizer_args->eos_token() = v.value();
-    }
-    // read pad_token
-    if (auto v = tokenizer_reader.value<std::string>("pad_token.content")) {
-      tokenizer_args->pad_token() = v.value();
-    } else if (auto v = tokenizer_reader.value<std::string>("pad_token")) {
-      tokenizer_args->pad_token() = v.value();
-    }
-  }
-
-  if (model_type == CHATGLM) {
-    load_chatglm_args(tokenizer_args);
-  } else if (model_type == CHATGLM) {
-    load_chatglm_args(tokenizer_args);
-  } else if (model_type == CHATGLM4) {
-    load_chatglm4_args(tokenizer_args);
-  } else if (model_type == YI) {
-    load_Yi_args(tokenizer_args);
-  } else if (model_type == QWEN) {
-    load_qwen_args(tokenizer_args);
-  } else {
-    LOG(ERROR) << "unrecognized model type: " << model_type;
-  }
-}
-
-void TokenizerArgsLoader::load_chatglm_args(TokenizerArgs* args) {
-  SET_ARG(tokenizer_type, "sentencepiece");
-  SET_ARG(vocab_file, "tokenizer.model");
-
-  // set special tokens
-  // ref to:
-  // https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenizer_config.json
-  const std::vector<SpecialToken> special_tokens({{"[MASK]", 64789},
-                                                  {"[gMASK]", 64790},
-                                                  {"[sMASK]", 64791},
-                                                  {"sop", 64792},
-                                                  {"eop", 64793},
-                                                  {"<|system|>", 64794},
-                                                  {"<|user|>", 64795},
-                                                  {"<|assistant|>", 64796},
-                                                  {"<|observation|>", 64797}});
-  SET_ARG(special_tokens, special_tokens);
-  SET_ARG(prefix_tokens, std::vector<std::string>({"[gMASK]", "sop"}));
-}
-
-void TokenizerArgsLoader::load_chatglm4_args(TokenizerArgs* args) {
-  SET_ARG(tokenizer_type, "tiktoken");
-  SET_ARG(vocab_file, "tokenizer.model");
-
-  // set special tokens
-  // ref to:
-  // https://huggingface.co/THUDM/glm-4-9b/blob/main/tokenizer_config.json
-  const std::vector<SpecialToken> special_tokens(
-      {{"<|endoftext|>", 151329},
-       {"[MASK]", 151330},
-       {"[gMASK]", 151331},
-       {"[sMASK]", 151332},
-       {"<sop>", 151333},
-       {"<eop>", 151334},
-       {"<|system|>", 151335},
-       {"<|user|>", 151336},
-       {"<|assistant|>", 151337},
-       {"<|observation|>", 151338},
-       {"<|begin_of_image|>", 151339},
-       {"<|end_of_image|>", 151340},
-       {"<|begin_of_video|>", 151341},
-       {"<|end_of_video|>", 151342}});
-  SET_ARG(special_tokens, special_tokens);
-
-  SET_ARG(prefix_tokens, std::vector<std::string>({"[gMASK]", "<sop>"}));
-
-  // set regex pattern for tiktoken tokenizer.
-  // ref to:
-  // https://huggingface.co/THUDM/glm-4-9b/blob/main/tokenization_chatglm.py#L27
-  // N.B. replaced '\s+(?!\S)' with '\s+[^\s]' to avoid regex error
-  const std::string pattern =
-      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+[^\S]|\s+)";
-  SET_ARG(pattern, pattern);
-}
-
-void TokenizerArgsLoader::load_Yi_args(TokenizerArgs* args) {
-  SET_ARG(tokenizer_type, "sentencepiece");
-  SET_ARG(vocab_file, "tokenizer.model");
-
-  // set special tokens
-  // ref to:
-  // https://huggingface.co/01-ai/Yi-34B-Chat-4bits/blob/main/tokenizer_config.json
-  const std::vector<SpecialToken> special_tokens({{"<unk>", 0},
-                                                  {"<|startoftext|>", 1},
-                                                  {"<|endoftext|>", 2},
-                                                  {"<|im_start|>", 6},
-                                                  {"<|im_end|>", 7},
-                                                  {"<|im_sep|>", 8}});
-  SET_ARG(special_tokens, special_tokens);
-}
-
-void TokenizerArgsLoader::load_qwen_args(TokenizerArgs* args) {
-  SET_ARG(tokenizer_type, "tiktoken");
-  // adapted from
-  // https://huggingface.co/Qwen/Qwen-14B-Chat-Int4/blob/main/tokenization_qwen.py
-  SET_ARG(vocab_file, "qwen.tiktoken");
-
-  // set special tokens
-  std::vector<SpecialToken> special_tokens;
-  int32_t next_id = 151643;
-  special_tokens.emplace_back("<|endoftext|>", next_id++);
-  special_tokens.emplace_back("<|im_start|>", next_id++);
-  special_tokens.emplace_back("<|im_end|>", next_id++);
-  for (int32_t i = 0; i < 205; ++i) {
-    special_tokens.emplace_back("<|extra_" + std::to_string(i) + "|>",
-                                next_id++);
-  }
-  SET_ARG(special_tokens, special_tokens);
-
-  // set regex pattern for tiktoken tokenizer.
-  const std::string pattern =
-      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+[^\S]|\s+)";
-  SET_ARG(pattern, pattern);
-}
-
-}  // namespace xllm_service
diff --git a/xllm_service/tokenizer/tokenizer_args_loader.h b/xllm_service/tokenizer/tokenizer_args_loader.h
deleted file mode 100644
index 07a99aa..0000000
--- a/xllm_service/tokenizer/tokenizer_args_loader.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2025 The xLLM Authors. All Rights Reserved.
-Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://github.com/jd-opensource/xllm-service/blob/main/LICENSE
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-#include "common/json_reader.h"
-#include "tokenizer_args.h"
-
-namespace xllm_service {
-
-class TokenizerArgsLoader {
- public:
-  static void load(const std::string& model_type,
-                   const std::string& tokenizer_args_file_path,
-                   TokenizerArgs* tokenizer_args);
-
- private:
-  static void load_chatglm_args(TokenizerArgs* args);
-
-  static void load_chatglm4_args(TokenizerArgs* args);
-
-  static void load_Yi_args(TokenizerArgs* args);
-
-  static void load_qwen_args(TokenizerArgs* args);
-};
-
-}  // namespace xllm_service
diff --git a/xllm_service/tokenizer/tokenizer_factory.cpp b/xllm_service/tokenizer/tokenizer_factory.cpp
new file mode 100644
index 0000000..204d868
--- /dev/null
+++ b/xllm_service/tokenizer/tokenizer_factory.cpp
@@ -0,0 +1,30 @@
+#include "tokenizer_factory.h"
+
+#include <filesystem>
+
+namespace xllm_service {
+
+std::unique_ptr<Tokenizer> TokenizerFactory::create_tokenizer(
+    const std::string& model_weights_path,
+    TokenizerArgs tokenizer_args) {
+  const std::string tokenizer_json_path =
+      model_weights_path + "/tokenizer.json";
+  if (std::filesystem::exists(tokenizer_json_path)) {
+    // 1. fast tokenizer
+    LOG(INFO) << "Create fast tokenizer.";
+    return std::make_unique<FastTokenizer>(tokenizer_json_path);
+  } else if (tokenizer_args.tokenizer_type() == "tiktoken" ||
+             tokenizer_args.tokenizer_class() == "TikTokenTokenizer") {
+    // 2. create tiktoken tokenizer
+    LOG(INFO) << "Create Tiktoken tokenizer.";
+    return std::make_unique<TiktokenTokenizer>(model_weights_path,
+                                               tokenizer_args);
+  } else {
+    // 3. create sentencepiece tokenizer
+    LOG(INFO) << "Create SentencePiece tokenizer.";
+    return std::make_unique<SentencePieceTokenizer>(model_weights_path,
+                                                    tokenizer_args);
+  }
+}
+
+}  // namespace xllm_service
diff --git a/xllm_service/tokenizer/tokenizer_factory.h b/xllm_service/tokenizer/tokenizer_factory.h
new file mode 100644
index 0000000..2120071
--- /dev/null
+++ b/xllm_service/tokenizer/tokenizer_factory.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "fast_tokenizer.h"
+#include "sentencepiece_tokenizer.h"
+#include "tiktoken_tokenizer.h"
+#include "tokenizer_args.h"
+
+namespace xllm_service {
+
+class TokenizerFactory {
+ public:
+  static std::unique_ptr<Tokenizer> create_tokenizer(
+      const std::string& model_weights_path,
+      TokenizerArgs tokenizer_args);
+};
+
+}  // namespace xllm_service
diff --git a/xllm_service/tokenizer/tokenizers/Cargo.toml b/xllm_service/tokenizer/tokenizers/Cargo.toml
index 108edff..4de532c 100644
--- a/xllm_service/tokenizer/tokenizers/Cargo.toml
+++ b/xllm_service/tokenizer/tokenizers/Cargo.toml
@@ -8,4 +8,4 @@ name = "rust_tokenizers"
 crate-type = ["cdylib"]
 
 [dependencies]
-tokenizers = { version = "0.20.0", default-features = false, features = ["onig"] }
+tokenizers = { version = "0.21.0", default-features = false, features = ["onig"] }
diff --git a/xllm_service/tokenizer/tokenizers/src/lib.rs b/xllm_service/tokenizer/tokenizers/src/lib.rs
index 2391de3..5c4d6be 100644
--- a/xllm_service/tokenizer/tokenizers/src/lib.rs
+++ b/xllm_service/tokenizer/tokenizers/src/lib.rs
@@ -1,134 +1,174 @@
-// Import the needed libraries
+// copied from https://github.com/mlc-ai/tokenizers-cpp/blob/v0.1.1/rust/src/lib.rs
+
+// A simple C wrapper of tokenzier library
+use std::{collections::HashMap, str::FromStr};
+use std::fs;
 use std::ffi::{c_char, CStr};
+use std::io;
 use tokenizers::tokenizer::Tokenizer;
 
-// ported from https://github.com/mlc-ai/tokenizers-cpp
-
 pub struct TokenizerWrapper {
-    // The tokenizer
     tokenizer: Tokenizer,
-    // Holds the encoded ids to avoid dropping them
-    encode_ids: Vec<u32>,
-    // Holds the decoded string to avoid dropping it
     decode_str: String,
-    // Holds the result of the token_to_id function
     id_to_token_result: String,
 }
 
+pub type Vocab = HashMap<String, u32>;
+pub type Merges = Vec<(String, String)>;
+
+#[repr(C)]
+pub struct TokenizerEncodeResult {
+    token_ids: *mut u32,
+    len: usize,
+}
+
+fn read_file_as_u8(path: &str) -> Result<Vec<u8>, io::Error> {
+    fs::read(path)
+}
+
 impl TokenizerWrapper {
-    pub fn encode(&mut self, text: &str, add_special_tokens: bool) {
-        // Encode the text and store the ids
-        self.encode_ids = Vec::from(
-            self.tokenizer
-                .encode(text, add_special_tokens)
-                .unwrap()
-                .get_ids(),
-        );
+    pub fn from_str(json: &str) -> TokenizerWrapper {
+        TokenizerWrapper {
+            tokenizer: Tokenizer::from_str(json).unwrap().into(),
+            decode_str: String::new(),
+            id_to_token_result: String::new(),
+        }
     }
 
-    pub fn decode(&mut self, ids: Vec<u32>, skip_special_tokens: bool) {
-        // Decode the ids and store the string
-        self.decode_str = self.tokenizer.decode(&ids, skip_special_tokens).unwrap();
+    pub fn encode(&mut self, text: &str, add_special_tokens: bool) -> Vec<u32> {
+        let encoded = self.tokenizer.encode(text, add_special_tokens).unwrap();
+        return encoded.get_ids().to_vec();
     }
 
-    pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
-        self.tokenizer.get_vocab_size(with_added_tokens)
+    pub fn encode_batch(&mut self, texts: Vec<&str>, add_special_tokens: bool) -> Vec<Vec<u32>> {
+        let results = self.tokenizer.encode_batch(texts, add_special_tokens).unwrap()
+            .into_iter()
+            .map(|encoded| encoded.get_ids().to_vec())
+            .collect::<Vec<Vec<u32>>>();
+        return results;
+    }
+
+    pub fn decode(&mut self, ids: &[u32], skip_special_tokens: bool) {
+        self.decode_str = self.tokenizer.decode(ids, skip_special_tokens).unwrap();
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_from_file(path: *const c_char) -> *mut TokenizerWrapper {
+extern "C" fn tokenizers_new_from_str(input_cstr: *const u8, len: usize) -> *mut TokenizerWrapper {
+    unsafe {
+        let json = &String::from_utf8_lossy(std::slice::from_raw_parts(input_cstr, len));
+        return Box::into_raw(Box::new(TokenizerWrapper::from_str(json)));
+    }
+}
+
+#[no_mangle]
+extern "C" fn tokenizers_new_from_path(path: *const c_char) -> *mut TokenizerWrapper {
     let c_str = unsafe { CStr::from_ptr(path) };
     let path_str = match c_str.to_str() {
         Ok(s) => s,
-        Err(_) => panic!("Failed to convert C string to Rust string"),
+        Err(_) => panic!("Failed to convert C path string to Rust string"),
     };
 
-    let boxed = Box::new(TokenizerWrapper {
-        tokenizer: Tokenizer::from_file(path_str).unwrap().into(),
-        encode_ids: Vec::new(),
-        decode_str: String::new(),
-        id_to_token_result: String::new(),
-    });
-
-    Box::into_raw(boxed)
+    match read_file_as_u8(path_str) {
+        Ok(bytes) => {
+            return tokenizers_new_from_str(bytes.as_ptr(), bytes.len());
+        }
+        Err(_) => {
+            panic!("Failed to read tokenizer file.");
+        }
+    }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_encode(
+extern "C" fn tokenizers_encode(
     handle: *mut TokenizerWrapper,
     input_cstr: *const u8,
     len: usize,
-    add_special_tokens: bool,
+    add_special_tokens: i32,
+    out_result: *mut TokenizerEncodeResult,
 ) {
     unsafe {
         let input_data = std::str::from_utf8(std::slice::from_raw_parts(input_cstr, len)).unwrap();
-        (*handle).encode(input_data, add_special_tokens);
+        let encoded = (*handle).encode(input_data, add_special_tokens != 0);
+        let len = encoded.len();
+        *out_result = TokenizerEncodeResult {
+            token_ids: Box::into_raw(encoded.into_boxed_slice()) as *mut u32,
+            len: len,
+        };
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_get_encode_ids(
+extern "C" fn tokenizers_encode_batch(
     handle: *mut TokenizerWrapper,
-    out_data: *mut *mut u32,
-    out_len: *mut usize,
+    input_cstr: *const *const u8,
+    input_len: *const usize,
+    num_seqs: usize,
+    add_special_tokens: i32,
+    out_result: *mut TokenizerEncodeResult,
 ) {
     unsafe {
-        *out_data = (*handle).encode_ids.as_mut_ptr();
-        *out_len = (*handle).encode_ids.len()
+        let input_data = (0..num_seqs)
+            .map(|i| {
+                std::str::from_utf8(std::slice::from_raw_parts(*input_cstr.offset(i as isize), *input_len.offset(i as isize))).unwrap()
+            })
+            .collect::<Vec<&str>>();
+        let encoded_batch = (*handle).encode_batch(input_data, add_special_tokens != 0);
+        for (i, encoded) in encoded_batch.into_iter().enumerate() {
+            let len = encoded.len();
+            let result = TokenizerEncodeResult {
+                token_ids: Box::into_raw(encoded.into_boxed_slice()) as *mut u32,
+                len: len,
+            };
+            *out_result.offset(i as isize) = result;
+        }
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_decode(
-    handle: *mut TokenizerWrapper,
-    input_ids: *const u32,
-    len: usize,
-    skip_special_tokens: bool,
-) {
+extern "C" fn tokenizers_free_encode_results(results: *mut TokenizerEncodeResult, num_seqs: usize) {
     unsafe {
-        let input_data = Vec::from(std::slice::from_raw_parts(input_ids, len));
-        (*handle).decode(input_data, skip_special_tokens);
+        let slice = std::slice::from_raw_parts_mut(results, num_seqs);
+        for result in &mut *slice {
+            drop(Box::from_raw(std::slice::from_raw_parts_mut(result.token_ids, result.len)));
+        }
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_get_decode_str(
+extern "C" fn tokenizers_decode(
     handle: *mut TokenizerWrapper,
+    input_ids: *const u32,
+    len: usize,
+    skip_special_tokens: i32,
     out_cstr: *mut *mut u8,
     out_len: *mut usize,
 ) {
     unsafe {
+        let input_data = std::slice::from_raw_parts(input_ids, len);
+        (*handle).decode(input_data, skip_special_tokens != 0);
+
         *out_cstr = (*handle).decode_str.as_mut_ptr();
-        *out_len = (*handle).decode_str.len();
+        *out_len = (&(*handle).decode_str).len();
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_free(wrapper: *mut TokenizerWrapper) {
+extern "C" fn tokenizers_free(wrapper: *mut TokenizerWrapper) {
     unsafe {
         drop(Box::from_raw(wrapper));
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_token_to_id(
-    handle: *mut TokenizerWrapper,
-    token: *const u8,
-    len: usize
-) {
+extern "C" fn tokenizers_get_vocab_size(handle: *mut TokenizerWrapper, size: *mut usize) {
     unsafe {
-        let token: &str = std::str::from_utf8(std::slice::from_raw_parts(token, len)).unwrap();
-        let id = (*handle).tokenizer.token_to_id(token);
-        match id {
-            Some(id) => id as i32,
-            None => -1,
-        };
+        *size = (*handle).tokenizer.get_vocab_size(true);
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_id_to_token(
+extern "C" fn tokenizers_id_to_token(
     handle: *mut TokenizerWrapper,
     id: u32,
     out_cstr: *mut *mut u8,
@@ -142,15 +182,23 @@ extern "C" fn tokenizer_id_to_token(
         };
 
         *out_cstr = (*handle).id_to_token_result.as_mut_ptr();
-        *out_len = (*handle).id_to_token_result.len();
+        *out_len = (&(*handle).id_to_token_result).len();
     }
 }
 
 #[no_mangle]
-extern "C" fn tokenizer_get_vocab_size(
-    handle: *mut TokenizerWrapper, 
-    with_added_tokens: bool) -> usize {
+extern "C" fn tokenizers_token_to_id(
+    handle: *mut TokenizerWrapper,
+    token: *const u8,
+    len: usize,
+    out_id: *mut i32,
+) {
     unsafe {
-        (*handle).get_vocab_size(with_added_tokens)
+        let token: &str = &String::from_utf8_lossy(std::slice::from_raw_parts(token, len));
+        let id = (*handle).tokenizer.token_to_id(token);
+        *out_id = match id {
+            Some(id) => id as i32,
+            None => -1,
+        };
     }
 }
diff --git a/xllm_service/tokenizer/tokenizers/tokenizers.h b/xllm_service/tokenizer/tokenizers/tokenizers.h
index daefbef..b0eba46 100644
--- a/xllm_service/tokenizer/tokenizers/tokenizers.h
+++ b/xllm_service/tokenizer/tokenizers/tokenizers.h
@@ -26,42 +26,42 @@ extern "C" {
 #include <stddef.h>
 #include <stdint.h>
 
-using TokenizerHandle = void*;
-
-TokenizerHandle tokenizer_from_file(const char* path);
-// TokenizerHandle tokenizer_from_pretrained(const char* identifier);
-
-void tokenizer_encode(TokenizerHandle handle,
-                      const char* data,
-                      size_t len,
-                      bool add_special_tokens);
-
-void tokenizer_decode(TokenizerHandle handle,
-                      const uint32_t* data,
-                      size_t len,
-                      bool skip_special_tokens);
-
-void tokenizer_get_decode_str(TokenizerHandle handle,
-                              const char** data,
-                              size_t* len);
-
-void tokenizer_get_encode_ids(TokenizerHandle handle,
-                              const uint32_t** id_data,
-                              size_t* len);
-
-void tokenizer_id_to_token(TokenizerHandle handle,
-                           uint32_t id,
-                           const char** data,
-                           size_t* len);
-
-// -1 if token is not in vocab
-int32_t tokenizer_token_to_id(TokenizerHandle handle,
-                              const char* token,
-                              size_t len);
-
-void tokenizer_free(TokenizerHandle handle);
-
-size_t tokenizer_get_vocab_size(TokenizerHandle handle, bool with_added_tokens);
+typedef void* TokenizerHandle;
+
+typedef struct {
+  int* token_ids;
+  size_t len;
+} TokenizerEncodeResult;
+
+TokenizerHandle tokenizers_new_from_path(const char* path);
+
+void tokenizers_encode(TokenizerHandle handle,
+                       const char* data,
+                       size_t len,
+                       int add_special_token,
+                       TokenizerEncodeResult* result);
+
+void tokenizers_decode(TokenizerHandle handle,
+                       const uint32_t* data,
+                       size_t len,
+                       int skip_special_tokens,
+                       const char** decode_data,
+                       size_t* decode_len);
+
+void tokenizers_id_to_token(TokenizerHandle handle,
+                            uint32_t id,
+                            const char** data,
+                            size_t* len);
+
+// tokenizers_token_to_id stores -1 to *id if the token is not in the vocab
+void tokenizers_token_to_id(TokenizerHandle handle,
+                            const char* token,
+                            size_t len,
+                            int32_t* id);
+
+void tokenizers_free(TokenizerHandle handle);
+
+void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size);
 
 #ifdef __cplusplus
 }