diff --git a/xllm_service/common/macros.h b/xllm_service/common/macros.h index 4d66f21..068250c 100644 --- a/xllm_service/common/macros.h +++ b/xllm_service/common/macros.h @@ -17,30 +17,26 @@ limitations under the License. #pragma once namespace xllm_service { -// a central place to define common macros for the project -// clang-format off -#define DEFINE_ARG(T, name) \ - public: \ - inline auto name(const T& name) ->decltype(*this) { \ - this->name##_ = name; \ - return *this; \ - } \ - inline const T& name() const noexcept { return this->name##_; } \ - inline T& name() noexcept { return this->name##_; } \ - \ - T name##_ - -#define DEFINE_PTR_ARG(T, name) \ - public: \ - inline auto name(T* name) ->decltype(*this) { \ - this->name##_ = name; \ - return *this; \ - } \ - inline T* name() const noexcept { return this->name##_; } \ - \ - T* name##_ - -// clang-format on +#define PROPERTY(T, property) \ + public: \ + [[nodiscard]] const T& property() const& noexcept { return property##_; } \ + [[nodiscard]] T& property() & noexcept { return property##_; } \ + [[nodiscard]] T&& property() && noexcept { return std::move(property##_); } \ + \ + auto property(const T& value) & -> decltype(*this) { \ + property##_ = value; \ + return *this; \ + } \ + \ + auto property(T&& value) & -> decltype(*this) { \ + property##_ = std::move(value); \ + return *this; \ + } \ + \ + void property(const T& value) && = delete; \ + void property(T&& value) && = delete; \ + \ + T property##_ #ifndef UNUSED_PARAMETER #define UNUSED_PARAMETER(x) ((void)(x)) diff --git a/xllm_service/tokenizer/CMakeLists.txt b/xllm_service/tokenizer/CMakeLists.txt index 580cd2b..06ae7d9 100644 --- a/xllm_service/tokenizer/CMakeLists.txt +++ b/xllm_service/tokenizer/CMakeLists.txt @@ -7,24 +7,25 @@ cc_library( NAME tokenizer HDRS - hf_tokenizer.h - sentencepiece_tokenizer.h - tiktoken_tokenizer.h - tokenizer_args_loader.h tokenizer_args.h tokenizer.h + tokenizer_factory.h + tiktoken_tokenizer.h + sentencepiece_tokenizer.h + fast_tokenizer.h SRCS - hf_tokenizer.cpp - sentencepiece_tokenizer.cpp + tokenizer_args.cpp + tokenizer_factory.cpp tiktoken_tokenizer.cpp - tokenizer_args_loader.cpp + sentencepiece_tokenizer.cpp + fast_tokenizer.cpp DEPS :common - sentencepiece + :sentencepiece absl::flat_hash_map absl::strings glog::glog rust_tokenizers re2::re2 - nlohmann_json::nlohmann_json ) + diff --git a/xllm_service/tokenizer/fast_tokenizer.cpp b/xllm_service/tokenizer/fast_tokenizer.cpp new file mode 100644 index 0000000..e863ef9 --- /dev/null +++ b/xllm_service/tokenizer/fast_tokenizer.cpp @@ -0,0 +1,67 @@ +#include "fast_tokenizer.h" + +#include + +namespace xllm_service { + +FastTokenizer::FastTokenizer(const std::string& tokenizer_json_path) + : tokenizer_json_path_(tokenizer_json_path) { + handle_ = tokenizers_new_from_path(tokenizer_json_path.c_str()); + CHECK(handle_ != nullptr) + << "Failed to load tokenizer from file: " << tokenizer_json_path; +} + +std::unique_ptr FastTokenizer::clone() const { + return std::make_unique(tokenizer_json_path_); +} + +FastTokenizer::~FastTokenizer() { tokenizers_free(handle_); } + +bool FastTokenizer::encode(const std::string_view& text, + std::vector* ids) const { + TokenizerEncodeResult result; + tokenizers_encode( + handle_, text.data(), text.size(), /*add_special_tokens=*/1, &result); + + std::vector ret(result.token_ids, result.token_ids + result.len); + *ids = std::move(ret); + + return true; +} + +std::string FastTokenizer::decode(const Slice& ids, + bool skip_special_tokens) const { + const char* data = nullptr; + size_t len = 0; + tokenizers_decode(handle_, + reinterpret_cast(ids.data()), + ids.size(), + skip_special_tokens, + &data, + &len); + return {data, len}; +} + +std::optional FastTokenizer::token_to_id( + const std::string_view& token) const { + int32_t id = -1; + tokenizers_token_to_id(handle_, token.data(), token.size(), &id); + return id == -1 ? std::optional(std::nullopt) + : std::optional(id); +} + +std::string FastTokenizer::id_to_token(int32_t id) const { + const char* data = nullptr; + size_t len = 0; + tokenizers_id_to_token(handle_, id, &data, &len); + return {data, len}; +} + +size_t FastTokenizer::vocab_size() const { + size_t size; + tokenizers_get_vocab_size(handle_, &size); + CHECK(size > 0) << "vocab_size must be greater than 0."; + return size; +} + +} // namespace xllm_service diff --git a/xllm_service/tokenizer/hf_tokenizer.h b/xllm_service/tokenizer/fast_tokenizer.h similarity index 79% rename from xllm_service/tokenizer/hf_tokenizer.h rename to xllm_service/tokenizer/fast_tokenizer.h index f18a692..0802ee7 100644 --- a/xllm_service/tokenizer/hf_tokenizer.h +++ b/xllm_service/tokenizer/fast_tokenizer.h @@ -21,13 +21,11 @@ limitations under the License. namespace xllm_service { -// a tokenizer that uses hf/tokenizers -// not thread-safe, can't be used in multiple threads. -class HFTokenizer : public Tokenizer { +class FastTokenizer : public Tokenizer { public: - HFTokenizer(const std::string& tokenizer_file_path, TokenizerHandle handle); + FastTokenizer(const std::string& tokenizer_json_path); - ~HFTokenizer() override; + ~FastTokenizer() override; bool encode(const std::string_view& text, std::vector* ids) const override; @@ -44,10 +42,8 @@ class HFTokenizer : public Tokenizer { std::unique_ptr clone() const override; - static std::unique_ptr from_file(const std::string& path); - private: - std::string tokenizer_file_path_; + std::string tokenizer_json_path_; TokenizerHandle handle_ = nullptr; }; diff --git a/xllm_service/tokenizer/hf_tokenizer.cpp b/xllm_service/tokenizer/hf_tokenizer.cpp deleted file mode 100644 index a98480c..0000000 --- a/xllm_service/tokenizer/hf_tokenizer.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. -Copyright 2024 The ScaleLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm-service/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tokenizer/hf_tokenizer.h" - -#include - -#include "tokenizers/tokenizers.h" - -namespace xllm_service { - -std::unique_ptr HFTokenizer::from_file( - const std::string& tokenizer_file_path) { - TokenizerHandle handle = tokenizer_from_file(tokenizer_file_path.c_str()); - CHECK(handle != nullptr) << "Failed to load tokenizer from file: " - << tokenizer_file_path; - return std::make_unique(tokenizer_file_path, handle); -} - -HFTokenizer::HFTokenizer(const std::string& tokenizer_file_path, - TokenizerHandle handle) - : tokenizer_file_path_(tokenizer_file_path), handle_(handle) { - CHECK(handle_ != nullptr); -} - -std::unique_ptr HFTokenizer::clone() const { - return from_file(tokenizer_file_path_); -} - -HFTokenizer::~HFTokenizer() { tokenizer_free(handle_); } - -bool HFTokenizer::encode(const std::string_view& text, - std::vector* ids) const { - tokenizer_encode( - handle_, text.data(), text.size(), /*add_special_tokens=*/true); - const uint32_t* data = nullptr; - size_t len = 0; - tokenizer_get_encode_ids(handle_, &data, &len); - ids->reserve(len); - for (size_t i = 0; i < len; ++i) { - ids->push_back(static_cast(data[i])); - } - return true; -} - -std::string HFTokenizer::decode(const Slice& ids, - bool skip_special_tokens) const { - tokenizer_decode(handle_, - reinterpret_cast(ids.data()), - ids.size(), - skip_special_tokens); - const char* data = nullptr; - size_t len = 0; - tokenizer_get_decode_str(handle_, &data, &len); - return {data, len}; -} - -std::optional HFTokenizer::token_to_id( - const std::string_view& token) const { - int32_t id = tokenizer_token_to_id(handle_, token.data(), token.size()); - if (id == -1) { - return std::nullopt; - } - return id; -} - -std::string HFTokenizer::id_to_token(int32_t id) const { - const char* data = nullptr; - size_t len = 0; - tokenizer_id_to_token(handle_, id, &data, &len); - return {data, len}; -} - -size_t HFTokenizer::vocab_size() const { - return tokenizer_get_vocab_size(handle_, /*with_added_tokens=*/true); -} - -} // namespace xllm_service diff --git a/xllm_service/tokenizer/tiktoken_tokenizer.h b/xllm_service/tokenizer/tiktoken_tokenizer.h index 3e7a0e5..3941023 100644 --- a/xllm_service/tokenizer/tiktoken_tokenizer.h +++ b/xllm_service/tokenizer/tiktoken_tokenizer.h @@ -15,6 +15,7 @@ limitations under the License. ==============================================================================*/ #pragma once + #include #include diff --git a/xllm_service/tokenizer/tokenizer.h b/xllm_service/tokenizer/tokenizer.h index e7990b3..f8c6869 100644 --- a/xllm_service/tokenizer/tokenizer.h +++ b/xllm_service/tokenizer/tokenizer.h @@ -15,30 +15,16 @@ limitations under the License. ==============================================================================*/ #pragma once - -#include - #include #include #include #include #include +#include "common/slice.h" + namespace xllm_service { -// Fundamentally, Large Language Models (LLM) are designed to generate text -// based on given prompts. To process text effectively, LLM models typically -// work with sequences of integers as inputs and produce sequences of integers -// as outputs. The conversion between text and integer sequences is handled by a -// tokenizer during preprocessing. The tokenizer serves two primary functions: -// 1. Breaking down text into tokens and then mapping those tokens to -// corresponding integers using a predefined vocabulary. -// 2. Reversing this process by converting a sequence of integers back into -// human-readable text using the same vocabulary. -// -// For example: -// ids = tokenizer.Encode("Hello, world!") # [1, 2, 3] -// text = tokenizer.Decode(ids) # "Hello, world!" class Tokenizer { public: virtual ~Tokenizer() = default; diff --git a/xllm_service/tokenizer/tokenizer_args.cpp b/xllm_service/tokenizer/tokenizer_args.cpp new file mode 100644 index 0000000..e463296 --- /dev/null +++ b/xllm_service/tokenizer/tokenizer_args.cpp @@ -0,0 +1,75 @@ +#include "tokenizer_args.h" + +#include + +#include "common/json_reader.h" + +namespace xllm_service { +namespace { +std::optional load_chat_template_file(const std::string& dir) { + // chat_template.json + const std::string chat_template_path = dir + "/chat_template.json"; + JsonReader reader; + if (reader.parse(chat_template_path); + auto v = reader.value("chat_template")) { + return v; + } + // chat_template.jinja + const std::string raw_chat_template_path = dir + "/chat_template.jinja"; + std::ifstream file(raw_chat_template_path); + if (file.is_open()) { + std::ostringstream content; + content << file.rdbuf(); + file.close(); + return content.str(); + } + return std::nullopt; +} +} // namespace + +bool load_tokenizer_args(const std::string& model_weights_path, + TokenizerArgs& tokenizer_args) { + // tokenizer args from tokenizer_config.json + JsonReader tokenizer_reader; + const std::string tokenizer_args_file_path = + model_weights_path + "/tokenizer_config.json"; + if (tokenizer_reader.parse(tokenizer_args_file_path)) { + // read chat template if exists + if (auto v = load_chat_template_file(model_weights_path)) { + tokenizer_args.chat_template() = v.value(); + } else if (auto v = tokenizer_reader.value("chat_template")) { + tokenizer_args.chat_template() = v.value(); + } + if (auto v = tokenizer_reader.value("add_bos_token")) { + tokenizer_args.add_bos_token() = v.value(); + } + if (auto v = tokenizer_reader.value("add_eos_token")) { + tokenizer_args.add_eos_token() = v.value(); + } + if (auto v = tokenizer_reader.value("tokenizer_class")) { + tokenizer_args.tokenizer_class() = v.value(); + } + // read bos_token + if (auto v = tokenizer_reader.value("bos_token.content")) { + tokenizer_args.bos_token() = v.value(); + } else if (auto v = tokenizer_reader.value("bos_token")) { + tokenizer_args.bos_token() = v.value(); + } + // read eos_token + if (auto v = tokenizer_reader.value("eos_token.content")) { + tokenizer_args.eos_token() = v.value(); + } else if (auto v = tokenizer_reader.value("eos_token")) { + tokenizer_args.eos_token() = v.value(); + } + // read pad_token + if (auto v = tokenizer_reader.value("pad_token.content")) { + tokenizer_args.pad_token() = v.value(); + } else if (auto v = tokenizer_reader.value("pad_token")) { + tokenizer_args.pad_token() = v.value(); + } + } + + return true; +} + +} // namespace xllm_service \ No newline at end of file diff --git a/xllm_service/tokenizer/tokenizer_args.h b/xllm_service/tokenizer/tokenizer_args.h index a690791..c1951e1 100644 --- a/xllm_service/tokenizer/tokenizer_args.h +++ b/xllm_service/tokenizer/tokenizer_args.h @@ -31,40 +31,40 @@ using SpecialToken = std::pair; struct TokenizerArgs { // Type of tokenizer to use. valid values are "sentencepiece" and "tiktoken". - DEFINE_ARG(std::string, tokenizer_type) = "sentencepiece"; + PROPERTY(std::string, tokenizer_type) = "sentencepiece"; // Vocab file name. - DEFINE_ARG(std::string, vocab_file) = "tokenizer.model"; + PROPERTY(std::string, vocab_file) = "tokenizer.model"; // Special tokens to add to the vocabulary. - DEFINE_ARG(std::vector, special_tokens); + PROPERTY(std::vector, special_tokens); // Regex pattern used by tiktok tokenizer only. - DEFINE_ARG(std::string, pattern); + PROPERTY(std::string, pattern); // tokens to add to the beginning of the input sequence. - DEFINE_ARG(std::vector, prefix_tokens); + PROPERTY(std::vector, prefix_tokens); // chat template - DEFINE_ARG(std::string, chat_template); + PROPERTY(std::string, chat_template); // add_bos_token - DEFINE_ARG(bool, add_bos_token) = false; + PROPERTY(bool, add_bos_token) = false; // add_eos_token - DEFINE_ARG(bool, add_eos_token) = false; + PROPERTY(bool, add_eos_token) = false; // bos_token - DEFINE_ARG(std::string, bos_token); + PROPERTY(std::string, bos_token); // eos_token - DEFINE_ARG(std::string, eos_token); + PROPERTY(std::string, eos_token); // pad_token - DEFINE_ARG(std::string, pad_token); + PROPERTY(std::string, pad_token); // tokenizer_class - DEFINE_ARG(std::string, tokenizer_class); + PROPERTY(std::string, tokenizer_class); }; inline std::ostream& operator<<(std::ostream& os, const TokenizerArgs& args) { @@ -93,4 +93,7 @@ inline std::ostream& operator<<(std::ostream& os, const TokenizerArgs& args) { return os; } +bool load_tokenizer_args(const std::string& model_weights_path, + TokenizerArgs& tokenizer_args); + } // namespace xllm_service diff --git a/xllm_service/tokenizer/tokenizer_args_loader.cpp b/xllm_service/tokenizer/tokenizer_args_loader.cpp deleted file mode 100644 index 5fb387d..0000000 --- a/xllm_service/tokenizer/tokenizer_args_loader.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. -Copyright 2024 The ScaleLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm-service/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tokenizer/tokenizer_args_loader.h" - -#include - -namespace xllm_service { -#define SET_ARG(arg_name, value) [&] { args->arg_name() = value; }() - -static std::string CHATGLM = "chatglm"; -static std::string CHATGLM4 = "chatglm4"; -static std::string YI = "Yi"; -static std::string QWEN = "qwen"; - -void TokenizerArgsLoader::load(const std::string& model_type, - const std::string& tokenizer_args_file_path, - TokenizerArgs* tokenizer_args) { - JsonReader tokenizer_reader; - if (tokenizer_reader.parse(tokenizer_args_file_path)) { - // read chat template if exists - if (auto v = tokenizer_reader.value("chat_template")) { - tokenizer_args->chat_template() = v.value(); - } - if (auto v = tokenizer_reader.value("add_bos_token")) { - tokenizer_args->add_bos_token() = v.value(); - } - if (auto v = tokenizer_reader.value("add_eos_token")) { - tokenizer_args->add_eos_token() = v.value(); - } - if (auto v = tokenizer_reader.value("tokenizer_class")) { - tokenizer_args->tokenizer_class() = v.value(); - } - // read bos_token - if (auto v = tokenizer_reader.value("bos_token.content")) { - tokenizer_args->bos_token() = v.value(); - } else if (auto v = tokenizer_reader.value("bos_token")) { - tokenizer_args->bos_token() = v.value(); - } - // read eos_token - if (auto v = tokenizer_reader.value("eos_token.content")) { - tokenizer_args->eos_token() = v.value(); - } else if (auto v = tokenizer_reader.value("eos_token")) { - tokenizer_args->eos_token() = v.value(); - } - // read pad_token - if (auto v = tokenizer_reader.value("pad_token.content")) { - tokenizer_args->pad_token() = v.value(); - } else if (auto v = tokenizer_reader.value("pad_token")) { - tokenizer_args->pad_token() = v.value(); - } - } - - if (model_type == CHATGLM) { - load_chatglm_args(tokenizer_args); - } else if (model_type == CHATGLM) { - load_chatglm_args(tokenizer_args); - } else if (model_type == CHATGLM4) { - load_chatglm4_args(tokenizer_args); - } else if (model_type == YI) { - load_Yi_args(tokenizer_args); - } else if (model_type == QWEN) { - load_qwen_args(tokenizer_args); - } else { - LOG(ERROR) << "unrecognized model type: " << model_type; - } -} - -void TokenizerArgsLoader::load_chatglm_args(TokenizerArgs* args) { - SET_ARG(tokenizer_type, "sentencepiece"); - SET_ARG(vocab_file, "tokenizer.model"); - - // set special tokens - // ref to: - // https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenizer_config.json - const std::vector special_tokens({{"[MASK]", 64789}, - {"[gMASK]", 64790}, - {"[sMASK]", 64791}, - {"sop", 64792}, - {"eop", 64793}, - {"<|system|>", 64794}, - {"<|user|>", 64795}, - {"<|assistant|>", 64796}, - {"<|observation|>", 64797}}); - SET_ARG(special_tokens, special_tokens); - SET_ARG(prefix_tokens, std::vector({"[gMASK]", "sop"})); -} - -void TokenizerArgsLoader::load_chatglm4_args(TokenizerArgs* args) { - SET_ARG(tokenizer_type, "tiktoken"); - SET_ARG(vocab_file, "tokenizer.model"); - - // set special tokens - // ref to: - // https://huggingface.co/THUDM/glm-4-9b/blob/main/tokenizer_config.json - const std::vector special_tokens( - {{"<|endoftext|>", 151329}, - {"[MASK]", 151330}, - {"[gMASK]", 151331}, - {"[sMASK]", 151332}, - {"", 151333}, - {"", 151334}, - {"<|system|>", 151335}, - {"<|user|>", 151336}, - {"<|assistant|>", 151337}, - {"<|observation|>", 151338}, - {"<|begin_of_image|>", 151339}, - {"<|end_of_image|>", 151340}, - {"<|begin_of_video|>", 151341}, - {"<|end_of_video|>", 151342}}); - SET_ARG(special_tokens, special_tokens); - - SET_ARG(prefix_tokens, std::vector({"[gMASK]", ""})); - - // set regex pattern for tiktoken tokenizer. - // ref to: - // https://huggingface.co/THUDM/glm-4-9b/blob/main/tokenization_chatglm.py#L27 - // N.B. replaced '\s+(?!\S)' with '\s+[^\s]' to avoid regex error - const std::string pattern = - R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+[^\S]|\s+)"; - SET_ARG(pattern, pattern); -} - -void TokenizerArgsLoader::load_Yi_args(TokenizerArgs* args) { - SET_ARG(tokenizer_type, "sentencepiece"); - SET_ARG(vocab_file, "tokenizer.model"); - - // set special tokens - // ref to: - // https://huggingface.co/01-ai/Yi-34B-Chat-4bits/blob/main/tokenizer_config.json - const std::vector special_tokens({{"", 0}, - {"<|startoftext|>", 1}, - {"<|endoftext|>", 2}, - {"<|im_start|>", 6}, - {"<|im_end|>", 7}, - {"<|im_sep|>", 8}}); - SET_ARG(special_tokens, special_tokens); -} - -void TokenizerArgsLoader::load_qwen_args(TokenizerArgs* args) { - SET_ARG(tokenizer_type, "tiktoken"); - // adapted from - // https://huggingface.co/Qwen/Qwen-14B-Chat-Int4/blob/main/tokenization_qwen.py - SET_ARG(vocab_file, "qwen.tiktoken"); - - // set special tokens - std::vector special_tokens; - int32_t next_id = 151643; - special_tokens.emplace_back("<|endoftext|>", next_id++); - special_tokens.emplace_back("<|im_start|>", next_id++); - special_tokens.emplace_back("<|im_end|>", next_id++); - for (int32_t i = 0; i < 205; ++i) { - special_tokens.emplace_back("<|extra_" + std::to_string(i) + "|>", - next_id++); - } - SET_ARG(special_tokens, special_tokens); - - // set regex pattern for tiktoken tokenizer. - const std::string pattern = - R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+[^\S]|\s+)"; - SET_ARG(pattern, pattern); -} - -} // namespace xllm_service diff --git a/xllm_service/tokenizer/tokenizer_args_loader.h b/xllm_service/tokenizer/tokenizer_args_loader.h deleted file mode 100644 index 07a99aa..0000000 --- a/xllm_service/tokenizer/tokenizer_args_loader.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright 2025 The xLLM Authors. All Rights Reserved. -Copyright 2024 The ScaleLLM Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://github.com/jd-opensource/xllm-service/blob/main/LICENSE - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#pragma once -#include "common/json_reader.h" -#include "tokenizer_args.h" - -namespace xllm_service { - -class TokenizerArgsLoader { - public: - static void load(const std::string& model_type, - const std::string& tokenizer_args_file_path, - TokenizerArgs* tokenizer_args); - - private: - static void load_chatglm_args(TokenizerArgs* args); - - static void load_chatglm4_args(TokenizerArgs* args); - - static void load_Yi_args(TokenizerArgs* args); - - static void load_qwen_args(TokenizerArgs* args); -}; - -} // namespace xllm_service diff --git a/xllm_service/tokenizer/tokenizer_factory.cpp b/xllm_service/tokenizer/tokenizer_factory.cpp new file mode 100644 index 0000000..204d868 --- /dev/null +++ b/xllm_service/tokenizer/tokenizer_factory.cpp @@ -0,0 +1,30 @@ +#include "tokenizer_factory.h" + +#include + +namespace xllm_service { + +std::unique_ptr TokenizerFactory::create_tokenizer( + const std::string& model_weights_path, + TokenizerArgs tokenizer_args) { + const std::string tokenizer_json_path = + model_weights_path + "/tokenizer.json"; + if (std::filesystem::exists(tokenizer_json_path)) { + // 1. fast tokenizer + LOG(INFO) << "Create fast tokenizer."; + return std::make_unique(tokenizer_json_path); + } else if (tokenizer_args.tokenizer_type() == "tiktoken" || + tokenizer_args.tokenizer_class() == "TikTokenTokenizer") { + // 2. create tiktoken tokenizer + LOG(INFO) << "Create Tiktoken tokenizer."; + return std::make_unique(model_weights_path, + tokenizer_args); + } else { + // 3. create sentencepiece tokenizer + LOG(INFO) << "Create SentencePiece tokenizer."; + return std::make_unique(model_weights_path, + tokenizer_args); + } +} + +} // namespace xllm_service diff --git a/xllm_service/tokenizer/tokenizer_factory.h b/xllm_service/tokenizer/tokenizer_factory.h new file mode 100644 index 0000000..2120071 --- /dev/null +++ b/xllm_service/tokenizer/tokenizer_factory.h @@ -0,0 +1,17 @@ +#pragma once + +#include "fast_tokenizer.h" +#include "sentencepiece_tokenizer.h" +#include "tiktoken_tokenizer.h" +#include "tokenizer_args.h" + +namespace xllm_service { + +class TokenizerFactory { + public: + static std::unique_ptr create_tokenizer( + const std::string& model_weights_path, + TokenizerArgs tokenizer_args); +}; + +} // namespace xllm_service diff --git a/xllm_service/tokenizer/tokenizers/Cargo.toml b/xllm_service/tokenizer/tokenizers/Cargo.toml index 108edff..4de532c 100644 --- a/xllm_service/tokenizer/tokenizers/Cargo.toml +++ b/xllm_service/tokenizer/tokenizers/Cargo.toml @@ -8,4 +8,4 @@ name = "rust_tokenizers" crate-type = ["cdylib"] [dependencies] -tokenizers = { version = "0.20.0", default-features = false, features = ["onig"] } +tokenizers = { version = "0.21.0", default-features = false, features = ["onig"] } diff --git a/xllm_service/tokenizer/tokenizers/src/lib.rs b/xllm_service/tokenizer/tokenizers/src/lib.rs index 2391de3..5c4d6be 100644 --- a/xllm_service/tokenizer/tokenizers/src/lib.rs +++ b/xllm_service/tokenizer/tokenizers/src/lib.rs @@ -1,134 +1,174 @@ -// Import the needed libraries +// copied from https://github.com/mlc-ai/tokenizers-cpp/blob/v0.1.1/rust/src/lib.rs + +// A simple C wrapper of tokenzier library +use std::{collections::HashMap, str::FromStr}; +use std::fs; use std::ffi::{c_char, CStr}; +use std::io; use tokenizers::tokenizer::Tokenizer; -// ported from https://github.com/mlc-ai/tokenizers-cpp - pub struct TokenizerWrapper { - // The tokenizer tokenizer: Tokenizer, - // Holds the encoded ids to avoid dropping them - encode_ids: Vec, - // Holds the decoded string to avoid dropping it decode_str: String, - // Holds the result of the token_to_id function id_to_token_result: String, } +pub type Vocab = HashMap; +pub type Merges = Vec<(String, String)>; + +#[repr(C)] +pub struct TokenizerEncodeResult { + token_ids: *mut u32, + len: usize, +} + +fn read_file_as_u8(path: &str) -> Result, io::Error> { + fs::read(path) +} + impl TokenizerWrapper { - pub fn encode(&mut self, text: &str, add_special_tokens: bool) { - // Encode the text and store the ids - self.encode_ids = Vec::from( - self.tokenizer - .encode(text, add_special_tokens) - .unwrap() - .get_ids(), - ); + pub fn from_str(json: &str) -> TokenizerWrapper { + TokenizerWrapper { + tokenizer: Tokenizer::from_str(json).unwrap().into(), + decode_str: String::new(), + id_to_token_result: String::new(), + } } - pub fn decode(&mut self, ids: Vec, skip_special_tokens: bool) { - // Decode the ids and store the string - self.decode_str = self.tokenizer.decode(&ids, skip_special_tokens).unwrap(); + pub fn encode(&mut self, text: &str, add_special_tokens: bool) -> Vec { + let encoded = self.tokenizer.encode(text, add_special_tokens).unwrap(); + return encoded.get_ids().to_vec(); } - pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize { - self.tokenizer.get_vocab_size(with_added_tokens) + pub fn encode_batch(&mut self, texts: Vec<&str>, add_special_tokens: bool) -> Vec> { + let results = self.tokenizer.encode_batch(texts, add_special_tokens).unwrap() + .into_iter() + .map(|encoded| encoded.get_ids().to_vec()) + .collect::>>(); + return results; + } + + pub fn decode(&mut self, ids: &[u32], skip_special_tokens: bool) { + self.decode_str = self.tokenizer.decode(ids, skip_special_tokens).unwrap(); } } #[no_mangle] -extern "C" fn tokenizer_from_file(path: *const c_char) -> *mut TokenizerWrapper { +extern "C" fn tokenizers_new_from_str(input_cstr: *const u8, len: usize) -> *mut TokenizerWrapper { + unsafe { + let json = &String::from_utf8_lossy(std::slice::from_raw_parts(input_cstr, len)); + return Box::into_raw(Box::new(TokenizerWrapper::from_str(json))); + } +} + +#[no_mangle] +extern "C" fn tokenizers_new_from_path(path: *const c_char) -> *mut TokenizerWrapper { let c_str = unsafe { CStr::from_ptr(path) }; let path_str = match c_str.to_str() { Ok(s) => s, - Err(_) => panic!("Failed to convert C string to Rust string"), + Err(_) => panic!("Failed to convert C path string to Rust string"), }; - let boxed = Box::new(TokenizerWrapper { - tokenizer: Tokenizer::from_file(path_str).unwrap().into(), - encode_ids: Vec::new(), - decode_str: String::new(), - id_to_token_result: String::new(), - }); - - Box::into_raw(boxed) + match read_file_as_u8(path_str) { + Ok(bytes) => { + return tokenizers_new_from_str(bytes.as_ptr(), bytes.len()); + } + Err(_) => { + panic!("Failed to read tokenizer file."); + } + } } #[no_mangle] -extern "C" fn tokenizer_encode( +extern "C" fn tokenizers_encode( handle: *mut TokenizerWrapper, input_cstr: *const u8, len: usize, - add_special_tokens: bool, + add_special_tokens: i32, + out_result: *mut TokenizerEncodeResult, ) { unsafe { let input_data = std::str::from_utf8(std::slice::from_raw_parts(input_cstr, len)).unwrap(); - (*handle).encode(input_data, add_special_tokens); + let encoded = (*handle).encode(input_data, add_special_tokens != 0); + let len = encoded.len(); + *out_result = TokenizerEncodeResult { + token_ids: Box::into_raw(encoded.into_boxed_slice()) as *mut u32, + len: len, + }; } } #[no_mangle] -extern "C" fn tokenizer_get_encode_ids( +extern "C" fn tokenizers_encode_batch( handle: *mut TokenizerWrapper, - out_data: *mut *mut u32, - out_len: *mut usize, + input_cstr: *const *const u8, + input_len: *const usize, + num_seqs: usize, + add_special_tokens: i32, + out_result: *mut TokenizerEncodeResult, ) { unsafe { - *out_data = (*handle).encode_ids.as_mut_ptr(); - *out_len = (*handle).encode_ids.len() + let input_data = (0..num_seqs) + .map(|i| { + std::str::from_utf8(std::slice::from_raw_parts(*input_cstr.offset(i as isize), *input_len.offset(i as isize))).unwrap() + }) + .collect::>(); + let encoded_batch = (*handle).encode_batch(input_data, add_special_tokens != 0); + for (i, encoded) in encoded_batch.into_iter().enumerate() { + let len = encoded.len(); + let result = TokenizerEncodeResult { + token_ids: Box::into_raw(encoded.into_boxed_slice()) as *mut u32, + len: len, + }; + *out_result.offset(i as isize) = result; + } } } #[no_mangle] -extern "C" fn tokenizer_decode( - handle: *mut TokenizerWrapper, - input_ids: *const u32, - len: usize, - skip_special_tokens: bool, -) { +extern "C" fn tokenizers_free_encode_results(results: *mut TokenizerEncodeResult, num_seqs: usize) { unsafe { - let input_data = Vec::from(std::slice::from_raw_parts(input_ids, len)); - (*handle).decode(input_data, skip_special_tokens); + let slice = std::slice::from_raw_parts_mut(results, num_seqs); + for result in &mut *slice { + drop(Box::from_raw(std::slice::from_raw_parts_mut(result.token_ids, result.len))); + } } } #[no_mangle] -extern "C" fn tokenizer_get_decode_str( +extern "C" fn tokenizers_decode( handle: *mut TokenizerWrapper, + input_ids: *const u32, + len: usize, + skip_special_tokens: i32, out_cstr: *mut *mut u8, out_len: *mut usize, ) { unsafe { + let input_data = std::slice::from_raw_parts(input_ids, len); + (*handle).decode(input_data, skip_special_tokens != 0); + *out_cstr = (*handle).decode_str.as_mut_ptr(); - *out_len = (*handle).decode_str.len(); + *out_len = (&(*handle).decode_str).len(); } } #[no_mangle] -extern "C" fn tokenizer_free(wrapper: *mut TokenizerWrapper) { +extern "C" fn tokenizers_free(wrapper: *mut TokenizerWrapper) { unsafe { drop(Box::from_raw(wrapper)); } } #[no_mangle] -extern "C" fn tokenizer_token_to_id( - handle: *mut TokenizerWrapper, - token: *const u8, - len: usize -) { +extern "C" fn tokenizers_get_vocab_size(handle: *mut TokenizerWrapper, size: *mut usize) { unsafe { - let token: &str = std::str::from_utf8(std::slice::from_raw_parts(token, len)).unwrap(); - let id = (*handle).tokenizer.token_to_id(token); - match id { - Some(id) => id as i32, - None => -1, - }; + *size = (*handle).tokenizer.get_vocab_size(true); } } #[no_mangle] -extern "C" fn tokenizer_id_to_token( +extern "C" fn tokenizers_id_to_token( handle: *mut TokenizerWrapper, id: u32, out_cstr: *mut *mut u8, @@ -142,15 +182,23 @@ extern "C" fn tokenizer_id_to_token( }; *out_cstr = (*handle).id_to_token_result.as_mut_ptr(); - *out_len = (*handle).id_to_token_result.len(); + *out_len = (&(*handle).id_to_token_result).len(); } } #[no_mangle] -extern "C" fn tokenizer_get_vocab_size( - handle: *mut TokenizerWrapper, - with_added_tokens: bool) -> usize { +extern "C" fn tokenizers_token_to_id( + handle: *mut TokenizerWrapper, + token: *const u8, + len: usize, + out_id: *mut i32, +) { unsafe { - (*handle).get_vocab_size(with_added_tokens) + let token: &str = &String::from_utf8_lossy(std::slice::from_raw_parts(token, len)); + let id = (*handle).tokenizer.token_to_id(token); + *out_id = match id { + Some(id) => id as i32, + None => -1, + }; } } diff --git a/xllm_service/tokenizer/tokenizers/tokenizers.h b/xllm_service/tokenizer/tokenizers/tokenizers.h index daefbef..b0eba46 100644 --- a/xllm_service/tokenizer/tokenizers/tokenizers.h +++ b/xllm_service/tokenizer/tokenizers/tokenizers.h @@ -26,42 +26,42 @@ extern "C" { #include #include -using TokenizerHandle = void*; - -TokenizerHandle tokenizer_from_file(const char* path); -// TokenizerHandle tokenizer_from_pretrained(const char* identifier); - -void tokenizer_encode(TokenizerHandle handle, - const char* data, - size_t len, - bool add_special_tokens); - -void tokenizer_decode(TokenizerHandle handle, - const uint32_t* data, - size_t len, - bool skip_special_tokens); - -void tokenizer_get_decode_str(TokenizerHandle handle, - const char** data, - size_t* len); - -void tokenizer_get_encode_ids(TokenizerHandle handle, - const uint32_t** id_data, - size_t* len); - -void tokenizer_id_to_token(TokenizerHandle handle, - uint32_t id, - const char** data, - size_t* len); - -// -1 if token is not in vocab -int32_t tokenizer_token_to_id(TokenizerHandle handle, - const char* token, - size_t len); - -void tokenizer_free(TokenizerHandle handle); - -size_t tokenizer_get_vocab_size(TokenizerHandle handle, bool with_added_tokens); +typedef void* TokenizerHandle; + +typedef struct { + int* token_ids; + size_t len; +} TokenizerEncodeResult; + +TokenizerHandle tokenizers_new_from_path(const char* path); + +void tokenizers_encode(TokenizerHandle handle, + const char* data, + size_t len, + int add_special_token, + TokenizerEncodeResult* result); + +void tokenizers_decode(TokenizerHandle handle, + const uint32_t* data, + size_t len, + int skip_special_tokens, + const char** decode_data, + size_t* decode_len); + +void tokenizers_id_to_token(TokenizerHandle handle, + uint32_t id, + const char** data, + size_t* len); + +// tokenizers_token_to_id stores -1 to *id if the token is not in the vocab +void tokenizers_token_to_id(TokenizerHandle handle, + const char* token, + size_t len, + int32_t* id); + +void tokenizers_free(TokenizerHandle handle); + +void tokenizers_get_vocab_size(TokenizerHandle handle, size_t* size); #ifdef __cplusplus }