diff --git a/.gitignore b/.gitignore index 9d9ac376a..77b7cb3cf 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ pip-wheel-metadata .vscode *.code-workspace + +.venv/ \ No newline at end of file diff --git a/bindings/cpp/.cargo/config.toml b/bindings/cpp/.cargo/config.toml new file mode 100644 index 000000000..0c17df095 --- /dev/null +++ b/bindings/cpp/.cargo/config.toml @@ -0,0 +1,2 @@ +[target.x86_64-pc-windows-msvc] +rustflags = ["-C", "target-feature=+crt-static"] \ No newline at end of file diff --git a/bindings/cpp/Cargo.toml b/bindings/cpp/Cargo.toml index 398df62ee..e4082827e 100644 --- a/bindings/cpp/Cargo.toml +++ b/bindings/cpp/Cargo.toml @@ -7,11 +7,13 @@ edition = "2018" [lib] name = "tokenizers" path = "tokenizers-cpp/lib.rs" -crate-type = ["cdylib"] +crate-type = ["cdylib", "staticlib"] [dependencies] cxx = "1.0.27" derive_more = "0.99.11" +serde = { version = "1.0", features = [ "rc", "derive" ] } +serde_json = "1.0" [dependencies.tokenizers] version = "*" diff --git a/bindings/cpp/build.rs b/bindings/cpp/build.rs index 35dcfd8eb..a9402c825 100644 --- a/bindings/cpp/build.rs +++ b/bindings/cpp/build.rs @@ -49,6 +49,7 @@ fn main() { .includes(include_dirs) .flag_if_supported(format!("-std={}", &standard).as_str()) .flag_if_supported(format!("/std:{}", &standard).as_str()) + .flag_if_supported("-Wno-c++11-extensions") // enable exception handling for MSVC .flag_if_supported("/EHsc") .compile(output); @@ -63,8 +64,14 @@ fn main() { if cfg!(feature = "test") { compile( cc::Build::new() + .cpp(true) .file("tokenizers-cpp/redefine_result_tests.cpp") - .include(format!("{}/cxxbridge/include", out_dir)), + .include(format!("{}/cxxbridge/include", out_dir)) + .flag_if_supported(format!("-std={}", &standard).as_str()) + .flag_if_supported(format!("/std:{}", &standard).as_str()) + .flag_if_supported("-Wno-c++11-extensions") + // enable exception handling for MSVC + .flag_if_supported("/EHsc"), "redefine_result_tests", ); } diff --git a/bindings/cpp/tokenizers-cpp/decoders.rs b/bindings/cpp/tokenizers-cpp/decoders.rs index 173e0f06d..3a5fefba9 100644 --- a/bindings/cpp/tokenizers-cpp/decoders.rs +++ b/bindings/cpp/tokenizers-cpp/decoders.rs @@ -20,7 +20,7 @@ mod ffi { fn decode_decoder(decoder: &Decoder, tokens: Vec) -> Result; } } - +use serde::{Serialize, Deserialize}; use derive_more::{Deref, DerefMut}; use tk::{ decoders::{bpe::BPEDecoder, byte_level::ByteLevel, wordpiece::WordPiece}, @@ -30,7 +30,7 @@ use tk::{ use crate::pre_tokenizers::u32_to_char; -#[derive(Deref, DerefMut, Clone)] +#[derive(Serialize, Deserialize, Deref, DerefMut, Clone)] pub struct Decoder(pub DecoderWrapper); impl DecoderTrait for Decoder { diff --git a/bindings/cpp/tokenizers-cpp/models.rs b/bindings/cpp/tokenizers-cpp/models.rs index 866393cdc..545e2caa8 100644 --- a/bindings/cpp/tokenizers-cpp/models.rs +++ b/bindings/cpp/tokenizers-cpp/models.rs @@ -99,7 +99,7 @@ use std::{ collections::HashMap, path::{Path, PathBuf}, }; - +use serde::{Deserialize, Serialize}; use crate::{tokens::wrap_tokens, wrap_option}; use derive_more::{Deref, DerefMut}; use ffi::*; @@ -112,7 +112,7 @@ use tk::{ Model as ModelTrait, ModelWrapper, Result, Trainer as TrainerTrait, }; -#[derive(Deref, DerefMut, Clone)] +#[derive(Serialize, Deserialize, Deref, DerefMut, Clone)] pub struct Model(pub ModelWrapper); #[derive(Deref, DerefMut)] diff --git a/bindings/cpp/tokenizers-cpp/normalizers.rs b/bindings/cpp/tokenizers-cpp/normalizers.rs index 699a0062d..9c0b15d72 100644 --- a/bindings/cpp/tokenizers-cpp/normalizers.rs +++ b/bindings/cpp/tokenizers-cpp/normalizers.rs @@ -60,7 +60,7 @@ pub mod ffi { fn get_original(normalized: &NormalizedString) -> &str; } } - +use serde::{Deserialize, Serialize}; use derive_more::{Deref, DerefMut}; use tk::{ normalizers::{ @@ -73,7 +73,7 @@ use tk::{ #[derive(Deref, DerefMut)] pub struct NormalizedString(pub tk::NormalizedString); -#[derive(Deref, DerefMut, Clone)] +#[derive(Serialize, Deserialize, Deref, DerefMut, Clone)] pub struct Normalizer(pub tk::NormalizerWrapper); #[derive(Deref, DerefMut, Clone)] diff --git a/bindings/cpp/tokenizers-cpp/pre_tokenizers.rs b/bindings/cpp/tokenizers-cpp/pre_tokenizers.rs index 86e46043b..5eeadb7dc 100644 --- a/bindings/cpp/tokenizers-cpp/pre_tokenizers.rs +++ b/bindings/cpp/tokenizers-cpp/pre_tokenizers.rs @@ -107,6 +107,7 @@ mod ffi { } use crate::{forward_cxx_enum, impl_extern_type, tokens::wrap_tokens_ref}; +use serde::{Deserialize, Serialize}; use derive_more::{Deref, DerefMut}; use ffi::*; use tk::{ @@ -131,7 +132,7 @@ impl_extern_type!(NormalizedString, "huggingface::tokenizers::ffi::NormalizedStr #[derive(Deref, DerefMut)] struct PreTokenizedString(tk::PreTokenizedString); -#[derive(Deref, DerefMut, Clone)] +#[derive(Serialize, Deserialize, Deref, DerefMut, Clone)] pub struct PreTokenizer(pub PreTokenizerWrapper); #[derive(Deref, DerefMut, Clone)] diff --git a/bindings/cpp/tokenizers-cpp/processors.rs b/bindings/cpp/tokenizers-cpp/processors.rs index e54d779b2..058a4a71d 100644 --- a/bindings/cpp/tokenizers-cpp/processors.rs +++ b/bindings/cpp/tokenizers-cpp/processors.rs @@ -101,6 +101,7 @@ mod ffi { } use crate::wrap_option; +use serde::{Deserialize, Serialize}; use derive_more::{Deref, DerefMut}; use ffi::*; use tk::{ @@ -188,7 +189,7 @@ impl Encoding { } } -#[derive(Deref, DerefMut, Clone)] +#[derive(Serialize, Deserialize, Deref, DerefMut, Clone)] pub struct PostProcessor(pub PostProcessorWrapper); impl PostProcessorTrait for PostProcessor { diff --git a/bindings/cpp/tokenizers-cpp/tests.h b/bindings/cpp/tokenizers-cpp/tests.h index 5403e924d..f759b113e 100644 --- a/bindings/cpp/tokenizers-cpp/tests.h +++ b/bindings/cpp/tokenizers-cpp/tests.h @@ -360,6 +360,68 @@ TEST_SUITE("Tokenizers") { } } + void print_encoding(const Encoding& encoding) { + std::cout << "encoding length: " << encoding.length() << std:: endl; + std::cout << "encoding number of sequences: " << encoding.number_of_sequences() << std:: endl; + std::cout << "input_ids: "; + for (uint32_t id : encoding.get_ids()) { + std::cout << id << ' '; + } + std::cout << std::endl; + + std::cout << "token_type_ids: "; + for (uint32_t attention_mask : encoding.get_type_ids()) { + std::cout << attention_mask << ' '; + } + std::cout << std::endl; + + std::cout << "attention_mask: "; + for (uint32_t attention_mask : encoding.get_attention_mask()) { + std::cout << attention_mask << ' '; + } + std::cout << std::endl; + } + + + TEST_CASE("load tokenizer") { + + SUBCASE("encode") { + Tokenizer tokenizer = Tokenizer::from_file(data_file("roberta.json")); + const char* example = "This is an example"; + rust::Vec ids{713, 16, 41, 1246}; + std::vector tokens{"This", "Ġis", "Ġan", "Ġexample"}; + + Encoding encodings = tokenizer.encode(InputSequence(example), false); + + COMPARE_CONTAINERS(encodings.get_ids(), ids); + COMPARE_CONTAINERS(encodings.get_tokens(), tokens); + + rust::String decoded = tokenizer.decode(ids, false); + CHECK(decoded == example); + } + + SUBCASE("encode_batch") { + std::vector examples { + "This is an example", + "hello world" + }; + std::vector batch; + for (const auto& input : examples) { + batch.emplace_back(input.c_str()); + } + Tokenizer tokenizer = Tokenizer::from_file(data_file("roberta.json")); + auto encodings = tokenizer + .with_padding(PaddingParams().with_batch_longest()) + .encode_batch(batch); + for (auto const&encoding : encodings) { + // print_encoding(encoding); + CHECK(encoding.get_ids().size() == 4); + CHECK(encoding.get_type_ids().size() == 4); + CHECK(encoding.get_attention_mask().size() == 4); + } + } + } + TEST_CASE("Bert") { Tokenizer tokenizer(WordPieceBuilder() .files(bert_vocab()) diff --git a/bindings/cpp/tokenizers-cpp/tokenizer.h b/bindings/cpp/tokenizers-cpp/tokenizer.h index 634d4d2d8..70f590fdf 100644 --- a/bindings/cpp/tokenizers-cpp/tokenizer.h +++ b/bindings/cpp/tokenizers-cpp/tokenizer.h @@ -143,6 +143,15 @@ struct Tokenizer { */ explicit Tokenizer(Model&& model) : inner_(ffi::tokenizer(*model)){}; + /** + * @brief Constructs a Tokenizer from a JSON file. + * + * @param path to the JSON file. + */ + static Tokenizer from_file(const std::string& path) { + return Tokenizer(ffi::from_file(path)); + } + /** * @brief Specifies the normalizer. */ diff --git a/bindings/cpp/tokenizers-cpp/tokenizer.rs b/bindings/cpp/tokenizers-cpp/tokenizer.rs index 54cc931f7..b2accf336 100644 --- a/bindings/cpp/tokenizers-cpp/tokenizer.rs +++ b/bindings/cpp/tokenizers-cpp/tokenizer.rs @@ -59,6 +59,7 @@ mod ffi { fn box_encoding1(encoding: &Encoding1) -> Box; + fn from_file(path: &str) -> Result>; // FIXME many of the below functions should take Box, not &. // Look for clone() in the implementations. fn tokenizer(model: &Model) -> Box; @@ -136,6 +137,7 @@ mod ffi { use crate::{forward_cxx_enum, impl_extern_type, models::vocab_to_vec, wrap_option}; use cxx::CxxVector; +use serde::{Serialize, Deserialize}; use derive_more::{Deref, DerefMut}; use ffi::*; use tk::{EncodeInput, PaddingParams, PaddingStrategy, Result, TruncationParams}; @@ -158,13 +160,20 @@ impl_extern_type!(PostProcessor, "huggingface::tokenizers::ffi::PostProcessor"); impl_extern_type!(Decoder, "huggingface::tokenizers::ffi::Decoder"); -#[derive(Deref, DerefMut)] +#[derive(Serialize, Deserialize, Deref, DerefMut)] struct Tokenizer(tk::TokenizerImpl); fn tokenizer(model: &Model) -> Box { Box::new(Tokenizer(tk::TokenizerImpl::new(model.clone()))) } +fn from_file(path: &str) -> Result> { + match tk::TokenizerImpl::from_file(path) { + Ok(tokenizer) => Ok(Box::new(Tokenizer(tokenizer))), + Err(msg) => Err(msg) + } +} + fn set_normalizer(tokenizer: &mut Tokenizer, normalizer: &Normalizer) { tokenizer.with_normalizer(normalizer.clone()); } diff --git a/tokenizers/Makefile b/tokenizers/Makefile index 29173d75f..486f5a568 100644 --- a/tokenizers/Makefile +++ b/tokenizers/Makefile @@ -55,7 +55,7 @@ $(DATA_DIR)/bert-% : $(DATA_DIR)/unigram% : $(dir_guard) - wget https://storage.googleapis.com/tokenizers/unigram$* -O $@ + wget https://huggingface.co/Narsil/small/raw/main/unigram$* -O $@ $(DATA_DIR)/albert-base-v1-tokenizer.json : $(dir_guard) @@ -70,7 +70,7 @@ $(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt $(DATA_DIR)/roberta.json : $(dir_guard) - wget https://storage.googleapis.com/tokenizers/roberta.json -O $@ + wget https://huggingface.co/Narsil/small/raw/main/roberta.json -O $@ $(DATA_DIR)/tokenizer-wiki.json : $(dir_guard)