diff --git a/lib/tokenizers/decode_stream.ex b/lib/tokenizers/decode_stream.ex new file mode 100644 index 0000000..1d7258d --- /dev/null +++ b/lib/tokenizers/decode_stream.ex @@ -0,0 +1,60 @@ +defmodule Tokenizers.DecodeStream do + @moduledoc """ + Implements streaming decoding functionality for tokenizers. + """ + + @enforce_keys [:resource] + defstruct [:resource] + + @type t :: %__MODULE__{ + resource: reference() + } + + @doc """ + Creates a new decode stream. + + ## Options + + * `:skip_special_tokens` - determines whether special tokens should be + skipped during decoding. By default, it is set to `false`. + + """ + @spec new(keyword()) :: t() + def new(opts \\ []) when is_list(opts) do + opts = Keyword.validate!(opts, skip_special_tokens: false) + Tokenizers.Native.decoder_stream_new(opts[:skip_special_tokens]) + end + + @doc """ + Steps through the decode stream with the given tokenizer and token ID. + + Returns `{:ok, String.t()}` if there's a decoded string, or `{:ok, :out_ofr_range}` if the token ID is out of range. + Returns `{:error, reason}` if an error occurs during decoding. + """ + def step(%__MODULE__{} = decode_stream, tokenizer, id) when is_integer(id) do + case Tokenizers.Native.decoder_stream_step(decode_stream, tokenizer, id) do + {:ok, decoded} when is_binary(decoded) -> + {:ok, decoded} + + {:ok, nil} -> + {:ok, :out_of_range} + + {:error, reason} -> + {:error, reason} + end + end + + @doc """ + Returns information about the decode stream state. + """ + defdelegate info(decode_stream), to: Tokenizers.Native, as: :decoder_stream_info + + defimpl Inspect do + import Inspect.Algebra + alias Tokenizers.DecodeStream + + def inspect(decode_stream, opts) do + "#Tokenizers.DecodeStream<#{to_doc(DecodeStream.info(decode_stream), opts)}>" + end + end +end diff --git a/lib/tokenizers/native.ex b/lib/tokenizers/native.ex index a20b33f..845c951 100644 --- a/lib/tokenizers/native.ex +++ b/lib/tokenizers/native.ex @@ -33,6 +33,13 @@ defmodule Tokenizers.Native do def decoders_ctc(_options), do: err() def decoders_sequence(_decoders), do: err() + # DecoderStream + def decoder_stream_step(_decoder_stream, _tokenizer, _id), do: err() + # + def decoder_stream_info(_decoder_stream), do: err() + # + def decoder_stream_new(_skip_special_tokens), do: err() + # Encoding def encoding_get_length(_encoding), do: err() def encoding_get_n_sequences(_encoding), do: err() diff --git a/native/ex_tokenizers/Cargo.lock b/native/ex_tokenizers/Cargo.lock index 23bc089..2788e93 100644 --- a/native/ex_tokenizers/Cargo.lock +++ b/native/ex_tokenizers/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "aho-corasick" @@ -13,9 +13,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.89" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "base64" @@ -29,17 +29,11 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "cc" -version = "1.1.24" +version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812acba72f0a070b003d3697490d2b55b837230ae7c6c6497f05cc2ddbb8d938" +checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ "shlex", ] @@ -52,9 +46,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "crossbeam-deque" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -71,15 +65,15 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "darling" -version = "0.20.10" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ "darling_core", "darling_macro", @@ -87,9 +81,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.10" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" dependencies = [ "fnv", "ident_case", @@ -101,9 +95,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.10" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", @@ -112,18 +106,18 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ "darling", "proc-macro2", @@ -133,9 +127,9 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", "syn", @@ -143,9 +137,9 @@ dependencies = [ [[package]] name = "either" -version = "1.13.0" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "esaxx-rs" @@ -198,9 +192,12 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "inventory" -version = "0.3.15" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f958d3d68f4167080a18141e10381e7634563984a537f2a49a30fd8e53ac5767" +checksum = "ab08d7cd2c5897f2c949e5383ea7c7db03fb19130ffcfbf7eda795137ae3cb83" +dependencies = [ + "rustversion", +] [[package]] name = "itertools" @@ -213,18 +210,18 @@ dependencies = [ [[package]] name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "lazy_static" @@ -234,15 +231,25 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.159" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "libloading" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" +dependencies = [ + "cfg-if", + "windows-targets", +] [[package]] name = "log" -version = "0.4.22" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "macro_rules_attribute" @@ -274,9 +281,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "monostate" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e" +checksum = "aafe1be9d0c75642e3e50fedc7ecadf1ef1cbce6eb66462153fc44245343fbee" dependencies = [ "monostate-impl", "serde", @@ -284,9 +291,9 @@ dependencies = [ [[package]] name = "monostate-impl" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" +checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5" dependencies = [ "proc-macro2", "quote", @@ -305,12 +312,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.1" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" -dependencies = [ - "portable-atomic", -] +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "onig" @@ -342,39 +346,33 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pkg-config" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" - -[[package]] -name = "portable-atomic" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "ppv-lite86" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ "zerocopy", ] [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.37" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -442,9 +440,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -454,15 +452,21 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -471,20 +475,21 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustler" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94bdfa68c0388cbd725f1ca54e975956482c262599e5cced04a903eec918b7f" +checksum = "f04a7b61bf2db5495d6c0d2eb4b3f0f366864d47f2482834656e25d1b25fe290" dependencies = [ "inventory", + "libloading", + "regex-lite", "rustler_codegen", - "rustler_sys", ] [[package]] name = "rustler_codegen" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "996dc019acb78b91b4e0c1bd6fa2cd509a835d309de762dc15213b97eac399da" +checksum = "bf9365a04e3a3a4d3136953d97c67fd0a9c036d36197917961551c2cc1ecb385" dependencies = [ "heck", "inventory", @@ -494,35 +499,31 @@ dependencies = [ ] [[package]] -name = "rustler_sys" -version = "2.4.3" +name = "rustversion" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd0e2c955cfc86ea4680067e1d5e711427b43f7befcb6e23c7807cf3dd90e97" -dependencies = [ - "regex", - "unreachable", -] +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "serde" -version = "1.0.210" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", @@ -531,9 +532,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", @@ -549,9 +550,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "smallvec" -version = "1.13.2" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "spm_precompiled" @@ -573,9 +574,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.79" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -584,18 +585,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.64" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.64" +version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" dependencies = [ "proc-macro2", "quote", @@ -604,15 +605,15 @@ dependencies = [ [[package]] name = "tokenizers" -version = "0.20.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70" +checksum = "3169b3195f925496c895caee7978a335d49218488ef22375267fba5a46a40bd7" dependencies = [ "aho-corasick", "derive_builder", "esaxx-rs", "getrandom", - "itertools 0.12.1", + "itertools 0.13.0", "lazy_static", "log", "macro_rules_attribute", @@ -635,9 +636,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-normalization-alignments" @@ -661,41 +662,89 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" [[package]] -name = "unreachable" -version = "1.0.0" +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "void", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] [[package]] -name = "void" -version = "1.0.2" +name = "windows_aarch64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +name = "windows_aarch64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "zerocopy" -version = "0.7.35" +version = "0.8.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" dependencies = [ - "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.35" +version = "0.8.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" dependencies = [ "proc-macro2", "quote", diff --git a/native/ex_tokenizers/Cargo.toml b/native/ex_tokenizers/Cargo.toml index 61322ac..0907306 100644 --- a/native/ex_tokenizers/Cargo.toml +++ b/native/ex_tokenizers/Cargo.toml @@ -11,7 +11,7 @@ crate-type = ["cdylib"] [dependencies] anyhow = "1" -rustler = "0.34.0" -thiserror = "1" -tokenizers = { version = "0.20.0", default-features = false, features = ["onig", "esaxx_fast"]} +rustler = "0.36.1" +thiserror = "2" +tokenizers = { version = "0.21.1", default-features = false, features = ["onig", "esaxx_fast"]} serde = { version = "1.0", features = [ "rc", "derive" ] } diff --git a/native/ex_tokenizers/src/decode_stream.rs b/native/ex_tokenizers/src/decode_stream.rs new file mode 100644 index 0000000..c9ebd66 --- /dev/null +++ b/native/ex_tokenizers/src/decode_stream.rs @@ -0,0 +1,123 @@ +use serde::{Deserialize, Serialize}; + +use crate::{new_info, tokenizer::ExTokenizersTokenizer, util::Info, ExTokenizersError}; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct ExTokenizersDecodeStreamRef { + skip_special_tokens: bool, + ids: Vec, + prefix: String, + prefix_index: usize, + read_index: usize, +} + +impl ExTokenizersDecodeStreamRef { + pub fn step( + &mut self, + tokenizer: ExTokenizersTokenizer, + id: u32, + ) -> tokenizers::tokenizer::Result> { + tokenizers::step_decode_stream( + &tokenizer.resource.0, + id, + self.skip_special_tokens, + &mut self.ids, + &mut self.prefix, + &mut self.prefix_index, + ) + } +} + +pub struct ExTokenizerDecodeStreamLock { + pub inner: std::sync::RwLock, +} + +#[rustler::resource_impl] +impl rustler::Resource for ExTokenizerDecodeStreamLock {} + +#[derive(rustler::NifStruct)] +#[module = "Tokenizers.DecodeStream"] +pub struct ExTokenizersDecodeStream { + pub resource: rustler::ResourceArc, +} + +impl Serialize for ExTokenizersDecodeStream { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.resource.inner.serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for ExTokenizersDecodeStream { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + Ok(ExTokenizersDecodeStream::new( + ExTokenizersDecodeStreamRef::deserialize(deserializer)?, + )) + } +} + +impl Clone for ExTokenizersDecodeStream { + fn clone(&self) -> Self { + Self { + resource: rustler::ResourceArc::new(ExTokenizerDecodeStreamLock { + inner: std::sync::RwLock::new(self.resource.inner.read().unwrap().clone()), + }), + } + } +} + +impl ExTokenizersDecodeStream { + pub fn new(data: ExTokenizersDecodeStreamRef) -> Self { + Self { + resource: rustler::ResourceArc::new(ExTokenizerDecodeStreamLock { + inner: std::sync::RwLock::new(data), + }), + } + } +} + +#[rustler::nif(schedule = "DirtyCpu")] +fn decoder_stream_step( + decode_stream: ExTokenizersDecodeStream, + tokenizer: ExTokenizersTokenizer, + id: u32, +) -> Result, ExTokenizersError> { + decode_stream + .resource + .inner + .write() + .unwrap() + .step(tokenizer, id) + .map_err(ExTokenizersError::Tokenizer) +} + +#[rustler::nif] +fn decoder_stream_new(skip_special_tokens: bool) -> ExTokenizersDecodeStream { + let ds = ExTokenizersDecodeStreamRef { + skip_special_tokens, + ids: vec![], + prefix: "".to_string(), + prefix_index: 0, + read_index: 0, + }; + + ExTokenizersDecodeStream::new(ds) +} + +/////////////////////////////////////////////////////////////////////////////// +/// Inspection +/////////////////////////////////////////////////////////////////////////////// + +#[rustler::nif] +fn decoder_stream_info(decode_stream: ExTokenizersDecodeStream) -> Info { + let ds = decode_stream.resource.inner.read().unwrap(); + + new_info! { + skip_special_tokens: ds.skip_special_tokens + } +} diff --git a/native/ex_tokenizers/src/error.rs b/native/ex_tokenizers/src/error.rs index 4ee3f88..1bd83c8 100644 --- a/native/ex_tokenizers/src/error.rs +++ b/native/ex_tokenizers/src/error.rs @@ -1,5 +1,5 @@ use rustler::{Encoder, Env, Term}; -use std::io; +use std::{io, panic::RefUnwindSafe}; use thiserror::Error; rustler::atoms! { @@ -12,7 +12,7 @@ pub enum ExTokenizersError { #[error("Invalid Char")] InvalidChar, #[error("Tokenizer Error")] - Tokenizer(#[from] Box), + Tokenizer(#[from] tokenizers::Error), #[error("IO Error")] Io(#[from] io::Error), #[error("Internal Error: {0}")] @@ -28,3 +28,5 @@ impl Encoder for ExTokenizersError { format!("{self:?}").encode(env) } } + +impl RefUnwindSafe for ExTokenizersError {} diff --git a/native/ex_tokenizers/src/lib.rs b/native/ex_tokenizers/src/lib.rs index bf4a2a3..86eee2a 100644 --- a/native/ex_tokenizers/src/lib.rs +++ b/native/ex_tokenizers/src/lib.rs @@ -1,4 +1,5 @@ mod added_token; +mod decode_stream; mod decoders; mod encoding; mod error; diff --git a/native/ex_tokenizers/src/tokenizer.rs b/native/ex_tokenizers/src/tokenizer.rs index d554578..b23f4d0 100644 --- a/native/ex_tokenizers/src/tokenizer.rs +++ b/native/ex_tokenizers/src/tokenizer.rs @@ -29,7 +29,7 @@ type ExTokenizerImpl = TokenizerImpl< ExTokenizersDecoder, >; -pub struct ExTokenizersTokenizerRef(ExTokenizerImpl); +pub struct ExTokenizersTokenizerRef(pub ExTokenizerImpl); #[rustler::resource_impl] impl rustler::Resource for ExTokenizersTokenizerRef {} diff --git a/native/ex_tokenizers/src/util.rs b/native/ex_tokenizers/src/util.rs index 96d6325..224ed91 100644 --- a/native/ex_tokenizers/src/util.rs +++ b/native/ex_tokenizers/src/util.rs @@ -1,3 +1,5 @@ +use std::panic::RefUnwindSafe; + use rustler::Encoder; use tokenizers::{PaddingDirection, TruncationDirection}; @@ -10,6 +12,7 @@ macro_rules! new_info { } pub struct Info(pub Vec<(Box, Box)>); +impl RefUnwindSafe for Info {} impl rustler::Encoder for Info { fn encode<'a>(&self, env: rustler::Env<'a>) -> rustler::Term<'a> { diff --git a/test/tokenizers/decode_stream_test.exs b/test/tokenizers/decode_stream_test.exs new file mode 100644 index 0000000..daca185 --- /dev/null +++ b/test/tokenizers/decode_stream_test.exs @@ -0,0 +1,88 @@ +defmodule Tokenizers.DecodeStreamTest do + use ExUnit.Case, async: true + doctest Tokenizers.Decoder + + describe "Minimal tokenizer" do + test "Decodes with stream" do + {:ok, bpe} = Tokenizers.Model.BPE.empty() + {:ok, tk} = Tokenizers.Tokenizer.init(bpe) + + tk = + tk + |> Tokenizers.Tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + + ds = Tokenizers.DecodeStream.new() + + {:ok, "my"} = Tokenizers.DecodeStream.step(ds, tk, 0) + {:ok, " name"} = Tokenizers.DecodeStream.step(ds, tk, 1) + {:ok, " is"} = Tokenizers.DecodeStream.step(ds, tk, 2) + {:ok, " john"} = Tokenizers.DecodeStream.step(ds, tk, 3) + {:ok, " pair"} = Tokenizers.DecodeStream.step(ds, tk, 4) + end + end + + describe "Byte fallback decode stream" do + test "handles byte fallback decoding" do + vocab = [ + {"", 0.0}, + {"<0x20>", -0.1}, + {"<0xC3>", -0.2}, + {"<0xA9>", -0.3} + ] + + {:ok, model} = Tokenizers.Model.Unigram.init(vocab, byte_fallback: true, unk_id: 0) + + {:ok, tk} = Tokenizers.Tokenizer.init(model) + + tk = + tk + |> Tokenizers.Tokenizer.set_decoder(Tokenizers.Decoder.byte_fallback()) + + ds = Tokenizers.DecodeStream.new() + + {:ok, " "} = Tokenizers.DecodeStream.step(ds, tk, 1) + {:ok, :out_of_range} = Tokenizers.DecodeStream.step(ds, tk, 2) + {:ok, "é"} = Tokenizers.DecodeStream.step(ds, tk, 3) + end + + test "handles metaspace decoding" do + vocab = [ + {"", 0.0}, + {"▁This", -0.1} + ] + + {:ok, model} = Tokenizers.Model.Unigram.init(vocab, byte_fallback: false, unk_id: 0) + {:ok, tk} = Tokenizers.Tokenizer.init(model) + + tk = + tk + |> Tokenizers.Tokenizer.set_decoder(Tokenizers.Decoder.metaspace()) + + ds = Tokenizers.DecodeStream.new() + + {:ok, "This"} = Tokenizers.DecodeStream.step(ds, tk, 1) + {:ok, " This"} = Tokenizers.DecodeStream.step(ds, tk, 1) + end + end + + describe "DecodeStream info" do + test "skip_special_tokens false" do + assert Tokenizers.DecodeStream.info(Tokenizers.DecodeStream.new()) == %{ + "skip_special_tokens" => false + } + end + + test "skip_special_tokens true" do + assert Tokenizers.DecodeStream.info(Tokenizers.DecodeStream.new(skip_special_tokens: true)) == + %{ + "skip_special_tokens" => true + } + end + + test "default DecodeStream" do + assert Tokenizers.DecodeStream.info(Tokenizers.DecodeStream.new()) == %{ + "skip_special_tokens" => false + } + end + end +end