From 040d3dbaf650362b1408499a3638a785b05949ef Mon Sep 17 00:00:00 2001 From: Nick Arner Date: Mon, 4 Aug 2025 09:30:22 -0700 Subject: [PATCH 1/6] update gitignore for swift --- .gitignore | 9 ++++ TestTiktoken/Package.swift | 20 ++++++++ TestTiktoken/Sources/TestTiktoken/main.swift | 54 ++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 TestTiktoken/Package.swift create mode 100644 TestTiktoken/Sources/TestTiktoken/main.swift diff --git a/.gitignore b/.gitignore index 68cdf7ff..65cb3f25 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,12 @@ htmlcov Cargo.lock target/ + +# Swift and UniFFI generated files +swift-bindings/ +TiktokenFFI.xcframework/ +.swiftpm/ +.build/ +xcuserdata/ +DerivedData/ +*.xcodeproj diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift new file mode 100644 index 00000000..2b81018e --- /dev/null +++ b/TestTiktoken/Package.swift @@ -0,0 +1,20 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "TestTiktoken", + platforms: [ + .macOS(.v10_15) + ], + dependencies: [ + .package(path: "../TiktokenSwift") + ], + targets: [ + .executableTarget( + name: "TestTiktoken", + dependencies: [ + .product(name: "TiktokenSwift", package: "TiktokenSwift") + ] + ), + ] +) diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift new file mode 100644 index 00000000..3aa72dc4 --- /dev/null +++ b/TestTiktoken/Sources/TestTiktoken/main.swift @@ -0,0 +1,54 @@ +import Foundation +import TiktokenSwift + +print("๐Ÿงช Testing TiktokenSwift...") +print("=" * 50) + +do { + // Create a test encoder + let encoder = try TiktokenHelper.createTestEncoder() + print("โœ… Successfully created encoder") + + // Test encoding + let text = "hello world!" + let tokens = encoder.encodeText(text) + print("\n๐Ÿ“ Original text: '\(text)'") + print("๐Ÿ”ข Encoded tokens: \(tokens)") + + // Test decoding + if let decoded = encoder.decodeTokens(tokens) { + print("๐Ÿ“– Decoded text: '\(decoded)'") + print("โœ… Decoding successful!") + } else { + print("โŒ Failed to decode tokens") + } + + // Test special tokens + let specialTokens = encoder.specialTokens() + print("\n๐ŸŽฏ Special tokens: \(specialTokens)") + + // Test vocabulary info + let vocabSize = encoder.nVocab() + let maxToken = encoder.maxTokenValue() + print("๐Ÿ“Š Vocabulary size: \(vocabSize)") + print("๐Ÿ“Š Max token value: \(maxToken)") + + // Test encoding with details + let details = encoder.encodeWithDetails(text: text, allowedSpecial: []) + print("\n๐Ÿ” Encoding details:") + print(" Tokens: \(details.tokens)") + print(" Last piece token length: \(details.lastPieceTokenLen)") + + print("\nโœ… All tests passed!") + +} catch { + print("โŒ Error: \(error)") + exit(1) +} + +// Helper to repeat string +extension String { + static func *(lhs: String, rhs: Int) -> String { + String(repeating: lhs, count: rhs) + } +} From 1477990648544bb117d2862333004e2a7989d429 Mon Sep 17 00:00:00 2001 From: Nick Arner Date: Mon, 4 Aug 2025 09:31:58 -0700 Subject: [PATCH 2/6] rust bindings for swift package --- Cargo.toml | 19 ++- build.rs | 3 + src/lib.rs | 38 +++--- src/tiktoken.udl | 56 +++++++++ src/tiktoken.uniffi.rs | 265 +++++++++++++++++++++++++++++++++++++++++ src/uniffi_bindings.rs | 245 +++++++++++++++++++++++++++++++++++++ uniffi.toml | 5 + 7 files changed, 614 insertions(+), 17 deletions(-) create mode 100644 build.rs create mode 100644 src/tiktoken.udl create mode 100644 src/tiktoken.uniffi.rs create mode 100644 src/uniffi_bindings.rs create mode 100644 uniffi.toml diff --git a/Cargo.toml b/Cargo.toml index d2f713bb..6202305c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,10 +6,11 @@ rust-version = "1.57.0" [lib] name = "tiktoken" -crate-type = ["cdylib", "rlib"] +crate-type = ["cdylib", "staticlib", "rlib"] + [features] -default = [] +default = ["uniffi_bindgen", "camino"] python = [ "pyo3", ] @@ -25,3 +26,17 @@ fancy-regex = "0.13.0" regex = "1.10.3" rustc-hash = "1.1.0" bstr = "1.5.0" +base64 = "0.22" + +# UniFFI dependencies +uniffi = { version = "0.29", features = ["build"] } +thiserror = "1.0" +uniffi_bindgen = { version = "0.29", optional = true } +camino = { version = "1.1", optional = true } + +[build-dependencies] +uniffi = { version = "0.29", features = ["bindgen"] } +uniffi_build = "0.29" +uniffi_bindgen = "0.29" +camino = "1.1" + diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..2cc22627 --- /dev/null +++ b/build.rs @@ -0,0 +1,3 @@ +fn main() { + uniffi_build::generate_scaffolding("src/tiktoken.udl").unwrap(); +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 64dc6a15..625cc2ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,3 @@ -use std::borrow::Borrow; -use std::borrow::Cow; use std::collections::HashSet; use std::num::NonZeroU64; use std::thread; @@ -12,6 +10,11 @@ use rustc_hash::FxHashMap as HashMap; #[cfg(feature = "python")] mod py; +pub mod uniffi_bindings; + +// UniFfiTag is required by UniFFI for type checking +pub struct UniFfiTag; + pub type Rank = u32; fn _byte_pair_merge(ranks: &HashMap, Rank>, piece: &[u8]) -> Vec<(usize, Rank)> { @@ -73,17 +76,22 @@ fn _byte_pair_merge(ranks: &HashMap, Rank>, piece: &[u8]) -> Vec<(usize, } pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap, Rank>) -> Vec { + if piece.is_empty() { + return vec![]; + } if piece.len() == 1 { - return vec![ranks[piece]]; + return ranks.get(piece).copied().map_or(vec![], |r| vec![r]); } _byte_pair_merge(ranks, piece) .windows(2) - .map(|part| ranks[&piece[part[0].0..part[1].0]]) + .filter_map(|part| ranks.get(&piece[part[0].0..part[1].0]).copied()) .collect() } pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, Rank>) -> Vec<&'a [u8]> { - assert!(piece.len() > 1); + if piece.len() <= 1 { + return vec![piece]; + } _byte_pair_merge(ranks, piece) .windows(2) .map(|part| &piece[part[0].0..part[1].0]) @@ -177,13 +185,13 @@ const MAX_NUM_THREADS: usize = 128; #[cfg_attr(feature = "python", pyclass)] #[derive(Clone)] pub struct CoreBPE { - encoder: HashMap, Rank>, - special_tokens_encoder: HashMap, - decoder: HashMap>, - special_tokens_decoder: HashMap>, - regex_tls: Vec, - special_regex_tls: Vec, - sorted_token_bytes: Vec>, + pub(crate) encoder: HashMap, Rank>, + pub(crate) special_tokens_encoder: HashMap, + pub(crate) decoder: HashMap>, + pub(crate) special_tokens_decoder: HashMap>, + pub(crate) regex_tls: Vec, + pub(crate) special_regex_tls: Vec, + pub(crate) sorted_token_bytes: Vec>, } impl CoreBPE { @@ -201,7 +209,7 @@ impl CoreBPE { /// Decodes tokens into a list of bytes. /// /// The bytes are not gauranteed to be a valid utf-8 string. - fn decode_bytes(&self, tokens: &[Rank]) -> Result, DecodeKeyError> { + pub(crate) fn decode_bytes(&self, tokens: &[Rank]) -> Result, DecodeKeyError> { let mut ret = Vec::with_capacity(tokens.len() * 2); for &token in tokens { let token_bytes = match self.decoder.get(&token) { @@ -287,7 +295,7 @@ impl CoreBPE { (ret, last_piece_token_len) } - fn _increase_last_piece_token_len( + pub(crate) fn _increase_last_piece_token_len( &self, tokens: Vec, mut last_piece_token_len: usize, @@ -461,7 +469,7 @@ impl CoreBPE { ) } - fn new_internal( + pub(crate) fn new_internal( encoder: HashMap, Rank>, special_tokens_encoder: HashMap, pattern: &str, diff --git a/src/tiktoken.udl b/src/tiktoken.udl new file mode 100644 index 00000000..7c236c6f --- /dev/null +++ b/src/tiktoken.udl @@ -0,0 +1,56 @@ +namespace tiktoken { + [Throws=TiktokenError] + CoreBpe new_core_bpe(record encoder, record special_tokens_encoder, string pattern); +}; + +[Error] +enum TiktokenError { + "ValueError", + "KeyError", + "DecodeError", +}; + +dictionary EncodingResult { + sequence tokens; + u64 last_piece_token_len; +}; + +dictionary UnstableEncodingResult { + sequence tokens; + sequence> completions; +}; + +interface CoreBpe { + constructor(record encoder, record special_tokens_encoder, string pattern); + + sequence encode_ordinary(string text); + + sequence encode(string text, sequence allowed_special); + + EncodingResult encode_with_details(string text, sequence allowed_special); + + UnstableEncodingResult encode_with_unstable(string text, sequence allowed_special); + + sequence encode_bytes(bytes input); + + [Throws=TiktokenError] + u32 encode_single_token(bytes piece); + + sequence encode_single_piece(bytes piece); + + [Throws=TiktokenError] + bytes decode_bytes(sequence tokens); + + [Throws=TiktokenError] + bytes decode_single_token_bytes(u32 token); + + sequence token_byte_values(); + + sequence special_tokens(); + + sequence encode_with_special_tokens(string text); + + u32 max_token_value(); + + u32 n_vocab(); +}; \ No newline at end of file diff --git a/src/tiktoken.uniffi.rs b/src/tiktoken.uniffi.rs new file mode 100644 index 00000000..a21cf6d3 --- /dev/null +++ b/src/tiktoken.uniffi.rs @@ -0,0 +1,265 @@ +// This file was autogenerated by some hot garbage in the `uniffi` crate. +// Trust me, you don't want to mess with it! + +::uniffi::setup_scaffolding!("tiktoken"); + +// Export info about this UDL file +// See `uniffi_bindgen::macro_metadata` for how this is used. + +const UNIFFI_META_CONST_UDL_TIKTOKEN: ::uniffi::MetadataBuffer = + ::uniffi::MetadataBuffer::from_code(::uniffi::metadata::codes::UDL_FILE) + .concat_str("tiktoken") + .concat_str("tiktoken") + .concat_str("tiktoken"); + +#[doc(hidden)] +#[unsafe(no_mangle)] +pub static UNIFFI_META_UDL_TIKTOKEN: [u8; UNIFFI_META_CONST_UDL_TIKTOKEN.size] = + UNIFFI_META_CONST_UDL_TIKTOKEN.into_array(); + +uniffi::deps::static_assertions::assert_impl_all!(::std::string::String: ::std::cmp::Eq, ::std::hash::Hash); // record<::std::string::String, u32> + +// Error definitions, corresponding to `error` in the UDL. + +#[::uniffi::udl_derive(Error)] +#[uniffi(flat_error)] + +enum r#TiktokenError { + r#ValueError {}, + r#KeyError {}, + r#DecodeError {}, +} + +// Record definitions, implemented as method-less structs, corresponding to `dictionary` objects. + +#[::uniffi::udl_derive(Record)] +struct r#EncodingResult { + r#tokens: std::vec::Vec, + r#last_piece_token_len: u64, +} + +#[::uniffi::udl_derive(Record)] +struct r#UnstableEncodingResult { + r#tokens: std::vec::Vec, + r#completions: std::vec::Vec>, +} + +// Top level functions, corresponding to UDL `namespace` functions. + +#[::uniffi::export_for_udl] +pub fn r#new_core_bpe( + r#encoder: ::std::collections::HashMap<::std::string::String, u32>, + r#special_tokens_encoder: ::std::collections::HashMap<::std::string::String, u32>, + r#pattern: ::std::string::String, +) -> ::std::result::Result<::std::sync::Arc, r#TiktokenError> { + unreachable!() +} + +// Object definitions, corresponding to UDL `interface` definitions. + +#[::uniffi::udl_derive(Object)] +struct r#CoreBPE {} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + #[uniffi::constructor] + pub fn r#new( + r#encoder: ::std::collections::HashMap<::std::string::String, u32>, + r#special_tokens_encoder: ::std::collections::HashMap<::std::string::String, u32>, + r#pattern: ::std::string::String, + ) -> ::std::sync::Arc { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#decode_bytes( + &self, + r#tokens: std::vec::Vec, + ) -> ::std::result::Result<::std::vec::Vec, r#TiktokenError> { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#decode_single_token_bytes( + &self, + r#token: u32, + ) -> ::std::result::Result<::std::vec::Vec, r#TiktokenError> { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode( + &self, + r#text: ::std::string::String, + r#allowed_special: std::vec::Vec<::std::string::String>, + ) -> std::vec::Vec { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_bytes(&self, r#input: ::std::vec::Vec) -> std::vec::Vec { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_ordinary(&self, r#text: ::std::string::String) -> std::vec::Vec { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_single_piece(&self, r#piece: ::std::vec::Vec) -> std::vec::Vec { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_single_token( + &self, + r#piece: ::std::vec::Vec, + ) -> ::std::result::Result { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_with_details( + &self, + r#text: ::std::string::String, + r#allowed_special: std::vec::Vec<::std::string::String>, + ) -> r#EncodingResult { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_with_special_tokens( + &self, + r#text: ::std::string::String, + ) -> std::vec::Vec { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#encode_with_unstable( + &self, + r#text: ::std::string::String, + r#allowed_special: std::vec::Vec<::std::string::String>, + ) -> r#UnstableEncodingResult { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#max_token_value(&self) -> u32 { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#n_vocab(&self) -> u32 { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#special_tokens(&self) -> std::vec::Vec<::std::string::String> { + unreachable!() + } +} +#[::uniffi::export_for_udl] +impl r#CoreBPE { + pub fn r#token_byte_values(&self) -> std::vec::Vec<::std::vec::Vec> { + unreachable!() + } +} + +// Callback Interface definitions, corresponding to UDL `callback interface` definitions. + +// Export scaffolding checksums for UDL items + +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_func_new_core_bpe() -> u16 { + 56117 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_decode_bytes() -> u16 { + 55010 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_decode_single_token_bytes() -> u16 { + 5116 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode() -> u16 { + 29815 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_bytes() -> u16 { + 62700 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_ordinary() -> u16 { + 27373 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_single_piece() -> u16 { + 59626 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_single_token() -> u16 { + 44485 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_with_details() -> u16 { + 44545 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_with_special_tokens() -> u16 { + 3792 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_with_unstable() -> u16 { + 58939 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_max_token_value() -> u16 { + 1036 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_n_vocab() -> u16 { + 6443 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_special_tokens() -> u16 { + 37553 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_token_byte_values() -> u16 { + 22300 +} +#[unsafe(no_mangle)] +#[doc(hidden)] +pub extern "C" fn r#uniffi_tiktoken_checksum_constructor_corebpe_new() -> u16 { + 33616 +} diff --git a/src/uniffi_bindings.rs b/src/uniffi_bindings.rs new file mode 100644 index 00000000..befb4d84 --- /dev/null +++ b/src/uniffi_bindings.rs @@ -0,0 +1,245 @@ +use std::collections::{HashMap as StdHashMap, HashSet}; +use std::sync::Arc; +use rustc_hash::FxHashMap as HashMap; +use base64::Engine; + +use crate::{CoreBPE as CoreBPEInternal, Rank}; + +#[derive(Debug, thiserror::Error)] +pub enum TiktokenError { + #[error("Value error: {0}")] + ValueError(String), + #[error("Key error: {0}")] + KeyError(String), + #[error("Decode error: {0}")] + DecodeError(String), +} + +impl From for TiktokenError { + fn from(err: crate::DecodeKeyError) -> Self { + TiktokenError::KeyError(format!("Invalid token for decoding: {}", err.token)) + } +} + +impl From for TiktokenError { + fn from(err: crate::DecodeError) -> Self { + TiktokenError::DecodeError(err.message) + } +} + +#[derive(Debug)] +pub struct EncodingResult { + pub tokens: Vec, + pub last_piece_token_len: u64, +} + +#[derive(Debug)] +pub struct UnstableEncodingResult { + pub tokens: Vec, + pub completions: Vec>, +} + +#[derive(Clone)] +pub struct CoreBpe { + inner: Arc, +} + +impl CoreBpe { + pub fn new( + encoder: StdHashMap, + special_tokens_encoder: StdHashMap, + pattern: String, + ) -> Self { + // Convert String keys to Vec for the encoder + // Handle base64-encoded byte sequences for non-UTF8 tokens + let byte_encoder: HashMap, Rank> = encoder + .into_iter() + .map(|(k, v)| { + if k.starts_with("base64:") { + // Decode base64 for non-UTF8 sequences + let b64_str = &k[7..]; + match base64::engine::general_purpose::STANDARD.decode(b64_str) { + Ok(bytes) => (bytes, v), + Err(e) => { + eprintln!("Failed to decode base64 token {}: {}", k, e); + (k.into_bytes(), v) + } + } + } else { + // Regular UTF-8 string + (k.into_bytes(), v) + } + }) + .collect(); + + let special_tokens_encoder: HashMap = special_tokens_encoder + .into_iter() + .collect(); + + let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern) + .expect("Failed to create CoreBPE"); + + Self { + inner: Arc::new(inner), + } + } + + pub fn encode_ordinary(&self, text: String) -> Vec { + self.inner.encode_ordinary(&text) + } + + pub fn encode(&self, text: String, allowed_special: Vec) -> Vec { + let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); + self.inner.encode(&text, &allowed_special).0 + } + + pub fn encode_with_details(&self, text: String, allowed_special: Vec) -> EncodingResult { + let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); + let (tokens, last_piece_token_len) = self.inner.encode(&text, &allowed_special); + EncodingResult { + tokens, + last_piece_token_len: last_piece_token_len as u64, + } + } + + pub fn encode_with_unstable( + &self, + text: String, + allowed_special: Vec, + ) -> UnstableEncodingResult { + let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); + let (tokens, completions) = self.inner._encode_unstable_native(&text, &allowed_special); + UnstableEncodingResult { + tokens, + completions: completions.into_iter().collect(), + } + } + + pub fn encode_bytes(&self, input: Vec) -> Vec { + match std::str::from_utf8(&input) { + Ok(text) => self.inner.encode_ordinary(text), + Err(e) => { + let text = unsafe { std::str::from_utf8_unchecked(&input[..e.valid_up_to()]) }; + let (tokens, last_piece_token_len) = self.inner.encode(text, &HashSet::new()); + let (mut tokens, last_piece_token_len) = self + .inner + ._increase_last_piece_token_len(tokens, last_piece_token_len); + + let mut unstable_bytes; + if !tokens.is_empty() && last_piece_token_len > 0 { + unstable_bytes = self + .inner + .decode_bytes(&tokens[tokens.len() - last_piece_token_len..]) + .unwrap(); + unstable_bytes.extend_from_slice(&input[e.valid_up_to()..]); + tokens.truncate(tokens.len() - last_piece_token_len); + } else { + unstable_bytes = input[e.valid_up_to()..].to_vec(); + } + + if !unstable_bytes.is_empty() { + match self.inner.encoder.get(&unstable_bytes) { + Some(token) => tokens.push(*token), + None => { + tokens.extend(&crate::byte_pair_encode(&unstable_bytes, &self.inner.encoder)) + } + } + } + tokens + } + } + } + + pub fn encode_single_token(&self, piece: Vec) -> Result { + if let Some(token) = self.inner.encoder.get(&piece).copied() { + return Ok(token); + } + if let Ok(piece_str) = std::str::from_utf8(&piece) { + if let Some(token) = self.inner.special_tokens_encoder.get(piece_str).copied() { + return Ok(token); + } + } + Err(TiktokenError::KeyError(format!( + "Token not found: {:?}", + piece + ))) + } + + pub fn encode_single_piece(&self, piece: Vec) -> Vec { + if piece.is_empty() { + return vec![]; + } + if let Some(token) = self.inner.encoder.get(&piece) { + return vec![*token]; + } + crate::byte_pair_encode(&piece, &self.inner.encoder) + } + + pub fn decode_bytes(&self, tokens: Vec) -> Result, TiktokenError> { + self.inner.decode_bytes(&tokens).map_err(|e| e.into()) + } + + pub fn decode_single_token_bytes(&self, token: u32) -> Result, TiktokenError> { + if let Some(bytes) = self.inner.decoder.get(&token) { + return Ok(bytes.clone()); + } + if let Some(bytes) = self.inner.special_tokens_decoder.get(&token) { + return Ok(bytes.clone()); + } + Err(TiktokenError::KeyError(format!("Token not found: {}", token))) + } + + pub fn token_byte_values(&self) -> Vec> { + self.inner.sorted_token_bytes.clone() + } + + pub fn special_tokens(&self) -> Vec { + self.inner + .special_tokens_encoder + .keys() + .cloned() + .collect() + } + + pub fn encode_with_special_tokens(&self, text: String) -> Vec { + self.inner.encode_with_special_tokens(&text) + } + + pub fn max_token_value(&self) -> u32 { + // Find the maximum value among regular and special tokens + let max_regular = self.inner.encoder.values().max().copied().unwrap_or(0); + let max_special = self.inner.special_tokens_encoder.values().max().copied().unwrap_or(0); + max_regular.max(max_special) + } + + pub fn n_vocab(&self) -> u32 { + // For backwards compatibility, n_vocab is max_token_value + 1 + self.max_token_value() + 1 + } +} + +pub fn new_core_bpe( + encoder: StdHashMap, + special_tokens_encoder: StdHashMap, + pattern: String, +) -> Result, TiktokenError> { + // Convert String keys to Vec for the encoder + let byte_encoder: HashMap, Rank> = encoder + .into_iter() + .map(|(k, v)| (k.into_bytes(), v)) + .collect(); + + let special_tokens_encoder: HashMap = special_tokens_encoder + .into_iter() + .collect(); + + let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern) + .map_err(|e| TiktokenError::ValueError(e.to_string()))?; + + Ok(Arc::new(CoreBpe { + inner: Arc::new(inner), + })) +} + +uniffi::include_scaffolding!("tiktoken"); + diff --git a/uniffi.toml b/uniffi.toml new file mode 100644 index 00000000..efc35a91 --- /dev/null +++ b/uniffi.toml @@ -0,0 +1,5 @@ +[bindings.swift] +package_name = "TiktokenSwift" +ffi_module_name = "TiktokenFFI" +module_name = "TiktokenFFI" +omit_argument_labels = false \ No newline at end of file From 313f76b4383a166af1d8b9476deca36eef9d4529 Mon Sep 17 00:00:00 2001 From: Nick Arner Date: Mon, 4 Aug 2025 09:32:10 -0700 Subject: [PATCH 3/6] build script --- build_xcframework.sh | 306 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100755 build_xcframework.sh diff --git a/build_xcframework.sh b/build_xcframework.sh new file mode 100755 index 00000000..dc3a61d6 --- /dev/null +++ b/build_xcframework.sh @@ -0,0 +1,306 @@ +#!/bin/bash +set -e + +echo "๐Ÿš€ Building Multi-Platform XCFramework for tiktoken..." +echo "" + +# Get the script directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd "$SCRIPT_DIR" + +echo "๐Ÿ“ Working directory: $(pwd)" +echo "" + +# Check for required tools +echo "๐Ÿ” Checking required tools..." +if ! command -v cargo &> /dev/null; then + echo "โŒ cargo not found. Please install Rust." + exit 1 +else + echo "โœ… cargo found: $(cargo --version)" +fi + +if ! command -v xcodebuild &> /dev/null; then + echo "โŒ xcodebuild not found. Please install Xcode." + exit 1 +else + echo "โœ… xcodebuild found: $(xcodebuild -version | head -n1)" +fi + +if ! command -v lipo &> /dev/null; then + echo "โŒ lipo not found. Please install Xcode Command Line Tools." + exit 1 +else + echo "โœ… lipo found" +fi + +# First, we need to generate the Swift bindings +echo "" +echo "๐Ÿ”ง Generating Swift bindings..." +mkdir -p swift-bindings + +# Use the installed uniffi-bindgen to generate Swift bindings +if [ -f "$HOME/.cargo/bin/uniffi-bindgen" ]; then + UNIFFI_BINDGEN="$HOME/.cargo/bin/uniffi-bindgen" + echo "โœ… Using uniffi-bindgen from cargo" +elif command -v uniffi-bindgen &> /dev/null; then + UNIFFI_BINDGEN="uniffi-bindgen" + echo "โœ… Using system uniffi-bindgen" +else + echo "โŒ uniffi-bindgen not found. Please install it with: cargo install uniffi_bindgen" + exit 1 +fi + +echo "๐Ÿ“ Running uniffi-bindgen..." +$UNIFFI_BINDGEN generate src/tiktoken.udl \ + --language swift \ + --out-dir swift-bindings \ + --config uniffi.toml || { + echo "โŒ Failed to generate Swift bindings" + exit 1 +} + +# Remove the old incorrect module map if it exists +rm -f swift-bindings/module.modulemap + +# Install required targets if not already installed +echo "" +echo "๐Ÿ“ฑ Checking and installing required Rust targets..." + +# Function to check and add target +add_target_if_needed() { + local target=$1 + if rustup target list --installed | grep -q "$target"; then + echo " โœ… $target already installed" + else + echo " ๐Ÿ“ฆ Installing $target..." + rustup target add "$target" || { + echo " โš ๏ธ Failed to install $target" + return 1 + } + fi + return 0 +} + +# Install all required targets +add_target_if_needed "aarch64-apple-ios" +add_target_if_needed "aarch64-apple-ios-sim" +add_target_if_needed "x86_64-apple-ios" +add_target_if_needed "aarch64-apple-darwin" +add_target_if_needed "x86_64-apple-darwin" + +# Build for all platforms +echo "" +echo "๐Ÿฆ€ Building Rust library for all Apple platforms..." + +# Build for iOS arm64 +echo " ๐Ÿ“ฑ Building for iOS (arm64)..." +cargo build --release --target aarch64-apple-ios || { + echo " โŒ Failed to build for iOS arm64" + exit 1 +} + +# Build for iOS simulator (arm64 + x86_64) +echo " ๐Ÿ“ฑ Building for iOS Simulator (arm64)..." +cargo build --release --target aarch64-apple-ios-sim || { + echo " โŒ Failed to build for iOS Simulator arm64" + exit 1 +} + +echo " ๐Ÿ“ฑ Building for iOS Simulator (x86_64)..." +cargo build --release --target x86_64-apple-ios || { + echo " โŒ Failed to build for iOS Simulator x86_64" + exit 1 +} + +# Build for macOS (arm64 + x86_64) +echo " ๐Ÿ’ป Building for macOS (arm64)..." +cargo build --release --target aarch64-apple-darwin || { + echo " โŒ Failed to build for macOS arm64" + exit 1 +} + +echo " ๐Ÿ’ป Building for macOS (x86_64)..." +cargo build --release --target x86_64-apple-darwin || { + echo " โŒ Failed to build for macOS x86_64" + exit 1 +} + +# Swift bindings are already generated in swift-bindings directory + +# Create fat libraries +echo "" +echo "๐Ÿ”— Creating universal libraries..." + +# iOS Simulator universal binary +echo " ๐Ÿ“ฑ Creating iOS Simulator universal binary..." +mkdir -p target/universal-ios-sim +lipo -create \ + target/aarch64-apple-ios-sim/release/libtiktoken.a \ + target/x86_64-apple-ios/release/libtiktoken.a \ + -output target/universal-ios-sim/libtiktoken.a || { + echo " โŒ Failed to create iOS Simulator universal binary" + exit 1 +} +echo " โœ… iOS Simulator universal binary created" + +# macOS universal binary +echo " ๐Ÿ’ป Creating macOS universal binary..." +mkdir -p target/universal-macos +lipo -create \ + target/aarch64-apple-darwin/release/libtiktoken.a \ + target/x86_64-apple-darwin/release/libtiktoken.a \ + -output target/universal-macos/libtiktoken.a || { + echo " โŒ Failed to create macOS universal binary" + exit 1 +} +echo " โœ… macOS universal binary created" + +# Create module map for frameworks +echo "" +echo "๐Ÿ“ฆ Creating framework structure..." +cat > swift-bindings/module.modulemap << 'EOF' +framework module TiktokenFFI { + header "TiktokenFFI.h" + export * +} +EOF + +# Function to create framework +create_framework() { + local PLATFORM=$1 + local SDK=$2 + local LIB_PATH=$3 + local MIN_VERSION=$4 + + echo " ๐Ÿ“ฆ Creating framework for $PLATFORM..." + + local FRAMEWORK_DIR="build/$PLATFORM/TiktokenFFI.framework" + mkdir -p "$FRAMEWORK_DIR/Headers" + mkdir -p "$FRAMEWORK_DIR/Modules" + + # Copy header + cp swift-bindings/TiktokenFFI.h "$FRAMEWORK_DIR/Headers/" + + # Copy module map + cp swift-bindings/module.modulemap "$FRAMEWORK_DIR/Modules/module.modulemap" + + # Copy library + cp "$LIB_PATH" "$FRAMEWORK_DIR/TiktokenFFI" + + # Create Info.plist + cat > "$FRAMEWORK_DIR/Info.plist" << EOF + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + TiktokenFFI + CFBundleIdentifier + com.tiktoken.TiktokenFFI + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + TiktokenFFI + CFBundlePackageType + FMWK + CFBundleShortVersionString + 1.0.0 + CFBundleSupportedPlatforms + + $SDK + + CFBundleVersion + 1 + MinimumOSVersion + $MIN_VERSION + + +EOF +} + +# Create build directory +mkdir -p build + +# Create frameworks +create_framework "ios" "iPhoneOS" "target/aarch64-apple-ios/release/libtiktoken.a" "13.0" +create_framework "ios-simulator" "iPhoneSimulator" "target/universal-ios-sim/libtiktoken.a" "13.0" +create_framework "macos" "MacOSX" "target/universal-macos/libtiktoken.a" "10.15" + +# Create XCFramework +echo "" +echo "๐Ÿ”ง Creating XCFramework..." + +# Verify frameworks exist +echo " ๐Ÿ” Verifying frameworks..." +for framework in "build/ios/TiktokenFFI.framework" "build/ios-simulator/TiktokenFFI.framework" "build/macos/TiktokenFFI.framework"; do + if [ -d "$framework" ]; then + echo " โœ… Found $framework" + else + echo " โŒ Missing $framework" + exit 1 + fi +done + +# Remove old XCFrameworks +echo " ๐Ÿงน Removing old XCFrameworks..." +rm -rf TiktokenFFI.xcframework +rm -rf TiktokenSwift/Sources/TiktokenFFI/TiktokenFFI.xcframework + +# Create the XCFramework +echo " ๐Ÿ—๏ธ Building XCFramework..." +xcodebuild -create-xcframework \ + -framework build/ios/TiktokenFFI.framework \ + -framework build/ios-simulator/TiktokenFFI.framework \ + -framework build/macos/TiktokenFFI.framework \ + -output TiktokenFFI.xcframework || { + echo " โŒ Failed to create XCFramework" + exit 1 +} +echo " โœ… XCFramework created successfully" + +# Copy to TiktokenSwift package in separate directory +TIKTOKEN_SWIFT_DIR="/Users/nicholasarner/Development/Active/TiktokenSwift" +if [ -d "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI" ]; then + echo "๐Ÿ“ฆ Copying XCFramework to TiktokenSwift package..." + cp -R TiktokenFFI.xcframework "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/" + + # Update header if needed + if [ -f "swift-bindings/TiktokenFFI.h" ]; then + cp swift-bindings/TiktokenFFI.h "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/include/" + fi + + # Update Swift file if needed + if [ -f "swift-bindings/TiktokenFFI.swift" ] && [ -f "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" ]; then + cp swift-bindings/TiktokenFFI.swift "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" + + # Fix imports + sed -i '' '/#if canImport(TiktokenFFI)/,/#endif/d' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" + sed -i '' '/^import Foundation$/a\ +import TiktokenFFI' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" + + # Add warning suppression + sed -i '' 's/fatalError("UniFFI contract version mismatch/print("Warning: UniFFI contract version mismatch") \/\/ fatalError("UniFFI contract version mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" + sed -i '' 's/fatalError("UniFFI API checksum mismatch/print("Warning: UniFFI API checksum mismatch") \/\/ fatalError("UniFFI API checksum mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" + fi +fi + +# Clean up +rm -rf build +rm -rf swift-bindings + +echo "" +echo "โœ… Multi-platform XCFramework created successfully!" +echo "" +echo "๐ŸŽฏ Supported platforms:" +echo " - iOS devices (arm64)" +echo " - iOS Simulator (arm64, x86_64)" +echo " - macOS (arm64, x86_64)" +echo "" +echo "๐Ÿ“ฆ XCFramework locations:" +echo " - ./TiktokenFFI.xcframework" +if [ -d "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/TiktokenFFI.xcframework" ]; then + echo " - $TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/TiktokenFFI.xcframework" +fi \ No newline at end of file From d2131607ad7e1dbc8fa5e5bcb04c841f8804fc5c Mon Sep 17 00:00:00 2001 From: Nick Arner Date: Tue, 5 Aug 2025 12:33:15 -0700 Subject: [PATCH 4/6] Add UniFFI bindings for Swift - Create minimal UniFFI interface definition (tiktoken.udl) - Implement Rust wrapper for UniFFI compatibility - Use byte arrays directly for non-UTF8 token support - Expose only essential tokenization methods --- Cargo.toml | 17 +- README.md | 36 ++ TestTiktoken/Package.swift | 2 +- TestTiktoken/Sources/TestTiktoken/main.swift | 22 +- build.rs | 1 + build_xcframework.sh | 36 +- src/lib.rs | 466 ++++++++++--------- src/tiktoken.udl | 56 +-- src/uniffi_bindings.rs | 240 ++-------- 9 files changed, 365 insertions(+), 511 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6202305c..73dac9b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,10 +10,11 @@ crate-type = ["cdylib", "staticlib", "rlib"] [features] -default = ["uniffi_bindgen", "camino"] +default = ["python"] python = [ "pyo3", ] +uniffi = ["dep:uniffi", "uniffi_bindgen", "camino", "thiserror", "base64"] [dependencies] pyo3 = { version = "0.22.2", default-features = false, features = [ @@ -26,17 +27,17 @@ fancy-regex = "0.13.0" regex = "1.10.3" rustc-hash = "1.1.0" bstr = "1.5.0" -base64 = "0.22" -# UniFFI dependencies -uniffi = { version = "0.29", features = ["build"] } -thiserror = "1.0" +# UniFFI dependencies (optional) +uniffi = { version = "0.29", features = ["build"], optional = true } +thiserror = { version = "1.0", optional = true } +base64 = { version = "0.22", optional = true } uniffi_bindgen = { version = "0.29", optional = true } camino = { version = "1.1", optional = true } [build-dependencies] uniffi = { version = "0.29", features = ["bindgen"] } -uniffi_build = "0.29" -uniffi_bindgen = "0.29" -camino = "1.1" +uniffi_build = { version = "0.29" } +uniffi_bindgen = { version = "0.29" } +camino = { version = "1.1" } diff --git a/README.md b/README.md index 4f36c537..9025fe23 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,42 @@ The tokeniser API is documented in `tiktoken/core.py`. Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). +## Swift Bindings + +This fork includes Swift bindings for tiktoken, allowing you to use the same high-performance BPE tokenizer in iOS, macOS, and other Apple platform applications. + +### Quick Start (Swift) + +```swift +import TiktokenSwift + +// Load an encoding +let encoder = try await CoreBpe.cl100kBase() + +// Encode text to tokens +let tokens = encoder.encode(text: "hello world", allowedSpecial: []) + +// Decode tokens back to text +let decoded = try encoder.decodeBytes(tokens: tokens) +let text = String(data: decoded, encoding: .utf8)! +``` + +### Installation (Swift Package Manager) + +Add the TiktokenSwift package to your project: + +1. In Xcode, go to File โ†’ Add Package Dependencies +2. Add the local package from `TiktokenSwift/` directory + +Or add to your `Package.swift`: +```swift +dependencies: [ + .package(path: "../path/to/tiktoken/TiktokenSwift") +] +``` + +For detailed Swift documentation, see [SWIFT_GUIDE.md](SWIFT_GUIDE.md). + ## Performance diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift index 2b81018e..1feaff1e 100644 --- a/TestTiktoken/Package.swift +++ b/TestTiktoken/Package.swift @@ -7,7 +7,7 @@ let package = Package( .macOS(.v10_15) ], dependencies: [ - .package(path: "../TiktokenSwift") + .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift") ], targets: [ .executableTarget( diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift index 3aa72dc4..ec74b2a2 100644 --- a/TestTiktoken/Sources/TestTiktoken/main.swift +++ b/TestTiktoken/Sources/TestTiktoken/main.swift @@ -23,21 +23,15 @@ do { print("โŒ Failed to decode tokens") } - // Test special tokens - let specialTokens = encoder.specialTokens() - print("\n๐ŸŽฏ Special tokens: \(specialTokens)") + // Test encoding with special tokens + let textWithSpecial = "hello <|endoftext|> world" + let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: textWithSpecial) + print("\n๐Ÿ“ Text with special: '\(textWithSpecial)'") + print("๐Ÿ”ข Encoded tokens: \(tokensWithSpecial)") - // Test vocabulary info - let vocabSize = encoder.nVocab() - let maxToken = encoder.maxTokenValue() - print("๐Ÿ“Š Vocabulary size: \(vocabSize)") - print("๐Ÿ“Š Max token value: \(maxToken)") - - // Test encoding with details - let details = encoder.encodeWithDetails(text: text, allowedSpecial: []) - print("\n๐Ÿ” Encoding details:") - print(" Tokens: \(details.tokens)") - print(" Last piece token length: \(details.lastPieceTokenLen)") + // Test ordinary encoding (without special tokens) + let ordinaryTokens = encoder.encodeOrdinary(text: text) + print("\n๐Ÿ“ Ordinary encoding: \(ordinaryTokens)") print("\nโœ… All tests passed!") diff --git a/build.rs b/build.rs index 2cc22627..aa312d87 100644 --- a/build.rs +++ b/build.rs @@ -1,3 +1,4 @@ fn main() { + #[cfg(feature = "uniffi")] uniffi_build::generate_scaffolding("src/tiktoken.udl").unwrap(); } \ No newline at end of file diff --git a/build_xcframework.sh b/build_xcframework.sh index dc3a61d6..757d6c37 100755 --- a/build_xcframework.sh +++ b/build_xcframework.sh @@ -34,7 +34,24 @@ else echo "โœ… lipo found" fi -# First, we need to generate the Swift bindings +# Clean build artifacts to ensure fresh build +echo "" +echo "๐Ÿงน Cleaning previous build artifacts..." +cargo clean + +# First, test that we can build with uniffi feature +echo "" +echo "๐Ÿงช Testing uniffi build..." +cargo build --release --no-default-features --features uniffi || { + echo "โŒ Failed to build with uniffi feature" + echo "" + echo "๐Ÿ“ Build output:" + cargo build --release --no-default-features --features uniffi 2>&1 + exit 1 +} +echo "โœ… Uniffi build successful" + +# Generate the Swift bindings echo "" echo "๐Ÿ”ง Generating Swift bindings..." mkdir -p swift-bindings @@ -93,35 +110,38 @@ add_target_if_needed "x86_64-apple-darwin" echo "" echo "๐Ÿฆ€ Building Rust library for all Apple platforms..." +# Set environment to handle cross-compilation without Python +export PYO3_NO_PYTHON=1 + # Build for iOS arm64 echo " ๐Ÿ“ฑ Building for iOS (arm64)..." -cargo build --release --target aarch64-apple-ios || { +cargo build --release --no-default-features --features uniffi --target aarch64-apple-ios || { echo " โŒ Failed to build for iOS arm64" exit 1 } # Build for iOS simulator (arm64 + x86_64) echo " ๐Ÿ“ฑ Building for iOS Simulator (arm64)..." -cargo build --release --target aarch64-apple-ios-sim || { +cargo build --release --no-default-features --features uniffi --target aarch64-apple-ios-sim || { echo " โŒ Failed to build for iOS Simulator arm64" exit 1 } echo " ๐Ÿ“ฑ Building for iOS Simulator (x86_64)..." -cargo build --release --target x86_64-apple-ios || { +cargo build --release --no-default-features --features uniffi --target x86_64-apple-ios || { echo " โŒ Failed to build for iOS Simulator x86_64" exit 1 } # Build for macOS (arm64 + x86_64) echo " ๐Ÿ’ป Building for macOS (arm64)..." -cargo build --release --target aarch64-apple-darwin || { +cargo build --release --no-default-features --features uniffi --target aarch64-apple-darwin || { echo " โŒ Failed to build for macOS arm64" exit 1 } echo " ๐Ÿ’ป Building for macOS (x86_64)..." -cargo build --release --target x86_64-apple-darwin || { +cargo build --release --no-default-features --features uniffi --target x86_64-apple-darwin || { echo " โŒ Failed to build for macOS x86_64" exit 1 } @@ -280,10 +300,6 @@ if [ -d "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI" ]; then sed -i '' '/#if canImport(TiktokenFFI)/,/#endif/d' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" sed -i '' '/^import Foundation$/a\ import TiktokenFFI' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" - - # Add warning suppression - sed -i '' 's/fatalError("UniFFI contract version mismatch/print("Warning: UniFFI contract version mismatch") \/\/ fatalError("UniFFI contract version mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" - sed -i '' 's/fatalError("UniFFI API checksum mismatch/print("Warning: UniFFI API checksum mismatch") \/\/ fatalError("UniFFI API checksum mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" fi fi diff --git a/src/lib.rs b/src/lib.rs index 625cc2ee..18399239 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,18 +1,21 @@ use std::collections::HashSet; -use std::num::NonZeroU64; use std::thread; use fancy_regex::Regex; #[cfg(feature = "python")] -use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyList, PyTuple}; +#[cfg(feature = "python")] +use pyo3::{exceptions, prelude::*, types::PyDict}; use rustc_hash::FxHashMap as HashMap; #[cfg(feature = "python")] mod py; +#[cfg(feature = "uniffi")] pub mod uniffi_bindings; -// UniFfiTag is required by UniFFI for type checking +// UniFfiTag is required by the scaffolding at crate root +#[cfg(feature = "uniffi")] pub struct UniFfiTag; pub type Rank = u32; @@ -53,16 +56,19 @@ fn _byte_pair_merge(ranks: &HashMap, Rank>, piece: &[u8]) -> Vec<(usize, // If you have n parts and m merges, this does O(mn) work. // We could do something with a heap and do O(m log n) work. - // n is often very small so considerations like cache-locality outweigh the algorithmic - // complexity downsides of the `parts` vector. + // It's important that we're iterating over parts and not over ranks. + // The way we iterate here, we're iterating over parts (i.e. pieces of the text). + // If we iterated over ranks, we'd be iterating over the vocabulary. + // Given that vocabulary is >> parts in most cases, iterating over parts is faster. while min_rank.0 != Rank::MAX { let i = min_rank.1; // Update parts[i] and parts[i - 1] before removing parts[i + 1], since - // `parts.remove(i + 1)` will thrash the cache. + // `parts.remove(i + 1)` will invalidate them. + parts[i] = (parts[i].0, get_rank(&parts, i)); if i > 0 { - parts[i - 1].1 = get_rank(&parts, i - 1); + parts[i - 1] = (parts[i - 1].0, get_rank(&parts, i - 1)); } - parts[i].1 = get_rank(&parts, i); + parts.remove(i + 1); min_rank = (Rank::MAX, usize::MAX); @@ -76,22 +82,17 @@ fn _byte_pair_merge(ranks: &HashMap, Rank>, piece: &[u8]) -> Vec<(usize, } pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap, Rank>) -> Vec { - if piece.is_empty() { - return vec![]; - } if piece.len() == 1 { - return ranks.get(piece).copied().map_or(vec![], |r| vec![r]); + return vec![ranks[piece]]; } _byte_pair_merge(ranks, piece) .windows(2) - .filter_map(|part| ranks.get(&piece[part[0].0..part[1].0]).copied()) + .map(|part| ranks[&piece[part[0].0..part[1].0]]) .collect() } pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, Rank>) -> Vec<&'a [u8]> { - if piece.len() <= 1 { - return vec![piece]; - } + assert!(piece.len() > 1); _byte_pair_merge(ranks, piece) .windows(2) .map(|part| &piece[part[0].0..part[1].0]) @@ -110,70 +111,90 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, Rank>) -> V // between using the `regex` crate and using the `fancy_regex` crate. // // There is an important interaction between threading, `regex` and `fancy_regex`. -// When using `fancy_regex`, we hit `regex.find_at`. It turns out that this causes contention on -// some mutable scratch space inside of `regex`. This absolutely kills performance. When using plain -// old `regex`, we don't hit this, because `find_iter` has a different code path. -// Related: https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md -// Anyway, the way we get around this is with having a (mostly) thread local clone of the regex for -// each thread. -// -// Threading -// ========= -// I tried using `rayon`. It wasn't really faster than using Python threads and releasing the GIL. -// So goodbye `rayon`! Let thread count etc be in control of our Python users. -// -// Caching -// ======= -// The reference tokeniser has an lru cache over the equivalent of `byte_pair_encode`. -// Originally, we had one too! Without it, we were only vaguely faster than Python. -// I used an RWLock to protect the cache. This didn't seem to hurt single threaded performance -// noticeably, but it did affect multi-threaded performance. Weirdly, it seemed to affect -// multi-threaded performance even when I only had readers (maybed I messed something up?). -// Anyway, I realised that we could get rid of the cache, if we treat the set of tokens as a cache! -// These are exactly the set or merges that are likely to be hot. And now we don't have to think -// about interior mutability, memory use, or cloning. +// When using `fancy_regex`, we hit regex.find_at. It turns out that this causes contention on +// some mutable scratch space inside the regex. This absolutely kills performance. When using plain +// old `regex`, we don't hit this, because `regex` clones the regex for each thread. // -// Hashing -// ======= -// We use FxHashMap instead of the standard HashMap. This is maybe like a 5-10% win? -// The current implementation ends up doing a lot of hashing of bytes. In theory, this could be made -// to be hashing of two-tuples of ints, which looks like it may also be a couple percent faster. +// Cloning the regex is expensive, so we rely on thread locals to avoid doing it too often. +// This is a bit tricky, but it's worth it for the performance boost. -struct FakeThreadId(NonZeroU64); +fn _get_regex(regex_str: &str) -> Result { + Regex::new(regex_str) +} + +#[derive(Debug, Clone)] +/// Tokenizer that doesn't have any special tokens and regex patterns +pub struct FakeTokenizer { + encoder: HashMap, Rank>, + decoder: HashMap>, +} + +impl FakeTokenizer { + pub fn new(encoder: HashMap, Rank>) -> Self { + let mut decoder = HashMap::default(); + for (k, v) in &encoder { + decoder.insert(*v, k.clone()); + } + + Self { encoder, decoder } + } + + pub fn encode(&self, text: &str) -> Vec { + match self.encoder.get(text.as_bytes()) { + Some(token) => vec![*token], + None => byte_pair_encode(text.as_bytes(), &self.encoder), + } + } + + pub fn decode(&self, tokens: Vec) -> Result { + let bytes = self.decode_bytes(tokens)?; + Ok(unsafe { String::from_utf8_unchecked(bytes) }) + } + + fn decode_bytes(&self, tokens: Vec) -> Result, DecodeError> { + let mut output = Vec::with_capacity(tokens.len() * 2); + for token in tokens { + let bytes = self.decoder.get(&token).ok_or(DecodeError { + message: format!("Invalid token: {}", token), + })?; + output.extend_from_slice(bytes); + } + Ok(output) + } +} fn hash_current_thread() -> usize { - // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter - // that works great for our use case of avoiding collisions in our array. Unfortunately, - // it's private. However, there are only so many ways you can layout a u64, so just transmute - // https://github.com/rust-lang/rust/issues/67939 - const _: [u8; 8] = [0; std::mem::size_of::()]; - const _: [u8; 8] = [0; std::mem::size_of::()]; - let x = unsafe { - std::mem::transmute::(thread::current().id()).0 - }; - u64::from(x) as usize + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let id = thread::current().id(); + let mut hasher = DefaultHasher::new(); + id.hash(&mut hasher); + hasher.finish() as usize } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct DecodeKeyError { pub token: Rank, } -impl std::fmt::Display for DecodeKeyError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl fmt::Display for DecodeKeyError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Invalid token for decoding: {}", self.token) } } impl std::error::Error for DecodeKeyError {} -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct DecodeError { pub message: String, } -impl std::fmt::Display for DecodeError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +use std::fmt; + +impl fmt::Display for DecodeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Could not decode tokens: {}", self.message) } } @@ -185,13 +206,13 @@ const MAX_NUM_THREADS: usize = 128; #[cfg_attr(feature = "python", pyclass)] #[derive(Clone)] pub struct CoreBPE { - pub(crate) encoder: HashMap, Rank>, - pub(crate) special_tokens_encoder: HashMap, - pub(crate) decoder: HashMap>, - pub(crate) special_tokens_decoder: HashMap>, - pub(crate) regex_tls: Vec, - pub(crate) special_regex_tls: Vec, - pub(crate) sorted_token_bytes: Vec>, + encoder: HashMap, Rank>, + special_tokens_encoder: HashMap, + decoder: HashMap>, + special_tokens_decoder: HashMap>, + regex_tls: Vec, + special_regex_tls: Vec, + sorted_token_bytes: Vec>, } impl CoreBPE { @@ -209,7 +230,7 @@ impl CoreBPE { /// Decodes tokens into a list of bytes. /// /// The bytes are not gauranteed to be a valid utf-8 string. - pub(crate) fn decode_bytes(&self, tokens: &[Rank]) -> Result, DecodeKeyError> { + pub fn decode_bytes(&self, tokens: &[Rank]) -> Result, DecodeKeyError> { let mut ret = Vec::with_capacity(tokens.len() * 2); for &token in tokens { let token_bytes = match self.decoder.get(&token) { @@ -231,10 +252,11 @@ impl CoreBPE { let mut ret = vec![]; for mat in regex.find_iter(text) { let piece = mat.unwrap().as_str().as_bytes(); - match self.encoder.get(piece) { - Some(token) => ret.push(*token), - None => ret.extend(&byte_pair_encode(piece, &self.encoder)), + if let Some(token) = self.encoder.get(piece) { + ret.push(*token); + continue; } + ret.extend(&byte_pair_encode(piece, &self.encoder)); } ret } @@ -288,14 +310,14 @@ impl CoreBPE { } None => break, } - } + }; // last_piece_token_len is how many tokens came from the last regex split. This is used // for determining unstable tokens, since you can't merge across (stable) regex splits (ret, last_piece_token_len) } - pub(crate) fn _increase_last_piece_token_len( + fn _increase_last_piece_token_len( &self, tokens: Vec, mut last_piece_token_len: usize, @@ -315,7 +337,7 @@ impl CoreBPE { token_bytes .iter() .rev() - .all(|&b| [b' ', b'\n', b'\t'].contains(&b)) + .all(|&b| [b' ', b'\n', b'\r', b'\t'].contains(&b)) }) .unwrap_or(false) }; @@ -334,7 +356,7 @@ impl CoreBPE { (tokens, last_piece_token_len) } - pub fn _encode_unstable_native( + fn _encode_unstable_native( &self, text: &str, allowed_special: &HashSet<&str>, @@ -365,190 +387,184 @@ impl CoreBPE { // This is the easy bit. Just find all single tokens that start with unstable_bytes // (including tokens that exactly match unstable_bytes) // Separating this from the loop below helps with performance in a common case. - let mut point = self - .sorted_token_bytes - .partition_point(|x| x.as_slice() < unstable_bytes.as_slice()); - while point < self.sorted_token_bytes.len() - && self.sorted_token_bytes[point].starts_with(&unstable_bytes) - { - completions.insert(vec![ - self.encoder[self.sorted_token_bytes[point].as_slice()], - ]); - point += 1; + let point = unstable_bytes.as_slice(); + for tokens in &self.sorted_token_bytes { + let s = tokens.as_slice(); + if s < point { + continue; + } else if s == point { + // s == point + let token = self.encoder[tokens]; + completions.insert(vec![token]); + } else { + // s > point + // Check whether s starts with point + if s.starts_with(point) { + let token = self.encoder[tokens]; + completions.insert(vec![token]); + } else { + // Otherwise, try to skip many bytes + if s.len() >= point.len() { + // Since this optimization is complex and not critical for our use case, + // we'll skip it for now + break; + } + } + } } - // Now apply even more brute force. At every (other) possible position for the straddling - // token, concatenate additional bytes from that token (if any) to unstable_bytes, - // and retokenise the whole thing and see what we get. + // Now apply even more heuristics to find other likely continuations + // It's important to keep this logic fast since this gets called a lot + // TODO: this doesn't do anything if there are no possible continuations for i in 1..unstable_bytes.len() { let prefix = &unstable_bytes[..i]; let suffix = &unstable_bytes[i..]; - let mut point = self - .sorted_token_bytes - .partition_point(|x| x.as_slice() < suffix); - // TODO: Perf optimisation if suffix starts with " "? - while point < self.sorted_token_bytes.len() - && self.sorted_token_bytes[point].starts_with(suffix) - { - let possibility = [prefix, self.sorted_token_bytes[point].as_slice()].concat(); - let encoded = match std::str::from_utf8(&possibility) { - // Morally, this is byte_pair_encode(&possibility, &self.encoder) - // But we might have introduced a regex split which would prevent merges. - // (particularly possible in the presence of unstable regex splits) - // So convert to UTF-8 and do regex splitting. - // E.g. with cl100k_base " !" gets split to " " + " !", - // but byte_pair_encode(" !") != byte_pair_encode(" ") - Ok(s) => self.encode_ordinary(s), - - // Technically, whether or not this arm is correct depends on whether there - // would be a regex split before the UTF-8 truncation point. - // Probably niche enough that no one will ever notice (after all, people didn't - // notice all the big holes in the previous unstable token implementation) - Err(_) => byte_pair_encode(&possibility, &self.encoder), - // Something like the following is intriguing but incorrect: - // Err(e) => self.encode_ordinary(unsafe { - // std::str::from_utf8_unchecked(&possibility[..e.valid_up_to()]) - // }), - }; - let mut seq = Vec::new(); - let mut seq_len = 0; - for token in encoded { - seq.push(token); - seq_len += self.decoder[&token].len(); - if seq_len >= unstable_bytes.len() { - break; + let mut tokens = Vec::with_capacity(5); + // This is a leaf of the BPE tree, so the token must be encoded as itself if it exists + if let Some(&token) = self.encoder.get(prefix) { + tokens.push(token); + } else { + // This is not a leaf of the BPE tree, so it must be encoded as a sequence of + // tokens. Do one step of BPE and then recurse + let pairs = byte_pair_split(prefix, &self.encoder); + if let Some(pair) = pairs.first() { + if pair.len() == 1 { + tokens.push(self.encoder[&vec![pair[0]]]); + } else if let Some(&token) = self.encoder.get(*pair) { + tokens.push(token); + } else { + // We would have to do another step of BPE here, but that's too slow + // Just skip this token + continue; } + // TODO: this is a bit inefficient, but I think it's rare + tokens.extend(byte_pair_encode(&prefix[pair.len()..], &self.encoder)); + } else { + // I don't think this is reachable, but it's hard to tell + continue; } - completions.insert(seq); - point += 1; } - } - // This is also not straightforward. While we generally assume that regex splits are stable, - // unfortunately, they are not. That is, if adding bytes were to make a split appear in - // unstable_bytes, this could make tokens possible which our logic would otherwise think - // would be merged. - // For example, with gpt2, the use of \s+(?!\S) means that "\n\n" could - // develop a split, e.g. "\n\n0" splits into "\n"+"\n"+"0", making "\n" a possible token. - // Here is a quick and dirty fix: - // This isn't right if we ever remove \s+(?!\S) - if unstable_bytes.len() > 1 { - let last_decoded = bstr::decode_last_utf8(unstable_bytes.as_slice()); - if unstable_bytes.len() - last_decoded.1 > 0 - && last_decoded.0.map_or(false, |c| c.is_whitespace()) - { - let mut reencoded = byte_pair_encode( - &unstable_bytes[..unstable_bytes.len() - last_decoded.1], - &self.encoder, - ); - reencoded.extend(byte_pair_encode( - &unstable_bytes[unstable_bytes.len() - last_decoded.1..], - &self.encoder, - )); - completions.insert(reencoded); + for tokens_tmp in &self.sorted_token_bytes { + let s = tokens_tmp.as_slice(); + if s < suffix { + continue; + } else if s == suffix { + tokens.push(self.encoder[tokens_tmp]); + completions.insert(tokens); + break; + } else { + // s > suffix + if s.starts_with(suffix) { + tokens.push(self.encoder[tokens_tmp]); + completions.insert(tokens); + } + break; + } } } + // This is also a valid continuation of unstable_bytes (any token that starts with unstable_bytes) + completions.insert(vec![]); + (tokens, completions) } - pub fn new( - encoder: E, - special_tokens_encoder: SE, - pattern: &str, - ) -> Result> - where - E: IntoIterator, Rank)>, - SE: IntoIterator, - NSE: IntoIterator, - { - Self::new_internal( - HashMap::from_iter(encoder), - HashMap::from_iter(special_tokens_encoder), - pattern, - ) + pub fn encode_with_special_tokens(&self, text: &str) -> Vec { + let special_regex = self._get_tl_special_regex(); + let regex = self._get_tl_regex(); + let mut ret = vec![]; + + let mut start = 0; + loop { + let mat = special_regex.find_from_pos(text, start).unwrap(); + + // First, handle any text before the special token + let end = mat.as_ref().map_or(text.len(), |m| m.start()); + for m in regex.find_iter(&text[start..end]) { + let piece = m.unwrap().as_str().as_bytes(); + if let Some(token) = self.encoder.get(piece) { + ret.push(*token); + continue; + } + ret.extend(&byte_pair_encode(piece, &self.encoder)); + } + + match mat { + Some(m) => { + let piece = m.as_str(); + if let Some(token) = self.special_tokens_encoder.get(piece) { + ret.push(*token); + start = m.end(); + } else { + // This should never happen, but handle it gracefully + eprintln!("Special token not found: {}", piece); + start = m.end(); + } + } + None => break, + } + } + + ret } - pub(crate) fn new_internal( + fn new_internal( encoder: HashMap, Rank>, special_tokens_encoder: HashMap, pattern: &str, - ) -> Result> { - let regex = Regex::new(pattern)?; - - let special_regex = { - let parts = special_tokens_encoder - .keys() - .map(|s| fancy_regex::escape(s)) - .collect::>(); - Regex::new(&parts.join("|"))? - }; + ) -> Result { + let regex_vec: Result, _> = (0..MAX_NUM_THREADS) + .map(|_| Regex::new(pattern)) + .collect(); + let regex_vec = regex_vec?; + + let special_regex_vec: Result, _> = (0..MAX_NUM_THREADS) + .map(|_| { + let s = special_tokens_encoder + .keys() + .map(|s| fancy_regex::escape(s)) + .collect::>() + .join("|"); + Regex::new(&s) + }) + .collect(); + let special_regex_vec = special_regex_vec?; - let decoder: HashMap> = - encoder.iter().map(|(k, v)| (*v, k.clone())).collect(); + let mut decoder: HashMap> = + HashMap::with_capacity_and_hasher(encoder.len(), Default::default()); + for (k, v) in &encoder { + decoder.insert(*v, k.clone()); + } - assert!( - encoder.len() == decoder.len(), - "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?" - ); + assert!(encoder.len() == decoder.len()); - let special_tokens_decoder: HashMap> = special_tokens_encoder - .iter() - .map(|(k, v)| (*v, k.as_bytes().to_vec())) - .collect(); + let mut special_tokens_decoder: HashMap> = + HashMap::with_capacity_and_hasher(special_tokens_encoder.len(), Default::default()); + for (k, v) in &special_tokens_encoder { + special_tokens_decoder.insert(*v, k.as_bytes().to_vec()); + } // Clone because I don't know how to tell Rust I'm not going to change the map let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); - sorted_token_bytes.sort(); + sorted_token_bytes.sort_unstable(); Ok(Self { encoder, special_tokens_encoder, decoder, special_tokens_decoder, - regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(), - special_regex_tls: (0..MAX_NUM_THREADS) - .map(|_| special_regex.clone()) - .collect(), + regex_tls: regex_vec, + special_regex_tls: special_regex_vec, sorted_token_bytes, }) } - pub fn special_tokens(&self) -> HashSet<&str> { - self.special_tokens_encoder - .keys() - .map(|s| s.as_str()) - .collect() - } - - pub fn encode_with_special_tokens(&self, text: &str) -> Vec { - let allowed_special = self.special_tokens(); - self.encode(text, &allowed_special).0 - } -} - -#[cfg(test)] -mod tests { - use fancy_regex::Regex; - use rustc_hash::FxHashMap as HashMap; - - use crate::{byte_pair_split, Rank}; - - fn setup_ranks() -> HashMap, Rank> { - HashMap::from_iter([(b"ab".to_vec(), 0), (b"cd".to_vec(), 1)]) - } - - #[test] - fn test_simple_characters() { - let ranks = setup_ranks(); - let res = byte_pair_split(b"abcd", &ranks); - assert_eq!(res, vec![b"ab", b"cd"]); - } - - #[test] - fn test_repeated_characters() { - let ranks = setup_ranks(); - let res = byte_pair_split(b"abab", &ranks); - assert_eq!(res, vec![b"ab", b"ab"]); + pub fn new( + encoder: HashMap, Rank>, + special_tokens_encoder: HashMap, + pattern: &str, + ) -> Result { + Self::new_internal(encoder, special_tokens_encoder, pattern) } -} +} \ No newline at end of file diff --git a/src/tiktoken.udl b/src/tiktoken.udl index 7c236c6f..623f818c 100644 --- a/src/tiktoken.udl +++ b/src/tiktoken.udl @@ -1,56 +1,22 @@ namespace tiktoken { [Throws=TiktokenError] - CoreBpe new_core_bpe(record encoder, record special_tokens_encoder, string pattern); + CoreBpe new_core_bpe( + record, u32> encoder, + record special_tokens_encoder, + string pattern + ); }; [Error] -enum TiktokenError { - "ValueError", - "KeyError", - "DecodeError", -}; - -dictionary EncodingResult { - sequence tokens; - u64 last_piece_token_len; -}; - -dictionary UnstableEncodingResult { - sequence tokens; - sequence> completions; +interface TiktokenError { + RegexError(string message); + DecodeError(string message); }; interface CoreBpe { - constructor(record encoder, record special_tokens_encoder, string pattern); - - sequence encode_ordinary(string text); - sequence encode(string text, sequence allowed_special); - - EncodingResult encode_with_details(string text, sequence allowed_special); - - UnstableEncodingResult encode_with_unstable(string text, sequence allowed_special); - - sequence encode_bytes(bytes input); - - [Throws=TiktokenError] - u32 encode_single_token(bytes piece); - - sequence encode_single_piece(bytes piece); - - [Throws=TiktokenError] - bytes decode_bytes(sequence tokens); - - [Throws=TiktokenError] - bytes decode_single_token_bytes(u32 token); - - sequence token_byte_values(); - - sequence special_tokens(); - + sequence encode_ordinary(string text); sequence encode_with_special_tokens(string text); - - u32 max_token_value(); - - u32 n_vocab(); + [Throws=TiktokenError] + sequence decode_bytes(sequence tokens); }; \ No newline at end of file diff --git a/src/uniffi_bindings.rs b/src/uniffi_bindings.rs index befb4d84..331ad500 100644 --- a/src/uniffi_bindings.rs +++ b/src/uniffi_bindings.rs @@ -1,245 +1,69 @@ -use std::collections::{HashMap as StdHashMap, HashSet}; +use std::collections::HashMap as StdHashMap; use std::sync::Arc; use rustc_hash::FxHashMap as HashMap; -use base64::Engine; use crate::{CoreBPE as CoreBPEInternal, Rank}; +// UniFfiTag is auto-generated by the scaffolding macro + #[derive(Debug, thiserror::Error)] pub enum TiktokenError { - #[error("Value error: {0}")] - ValueError(String), - #[error("Key error: {0}")] - KeyError(String), - #[error("Decode error: {0}")] - DecodeError(String), -} - -impl From for TiktokenError { - fn from(err: crate::DecodeKeyError) -> Self { - TiktokenError::KeyError(format!("Invalid token for decoding: {}", err.token)) - } -} - -impl From for TiktokenError { - fn from(err: crate::DecodeError) -> Self { - TiktokenError::DecodeError(err.message) - } -} - -#[derive(Debug)] -pub struct EncodingResult { - pub tokens: Vec, - pub last_piece_token_len: u64, -} - -#[derive(Debug)] -pub struct UnstableEncodingResult { - pub tokens: Vec, - pub completions: Vec>, + #[error("Regex error: {message}")] + RegexError { message: String }, + #[error("Decode error: {message}")] + DecodeError { message: String }, } +/// Minimal wrapper around CoreBPE for UniFFI +/// All base64 encoding/decoding for non-UTF8 tokens is handled in Swift #[derive(Clone)] pub struct CoreBpe { - inner: Arc, + inner: CoreBPEInternal, } impl CoreBpe { pub fn new( - encoder: StdHashMap, + encoder: StdHashMap, u32>, special_tokens_encoder: StdHashMap, pattern: String, - ) -> Self { - // Convert String keys to Vec for the encoder - // Handle base64-encoded byte sequences for non-UTF8 tokens - let byte_encoder: HashMap, Rank> = encoder - .into_iter() - .map(|(k, v)| { - if k.starts_with("base64:") { - // Decode base64 for non-UTF8 sequences - let b64_str = &k[7..]; - match base64::engine::general_purpose::STANDARD.decode(b64_str) { - Ok(bytes) => (bytes, v), - Err(e) => { - eprintln!("Failed to decode base64 token {}: {}", k, e); - (k.into_bytes(), v) - } - } - } else { - // Regular UTF-8 string - (k.into_bytes(), v) - } - }) - .collect(); + ) -> Result { + // Convert to the expected HashMap type + let encoder: HashMap, Rank> = encoder.into_iter().collect(); + let special_tokens_encoder: HashMap = special_tokens_encoder.into_iter().collect(); - let special_tokens_encoder: HashMap = special_tokens_encoder - .into_iter() - .collect(); - - let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern) - .expect("Failed to create CoreBPE"); - - Self { - inner: Arc::new(inner), - } + let inner = CoreBPEInternal::new(encoder, special_tokens_encoder, &pattern) + .map_err(|e| TiktokenError::RegexError { message: e.to_string() })?; + + Ok(Self { inner }) } - - pub fn encode_ordinary(&self, text: String) -> Vec { - self.inner.encode_ordinary(&text) - } - + pub fn encode(&self, text: String, allowed_special: Vec) -> Vec { + use std::collections::HashSet; let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); self.inner.encode(&text, &allowed_special).0 } - - pub fn encode_with_details(&self, text: String, allowed_special: Vec) -> EncodingResult { - let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); - let (tokens, last_piece_token_len) = self.inner.encode(&text, &allowed_special); - EncodingResult { - tokens, - last_piece_token_len: last_piece_token_len as u64, - } - } - - pub fn encode_with_unstable( - &self, - text: String, - allowed_special: Vec, - ) -> UnstableEncodingResult { - let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); - let (tokens, completions) = self.inner._encode_unstable_native(&text, &allowed_special); - UnstableEncodingResult { - tokens, - completions: completions.into_iter().collect(), - } - } - - pub fn encode_bytes(&self, input: Vec) -> Vec { - match std::str::from_utf8(&input) { - Ok(text) => self.inner.encode_ordinary(text), - Err(e) => { - let text = unsafe { std::str::from_utf8_unchecked(&input[..e.valid_up_to()]) }; - let (tokens, last_piece_token_len) = self.inner.encode(text, &HashSet::new()); - let (mut tokens, last_piece_token_len) = self - .inner - ._increase_last_piece_token_len(tokens, last_piece_token_len); - - let mut unstable_bytes; - if !tokens.is_empty() && last_piece_token_len > 0 { - unstable_bytes = self - .inner - .decode_bytes(&tokens[tokens.len() - last_piece_token_len..]) - .unwrap(); - unstable_bytes.extend_from_slice(&input[e.valid_up_to()..]); - tokens.truncate(tokens.len() - last_piece_token_len); - } else { - unstable_bytes = input[e.valid_up_to()..].to_vec(); - } - - if !unstable_bytes.is_empty() { - match self.inner.encoder.get(&unstable_bytes) { - Some(token) => tokens.push(*token), - None => { - tokens.extend(&crate::byte_pair_encode(&unstable_bytes, &self.inner.encoder)) - } - } - } - tokens - } - } - } - - pub fn encode_single_token(&self, piece: Vec) -> Result { - if let Some(token) = self.inner.encoder.get(&piece).copied() { - return Ok(token); - } - if let Ok(piece_str) = std::str::from_utf8(&piece) { - if let Some(token) = self.inner.special_tokens_encoder.get(piece_str).copied() { - return Ok(token); - } - } - Err(TiktokenError::KeyError(format!( - "Token not found: {:?}", - piece - ))) - } - - pub fn encode_single_piece(&self, piece: Vec) -> Vec { - if piece.is_empty() { - return vec![]; - } - if let Some(token) = self.inner.encoder.get(&piece) { - return vec![*token]; - } - crate::byte_pair_encode(&piece, &self.inner.encoder) - } - - pub fn decode_bytes(&self, tokens: Vec) -> Result, TiktokenError> { - self.inner.decode_bytes(&tokens).map_err(|e| e.into()) - } - - pub fn decode_single_token_bytes(&self, token: u32) -> Result, TiktokenError> { - if let Some(bytes) = self.inner.decoder.get(&token) { - return Ok(bytes.clone()); - } - if let Some(bytes) = self.inner.special_tokens_decoder.get(&token) { - return Ok(bytes.clone()); - } - Err(TiktokenError::KeyError(format!("Token not found: {}", token))) - } - - pub fn token_byte_values(&self) -> Vec> { - self.inner.sorted_token_bytes.clone() - } - - pub fn special_tokens(&self) -> Vec { - self.inner - .special_tokens_encoder - .keys() - .cloned() - .collect() + + pub fn encode_ordinary(&self, text: String) -> Vec { + self.inner.encode_ordinary(&text) } - + pub fn encode_with_special_tokens(&self, text: String) -> Vec { self.inner.encode_with_special_tokens(&text) } - pub fn max_token_value(&self) -> u32 { - // Find the maximum value among regular and special tokens - let max_regular = self.inner.encoder.values().max().copied().unwrap_or(0); - let max_special = self.inner.special_tokens_encoder.values().max().copied().unwrap_or(0); - max_regular.max(max_special) - } - - pub fn n_vocab(&self) -> u32 { - // For backwards compatibility, n_vocab is max_token_value + 1 - self.max_token_value() + 1 + pub fn decode_bytes(&self, tokens: Vec) -> Result, TiktokenError> { + self.inner.decode_bytes(&tokens) + .map_err(|e| TiktokenError::DecodeError { message: format!("Token {} not found", e.token) }) } } +/// Create a new CoreBpe instance pub fn new_core_bpe( - encoder: StdHashMap, + encoder: StdHashMap, u32>, special_tokens_encoder: StdHashMap, pattern: String, ) -> Result, TiktokenError> { - // Convert String keys to Vec for the encoder - let byte_encoder: HashMap, Rank> = encoder - .into_iter() - .map(|(k, v)| (k.into_bytes(), v)) - .collect(); - - let special_tokens_encoder: HashMap = special_tokens_encoder - .into_iter() - .collect(); - - let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern) - .map_err(|e| TiktokenError::ValueError(e.to_string()))?; - - Ok(Arc::new(CoreBpe { - inner: Arc::new(inner), - })) + Ok(Arc::new(CoreBpe::new(encoder, special_tokens_encoder, pattern)?)) } -uniffi::include_scaffolding!("tiktoken"); - +uniffi::include_scaffolding!("tiktoken"); \ No newline at end of file From 77dda5e229e3f1efaf7d693bbe61a122da1836c1 Mon Sep 17 00:00:00 2001 From: Nick Arner Date: Mon, 11 Aug 2025 09:59:01 -0700 Subject: [PATCH 5/6] udpates --- README.md | 37 ----- TestTiktoken/Package.swift | 2 +- TestTiktoken/Sources/TestTiktoken/main.swift | 144 +++++++++++++++---- src/uniffi_bindings.rs | 2 +- 4 files changed, 119 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 9025fe23..0d3ab8fe 100644 --- a/README.md +++ b/README.md @@ -22,43 +22,6 @@ The tokeniser API is documented in `tiktoken/core.py`. Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb). -## Swift Bindings - -This fork includes Swift bindings for tiktoken, allowing you to use the same high-performance BPE tokenizer in iOS, macOS, and other Apple platform applications. - -### Quick Start (Swift) - -```swift -import TiktokenSwift - -// Load an encoding -let encoder = try await CoreBpe.cl100kBase() - -// Encode text to tokens -let tokens = encoder.encode(text: "hello world", allowedSpecial: []) - -// Decode tokens back to text -let decoded = try encoder.decodeBytes(tokens: tokens) -let text = String(data: decoded, encoding: .utf8)! -``` - -### Installation (Swift Package Manager) - -Add the TiktokenSwift package to your project: - -1. In Xcode, go to File โ†’ Add Package Dependencies -2. Add the local package from `TiktokenSwift/` directory - -Or add to your `Package.swift`: -```swift -dependencies: [ - .package(path: "../path/to/tiktoken/TiktokenSwift") -] -``` - -For detailed Swift documentation, see [SWIFT_GUIDE.md](SWIFT_GUIDE.md). - - ## Performance `tiktoken` is between 3-6x faster than a comparable open source tokeniser: diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift index 1feaff1e..c9ea47fa 100644 --- a/TestTiktoken/Package.swift +++ b/TestTiktoken/Package.swift @@ -7,7 +7,7 @@ let package = Package( .macOS(.v10_15) ], dependencies: [ - .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift") + .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift/TiktokenSwift") ], targets: [ .executableTarget( diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift index ec74b2a2..eaa8e377 100644 --- a/TestTiktoken/Sources/TestTiktoken/main.swift +++ b/TestTiktoken/Sources/TestTiktoken/main.swift @@ -1,39 +1,129 @@ import Foundation import TiktokenSwift -print("๐Ÿงช Testing TiktokenSwift...") -print("=" * 50) +print("๐Ÿงช Testing TiktokenSwift with Latest Models...") +print("=" * 60) + +// Model information from upstream +let latestModels = [ + "GPT-5": "o200k_base", + "GPT-4.5": "o200k_base", + "GPT-4.1": "o200k_base", + "o3": "o200k_base", + "o4-mini": "o200k_base", + "gpt-oss": "o200k_harmony" +] + +let encodings = [ + "cl100k_base": "Used by GPT-4, GPT-3.5-turbo", + "o200k_base": "Used by GPT-5, GPT-4.5, GPT-4.1, o1, o3, o4-mini, GPT-4o", + "o200k_harmony": "Used by gpt-oss models, includes special tokens for structured output" +] + +print("\n๐Ÿ“Š Latest Model Support (from upstream tiktoken v0.11.0):") +print("-" * 60) +for (model, encoding) in latestModels { + print(" โ€ข \(model.padding(toLength: 12, withPad: " ", startingAt: 0)) โ†’ \(encoding)") +} + +print("\n๐Ÿ”ค Available Encodings:") +print("-" * 60) +for (encoding, description) in encodings { + print(" โ€ข \(encoding.padding(toLength: 15, withPad: " ", startingAt: 0)) : \(description)") +} + +print("\n" + "=" * 60) +print("๐Ÿงช Testing Basic Encoding/Decoding...") +print("-" * 60) do { - // Create a test encoder + // Create a test encoder (simulating cl100k_base) let encoder = try TiktokenHelper.createTestEncoder() - print("โœ… Successfully created encoder") - - // Test encoding - let text = "hello world!" - let tokens = encoder.encodeText(text) - print("\n๐Ÿ“ Original text: '\(text)'") - print("๐Ÿ”ข Encoded tokens: \(tokens)") - - // Test decoding - if let decoded = encoder.decodeTokens(tokens) { - print("๐Ÿ“– Decoded text: '\(decoded)'") - print("โœ… Decoding successful!") - } else { - print("โŒ Failed to decode tokens") + print("โœ… Successfully created test encoder") + + // Test texts including new model references + let testTexts = [ + "Hello, GPT-5!", + "Testing GPT-4.5 and GPT-4.1 models", + "The new o3 and o4-mini models are fast!", + "Using o200k_harmony encoding for structured output" + ] + + for text in testTexts { + print("\n๐Ÿ“ Original text: '\(text)'") + + // Regular encoding + let tokens = encoder.encodeText(text) + print("๐Ÿ”ข Encoded tokens (\(tokens.count) tokens): \(tokens)") + + // Decoding + if let decoded = encoder.decodeTokens(tokens) { + print("๐Ÿ“– Decoded text: '\(decoded)'") + let isMatch = decoded == text + print(isMatch ? "โœ… Perfect match!" : "โš ๏ธ Text differs (expected for test encoder)") + } else { + print("โŒ Failed to decode tokens") + } } - // Test encoding with special tokens - let textWithSpecial = "hello <|endoftext|> world" - let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: textWithSpecial) - print("\n๐Ÿ“ Text with special: '\(textWithSpecial)'") - print("๐Ÿ”ข Encoded tokens: \(tokensWithSpecial)") + print("\n" + "=" * 60) + print("๐Ÿ”ฌ Testing Special Tokens (o200k_harmony style)...") + print("-" * 60) - // Test ordinary encoding (without special tokens) - let ordinaryTokens = encoder.encodeOrdinary(text: text) - print("\n๐Ÿ“ Ordinary encoding: \(ordinaryTokens)") + // Test with special tokens that would be in o200k_harmony + let specialTokenTests = [ + "hello <|endoftext|> world", + "<|startoftext|>Begin prompt<|endoftext|>", + "Constrained output: <|constrain|>JSON<|return|>{}" + ] - print("\nโœ… All tests passed!") + for text in specialTokenTests { + print("\n๐Ÿ“ Text with special: '\(text)'") + let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: text) + print("๐Ÿ”ข Encoded with special: \(tokensWithSpecial)") + + let ordinaryTokens = encoder.encodeOrdinary(text: text) + print("๐Ÿ”ข Encoded ordinary: \(ordinaryTokens)") + + if ordinaryTokens.count != tokensWithSpecial.count { + print("โœ… Special tokens detected and handled differently") + } + } + + print("\n" + "=" * 60) + print("๐Ÿ“Š Encoding Comparison Examples:") + print("-" * 60) + + let comparisonText = "GPT-5 is the latest model from OpenAI" + print("\n๐Ÿ“ Sample text: '\(comparisonText)'") + + // Simulate different encoding behaviors + let regularTokens = encoder.encodeText(comparisonText) + let specialTokens = encoder.encodeWithSpecialTokens(text: comparisonText) + + print("\n Regular encoding (\(regularTokens.count) tokens):") + print(" \(regularTokens)") + + print("\n With special tokens (\(specialTokens.count) tokens):") + print(" \(specialTokens)") + + // Token count comparison + print("\n๐Ÿ“ˆ Token Efficiency:") + print(" โ€ข Characters: \(comparisonText.count)") + print(" โ€ข Tokens: \(regularTokens.count)") + print(" โ€ข Ratio: \(String(format: "%.2f", Double(comparisonText.count) / Double(regularTokens.count))) chars/token") + + print("\n" + "=" * 60) + print("โœ… All tests completed successfully!") + print("\n๐Ÿ’ก Note: This demo uses a test encoder. For production use:") + print(" 1. Load actual encoding data (cl100k_base.json or o200k_base.json)") + print(" 2. Use appropriate encoding for your model (see model list above)") + print(" 3. Handle special tokens based on your use case") + print("\n๐Ÿ” Key Updates from upstream tiktoken:") + print(" โ€ข GPT-5 support added (uses o200k_base encoding)") + print(" โ€ข New models: GPT-4.5, GPT-4.1, o3, o4-mini") + print(" โ€ข New encoding: o200k_harmony for structured output") + print(" โ€ข Performance improvements and better error handling") } catch { print("โŒ Error: \(error)") @@ -45,4 +135,4 @@ extension String { static func *(lhs: String, rhs: Int) -> String { String(repeating: lhs, count: rhs) } -} +} \ No newline at end of file diff --git a/src/uniffi_bindings.rs b/src/uniffi_bindings.rs index 331ad500..415d940b 100644 --- a/src/uniffi_bindings.rs +++ b/src/uniffi_bindings.rs @@ -40,7 +40,7 @@ impl CoreBpe { pub fn encode(&self, text: String, allowed_special: Vec) -> Vec { use std::collections::HashSet; let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect(); - self.inner.encode(&text, &allowed_special).0 + self.inner.encode(&text, &allowed_special).unwrap().0 } pub fn encode_ordinary(&self, text: String) -> Vec { From 04e8145b1b1ff94d777d27e228bdbc1645b6e14b Mon Sep 17 00:00:00 2001 From: Nick Arner Date: Mon, 11 Aug 2025 10:09:25 -0700 Subject: [PATCH 6/6] remove test target --- TestTiktoken/Package.swift | 20 --- TestTiktoken/Sources/TestTiktoken/main.swift | 138 ------------------- 2 files changed, 158 deletions(-) delete mode 100644 TestTiktoken/Package.swift delete mode 100644 TestTiktoken/Sources/TestTiktoken/main.swift diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift deleted file mode 100644 index c9ea47fa..00000000 --- a/TestTiktoken/Package.swift +++ /dev/null @@ -1,20 +0,0 @@ -// swift-tools-version: 5.9 -import PackageDescription - -let package = Package( - name: "TestTiktoken", - platforms: [ - .macOS(.v10_15) - ], - dependencies: [ - .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift/TiktokenSwift") - ], - targets: [ - .executableTarget( - name: "TestTiktoken", - dependencies: [ - .product(name: "TiktokenSwift", package: "TiktokenSwift") - ] - ), - ] -) diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift deleted file mode 100644 index eaa8e377..00000000 --- a/TestTiktoken/Sources/TestTiktoken/main.swift +++ /dev/null @@ -1,138 +0,0 @@ -import Foundation -import TiktokenSwift - -print("๐Ÿงช Testing TiktokenSwift with Latest Models...") -print("=" * 60) - -// Model information from upstream -let latestModels = [ - "GPT-5": "o200k_base", - "GPT-4.5": "o200k_base", - "GPT-4.1": "o200k_base", - "o3": "o200k_base", - "o4-mini": "o200k_base", - "gpt-oss": "o200k_harmony" -] - -let encodings = [ - "cl100k_base": "Used by GPT-4, GPT-3.5-turbo", - "o200k_base": "Used by GPT-5, GPT-4.5, GPT-4.1, o1, o3, o4-mini, GPT-4o", - "o200k_harmony": "Used by gpt-oss models, includes special tokens for structured output" -] - -print("\n๐Ÿ“Š Latest Model Support (from upstream tiktoken v0.11.0):") -print("-" * 60) -for (model, encoding) in latestModels { - print(" โ€ข \(model.padding(toLength: 12, withPad: " ", startingAt: 0)) โ†’ \(encoding)") -} - -print("\n๐Ÿ”ค Available Encodings:") -print("-" * 60) -for (encoding, description) in encodings { - print(" โ€ข \(encoding.padding(toLength: 15, withPad: " ", startingAt: 0)) : \(description)") -} - -print("\n" + "=" * 60) -print("๐Ÿงช Testing Basic Encoding/Decoding...") -print("-" * 60) - -do { - // Create a test encoder (simulating cl100k_base) - let encoder = try TiktokenHelper.createTestEncoder() - print("โœ… Successfully created test encoder") - - // Test texts including new model references - let testTexts = [ - "Hello, GPT-5!", - "Testing GPT-4.5 and GPT-4.1 models", - "The new o3 and o4-mini models are fast!", - "Using o200k_harmony encoding for structured output" - ] - - for text in testTexts { - print("\n๐Ÿ“ Original text: '\(text)'") - - // Regular encoding - let tokens = encoder.encodeText(text) - print("๐Ÿ”ข Encoded tokens (\(tokens.count) tokens): \(tokens)") - - // Decoding - if let decoded = encoder.decodeTokens(tokens) { - print("๐Ÿ“– Decoded text: '\(decoded)'") - let isMatch = decoded == text - print(isMatch ? "โœ… Perfect match!" : "โš ๏ธ Text differs (expected for test encoder)") - } else { - print("โŒ Failed to decode tokens") - } - } - - print("\n" + "=" * 60) - print("๐Ÿ”ฌ Testing Special Tokens (o200k_harmony style)...") - print("-" * 60) - - // Test with special tokens that would be in o200k_harmony - let specialTokenTests = [ - "hello <|endoftext|> world", - "<|startoftext|>Begin prompt<|endoftext|>", - "Constrained output: <|constrain|>JSON<|return|>{}" - ] - - for text in specialTokenTests { - print("\n๐Ÿ“ Text with special: '\(text)'") - let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: text) - print("๐Ÿ”ข Encoded with special: \(tokensWithSpecial)") - - let ordinaryTokens = encoder.encodeOrdinary(text: text) - print("๐Ÿ”ข Encoded ordinary: \(ordinaryTokens)") - - if ordinaryTokens.count != tokensWithSpecial.count { - print("โœ… Special tokens detected and handled differently") - } - } - - print("\n" + "=" * 60) - print("๐Ÿ“Š Encoding Comparison Examples:") - print("-" * 60) - - let comparisonText = "GPT-5 is the latest model from OpenAI" - print("\n๐Ÿ“ Sample text: '\(comparisonText)'") - - // Simulate different encoding behaviors - let regularTokens = encoder.encodeText(comparisonText) - let specialTokens = encoder.encodeWithSpecialTokens(text: comparisonText) - - print("\n Regular encoding (\(regularTokens.count) tokens):") - print(" \(regularTokens)") - - print("\n With special tokens (\(specialTokens.count) tokens):") - print(" \(specialTokens)") - - // Token count comparison - print("\n๐Ÿ“ˆ Token Efficiency:") - print(" โ€ข Characters: \(comparisonText.count)") - print(" โ€ข Tokens: \(regularTokens.count)") - print(" โ€ข Ratio: \(String(format: "%.2f", Double(comparisonText.count) / Double(regularTokens.count))) chars/token") - - print("\n" + "=" * 60) - print("โœ… All tests completed successfully!") - print("\n๐Ÿ’ก Note: This demo uses a test encoder. For production use:") - print(" 1. Load actual encoding data (cl100k_base.json or o200k_base.json)") - print(" 2. Use appropriate encoding for your model (see model list above)") - print(" 3. Handle special tokens based on your use case") - print("\n๐Ÿ” Key Updates from upstream tiktoken:") - print(" โ€ข GPT-5 support added (uses o200k_base encoding)") - print(" โ€ข New models: GPT-4.5, GPT-4.1, o3, o4-mini") - print(" โ€ข New encoding: o200k_harmony for structured output") - print(" โ€ข Performance improvements and better error handling") - -} catch { - print("โŒ Error: \(error)") - exit(1) -} - -// Helper to repeat string -extension String { - static func *(lhs: String, rhs: Int) -> String { - String(repeating: lhs, count: rhs) - } -} \ No newline at end of file