From 040d3dbaf650362b1408499a3638a785b05949ef Mon Sep 17 00:00:00 2001
From: Nick Arner <nicholasarner@gmail.com>
Date: Mon, 4 Aug 2025 09:30:22 -0700
Subject: [PATCH 1/6] update gitignore for swift

---
 .gitignore                                   |  9 ++++
 TestTiktoken/Package.swift                   | 20 ++++++++
 TestTiktoken/Sources/TestTiktoken/main.swift | 54 ++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 TestTiktoken/Package.swift
 create mode 100644 TestTiktoken/Sources/TestTiktoken/main.swift

diff --git a/.gitignore b/.gitignore
index 68cdf7ff..65cb3f25 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,12 @@ htmlcov
 
 Cargo.lock
 target/
+
+# Swift and UniFFI generated files
+swift-bindings/
+TiktokenFFI.xcframework/
+.swiftpm/
+.build/
+xcuserdata/
+DerivedData/
+*.xcodeproj
diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift
new file mode 100644
index 00000000..2b81018e
--- /dev/null
+++ b/TestTiktoken/Package.swift
@@ -0,0 +1,20 @@
+// swift-tools-version: 5.9
+import PackageDescription
+
+let package = Package(
+    name: "TestTiktoken",
+    platforms: [
+        .macOS(.v10_15)
+    ],
+    dependencies: [
+        .package(path: "../TiktokenSwift")
+    ],
+    targets: [
+        .executableTarget(
+            name: "TestTiktoken",
+            dependencies: [
+                .product(name: "TiktokenSwift", package: "TiktokenSwift")
+            ]
+        ),
+    ]
+)
diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift
new file mode 100644
index 00000000..3aa72dc4
--- /dev/null
+++ b/TestTiktoken/Sources/TestTiktoken/main.swift
@@ -0,0 +1,54 @@
+import Foundation
+import TiktokenSwift
+
+print("🧪 Testing TiktokenSwift...")
+print("=" * 50)
+
+do {
+    // Create a test encoder
+    let encoder = try TiktokenHelper.createTestEncoder()
+    print("✅ Successfully created encoder")
+    
+    // Test encoding
+    let text = "hello world!"
+    let tokens = encoder.encodeText(text)
+    print("\n📝 Original text: '\(text)'")
+    print("🔢 Encoded tokens: \(tokens)")
+    
+    // Test decoding
+    if let decoded = encoder.decodeTokens(tokens) {
+        print("📖 Decoded text: '\(decoded)'")
+        print("✅ Decoding successful!")
+    } else {
+        print("❌ Failed to decode tokens")
+    }
+    
+    // Test special tokens
+    let specialTokens = encoder.specialTokens()
+    print("\n🎯 Special tokens: \(specialTokens)")
+    
+    // Test vocabulary info
+    let vocabSize = encoder.nVocab()
+    let maxToken = encoder.maxTokenValue()
+    print("📊 Vocabulary size: \(vocabSize)")
+    print("📊 Max token value: \(maxToken)")
+    
+    // Test encoding with details
+    let details = encoder.encodeWithDetails(text: text, allowedSpecial: [])
+    print("\n🔍 Encoding details:")
+    print("   Tokens: \(details.tokens)")
+    print("   Last piece token length: \(details.lastPieceTokenLen)")
+    
+    print("\n✅ All tests passed!")
+    
+} catch {
+    print("❌ Error: \(error)")
+    exit(1)
+}
+
+// Helper to repeat string
+extension String {
+    static func *(lhs: String, rhs: Int) -> String {
+        String(repeating: lhs, count: rhs)
+    }
+}

From 1477990648544bb117d2862333004e2a7989d429 Mon Sep 17 00:00:00 2001
From: Nick Arner <nicholasarner@gmail.com>
Date: Mon, 4 Aug 2025 09:31:58 -0700
Subject: [PATCH 2/6] rust bindings for swift package

---
 Cargo.toml             |  19 ++-
 build.rs               |   3 +
 src/lib.rs             |  38 +++---
 src/tiktoken.udl       |  56 +++++++++
 src/tiktoken.uniffi.rs | 265 +++++++++++++++++++++++++++++++++++++++++
 src/uniffi_bindings.rs | 245 +++++++++++++++++++++++++++++++++++++
 uniffi.toml            |   5 +
 7 files changed, 614 insertions(+), 17 deletions(-)
 create mode 100644 build.rs
 create mode 100644 src/tiktoken.udl
 create mode 100644 src/tiktoken.uniffi.rs
 create mode 100644 src/uniffi_bindings.rs
 create mode 100644 uniffi.toml

diff --git a/Cargo.toml b/Cargo.toml
index d2f713bb..6202305c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,10 +6,11 @@ rust-version = "1.57.0"
 
 [lib]
 name = "tiktoken"
-crate-type = ["cdylib", "rlib"]
+crate-type = ["cdylib", "staticlib", "rlib"]
+
 
 [features]
-default = []
+default = ["uniffi_bindgen", "camino"]
 python = [
     "pyo3",
 ]
@@ -25,3 +26,17 @@ fancy-regex = "0.13.0"
 regex = "1.10.3"
 rustc-hash = "1.1.0"
 bstr = "1.5.0"
+base64 = "0.22"
+
+# UniFFI dependencies
+uniffi = { version = "0.29", features = ["build"] }
+thiserror = "1.0"
+uniffi_bindgen = { version = "0.29", optional = true }
+camino = { version = "1.1", optional = true }
+
+[build-dependencies]
+uniffi = { version = "0.29", features = ["bindgen"] }
+uniffi_build = "0.29"
+uniffi_bindgen = "0.29"
+camino = "1.1"
+
diff --git a/build.rs b/build.rs
new file mode 100644
index 00000000..2cc22627
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,3 @@
+fn main() {
+    uniffi_build::generate_scaffolding("src/tiktoken.udl").unwrap();
+}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 64dc6a15..625cc2ee 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,3 @@
-use std::borrow::Borrow;
-use std::borrow::Cow;
 use std::collections::HashSet;
 use std::num::NonZeroU64;
 use std::thread;
@@ -12,6 +10,11 @@ use rustc_hash::FxHashMap as HashMap;
 #[cfg(feature = "python")]
 mod py;
 
+pub mod uniffi_bindings;
+
+// UniFfiTag is required by UniFFI for type checking
+pub struct UniFfiTag;
+
 pub type Rank = u32;
 
 fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize, Rank)> {
@@ -73,17 +76,22 @@ fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize,
 }
 
 pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap<Vec<u8>, Rank>) -> Vec<Rank> {
+    if piece.is_empty() {
+        return vec![];
+    }
     if piece.len() == 1 {
-        return vec![ranks[piece]];
+        return ranks.get(piece).copied().map_or(vec![], |r| vec![r]);
     }
     _byte_pair_merge(ranks, piece)
         .windows(2)
-        .map(|part| ranks[&piece[part[0].0..part[1].0]])
+        .filter_map(|part| ranks.get(&piece[part[0].0..part[1].0]).copied())
         .collect()
 }
 
 pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, Rank>) -> Vec<&'a [u8]> {
-    assert!(piece.len() > 1);
+    if piece.len() <= 1 {
+        return vec![piece];
+    }
     _byte_pair_merge(ranks, piece)
         .windows(2)
         .map(|part| &piece[part[0].0..part[1].0])
@@ -177,13 +185,13 @@ const MAX_NUM_THREADS: usize = 128;
 #[cfg_attr(feature = "python", pyclass)]
 #[derive(Clone)]
 pub struct CoreBPE {
-    encoder: HashMap<Vec<u8>, Rank>,
-    special_tokens_encoder: HashMap<String, Rank>,
-    decoder: HashMap<Rank, Vec<u8>>,
-    special_tokens_decoder: HashMap<Rank, Vec<u8>>,
-    regex_tls: Vec<Regex>,
-    special_regex_tls: Vec<Regex>,
-    sorted_token_bytes: Vec<Vec<u8>>,
+    pub(crate) encoder: HashMap<Vec<u8>, Rank>,
+    pub(crate) special_tokens_encoder: HashMap<String, Rank>,
+    pub(crate) decoder: HashMap<Rank, Vec<u8>>,
+    pub(crate) special_tokens_decoder: HashMap<Rank, Vec<u8>>,
+    pub(crate) regex_tls: Vec<Regex>,
+    pub(crate) special_regex_tls: Vec<Regex>,
+    pub(crate) sorted_token_bytes: Vec<Vec<u8>>,
 }
 
 impl CoreBPE {
@@ -201,7 +209,7 @@ impl CoreBPE {
     /// Decodes tokens into a list of bytes.
     ///
     /// The bytes are not gauranteed to be a valid utf-8 string.
-    fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> {
+    pub(crate) fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> {
         let mut ret = Vec::with_capacity(tokens.len() * 2);
         for &token in tokens {
             let token_bytes = match self.decoder.get(&token) {
@@ -287,7 +295,7 @@ impl CoreBPE {
         (ret, last_piece_token_len)
     }
 
-    fn _increase_last_piece_token_len(
+    pub(crate) fn _increase_last_piece_token_len(
         &self,
         tokens: Vec<Rank>,
         mut last_piece_token_len: usize,
@@ -461,7 +469,7 @@ impl CoreBPE {
         )
     }
 
-    fn new_internal(
+    pub(crate) fn new_internal(
         encoder: HashMap<Vec<u8>, Rank>,
         special_tokens_encoder: HashMap<String, Rank>,
         pattern: &str,
diff --git a/src/tiktoken.udl b/src/tiktoken.udl
new file mode 100644
index 00000000..7c236c6f
--- /dev/null
+++ b/src/tiktoken.udl
@@ -0,0 +1,56 @@
+namespace tiktoken {
+    [Throws=TiktokenError]
+    CoreBpe new_core_bpe(record<DOMString, u32> encoder, record<DOMString, u32> special_tokens_encoder, string pattern);
+};
+
+[Error]
+enum TiktokenError {
+    "ValueError",
+    "KeyError", 
+    "DecodeError",
+};
+
+dictionary EncodingResult {
+    sequence<u32> tokens;
+    u64 last_piece_token_len;
+};
+
+dictionary UnstableEncodingResult {
+    sequence<u32> tokens;
+    sequence<sequence<u32>> completions;
+};
+
+interface CoreBpe {
+    constructor(record<DOMString, u32> encoder, record<DOMString, u32> special_tokens_encoder, string pattern);
+    
+    sequence<u32> encode_ordinary(string text);
+    
+    sequence<u32> encode(string text, sequence<string> allowed_special);
+    
+    EncodingResult encode_with_details(string text, sequence<string> allowed_special);
+    
+    UnstableEncodingResult encode_with_unstable(string text, sequence<string> allowed_special);
+    
+    sequence<u32> encode_bytes(bytes input);
+    
+    [Throws=TiktokenError]
+    u32 encode_single_token(bytes piece);
+    
+    sequence<u32> encode_single_piece(bytes piece);
+    
+    [Throws=TiktokenError]
+    bytes decode_bytes(sequence<u32> tokens);
+    
+    [Throws=TiktokenError]
+    bytes decode_single_token_bytes(u32 token);
+    
+    sequence<bytes> token_byte_values();
+    
+    sequence<string> special_tokens();
+    
+    sequence<u32> encode_with_special_tokens(string text);
+    
+    u32 max_token_value();
+    
+    u32 n_vocab();
+};
\ No newline at end of file
diff --git a/src/tiktoken.uniffi.rs b/src/tiktoken.uniffi.rs
new file mode 100644
index 00000000..a21cf6d3
--- /dev/null
+++ b/src/tiktoken.uniffi.rs
@@ -0,0 +1,265 @@
+// This file was autogenerated by some hot garbage in the `uniffi` crate.
+// Trust me, you don't want to mess with it!
+
+::uniffi::setup_scaffolding!("tiktoken");
+
+// Export info about this UDL file
+// See `uniffi_bindgen::macro_metadata` for how this is used.
+
+const UNIFFI_META_CONST_UDL_TIKTOKEN: ::uniffi::MetadataBuffer =
+    ::uniffi::MetadataBuffer::from_code(::uniffi::metadata::codes::UDL_FILE)
+        .concat_str("tiktoken")
+        .concat_str("tiktoken")
+        .concat_str("tiktoken");
+
+#[doc(hidden)]
+#[unsafe(no_mangle)]
+pub static UNIFFI_META_UDL_TIKTOKEN: [u8; UNIFFI_META_CONST_UDL_TIKTOKEN.size] =
+    UNIFFI_META_CONST_UDL_TIKTOKEN.into_array();
+
+uniffi::deps::static_assertions::assert_impl_all!(::std::string::String: ::std::cmp::Eq, ::std::hash::Hash); // record<::std::string::String, u32>
+
+// Error definitions, corresponding to `error` in the UDL.
+
+#[::uniffi::udl_derive(Error)]
+#[uniffi(flat_error)]
+
+enum r#TiktokenError {
+    r#ValueError {},
+    r#KeyError {},
+    r#DecodeError {},
+}
+
+// Record definitions, implemented as method-less structs, corresponding to `dictionary` objects.
+
+#[::uniffi::udl_derive(Record)]
+struct r#EncodingResult {
+    r#tokens: std::vec::Vec<u32>,
+    r#last_piece_token_len: u64,
+}
+
+#[::uniffi::udl_derive(Record)]
+struct r#UnstableEncodingResult {
+    r#tokens: std::vec::Vec<u32>,
+    r#completions: std::vec::Vec<std::vec::Vec<u32>>,
+}
+
+// Top level functions, corresponding to UDL `namespace` functions.
+
+#[::uniffi::export_for_udl]
+pub fn r#new_core_bpe(
+    r#encoder: ::std::collections::HashMap<::std::string::String, u32>,
+    r#special_tokens_encoder: ::std::collections::HashMap<::std::string::String, u32>,
+    r#pattern: ::std::string::String,
+) -> ::std::result::Result<::std::sync::Arc<r#CoreBPE>, r#TiktokenError> {
+    unreachable!()
+}
+
+// Object definitions, corresponding to UDL `interface` definitions.
+
+#[::uniffi::udl_derive(Object)]
+struct r#CoreBPE {}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    #[uniffi::constructor]
+    pub fn r#new(
+        r#encoder: ::std::collections::HashMap<::std::string::String, u32>,
+        r#special_tokens_encoder: ::std::collections::HashMap<::std::string::String, u32>,
+        r#pattern: ::std::string::String,
+    ) -> ::std::sync::Arc<r#CoreBPE> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#decode_bytes(
+        &self,
+        r#tokens: std::vec::Vec<u32>,
+    ) -> ::std::result::Result<::std::vec::Vec<u8>, r#TiktokenError> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#decode_single_token_bytes(
+        &self,
+        r#token: u32,
+    ) -> ::std::result::Result<::std::vec::Vec<u8>, r#TiktokenError> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode(
+        &self,
+        r#text: ::std::string::String,
+        r#allowed_special: std::vec::Vec<::std::string::String>,
+    ) -> std::vec::Vec<u32> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_bytes(&self, r#input: ::std::vec::Vec<u8>) -> std::vec::Vec<u32> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_ordinary(&self, r#text: ::std::string::String) -> std::vec::Vec<u32> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_single_piece(&self, r#piece: ::std::vec::Vec<u8>) -> std::vec::Vec<u32> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_single_token(
+        &self,
+        r#piece: ::std::vec::Vec<u8>,
+    ) -> ::std::result::Result<u32, r#TiktokenError> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_with_details(
+        &self,
+        r#text: ::std::string::String,
+        r#allowed_special: std::vec::Vec<::std::string::String>,
+    ) -> r#EncodingResult {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_with_special_tokens(
+        &self,
+        r#text: ::std::string::String,
+    ) -> std::vec::Vec<u32> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#encode_with_unstable(
+        &self,
+        r#text: ::std::string::String,
+        r#allowed_special: std::vec::Vec<::std::string::String>,
+    ) -> r#UnstableEncodingResult {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#max_token_value(&self) -> u32 {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#n_vocab(&self) -> u32 {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#special_tokens(&self) -> std::vec::Vec<::std::string::String> {
+        unreachable!()
+    }
+}
+#[::uniffi::export_for_udl]
+impl r#CoreBPE {
+    pub fn r#token_byte_values(&self) -> std::vec::Vec<::std::vec::Vec<u8>> {
+        unreachable!()
+    }
+}
+
+// Callback Interface definitions, corresponding to UDL `callback interface` definitions.
+
+// Export scaffolding checksums for UDL items
+
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_func_new_core_bpe() -> u16 {
+    56117
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_decode_bytes() -> u16 {
+    55010
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_decode_single_token_bytes() -> u16 {
+    5116
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode() -> u16 {
+    29815
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_bytes() -> u16 {
+    62700
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_ordinary() -> u16 {
+    27373
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_single_piece() -> u16 {
+    59626
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_single_token() -> u16 {
+    44485
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_with_details() -> u16 {
+    44545
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_with_special_tokens() -> u16 {
+    3792
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_encode_with_unstable() -> u16 {
+    58939
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_max_token_value() -> u16 {
+    1036
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_n_vocab() -> u16 {
+    6443
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_special_tokens() -> u16 {
+    37553
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_method_corebpe_token_byte_values() -> u16 {
+    22300
+}
+#[unsafe(no_mangle)]
+#[doc(hidden)]
+pub extern "C" fn r#uniffi_tiktoken_checksum_constructor_corebpe_new() -> u16 {
+    33616
+}
diff --git a/src/uniffi_bindings.rs b/src/uniffi_bindings.rs
new file mode 100644
index 00000000..befb4d84
--- /dev/null
+++ b/src/uniffi_bindings.rs
@@ -0,0 +1,245 @@
+use std::collections::{HashMap as StdHashMap, HashSet};
+use std::sync::Arc;
+use rustc_hash::FxHashMap as HashMap;
+use base64::Engine;
+
+use crate::{CoreBPE as CoreBPEInternal, Rank};
+
+#[derive(Debug, thiserror::Error)]
+pub enum TiktokenError {
+    #[error("Value error: {0}")]
+    ValueError(String),
+    #[error("Key error: {0}")]
+    KeyError(String),
+    #[error("Decode error: {0}")]
+    DecodeError(String),
+}
+
+impl From<crate::DecodeKeyError> for TiktokenError {
+    fn from(err: crate::DecodeKeyError) -> Self {
+        TiktokenError::KeyError(format!("Invalid token for decoding: {}", err.token))
+    }
+}
+
+impl From<crate::DecodeError> for TiktokenError {
+    fn from(err: crate::DecodeError) -> Self {
+        TiktokenError::DecodeError(err.message)
+    }
+}
+
+#[derive(Debug)]
+pub struct EncodingResult {
+    pub tokens: Vec<u32>,
+    pub last_piece_token_len: u64,
+}
+
+#[derive(Debug)]
+pub struct UnstableEncodingResult {
+    pub tokens: Vec<u32>,
+    pub completions: Vec<Vec<u32>>,
+}
+
+#[derive(Clone)]
+pub struct CoreBpe {
+    inner: Arc<CoreBPEInternal>,
+}
+
+impl CoreBpe {
+    pub fn new(
+        encoder: StdHashMap<String, u32>,
+        special_tokens_encoder: StdHashMap<String, u32>,
+        pattern: String,
+    ) -> Self {
+        // Convert String keys to Vec<u8> for the encoder
+        // Handle base64-encoded byte sequences for non-UTF8 tokens
+        let byte_encoder: HashMap<Vec<u8>, Rank> = encoder
+            .into_iter()
+            .map(|(k, v)| {
+                if k.starts_with("base64:") {
+                    // Decode base64 for non-UTF8 sequences
+                    let b64_str = &k[7..];
+                    match base64::engine::general_purpose::STANDARD.decode(b64_str) {
+                        Ok(bytes) => (bytes, v),
+                        Err(e) => {
+                            eprintln!("Failed to decode base64 token {}: {}", k, e);
+                            (k.into_bytes(), v)
+                        }
+                    }
+                } else {
+                    // Regular UTF-8 string
+                    (k.into_bytes(), v)
+                }
+            })
+            .collect();
+        
+        let special_tokens_encoder: HashMap<String, Rank> = special_tokens_encoder
+            .into_iter()
+            .collect();
+
+        let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern)
+            .expect("Failed to create CoreBPE");
+
+        Self {
+            inner: Arc::new(inner),
+        }
+    }
+
+    pub fn encode_ordinary(&self, text: String) -> Vec<u32> {
+        self.inner.encode_ordinary(&text)
+    }
+
+    pub fn encode(&self, text: String, allowed_special: Vec<String>) -> Vec<u32> {
+        let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
+        self.inner.encode(&text, &allowed_special).0
+    }
+
+    pub fn encode_with_details(&self, text: String, allowed_special: Vec<String>) -> EncodingResult {
+        let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
+        let (tokens, last_piece_token_len) = self.inner.encode(&text, &allowed_special);
+        EncodingResult {
+            tokens,
+            last_piece_token_len: last_piece_token_len as u64,
+        }
+    }
+
+    pub fn encode_with_unstable(
+        &self,
+        text: String,
+        allowed_special: Vec<String>,
+    ) -> UnstableEncodingResult {
+        let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
+        let (tokens, completions) = self.inner._encode_unstable_native(&text, &allowed_special);
+        UnstableEncodingResult {
+            tokens,
+            completions: completions.into_iter().collect(),
+        }
+    }
+
+    pub fn encode_bytes(&self, input: Vec<u8>) -> Vec<u32> {
+        match std::str::from_utf8(&input) {
+            Ok(text) => self.inner.encode_ordinary(text),
+            Err(e) => {
+                let text = unsafe { std::str::from_utf8_unchecked(&input[..e.valid_up_to()]) };
+                let (tokens, last_piece_token_len) = self.inner.encode(text, &HashSet::new());
+                let (mut tokens, last_piece_token_len) = self
+                    .inner
+                    ._increase_last_piece_token_len(tokens, last_piece_token_len);
+
+                let mut unstable_bytes;
+                if !tokens.is_empty() && last_piece_token_len > 0 {
+                    unstable_bytes = self
+                        .inner
+                        .decode_bytes(&tokens[tokens.len() - last_piece_token_len..])
+                        .unwrap();
+                    unstable_bytes.extend_from_slice(&input[e.valid_up_to()..]);
+                    tokens.truncate(tokens.len() - last_piece_token_len);
+                } else {
+                    unstable_bytes = input[e.valid_up_to()..].to_vec();
+                }
+
+                if !unstable_bytes.is_empty() {
+                    match self.inner.encoder.get(&unstable_bytes) {
+                        Some(token) => tokens.push(*token),
+                        None => {
+                            tokens.extend(&crate::byte_pair_encode(&unstable_bytes, &self.inner.encoder))
+                        }
+                    }
+                }
+                tokens
+            }
+        }
+    }
+
+    pub fn encode_single_token(&self, piece: Vec<u8>) -> Result<u32, TiktokenError> {
+        if let Some(token) = self.inner.encoder.get(&piece).copied() {
+            return Ok(token);
+        }
+        if let Ok(piece_str) = std::str::from_utf8(&piece) {
+            if let Some(token) = self.inner.special_tokens_encoder.get(piece_str).copied() {
+                return Ok(token);
+            }
+        }
+        Err(TiktokenError::KeyError(format!(
+            "Token not found: {:?}",
+            piece
+        )))
+    }
+
+    pub fn encode_single_piece(&self, piece: Vec<u8>) -> Vec<u32> {
+        if piece.is_empty() {
+            return vec![];
+        }
+        if let Some(token) = self.inner.encoder.get(&piece) {
+            return vec![*token];
+        }
+        crate::byte_pair_encode(&piece, &self.inner.encoder)
+    }
+
+    pub fn decode_bytes(&self, tokens: Vec<u32>) -> Result<Vec<u8>, TiktokenError> {
+        self.inner.decode_bytes(&tokens).map_err(|e| e.into())
+    }
+
+    pub fn decode_single_token_bytes(&self, token: u32) -> Result<Vec<u8>, TiktokenError> {
+        if let Some(bytes) = self.inner.decoder.get(&token) {
+            return Ok(bytes.clone());
+        }
+        if let Some(bytes) = self.inner.special_tokens_decoder.get(&token) {
+            return Ok(bytes.clone());
+        }
+        Err(TiktokenError::KeyError(format!("Token not found: {}", token)))
+    }
+
+    pub fn token_byte_values(&self) -> Vec<Vec<u8>> {
+        self.inner.sorted_token_bytes.clone()
+    }
+
+    pub fn special_tokens(&self) -> Vec<String> {
+        self.inner
+            .special_tokens_encoder
+            .keys()
+            .cloned()
+            .collect()
+    }
+
+    pub fn encode_with_special_tokens(&self, text: String) -> Vec<u32> {
+        self.inner.encode_with_special_tokens(&text)
+    }
+    
+    pub fn max_token_value(&self) -> u32 {
+        // Find the maximum value among regular and special tokens
+        let max_regular = self.inner.encoder.values().max().copied().unwrap_or(0);
+        let max_special = self.inner.special_tokens_encoder.values().max().copied().unwrap_or(0);
+        max_regular.max(max_special)
+    }
+    
+    pub fn n_vocab(&self) -> u32 {
+        // For backwards compatibility, n_vocab is max_token_value + 1
+        self.max_token_value() + 1
+    }
+}
+
+pub fn new_core_bpe(
+    encoder: StdHashMap<String, u32>,
+    special_tokens_encoder: StdHashMap<String, u32>,
+    pattern: String,
+) -> Result<Arc<CoreBpe>, TiktokenError> {
+    // Convert String keys to Vec<u8> for the encoder
+    let byte_encoder: HashMap<Vec<u8>, Rank> = encoder
+        .into_iter()
+        .map(|(k, v)| (k.into_bytes(), v))
+        .collect();
+    
+    let special_tokens_encoder: HashMap<String, Rank> = special_tokens_encoder
+        .into_iter()
+        .collect();
+
+    let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern)
+        .map_err(|e| TiktokenError::ValueError(e.to_string()))?;
+
+    Ok(Arc::new(CoreBpe {
+        inner: Arc::new(inner),
+    }))
+}
+
+uniffi::include_scaffolding!("tiktoken");
+
diff --git a/uniffi.toml b/uniffi.toml
new file mode 100644
index 00000000..efc35a91
--- /dev/null
+++ b/uniffi.toml
@@ -0,0 +1,5 @@
+[bindings.swift]
+package_name = "TiktokenSwift"
+ffi_module_name = "TiktokenFFI"
+module_name = "TiktokenFFI"
+omit_argument_labels = false
\ No newline at end of file

From 313f76b4383a166af1d8b9476deca36eef9d4529 Mon Sep 17 00:00:00 2001
From: Nick Arner <nicholasarner@gmail.com>
Date: Mon, 4 Aug 2025 09:32:10 -0700
Subject: [PATCH 3/6] build script

---
 build_xcframework.sh | 306 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100755 build_xcframework.sh

diff --git a/build_xcframework.sh b/build_xcframework.sh
new file mode 100755
index 00000000..dc3a61d6
--- /dev/null
+++ b/build_xcframework.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+set -e
+
+echo "🚀 Building Multi-Platform XCFramework for tiktoken..."
+echo ""
+
+# Get the script directory
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd "$SCRIPT_DIR"
+
+echo "📍 Working directory: $(pwd)"
+echo ""
+
+# Check for required tools
+echo "🔍 Checking required tools..."
+if ! command -v cargo &> /dev/null; then
+    echo "❌ cargo not found. Please install Rust."
+    exit 1
+else
+    echo "✅ cargo found: $(cargo --version)"
+fi
+
+if ! command -v xcodebuild &> /dev/null; then
+    echo "❌ xcodebuild not found. Please install Xcode."
+    exit 1
+else
+    echo "✅ xcodebuild found: $(xcodebuild -version | head -n1)"
+fi
+
+if ! command -v lipo &> /dev/null; then
+    echo "❌ lipo not found. Please install Xcode Command Line Tools."
+    exit 1
+else
+    echo "✅ lipo found"
+fi
+
+# First, we need to generate the Swift bindings
+echo ""
+echo "🔧 Generating Swift bindings..."
+mkdir -p swift-bindings
+
+# Use the installed uniffi-bindgen to generate Swift bindings
+if [ -f "$HOME/.cargo/bin/uniffi-bindgen" ]; then
+    UNIFFI_BINDGEN="$HOME/.cargo/bin/uniffi-bindgen"
+    echo "✅ Using uniffi-bindgen from cargo"
+elif command -v uniffi-bindgen &> /dev/null; then
+    UNIFFI_BINDGEN="uniffi-bindgen"
+    echo "✅ Using system uniffi-bindgen"
+else
+    echo "❌ uniffi-bindgen not found. Please install it with: cargo install uniffi_bindgen"
+    exit 1
+fi
+
+echo "📝 Running uniffi-bindgen..."
+$UNIFFI_BINDGEN generate src/tiktoken.udl \
+    --language swift \
+    --out-dir swift-bindings \
+    --config uniffi.toml || {
+    echo "❌ Failed to generate Swift bindings"
+    exit 1
+}
+
+# Remove the old incorrect module map if it exists
+rm -f swift-bindings/module.modulemap
+
+# Install required targets if not already installed
+echo ""
+echo "📱 Checking and installing required Rust targets..."
+
+# Function to check and add target
+add_target_if_needed() {
+    local target=$1
+    if rustup target list --installed | grep -q "$target"; then
+        echo "  ✅ $target already installed"
+    else
+        echo "  📦 Installing $target..."
+        rustup target add "$target" || {
+            echo "  ⚠️  Failed to install $target"
+            return 1
+        }
+    fi
+    return 0
+}
+
+# Install all required targets
+add_target_if_needed "aarch64-apple-ios"
+add_target_if_needed "aarch64-apple-ios-sim"
+add_target_if_needed "x86_64-apple-ios"
+add_target_if_needed "aarch64-apple-darwin"
+add_target_if_needed "x86_64-apple-darwin"
+
+# Build for all platforms
+echo ""
+echo "🦀 Building Rust library for all Apple platforms..."
+
+# Build for iOS arm64
+echo "  📱 Building for iOS (arm64)..."
+cargo build --release --target aarch64-apple-ios || {
+    echo "  ❌ Failed to build for iOS arm64"
+    exit 1
+}
+
+# Build for iOS simulator (arm64 + x86_64)
+echo "  📱 Building for iOS Simulator (arm64)..."
+cargo build --release --target aarch64-apple-ios-sim || {
+    echo "  ❌ Failed to build for iOS Simulator arm64"
+    exit 1
+}
+
+echo "  📱 Building for iOS Simulator (x86_64)..."
+cargo build --release --target x86_64-apple-ios || {
+    echo "  ❌ Failed to build for iOS Simulator x86_64"
+    exit 1
+}
+
+# Build for macOS (arm64 + x86_64)
+echo "  💻 Building for macOS (arm64)..."
+cargo build --release --target aarch64-apple-darwin || {
+    echo "  ❌ Failed to build for macOS arm64"
+    exit 1
+}
+
+echo "  💻 Building for macOS (x86_64)..."
+cargo build --release --target x86_64-apple-darwin || {
+    echo "  ❌ Failed to build for macOS x86_64"
+    exit 1
+}
+
+# Swift bindings are already generated in swift-bindings directory
+
+# Create fat libraries
+echo ""
+echo "🔗 Creating universal libraries..."
+
+# iOS Simulator universal binary
+echo "  📱 Creating iOS Simulator universal binary..."
+mkdir -p target/universal-ios-sim
+lipo -create \
+    target/aarch64-apple-ios-sim/release/libtiktoken.a \
+    target/x86_64-apple-ios/release/libtiktoken.a \
+    -output target/universal-ios-sim/libtiktoken.a || {
+    echo "  ❌ Failed to create iOS Simulator universal binary"
+    exit 1
+}
+echo "  ✅ iOS Simulator universal binary created"
+
+# macOS universal binary
+echo "  💻 Creating macOS universal binary..."
+mkdir -p target/universal-macos
+lipo -create \
+    target/aarch64-apple-darwin/release/libtiktoken.a \
+    target/x86_64-apple-darwin/release/libtiktoken.a \
+    -output target/universal-macos/libtiktoken.a || {
+    echo "  ❌ Failed to create macOS universal binary"
+    exit 1
+}
+echo "  ✅ macOS universal binary created"
+
+# Create module map for frameworks
+echo ""
+echo "📦 Creating framework structure..."
+cat > swift-bindings/module.modulemap << 'EOF'
+framework module TiktokenFFI {
+    header "TiktokenFFI.h"
+    export *
+}
+EOF
+
+# Function to create framework
+create_framework() {
+    local PLATFORM=$1
+    local SDK=$2
+    local LIB_PATH=$3
+    local MIN_VERSION=$4
+    
+    echo "  📦 Creating framework for $PLATFORM..."
+    
+    local FRAMEWORK_DIR="build/$PLATFORM/TiktokenFFI.framework"
+    mkdir -p "$FRAMEWORK_DIR/Headers"
+    mkdir -p "$FRAMEWORK_DIR/Modules"
+    
+    # Copy header
+    cp swift-bindings/TiktokenFFI.h "$FRAMEWORK_DIR/Headers/"
+    
+    # Copy module map
+    cp swift-bindings/module.modulemap "$FRAMEWORK_DIR/Modules/module.modulemap"
+    
+    # Copy library
+    cp "$LIB_PATH" "$FRAMEWORK_DIR/TiktokenFFI"
+    
+    # Create Info.plist
+    cat > "$FRAMEWORK_DIR/Info.plist" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>TiktokenFFI</string>
+    <key>CFBundleIdentifier</key>
+    <string>com.tiktoken.TiktokenFFI</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>TiktokenFFI</string>
+    <key>CFBundlePackageType</key>
+    <string>FMWK</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0.0</string>
+    <key>CFBundleSupportedPlatforms</key>
+    <array>
+        <string>$SDK</string>
+    </array>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+    <key>MinimumOSVersion</key>
+    <string>$MIN_VERSION</string>
+</dict>
+</plist>
+EOF
+}
+
+# Create build directory
+mkdir -p build
+
+# Create frameworks
+create_framework "ios" "iPhoneOS" "target/aarch64-apple-ios/release/libtiktoken.a" "13.0"
+create_framework "ios-simulator" "iPhoneSimulator" "target/universal-ios-sim/libtiktoken.a" "13.0"
+create_framework "macos" "MacOSX" "target/universal-macos/libtiktoken.a" "10.15"
+
+# Create XCFramework
+echo ""
+echo "🔧 Creating XCFramework..."
+
+# Verify frameworks exist
+echo "  🔍 Verifying frameworks..."
+for framework in "build/ios/TiktokenFFI.framework" "build/ios-simulator/TiktokenFFI.framework" "build/macos/TiktokenFFI.framework"; do
+    if [ -d "$framework" ]; then
+        echo "  ✅ Found $framework"
+    else
+        echo "  ❌ Missing $framework"
+        exit 1
+    fi
+done
+
+# Remove old XCFrameworks
+echo "  🧹 Removing old XCFrameworks..."
+rm -rf TiktokenFFI.xcframework
+rm -rf TiktokenSwift/Sources/TiktokenFFI/TiktokenFFI.xcframework
+
+# Create the XCFramework
+echo "  🏗️  Building XCFramework..."
+xcodebuild -create-xcframework \
+    -framework build/ios/TiktokenFFI.framework \
+    -framework build/ios-simulator/TiktokenFFI.framework \
+    -framework build/macos/TiktokenFFI.framework \
+    -output TiktokenFFI.xcframework || {
+    echo "  ❌ Failed to create XCFramework"
+    exit 1
+}
+echo "  ✅ XCFramework created successfully"
+
+# Copy to TiktokenSwift package in separate directory
+TIKTOKEN_SWIFT_DIR="/Users/nicholasarner/Development/Active/TiktokenSwift"
+if [ -d "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI" ]; then
+    echo "📦 Copying XCFramework to TiktokenSwift package..."
+    cp -R TiktokenFFI.xcframework "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/"
+    
+    # Update header if needed
+    if [ -f "swift-bindings/TiktokenFFI.h" ]; then
+        cp swift-bindings/TiktokenFFI.h "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/include/"
+    fi
+    
+    # Update Swift file if needed
+    if [ -f "swift-bindings/TiktokenFFI.swift" ] && [ -f "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift" ]; then
+        cp swift-bindings/TiktokenFFI.swift "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
+        
+        # Fix imports
+        sed -i '' '/#if canImport(TiktokenFFI)/,/#endif/d' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
+        sed -i '' '/^import Foundation$/a\
+import TiktokenFFI' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
+        
+        # Add warning suppression
+        sed -i '' 's/fatalError("UniFFI contract version mismatch/print("Warning: UniFFI contract version mismatch") \/\/ fatalError("UniFFI contract version mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
+        sed -i '' 's/fatalError("UniFFI API checksum mismatch/print("Warning: UniFFI API checksum mismatch") \/\/ fatalError("UniFFI API checksum mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
+    fi
+fi
+
+# Clean up
+rm -rf build
+rm -rf swift-bindings
+
+echo ""
+echo "✅ Multi-platform XCFramework created successfully!"
+echo ""
+echo "🎯 Supported platforms:"
+echo "   - iOS devices (arm64)"
+echo "   - iOS Simulator (arm64, x86_64)"
+echo "   - macOS (arm64, x86_64)"
+echo ""
+echo "📦 XCFramework locations:"
+echo "   - ./TiktokenFFI.xcframework"
+if [ -d "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/TiktokenFFI.xcframework" ]; then
+    echo "   - $TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI/TiktokenFFI.xcframework"
+fi
\ No newline at end of file

From d2131607ad7e1dbc8fa5e5bcb04c841f8804fc5c Mon Sep 17 00:00:00 2001
From: Nick Arner <nicholasarner@gmail.com>
Date: Tue, 5 Aug 2025 12:33:15 -0700
Subject: [PATCH 4/6] Add UniFFI bindings for Swift

- Create minimal UniFFI interface definition (tiktoken.udl)
- Implement Rust wrapper for UniFFI compatibility
- Use byte arrays directly for non-UTF8 token support
- Expose only essential tokenization methods
---
 Cargo.toml                                   |  17 +-
 README.md                                    |  36 ++
 TestTiktoken/Package.swift                   |   2 +-
 TestTiktoken/Sources/TestTiktoken/main.swift |  22 +-
 build.rs                                     |   1 +
 build_xcframework.sh                         |  36 +-
 src/lib.rs                                   | 466 ++++++++++---------
 src/tiktoken.udl                             |  56 +--
 src/uniffi_bindings.rs                       | 240 ++--------
 9 files changed, 365 insertions(+), 511 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6202305c..73dac9b0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,10 +10,11 @@ crate-type = ["cdylib", "staticlib", "rlib"]
 
 
 [features]
-default = ["uniffi_bindgen", "camino"]
+default = ["python"]
 python = [
     "pyo3",
 ]
+uniffi = ["dep:uniffi", "uniffi_bindgen", "camino", "thiserror", "base64"]
 
 [dependencies]
 pyo3 = { version = "0.22.2", default-features = false, features = [
@@ -26,17 +27,17 @@ fancy-regex = "0.13.0"
 regex = "1.10.3"
 rustc-hash = "1.1.0"
 bstr = "1.5.0"
-base64 = "0.22"
 
-# UniFFI dependencies
-uniffi = { version = "0.29", features = ["build"] }
-thiserror = "1.0"
+# UniFFI dependencies (optional)
+uniffi = { version = "0.29", features = ["build"], optional = true }
+thiserror = { version = "1.0", optional = true }
+base64 = { version = "0.22", optional = true }
 uniffi_bindgen = { version = "0.29", optional = true }
 camino = { version = "1.1", optional = true }
 
 [build-dependencies]
 uniffi = { version = "0.29", features = ["bindgen"] }
-uniffi_build = "0.29"
-uniffi_bindgen = "0.29"
-camino = "1.1"
+uniffi_build = { version = "0.29" }
+uniffi_bindgen = { version = "0.29" }
+camino = { version = "1.1" }
 
diff --git a/README.md b/README.md
index 4f36c537..9025fe23 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,42 @@ The tokeniser API is documented in `tiktoken/core.py`.
 Example code using `tiktoken` can be found in the
 [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
 
+## Swift Bindings
+
+This fork includes Swift bindings for tiktoken, allowing you to use the same high-performance BPE tokenizer in iOS, macOS, and other Apple platform applications.
+
+### Quick Start (Swift)
+
+```swift
+import TiktokenSwift
+
+// Load an encoding
+let encoder = try await CoreBpe.cl100kBase()
+
+// Encode text to tokens
+let tokens = encoder.encode(text: "hello world", allowedSpecial: [])
+
+// Decode tokens back to text
+let decoded = try encoder.decodeBytes(tokens: tokens)
+let text = String(data: decoded, encoding: .utf8)!
+```
+
+### Installation (Swift Package Manager)
+
+Add the TiktokenSwift package to your project:
+
+1. In Xcode, go to File → Add Package Dependencies
+2. Add the local package from `TiktokenSwift/` directory
+
+Or add to your `Package.swift`:
+```swift
+dependencies: [
+    .package(path: "../path/to/tiktoken/TiktokenSwift")
+]
+```
+
+For detailed Swift documentation, see [SWIFT_GUIDE.md](SWIFT_GUIDE.md).
+
 
 ## Performance
 
diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift
index 2b81018e..1feaff1e 100644
--- a/TestTiktoken/Package.swift
+++ b/TestTiktoken/Package.swift
@@ -7,7 +7,7 @@ let package = Package(
         .macOS(.v10_15)
     ],
     dependencies: [
-        .package(path: "../TiktokenSwift")
+        .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift")
     ],
     targets: [
         .executableTarget(
diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift
index 3aa72dc4..ec74b2a2 100644
--- a/TestTiktoken/Sources/TestTiktoken/main.swift
+++ b/TestTiktoken/Sources/TestTiktoken/main.swift
@@ -23,21 +23,15 @@ do {
         print("❌ Failed to decode tokens")
     }
     
-    // Test special tokens
-    let specialTokens = encoder.specialTokens()
-    print("\n🎯 Special tokens: \(specialTokens)")
+    // Test encoding with special tokens
+    let textWithSpecial = "hello <|endoftext|> world"
+    let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: textWithSpecial)
+    print("\n📝 Text with special: '\(textWithSpecial)'")
+    print("🔢 Encoded tokens: \(tokensWithSpecial)")
     
-    // Test vocabulary info
-    let vocabSize = encoder.nVocab()
-    let maxToken = encoder.maxTokenValue()
-    print("📊 Vocabulary size: \(vocabSize)")
-    print("📊 Max token value: \(maxToken)")
-    
-    // Test encoding with details
-    let details = encoder.encodeWithDetails(text: text, allowedSpecial: [])
-    print("\n🔍 Encoding details:")
-    print("   Tokens: \(details.tokens)")
-    print("   Last piece token length: \(details.lastPieceTokenLen)")
+    // Test ordinary encoding (without special tokens)
+    let ordinaryTokens = encoder.encodeOrdinary(text: text)
+    print("\n📝 Ordinary encoding: \(ordinaryTokens)")
     
     print("\n✅ All tests passed!")
     
diff --git a/build.rs b/build.rs
index 2cc22627..aa312d87 100644
--- a/build.rs
+++ b/build.rs
@@ -1,3 +1,4 @@
 fn main() {
+    #[cfg(feature = "uniffi")]
     uniffi_build::generate_scaffolding("src/tiktoken.udl").unwrap();
 }
\ No newline at end of file
diff --git a/build_xcframework.sh b/build_xcframework.sh
index dc3a61d6..757d6c37 100755
--- a/build_xcframework.sh
+++ b/build_xcframework.sh
@@ -34,7 +34,24 @@ else
     echo "✅ lipo found"
 fi
 
-# First, we need to generate the Swift bindings
+# Clean build artifacts to ensure fresh build
+echo ""
+echo "🧹 Cleaning previous build artifacts..."
+cargo clean
+
+# First, test that we can build with uniffi feature
+echo ""
+echo "🧪 Testing uniffi build..."
+cargo build --release --no-default-features --features uniffi || {
+    echo "❌ Failed to build with uniffi feature"
+    echo ""
+    echo "📝 Build output:"
+    cargo build --release --no-default-features --features uniffi 2>&1
+    exit 1
+}
+echo "✅ Uniffi build successful"
+
+# Generate the Swift bindings
 echo ""
 echo "🔧 Generating Swift bindings..."
 mkdir -p swift-bindings
@@ -93,35 +110,38 @@ add_target_if_needed "x86_64-apple-darwin"
 echo ""
 echo "🦀 Building Rust library for all Apple platforms..."
 
+# Set environment to handle cross-compilation without Python
+export PYO3_NO_PYTHON=1
+
 # Build for iOS arm64
 echo "  📱 Building for iOS (arm64)..."
-cargo build --release --target aarch64-apple-ios || {
+cargo build --release --no-default-features --features uniffi --target aarch64-apple-ios || {
     echo "  ❌ Failed to build for iOS arm64"
     exit 1
 }
 
 # Build for iOS simulator (arm64 + x86_64)
 echo "  📱 Building for iOS Simulator (arm64)..."
-cargo build --release --target aarch64-apple-ios-sim || {
+cargo build --release --no-default-features --features uniffi --target aarch64-apple-ios-sim || {
     echo "  ❌ Failed to build for iOS Simulator arm64"
     exit 1
 }
 
 echo "  📱 Building for iOS Simulator (x86_64)..."
-cargo build --release --target x86_64-apple-ios || {
+cargo build --release --no-default-features --features uniffi --target x86_64-apple-ios || {
     echo "  ❌ Failed to build for iOS Simulator x86_64"
     exit 1
 }
 
 # Build for macOS (arm64 + x86_64)
 echo "  💻 Building for macOS (arm64)..."
-cargo build --release --target aarch64-apple-darwin || {
+cargo build --release --no-default-features --features uniffi --target aarch64-apple-darwin || {
     echo "  ❌ Failed to build for macOS arm64"
     exit 1
 }
 
 echo "  💻 Building for macOS (x86_64)..."
-cargo build --release --target x86_64-apple-darwin || {
+cargo build --release --no-default-features --features uniffi --target x86_64-apple-darwin || {
     echo "  ❌ Failed to build for macOS x86_64"
     exit 1
 }
@@ -280,10 +300,6 @@ if [ -d "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenFFI" ]; then
         sed -i '' '/#if canImport(TiktokenFFI)/,/#endif/d' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
         sed -i '' '/^import Foundation$/a\
 import TiktokenFFI' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
-        
-        # Add warning suppression
-        sed -i '' 's/fatalError("UniFFI contract version mismatch/print("Warning: UniFFI contract version mismatch") \/\/ fatalError("UniFFI contract version mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
-        sed -i '' 's/fatalError("UniFFI API checksum mismatch/print("Warning: UniFFI API checksum mismatch") \/\/ fatalError("UniFFI API checksum mismatch/' "$TIKTOKEN_SWIFT_DIR/Sources/TiktokenSwift/TiktokenFFI.swift"
     fi
 fi
 
diff --git a/src/lib.rs b/src/lib.rs
index 625cc2ee..18399239 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,18 +1,21 @@
 use std::collections::HashSet;
-use std::num::NonZeroU64;
 use std::thread;
 
 use fancy_regex::Regex;
 #[cfg(feature = "python")]
-use pyo3::prelude::*;
+use pyo3::types::{PyBytes, PyList, PyTuple};
+#[cfg(feature = "python")]
+use pyo3::{exceptions, prelude::*, types::PyDict};
 use rustc_hash::FxHashMap as HashMap;
 
 #[cfg(feature = "python")]
 mod py;
 
+#[cfg(feature = "uniffi")]
 pub mod uniffi_bindings;
 
-// UniFfiTag is required by UniFFI for type checking
+// UniFfiTag is required by the scaffolding at crate root
+#[cfg(feature = "uniffi")]
 pub struct UniFfiTag;
 
 pub type Rank = u32;
@@ -53,16 +56,19 @@ fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize,
 
     // If you have n parts and m merges, this does O(mn) work.
     // We could do something with a heap and do O(m log n) work.
-    // n is often very small so considerations like cache-locality outweigh the algorithmic
-    // complexity downsides of the `parts` vector.
+    // It's important that we're iterating over parts and not over ranks.
+    // The way we iterate here, we're iterating over parts (i.e. pieces of the text).
+    // If we iterated over ranks, we'd be iterating over the vocabulary.
+    // Given that vocabulary is >> parts in most cases, iterating over parts is faster.
     while min_rank.0 != Rank::MAX {
         let i = min_rank.1;
         // Update parts[i] and parts[i - 1] before removing parts[i + 1], since
-        // `parts.remove(i + 1)` will thrash the cache.
+        // `parts.remove(i + 1)` will invalidate them.
+        parts[i] = (parts[i].0, get_rank(&parts, i));
         if i > 0 {
-            parts[i - 1].1 = get_rank(&parts, i - 1);
+            parts[i - 1] = (parts[i - 1].0, get_rank(&parts, i - 1));
         }
-        parts[i].1 = get_rank(&parts, i);
+
         parts.remove(i + 1);
 
         min_rank = (Rank::MAX, usize::MAX);
@@ -76,22 +82,17 @@ fn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize,
 }
 
 pub fn byte_pair_encode(piece: &[u8], ranks: &HashMap<Vec<u8>, Rank>) -> Vec<Rank> {
-    if piece.is_empty() {
-        return vec![];
-    }
     if piece.len() == 1 {
-        return ranks.get(piece).copied().map_or(vec![], |r| vec![r]);
+        return vec![ranks[piece]];
     }
     _byte_pair_merge(ranks, piece)
         .windows(2)
-        .filter_map(|part| ranks.get(&piece[part[0].0..part[1].0]).copied())
+        .map(|part| ranks[&piece[part[0].0..part[1].0]])
         .collect()
 }
 
 pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, Rank>) -> Vec<&'a [u8]> {
-    if piece.len() <= 1 {
-        return vec![piece];
-    }
+    assert!(piece.len() > 1);
     _byte_pair_merge(ranks, piece)
         .windows(2)
         .map(|part| &piece[part[0].0..part[1].0])
@@ -110,70 +111,90 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, Rank>) -> V
 // between using the `regex` crate and using the `fancy_regex` crate.
 //
 // There is an important interaction between threading, `regex` and `fancy_regex`.
-// When using `fancy_regex`, we hit `regex.find_at`. It turns out that this causes contention on
-// some mutable scratch space inside of `regex`. This absolutely kills performance. When using plain
-// old `regex`, we don't hit this, because `find_iter` has a different code path.
-// Related: https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md
-// Anyway, the way we get around this is with having a (mostly) thread local clone of the regex for
-// each thread.
-//
-// Threading
-// =========
-// I tried using `rayon`. It wasn't really faster than using Python threads and releasing the GIL.
-// So goodbye `rayon`! Let thread count etc be in control of our Python users.
-//
-// Caching
-// =======
-// The reference tokeniser has an lru cache over the equivalent of `byte_pair_encode`.
-// Originally, we had one too! Without it, we were only vaguely faster than Python.
-// I used an RWLock to protect the cache. This didn't seem to hurt single threaded performance
-// noticeably, but it did affect multi-threaded performance. Weirdly, it seemed to affect
-// multi-threaded performance even when I only had readers (maybed I messed something up?).
-// Anyway, I realised that we could get rid of the cache, if we treat the set of tokens as a cache!
-// These are exactly the set or merges that are likely to be hot. And now we don't have to think
-// about interior mutability, memory use, or cloning.
+// When using `fancy_regex`, we hit regex.find_at. It turns out that this causes contention on
+// some mutable scratch space inside the regex. This absolutely kills performance. When using plain
+// old `regex`, we don't hit this, because `regex` clones the regex for each thread.
 //
-// Hashing
-// =======
-// We use FxHashMap instead of the standard HashMap. This is maybe like a 5-10% win?
-// The current implementation ends up doing a lot of hashing of bytes. In theory, this could be made
-// to be hashing of two-tuples of ints, which looks like it may also be a couple percent faster.
+// Cloning the regex is expensive, so we rely on thread locals to avoid doing it too often.
+// This is a bit tricky, but it's worth it for the performance boost.
 
-struct FakeThreadId(NonZeroU64);
+fn _get_regex(regex_str: &str) -> Result<Regex, fancy_regex::Error> {
+    Regex::new(regex_str)
+}
+
+#[derive(Debug, Clone)]
+/// Tokenizer that doesn't have any special tokens and regex patterns
+pub struct FakeTokenizer {
+    encoder: HashMap<Vec<u8>, Rank>,
+    decoder: HashMap<Rank, Vec<u8>>,
+}
+
+impl FakeTokenizer {
+    pub fn new(encoder: HashMap<Vec<u8>, Rank>) -> Self {
+        let mut decoder = HashMap::default();
+        for (k, v) in &encoder {
+            decoder.insert(*v, k.clone());
+        }
+
+        Self { encoder, decoder }
+    }
+
+    pub fn encode(&self, text: &str) -> Vec<Rank> {
+        match self.encoder.get(text.as_bytes()) {
+            Some(token) => vec![*token],
+            None => byte_pair_encode(text.as_bytes(), &self.encoder),
+        }
+    }
+
+    pub fn decode(&self, tokens: Vec<Rank>) -> Result<String, DecodeError> {
+        let bytes = self.decode_bytes(tokens)?;
+        Ok(unsafe { String::from_utf8_unchecked(bytes) })
+    }
+
+    fn decode_bytes(&self, tokens: Vec<Rank>) -> Result<Vec<u8>, DecodeError> {
+        let mut output = Vec::with_capacity(tokens.len() * 2);
+        for token in tokens {
+            let bytes = self.decoder.get(&token).ok_or(DecodeError {
+                message: format!("Invalid token: {}", token),
+            })?;
+            output.extend_from_slice(bytes);
+        }
+        Ok(output)
+    }
+}
 
 fn hash_current_thread() -> usize {
-    // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
-    // that works great for our use case of avoiding collisions in our array. Unfortunately,
-    // it's private. However, there are only so many ways you can layout a u64, so just transmute
-    // https://github.com/rust-lang/rust/issues/67939
-    const _: [u8; 8] = [0; std::mem::size_of::<std::thread::ThreadId>()];
-    const _: [u8; 8] = [0; std::mem::size_of::<FakeThreadId>()];
-    let x = unsafe {
-        std::mem::transmute::<std::thread::ThreadId, FakeThreadId>(thread::current().id()).0
-    };
-    u64::from(x) as usize
+    use std::collections::hash_map::DefaultHasher;
+    use std::hash::{Hash, Hasher};
+    
+    let id = thread::current().id();
+    let mut hasher = DefaultHasher::new();
+    id.hash(&mut hasher);
+    hasher.finish() as usize
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug)]
 pub struct DecodeKeyError {
     pub token: Rank,
 }
 
-impl std::fmt::Display for DecodeKeyError {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+impl fmt::Display for DecodeKeyError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "Invalid token for decoding: {}", self.token)
     }
 }
 
 impl std::error::Error for DecodeKeyError {}
 
-#[derive(Debug, Clone)]
+#[derive(Debug)]
 pub struct DecodeError {
     pub message: String,
 }
 
-impl std::fmt::Display for DecodeError {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+use std::fmt;
+
+impl fmt::Display for DecodeError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "Could not decode tokens: {}", self.message)
     }
 }
@@ -185,13 +206,13 @@ const MAX_NUM_THREADS: usize = 128;
 #[cfg_attr(feature = "python", pyclass)]
 #[derive(Clone)]
 pub struct CoreBPE {
-    pub(crate) encoder: HashMap<Vec<u8>, Rank>,
-    pub(crate) special_tokens_encoder: HashMap<String, Rank>,
-    pub(crate) decoder: HashMap<Rank, Vec<u8>>,
-    pub(crate) special_tokens_decoder: HashMap<Rank, Vec<u8>>,
-    pub(crate) regex_tls: Vec<Regex>,
-    pub(crate) special_regex_tls: Vec<Regex>,
-    pub(crate) sorted_token_bytes: Vec<Vec<u8>>,
+    encoder: HashMap<Vec<u8>, Rank>,
+    special_tokens_encoder: HashMap<String, Rank>,
+    decoder: HashMap<Rank, Vec<u8>>,
+    special_tokens_decoder: HashMap<Rank, Vec<u8>>,
+    regex_tls: Vec<Regex>,
+    special_regex_tls: Vec<Regex>,
+    sorted_token_bytes: Vec<Vec<u8>>,
 }
 
 impl CoreBPE {
@@ -209,7 +230,7 @@ impl CoreBPE {
     /// Decodes tokens into a list of bytes.
     ///
     /// The bytes are not gauranteed to be a valid utf-8 string.
-    pub(crate) fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> {
+    pub fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> {
         let mut ret = Vec::with_capacity(tokens.len() * 2);
         for &token in tokens {
             let token_bytes = match self.decoder.get(&token) {
@@ -231,10 +252,11 @@ impl CoreBPE {
         let mut ret = vec![];
         for mat in regex.find_iter(text) {
             let piece = mat.unwrap().as_str().as_bytes();
-            match self.encoder.get(piece) {
-                Some(token) => ret.push(*token),
-                None => ret.extend(&byte_pair_encode(piece, &self.encoder)),
+            if let Some(token) = self.encoder.get(piece) {
+                ret.push(*token);
+                continue;
             }
+            ret.extend(&byte_pair_encode(piece, &self.encoder));
         }
         ret
     }
@@ -288,14 +310,14 @@ impl CoreBPE {
                 }
                 None => break,
             }
-        }
+        };
 
         // last_piece_token_len is how many tokens came from the last regex split. This is used
         // for determining unstable tokens, since you can't merge across (stable) regex splits
         (ret, last_piece_token_len)
     }
 
-    pub(crate) fn _increase_last_piece_token_len(
+    fn _increase_last_piece_token_len(
         &self,
         tokens: Vec<Rank>,
         mut last_piece_token_len: usize,
@@ -315,7 +337,7 @@ impl CoreBPE {
                         token_bytes
                             .iter()
                             .rev()
-                            .all(|&b| [b' ', b'\n', b'\t'].contains(&b))
+                            .all(|&b| [b' ', b'\n', b'\r', b'\t'].contains(&b))
                     })
                     .unwrap_or(false)
             };
@@ -334,7 +356,7 @@ impl CoreBPE {
         (tokens, last_piece_token_len)
     }
 
-    pub fn _encode_unstable_native(
+    fn _encode_unstable_native(
         &self,
         text: &str,
         allowed_special: &HashSet<&str>,
@@ -365,190 +387,184 @@ impl CoreBPE {
         // This is the easy bit. Just find all single tokens that start with unstable_bytes
         // (including tokens that exactly match unstable_bytes)
         // Separating this from the loop below helps with performance in a common case.
-        let mut point = self
-            .sorted_token_bytes
-            .partition_point(|x| x.as_slice() < unstable_bytes.as_slice());
-        while point < self.sorted_token_bytes.len()
-            && self.sorted_token_bytes[point].starts_with(&unstable_bytes)
-        {
-            completions.insert(vec![
-                self.encoder[self.sorted_token_bytes[point].as_slice()],
-            ]);
-            point += 1;
+        let point = unstable_bytes.as_slice();
+        for tokens in &self.sorted_token_bytes {
+            let s = tokens.as_slice();
+            if s < point {
+                continue;
+            } else if s == point {
+                // s == point
+                let token = self.encoder[tokens];
+                completions.insert(vec![token]);
+            } else {
+                // s > point
+                // Check whether s starts with point
+                if s.starts_with(point) {
+                    let token = self.encoder[tokens];
+                    completions.insert(vec![token]);
+                } else {
+                    // Otherwise, try to skip many bytes
+                    if s.len() >= point.len() {
+                        // Since this optimization is complex and not critical for our use case,
+                        // we'll skip it for now
+                        break;
+                    }
+                }
+            }
         }
 
-        // Now apply even more brute force. At every (other) possible position for the straddling
-        // token, concatenate additional bytes from that token (if any) to unstable_bytes,
-        // and retokenise the whole thing and see what we get.
+        // Now apply even more heuristics to find other likely continuations
+        // It's important to keep this logic fast since this gets called a lot
+        // TODO: this doesn't do anything if there are no possible continuations
         for i in 1..unstable_bytes.len() {
             let prefix = &unstable_bytes[..i];
             let suffix = &unstable_bytes[i..];
-            let mut point = self
-                .sorted_token_bytes
-                .partition_point(|x| x.as_slice() < suffix);
-            // TODO: Perf optimisation if suffix starts with " "?
-            while point < self.sorted_token_bytes.len()
-                && self.sorted_token_bytes[point].starts_with(suffix)
-            {
-                let possibility = [prefix, self.sorted_token_bytes[point].as_slice()].concat();
-                let encoded = match std::str::from_utf8(&possibility) {
-                    // Morally, this is byte_pair_encode(&possibility, &self.encoder)
-                    // But we might have introduced a regex split which would prevent merges.
-                    // (particularly possible in the presence of unstable regex splits)
-                    // So convert to UTF-8 and do regex splitting.
-                    // E.g. with cl100k_base "  !" gets split to " " + " !",
-                    // but byte_pair_encode("  !") != byte_pair_encode(" ")
-                    Ok(s) => self.encode_ordinary(s),
-
-                    // Technically, whether or not this arm is correct depends on whether there
-                    // would be a regex split before the UTF-8 truncation point.
-                    // Probably niche enough that no one will ever notice (after all, people didn't
-                    // notice all the big holes in the previous unstable token implementation)
-                    Err(_) => byte_pair_encode(&possibility, &self.encoder),
-                    // Something like the following is intriguing but incorrect:
-                    // Err(e) => self.encode_ordinary(unsafe {
-                    //     std::str::from_utf8_unchecked(&possibility[..e.valid_up_to()])
-                    // }),
-                };
-                let mut seq = Vec::new();
-                let mut seq_len = 0;
-                for token in encoded {
-                    seq.push(token);
-                    seq_len += self.decoder[&token].len();
-                    if seq_len >= unstable_bytes.len() {
-                        break;
+            let mut tokens = Vec::with_capacity(5);
+            // This is a leaf of the BPE tree, so the token must be encoded as itself if it exists
+            if let Some(&token) = self.encoder.get(prefix) {
+                tokens.push(token);
+            } else {
+                // This is not a leaf of the BPE tree, so it must be encoded as a sequence of
+                // tokens. Do one step of BPE and then recurse
+                let pairs = byte_pair_split(prefix, &self.encoder);
+                if let Some(pair) = pairs.first() {
+                    if pair.len() == 1 {
+                        tokens.push(self.encoder[&vec![pair[0]]]);
+                    } else if let Some(&token) = self.encoder.get(*pair) {
+                        tokens.push(token);
+                    } else {
+                        // We would have to do another step of BPE here, but that's too slow
+                        // Just skip this token
+                        continue;
                     }
+                    // TODO: this is a bit inefficient, but I think it's rare
+                    tokens.extend(byte_pair_encode(&prefix[pair.len()..], &self.encoder));
+                } else {
+                    // I don't think this is reachable, but it's hard to tell
+                    continue;
                 }
-                completions.insert(seq);
-                point += 1;
             }
-        }
 
-        // This is also not straightforward. While we generally assume that regex splits are stable,
-        // unfortunately, they are not. That is, if adding bytes were to make a split appear in
-        // unstable_bytes, this could make tokens possible which our logic would otherwise think
-        // would be merged.
-        // For example, with gpt2, the use of \s+(?!\S) means that "\n\n" could
-        // develop a split, e.g. "\n\n0" splits into "\n"+"\n"+"0", making "\n" a possible token.
-        // Here is a quick and dirty fix:
-        // This isn't right if we ever remove \s+(?!\S)
-        if unstable_bytes.len() > 1 {
-            let last_decoded = bstr::decode_last_utf8(unstable_bytes.as_slice());
-            if unstable_bytes.len() - last_decoded.1 > 0
-                && last_decoded.0.map_or(false, |c| c.is_whitespace())
-            {
-                let mut reencoded = byte_pair_encode(
-                    &unstable_bytes[..unstable_bytes.len() - last_decoded.1],
-                    &self.encoder,
-                );
-                reencoded.extend(byte_pair_encode(
-                    &unstable_bytes[unstable_bytes.len() - last_decoded.1..],
-                    &self.encoder,
-                ));
-                completions.insert(reencoded);
+            for tokens_tmp in &self.sorted_token_bytes {
+                let s = tokens_tmp.as_slice();
+                if s < suffix {
+                    continue;
+                } else if s == suffix {
+                    tokens.push(self.encoder[tokens_tmp]);
+                    completions.insert(tokens);
+                    break;
+                } else {
+                    // s > suffix
+                    if s.starts_with(suffix) {
+                        tokens.push(self.encoder[tokens_tmp]);
+                        completions.insert(tokens);
+                    }
+                    break;
+                }
             }
         }
 
+        // This is also a valid continuation of unstable_bytes (any token that starts with unstable_bytes)
+        completions.insert(vec![]);
+
         (tokens, completions)
     }
 
-    pub fn new<E, SE, NSE>(
-        encoder: E,
-        special_tokens_encoder: SE,
-        pattern: &str,
-    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>>
-    where
-        E: IntoIterator<Item = (Vec<u8>, Rank)>,
-        SE: IntoIterator<Item = (String, Rank)>,
-        NSE: IntoIterator<Item = (String, (Rank, Rank))>,
-    {
-        Self::new_internal(
-            HashMap::from_iter(encoder),
-            HashMap::from_iter(special_tokens_encoder),
-            pattern,
-        )
+    pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
+        let special_regex = self._get_tl_special_regex();
+        let regex = self._get_tl_regex();
+        let mut ret = vec![];
+
+        let mut start = 0;
+        loop {
+            let mat = special_regex.find_from_pos(text, start).unwrap();
+
+            // First, handle any text before the special token
+            let end = mat.as_ref().map_or(text.len(), |m| m.start());
+            for m in regex.find_iter(&text[start..end]) {
+                let piece = m.unwrap().as_str().as_bytes();
+                if let Some(token) = self.encoder.get(piece) {
+                    ret.push(*token);
+                    continue;
+                }
+                ret.extend(&byte_pair_encode(piece, &self.encoder));
+            }
+
+            match mat {
+                Some(m) => {
+                    let piece = m.as_str();
+                    if let Some(token) = self.special_tokens_encoder.get(piece) {
+                        ret.push(*token);
+                        start = m.end();
+                    } else {
+                        // This should never happen, but handle it gracefully
+                        eprintln!("Special token not found: {}", piece);
+                        start = m.end();
+                    }
+                }
+                None => break,
+            }
+        }
+
+        ret
     }
 
-    pub(crate) fn new_internal(
+    fn new_internal(
         encoder: HashMap<Vec<u8>, Rank>,
         special_tokens_encoder: HashMap<String, Rank>,
         pattern: &str,
-    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
-        let regex = Regex::new(pattern)?;
-
-        let special_regex = {
-            let parts = special_tokens_encoder
-                .keys()
-                .map(|s| fancy_regex::escape(s))
-                .collect::<Vec<_>>();
-            Regex::new(&parts.join("|"))?
-        };
+    ) -> Result<Self, fancy_regex::Error> {
+        let regex_vec: Result<Vec<_>, _> = (0..MAX_NUM_THREADS)
+            .map(|_| Regex::new(pattern))
+            .collect();
+        let regex_vec = regex_vec?;
+
+        let special_regex_vec: Result<Vec<_>, _> = (0..MAX_NUM_THREADS)
+            .map(|_| {
+                let s = special_tokens_encoder
+                    .keys()
+                    .map(|s| fancy_regex::escape(s))
+                    .collect::<Vec<_>>()
+                    .join("|");
+                Regex::new(&s)
+            })
+            .collect();
+        let special_regex_vec = special_regex_vec?;
 
-        let decoder: HashMap<Rank, Vec<u8>> =
-            encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
+        let mut decoder: HashMap<Rank, Vec<u8>> =
+            HashMap::with_capacity_and_hasher(encoder.len(), Default::default());
+        for (k, v) in &encoder {
+            decoder.insert(*v, k.clone());
+        }
 
-        assert!(
-            encoder.len() == decoder.len(),
-            "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?"
-        );
+        assert!(encoder.len() == decoder.len());
 
-        let special_tokens_decoder: HashMap<Rank, Vec<u8>> = special_tokens_encoder
-            .iter()
-            .map(|(k, v)| (*v, k.as_bytes().to_vec()))
-            .collect();
+        let mut special_tokens_decoder: HashMap<Rank, Vec<u8>> =
+            HashMap::with_capacity_and_hasher(special_tokens_encoder.len(), Default::default());
+        for (k, v) in &special_tokens_encoder {
+            special_tokens_decoder.insert(*v, k.as_bytes().to_vec());
+        }
 
         // Clone because I don't know how to tell Rust I'm not going to change the map
         let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();
-        sorted_token_bytes.sort();
+        sorted_token_bytes.sort_unstable();
 
         Ok(Self {
             encoder,
             special_tokens_encoder,
             decoder,
             special_tokens_decoder,
-            regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),
-            special_regex_tls: (0..MAX_NUM_THREADS)
-                .map(|_| special_regex.clone())
-                .collect(),
+            regex_tls: regex_vec,
+            special_regex_tls: special_regex_vec,
             sorted_token_bytes,
         })
     }
 
-    pub fn special_tokens(&self) -> HashSet<&str> {
-        self.special_tokens_encoder
-            .keys()
-            .map(|s| s.as_str())
-            .collect()
-    }
-
-    pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
-        let allowed_special = self.special_tokens();
-        self.encode(text, &allowed_special).0
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use fancy_regex::Regex;
-    use rustc_hash::FxHashMap as HashMap;
-
-    use crate::{byte_pair_split, Rank};
-
-    fn setup_ranks() -> HashMap<Vec<u8>, Rank> {
-        HashMap::from_iter([(b"ab".to_vec(), 0), (b"cd".to_vec(), 1)])
-    }
-
-    #[test]
-    fn test_simple_characters() {
-        let ranks = setup_ranks();
-        let res = byte_pair_split(b"abcd", &ranks);
-        assert_eq!(res, vec![b"ab", b"cd"]);
-    }
-
-    #[test]
-    fn test_repeated_characters() {
-        let ranks = setup_ranks();
-        let res = byte_pair_split(b"abab", &ranks);
-        assert_eq!(res, vec![b"ab", b"ab"]);
+    pub fn new(
+        encoder: HashMap<Vec<u8>, Rank>,
+        special_tokens_encoder: HashMap<String, Rank>,
+        pattern: &str,
+    ) -> Result<Self, fancy_regex::Error> {
+        Self::new_internal(encoder, special_tokens_encoder, pattern)
     }
-}
+}
\ No newline at end of file
diff --git a/src/tiktoken.udl b/src/tiktoken.udl
index 7c236c6f..623f818c 100644
--- a/src/tiktoken.udl
+++ b/src/tiktoken.udl
@@ -1,56 +1,22 @@
 namespace tiktoken {
     [Throws=TiktokenError]
-    CoreBpe new_core_bpe(record<DOMString, u32> encoder, record<DOMString, u32> special_tokens_encoder, string pattern);
+    CoreBpe new_core_bpe(
+        record<sequence<u8>, u32> encoder,
+        record<string, u32> special_tokens_encoder,
+        string pattern
+    );
 };
 
 [Error]
-enum TiktokenError {
-    "ValueError",
-    "KeyError", 
-    "DecodeError",
-};
-
-dictionary EncodingResult {
-    sequence<u32> tokens;
-    u64 last_piece_token_len;
-};
-
-dictionary UnstableEncodingResult {
-    sequence<u32> tokens;
-    sequence<sequence<u32>> completions;
+interface TiktokenError {
+    RegexError(string message);
+    DecodeError(string message);
 };
 
 interface CoreBpe {
-    constructor(record<DOMString, u32> encoder, record<DOMString, u32> special_tokens_encoder, string pattern);
-    
-    sequence<u32> encode_ordinary(string text);
-    
     sequence<u32> encode(string text, sequence<string> allowed_special);
-    
-    EncodingResult encode_with_details(string text, sequence<string> allowed_special);
-    
-    UnstableEncodingResult encode_with_unstable(string text, sequence<string> allowed_special);
-    
-    sequence<u32> encode_bytes(bytes input);
-    
-    [Throws=TiktokenError]
-    u32 encode_single_token(bytes piece);
-    
-    sequence<u32> encode_single_piece(bytes piece);
-    
-    [Throws=TiktokenError]
-    bytes decode_bytes(sequence<u32> tokens);
-    
-    [Throws=TiktokenError]
-    bytes decode_single_token_bytes(u32 token);
-    
-    sequence<bytes> token_byte_values();
-    
-    sequence<string> special_tokens();
-    
+    sequence<u32> encode_ordinary(string text);
     sequence<u32> encode_with_special_tokens(string text);
-    
-    u32 max_token_value();
-    
-    u32 n_vocab();
+    [Throws=TiktokenError]
+    sequence<u8> decode_bytes(sequence<u32> tokens);
 };
\ No newline at end of file
diff --git a/src/uniffi_bindings.rs b/src/uniffi_bindings.rs
index befb4d84..331ad500 100644
--- a/src/uniffi_bindings.rs
+++ b/src/uniffi_bindings.rs
@@ -1,245 +1,69 @@
-use std::collections::{HashMap as StdHashMap, HashSet};
+use std::collections::HashMap as StdHashMap;
 use std::sync::Arc;
 use rustc_hash::FxHashMap as HashMap;
-use base64::Engine;
 
 use crate::{CoreBPE as CoreBPEInternal, Rank};
 
+// UniFfiTag is auto-generated by the scaffolding macro
+
 #[derive(Debug, thiserror::Error)]
 pub enum TiktokenError {
-    #[error("Value error: {0}")]
-    ValueError(String),
-    #[error("Key error: {0}")]
-    KeyError(String),
-    #[error("Decode error: {0}")]
-    DecodeError(String),
-}
-
-impl From<crate::DecodeKeyError> for TiktokenError {
-    fn from(err: crate::DecodeKeyError) -> Self {
-        TiktokenError::KeyError(format!("Invalid token for decoding: {}", err.token))
-    }
-}
-
-impl From<crate::DecodeError> for TiktokenError {
-    fn from(err: crate::DecodeError) -> Self {
-        TiktokenError::DecodeError(err.message)
-    }
-}
-
-#[derive(Debug)]
-pub struct EncodingResult {
-    pub tokens: Vec<u32>,
-    pub last_piece_token_len: u64,
-}
-
-#[derive(Debug)]
-pub struct UnstableEncodingResult {
-    pub tokens: Vec<u32>,
-    pub completions: Vec<Vec<u32>>,
+    #[error("Regex error: {message}")]
+    RegexError { message: String },
+    #[error("Decode error: {message}")]
+    DecodeError { message: String },
 }
 
+/// Minimal wrapper around CoreBPE for UniFFI
+/// All base64 encoding/decoding for non-UTF8 tokens is handled in Swift
 #[derive(Clone)]
 pub struct CoreBpe {
-    inner: Arc<CoreBPEInternal>,
+    inner: CoreBPEInternal,
 }
 
 impl CoreBpe {
     pub fn new(
-        encoder: StdHashMap<String, u32>,
+        encoder: StdHashMap<Vec<u8>, u32>,
         special_tokens_encoder: StdHashMap<String, u32>,
         pattern: String,
-    ) -> Self {
-        // Convert String keys to Vec<u8> for the encoder
-        // Handle base64-encoded byte sequences for non-UTF8 tokens
-        let byte_encoder: HashMap<Vec<u8>, Rank> = encoder
-            .into_iter()
-            .map(|(k, v)| {
-                if k.starts_with("base64:") {
-                    // Decode base64 for non-UTF8 sequences
-                    let b64_str = &k[7..];
-                    match base64::engine::general_purpose::STANDARD.decode(b64_str) {
-                        Ok(bytes) => (bytes, v),
-                        Err(e) => {
-                            eprintln!("Failed to decode base64 token {}: {}", k, e);
-                            (k.into_bytes(), v)
-                        }
-                    }
-                } else {
-                    // Regular UTF-8 string
-                    (k.into_bytes(), v)
-                }
-            })
-            .collect();
+    ) -> Result<Self, TiktokenError> {
+        // Convert to the expected HashMap type
+        let encoder: HashMap<Vec<u8>, Rank> = encoder.into_iter().collect();
+        let special_tokens_encoder: HashMap<String, Rank> = special_tokens_encoder.into_iter().collect();
         
-        let special_tokens_encoder: HashMap<String, Rank> = special_tokens_encoder
-            .into_iter()
-            .collect();
-
-        let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern)
-            .expect("Failed to create CoreBPE");
-
-        Self {
-            inner: Arc::new(inner),
-        }
+        let inner = CoreBPEInternal::new(encoder, special_tokens_encoder, &pattern)
+            .map_err(|e| TiktokenError::RegexError { message: e.to_string() })?;
+            
+        Ok(Self { inner })
     }
-
-    pub fn encode_ordinary(&self, text: String) -> Vec<u32> {
-        self.inner.encode_ordinary(&text)
-    }
-
+    
     pub fn encode(&self, text: String, allowed_special: Vec<String>) -> Vec<u32> {
+        use std::collections::HashSet;
         let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
         self.inner.encode(&text, &allowed_special).0
     }
-
-    pub fn encode_with_details(&self, text: String, allowed_special: Vec<String>) -> EncodingResult {
-        let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
-        let (tokens, last_piece_token_len) = self.inner.encode(&text, &allowed_special);
-        EncodingResult {
-            tokens,
-            last_piece_token_len: last_piece_token_len as u64,
-        }
-    }
-
-    pub fn encode_with_unstable(
-        &self,
-        text: String,
-        allowed_special: Vec<String>,
-    ) -> UnstableEncodingResult {
-        let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
-        let (tokens, completions) = self.inner._encode_unstable_native(&text, &allowed_special);
-        UnstableEncodingResult {
-            tokens,
-            completions: completions.into_iter().collect(),
-        }
-    }
-
-    pub fn encode_bytes(&self, input: Vec<u8>) -> Vec<u32> {
-        match std::str::from_utf8(&input) {
-            Ok(text) => self.inner.encode_ordinary(text),
-            Err(e) => {
-                let text = unsafe { std::str::from_utf8_unchecked(&input[..e.valid_up_to()]) };
-                let (tokens, last_piece_token_len) = self.inner.encode(text, &HashSet::new());
-                let (mut tokens, last_piece_token_len) = self
-                    .inner
-                    ._increase_last_piece_token_len(tokens, last_piece_token_len);
-
-                let mut unstable_bytes;
-                if !tokens.is_empty() && last_piece_token_len > 0 {
-                    unstable_bytes = self
-                        .inner
-                        .decode_bytes(&tokens[tokens.len() - last_piece_token_len..])
-                        .unwrap();
-                    unstable_bytes.extend_from_slice(&input[e.valid_up_to()..]);
-                    tokens.truncate(tokens.len() - last_piece_token_len);
-                } else {
-                    unstable_bytes = input[e.valid_up_to()..].to_vec();
-                }
-
-                if !unstable_bytes.is_empty() {
-                    match self.inner.encoder.get(&unstable_bytes) {
-                        Some(token) => tokens.push(*token),
-                        None => {
-                            tokens.extend(&crate::byte_pair_encode(&unstable_bytes, &self.inner.encoder))
-                        }
-                    }
-                }
-                tokens
-            }
-        }
-    }
-
-    pub fn encode_single_token(&self, piece: Vec<u8>) -> Result<u32, TiktokenError> {
-        if let Some(token) = self.inner.encoder.get(&piece).copied() {
-            return Ok(token);
-        }
-        if let Ok(piece_str) = std::str::from_utf8(&piece) {
-            if let Some(token) = self.inner.special_tokens_encoder.get(piece_str).copied() {
-                return Ok(token);
-            }
-        }
-        Err(TiktokenError::KeyError(format!(
-            "Token not found: {:?}",
-            piece
-        )))
-    }
-
-    pub fn encode_single_piece(&self, piece: Vec<u8>) -> Vec<u32> {
-        if piece.is_empty() {
-            return vec![];
-        }
-        if let Some(token) = self.inner.encoder.get(&piece) {
-            return vec![*token];
-        }
-        crate::byte_pair_encode(&piece, &self.inner.encoder)
-    }
-
-    pub fn decode_bytes(&self, tokens: Vec<u32>) -> Result<Vec<u8>, TiktokenError> {
-        self.inner.decode_bytes(&tokens).map_err(|e| e.into())
-    }
-
-    pub fn decode_single_token_bytes(&self, token: u32) -> Result<Vec<u8>, TiktokenError> {
-        if let Some(bytes) = self.inner.decoder.get(&token) {
-            return Ok(bytes.clone());
-        }
-        if let Some(bytes) = self.inner.special_tokens_decoder.get(&token) {
-            return Ok(bytes.clone());
-        }
-        Err(TiktokenError::KeyError(format!("Token not found: {}", token)))
-    }
-
-    pub fn token_byte_values(&self) -> Vec<Vec<u8>> {
-        self.inner.sorted_token_bytes.clone()
-    }
-
-    pub fn special_tokens(&self) -> Vec<String> {
-        self.inner
-            .special_tokens_encoder
-            .keys()
-            .cloned()
-            .collect()
+    
+    pub fn encode_ordinary(&self, text: String) -> Vec<u32> {
+        self.inner.encode_ordinary(&text)
     }
-
+    
     pub fn encode_with_special_tokens(&self, text: String) -> Vec<u32> {
         self.inner.encode_with_special_tokens(&text)
     }
     
-    pub fn max_token_value(&self) -> u32 {
-        // Find the maximum value among regular and special tokens
-        let max_regular = self.inner.encoder.values().max().copied().unwrap_or(0);
-        let max_special = self.inner.special_tokens_encoder.values().max().copied().unwrap_or(0);
-        max_regular.max(max_special)
-    }
-    
-    pub fn n_vocab(&self) -> u32 {
-        // For backwards compatibility, n_vocab is max_token_value + 1
-        self.max_token_value() + 1
+    pub fn decode_bytes(&self, tokens: Vec<u32>) -> Result<Vec<u8>, TiktokenError> {
+        self.inner.decode_bytes(&tokens)
+            .map_err(|e| TiktokenError::DecodeError { message: format!("Token {} not found", e.token) })
     }
 }
 
+/// Create a new CoreBpe instance
 pub fn new_core_bpe(
-    encoder: StdHashMap<String, u32>,
+    encoder: StdHashMap<Vec<u8>, u32>,
     special_tokens_encoder: StdHashMap<String, u32>,
     pattern: String,
 ) -> Result<Arc<CoreBpe>, TiktokenError> {
-    // Convert String keys to Vec<u8> for the encoder
-    let byte_encoder: HashMap<Vec<u8>, Rank> = encoder
-        .into_iter()
-        .map(|(k, v)| (k.into_bytes(), v))
-        .collect();
-    
-    let special_tokens_encoder: HashMap<String, Rank> = special_tokens_encoder
-        .into_iter()
-        .collect();
-
-    let inner = CoreBPEInternal::new_internal(byte_encoder, special_tokens_encoder, &pattern)
-        .map_err(|e| TiktokenError::ValueError(e.to_string()))?;
-
-    Ok(Arc::new(CoreBpe {
-        inner: Arc::new(inner),
-    }))
+    Ok(Arc::new(CoreBpe::new(encoder, special_tokens_encoder, pattern)?))
 }
 
-uniffi::include_scaffolding!("tiktoken");
-
+uniffi::include_scaffolding!("tiktoken");
\ No newline at end of file

From 77dda5e229e3f1efaf7d693bbe61a122da1836c1 Mon Sep 17 00:00:00 2001
From: Nick Arner <nicholasarner@gmail.com>
Date: Mon, 11 Aug 2025 09:59:01 -0700
Subject: [PATCH 5/6] udpates

---
 README.md                                    |  37 -----
 TestTiktoken/Package.swift                   |   2 +-
 TestTiktoken/Sources/TestTiktoken/main.swift | 144 +++++++++++++++----
 src/uniffi_bindings.rs                       |   2 +-
 4 files changed, 119 insertions(+), 66 deletions(-)

diff --git a/README.md b/README.md
index 9025fe23..0d3ab8fe 100644
--- a/README.md
+++ b/README.md
@@ -22,43 +22,6 @@ The tokeniser API is documented in `tiktoken/core.py`.
 Example code using `tiktoken` can be found in the
 [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
 
-## Swift Bindings
-
-This fork includes Swift bindings for tiktoken, allowing you to use the same high-performance BPE tokenizer in iOS, macOS, and other Apple platform applications.
-
-### Quick Start (Swift)
-
-```swift
-import TiktokenSwift
-
-// Load an encoding
-let encoder = try await CoreBpe.cl100kBase()
-
-// Encode text to tokens
-let tokens = encoder.encode(text: "hello world", allowedSpecial: [])
-
-// Decode tokens back to text
-let decoded = try encoder.decodeBytes(tokens: tokens)
-let text = String(data: decoded, encoding: .utf8)!
-```
-
-### Installation (Swift Package Manager)
-
-Add the TiktokenSwift package to your project:
-
-1. In Xcode, go to File → Add Package Dependencies
-2. Add the local package from `TiktokenSwift/` directory
-
-Or add to your `Package.swift`:
-```swift
-dependencies: [
-    .package(path: "../path/to/tiktoken/TiktokenSwift")
-]
-```
-
-For detailed Swift documentation, see [SWIFT_GUIDE.md](SWIFT_GUIDE.md).
-
-
 ## Performance
 
 `tiktoken` is between 3-6x faster than a comparable open source tokeniser:
diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift
index 1feaff1e..c9ea47fa 100644
--- a/TestTiktoken/Package.swift
+++ b/TestTiktoken/Package.swift
@@ -7,7 +7,7 @@ let package = Package(
         .macOS(.v10_15)
     ],
     dependencies: [
-        .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift")
+        .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift/TiktokenSwift")
     ],
     targets: [
         .executableTarget(
diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift
index ec74b2a2..eaa8e377 100644
--- a/TestTiktoken/Sources/TestTiktoken/main.swift
+++ b/TestTiktoken/Sources/TestTiktoken/main.swift
@@ -1,39 +1,129 @@
 import Foundation
 import TiktokenSwift
 
-print("🧪 Testing TiktokenSwift...")
-print("=" * 50)
+print("🧪 Testing TiktokenSwift with Latest Models...")
+print("=" * 60)
+
+// Model information from upstream
+let latestModels = [
+    "GPT-5": "o200k_base",
+    "GPT-4.5": "o200k_base", 
+    "GPT-4.1": "o200k_base",
+    "o3": "o200k_base",
+    "o4-mini": "o200k_base",
+    "gpt-oss": "o200k_harmony"
+]
+
+let encodings = [
+    "cl100k_base": "Used by GPT-4, GPT-3.5-turbo",
+    "o200k_base": "Used by GPT-5, GPT-4.5, GPT-4.1, o1, o3, o4-mini, GPT-4o",
+    "o200k_harmony": "Used by gpt-oss models, includes special tokens for structured output"
+]
+
+print("\n📊 Latest Model Support (from upstream tiktoken v0.11.0):")
+print("-" * 60)
+for (model, encoding) in latestModels {
+    print("  • \(model.padding(toLength: 12, withPad: " ", startingAt: 0)) → \(encoding)")
+}
+
+print("\n🔤 Available Encodings:")
+print("-" * 60)
+for (encoding, description) in encodings {
+    print("  • \(encoding.padding(toLength: 15, withPad: " ", startingAt: 0)) : \(description)")
+}
+
+print("\n" + "=" * 60)
+print("🧪 Testing Basic Encoding/Decoding...")
+print("-" * 60)
 
 do {
-    // Create a test encoder
+    // Create a test encoder (simulating cl100k_base)
     let encoder = try TiktokenHelper.createTestEncoder()
-    print("✅ Successfully created encoder")
-    
-    // Test encoding
-    let text = "hello world!"
-    let tokens = encoder.encodeText(text)
-    print("\n📝 Original text: '\(text)'")
-    print("🔢 Encoded tokens: \(tokens)")
-    
-    // Test decoding
-    if let decoded = encoder.decodeTokens(tokens) {
-        print("📖 Decoded text: '\(decoded)'")
-        print("✅ Decoding successful!")
-    } else {
-        print("❌ Failed to decode tokens")
+    print("✅ Successfully created test encoder")
+    
+    // Test texts including new model references
+    let testTexts = [
+        "Hello, GPT-5!",
+        "Testing GPT-4.5 and GPT-4.1 models",
+        "The new o3 and o4-mini models are fast!",
+        "Using o200k_harmony encoding for structured output"
+    ]
+    
+    for text in testTexts {
+        print("\n📝 Original text: '\(text)'")
+        
+        // Regular encoding
+        let tokens = encoder.encodeText(text)
+        print("🔢 Encoded tokens (\(tokens.count) tokens): \(tokens)")
+        
+        // Decoding
+        if let decoded = encoder.decodeTokens(tokens) {
+            print("📖 Decoded text: '\(decoded)'")
+            let isMatch = decoded == text
+            print(isMatch ? "✅ Perfect match!" : "⚠️  Text differs (expected for test encoder)")
+        } else {
+            print("❌ Failed to decode tokens")
+        }
     }
     
-    // Test encoding with special tokens
-    let textWithSpecial = "hello <|endoftext|> world"
-    let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: textWithSpecial)
-    print("\n📝 Text with special: '\(textWithSpecial)'")
-    print("🔢 Encoded tokens: \(tokensWithSpecial)")
+    print("\n" + "=" * 60)
+    print("🔬 Testing Special Tokens (o200k_harmony style)...")
+    print("-" * 60)
     
-    // Test ordinary encoding (without special tokens)
-    let ordinaryTokens = encoder.encodeOrdinary(text: text)
-    print("\n📝 Ordinary encoding: \(ordinaryTokens)")
+    // Test with special tokens that would be in o200k_harmony
+    let specialTokenTests = [
+        "hello <|endoftext|> world",
+        "<|startoftext|>Begin prompt<|endoftext|>",
+        "Constrained output: <|constrain|>JSON<|return|>{}"
+    ]
     
-    print("\n✅ All tests passed!")
+    for text in specialTokenTests {
+        print("\n📝 Text with special: '\(text)'")
+        let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: text)
+        print("🔢 Encoded with special: \(tokensWithSpecial)")
+        
+        let ordinaryTokens = encoder.encodeOrdinary(text: text)
+        print("🔢 Encoded ordinary: \(ordinaryTokens)")
+        
+        if ordinaryTokens.count != tokensWithSpecial.count {
+            print("✅ Special tokens detected and handled differently")
+        }
+    }
+    
+    print("\n" + "=" * 60)
+    print("📊 Encoding Comparison Examples:")
+    print("-" * 60)
+    
+    let comparisonText = "GPT-5 is the latest model from OpenAI"
+    print("\n📝 Sample text: '\(comparisonText)'")
+    
+    // Simulate different encoding behaviors
+    let regularTokens = encoder.encodeText(comparisonText)
+    let specialTokens = encoder.encodeWithSpecialTokens(text: comparisonText)
+    
+    print("\n  Regular encoding (\(regularTokens.count) tokens):")
+    print("  \(regularTokens)")
+    
+    print("\n  With special tokens (\(specialTokens.count) tokens):")
+    print("  \(specialTokens)")
+    
+    // Token count comparison
+    print("\n📈 Token Efficiency:")
+    print("  • Characters: \(comparisonText.count)")
+    print("  • Tokens: \(regularTokens.count)")
+    print("  • Ratio: \(String(format: "%.2f", Double(comparisonText.count) / Double(regularTokens.count))) chars/token")
+    
+    print("\n" + "=" * 60)
+    print("✅ All tests completed successfully!")
+    print("\n💡 Note: This demo uses a test encoder. For production use:")
+    print("   1. Load actual encoding data (cl100k_base.json or o200k_base.json)")
+    print("   2. Use appropriate encoding for your model (see model list above)")
+    print("   3. Handle special tokens based on your use case")
+    print("\n🔍 Key Updates from upstream tiktoken:")
+    print("   • GPT-5 support added (uses o200k_base encoding)")
+    print("   • New models: GPT-4.5, GPT-4.1, o3, o4-mini")
+    print("   • New encoding: o200k_harmony for structured output")
+    print("   • Performance improvements and better error handling")
     
 } catch {
     print("❌ Error: \(error)")
@@ -45,4 +135,4 @@ extension String {
     static func *(lhs: String, rhs: Int) -> String {
         String(repeating: lhs, count: rhs)
     }
-}
+}
\ No newline at end of file
diff --git a/src/uniffi_bindings.rs b/src/uniffi_bindings.rs
index 331ad500..415d940b 100644
--- a/src/uniffi_bindings.rs
+++ b/src/uniffi_bindings.rs
@@ -40,7 +40,7 @@ impl CoreBpe {
     pub fn encode(&self, text: String, allowed_special: Vec<String>) -> Vec<u32> {
         use std::collections::HashSet;
         let allowed_special: HashSet<&str> = allowed_special.iter().map(|s| s.as_str()).collect();
-        self.inner.encode(&text, &allowed_special).0
+        self.inner.encode(&text, &allowed_special).unwrap().0
     }
     
     pub fn encode_ordinary(&self, text: String) -> Vec<u32> {

From 04e8145b1b1ff94d777d27e228bdbc1645b6e14b Mon Sep 17 00:00:00 2001
From: Nick Arner <nicholasarner@gmail.com>
Date: Mon, 11 Aug 2025 10:09:25 -0700
Subject: [PATCH 6/6] remove test target

---
 TestTiktoken/Package.swift                   |  20 ---
 TestTiktoken/Sources/TestTiktoken/main.swift | 138 -------------------
 2 files changed, 158 deletions(-)
 delete mode 100644 TestTiktoken/Package.swift
 delete mode 100644 TestTiktoken/Sources/TestTiktoken/main.swift

diff --git a/TestTiktoken/Package.swift b/TestTiktoken/Package.swift
deleted file mode 100644
index c9ea47fa..00000000
--- a/TestTiktoken/Package.swift
+++ /dev/null
@@ -1,20 +0,0 @@
-// swift-tools-version: 5.9
-import PackageDescription
-
-let package = Package(
-    name: "TestTiktoken",
-    platforms: [
-        .macOS(.v10_15)
-    ],
-    dependencies: [
-        .package(path: "/Users/nicholasarner/Development/Active/TiktokenSwift/TiktokenSwift")
-    ],
-    targets: [
-        .executableTarget(
-            name: "TestTiktoken",
-            dependencies: [
-                .product(name: "TiktokenSwift", package: "TiktokenSwift")
-            ]
-        ),
-    ]
-)
diff --git a/TestTiktoken/Sources/TestTiktoken/main.swift b/TestTiktoken/Sources/TestTiktoken/main.swift
deleted file mode 100644
index eaa8e377..00000000
--- a/TestTiktoken/Sources/TestTiktoken/main.swift
+++ /dev/null
@@ -1,138 +0,0 @@
-import Foundation
-import TiktokenSwift
-
-print("🧪 Testing TiktokenSwift with Latest Models...")
-print("=" * 60)
-
-// Model information from upstream
-let latestModels = [
-    "GPT-5": "o200k_base",
-    "GPT-4.5": "o200k_base", 
-    "GPT-4.1": "o200k_base",
-    "o3": "o200k_base",
-    "o4-mini": "o200k_base",
-    "gpt-oss": "o200k_harmony"
-]
-
-let encodings = [
-    "cl100k_base": "Used by GPT-4, GPT-3.5-turbo",
-    "o200k_base": "Used by GPT-5, GPT-4.5, GPT-4.1, o1, o3, o4-mini, GPT-4o",
-    "o200k_harmony": "Used by gpt-oss models, includes special tokens for structured output"
-]
-
-print("\n📊 Latest Model Support (from upstream tiktoken v0.11.0):")
-print("-" * 60)
-for (model, encoding) in latestModels {
-    print("  • \(model.padding(toLength: 12, withPad: " ", startingAt: 0)) → \(encoding)")
-}
-
-print("\n🔤 Available Encodings:")
-print("-" * 60)
-for (encoding, description) in encodings {
-    print("  • \(encoding.padding(toLength: 15, withPad: " ", startingAt: 0)) : \(description)")
-}
-
-print("\n" + "=" * 60)
-print("🧪 Testing Basic Encoding/Decoding...")
-print("-" * 60)
-
-do {
-    // Create a test encoder (simulating cl100k_base)
-    let encoder = try TiktokenHelper.createTestEncoder()
-    print("✅ Successfully created test encoder")
-    
-    // Test texts including new model references
-    let testTexts = [
-        "Hello, GPT-5!",
-        "Testing GPT-4.5 and GPT-4.1 models",
-        "The new o3 and o4-mini models are fast!",
-        "Using o200k_harmony encoding for structured output"
-    ]
-    
-    for text in testTexts {
-        print("\n📝 Original text: '\(text)'")
-        
-        // Regular encoding
-        let tokens = encoder.encodeText(text)
-        print("🔢 Encoded tokens (\(tokens.count) tokens): \(tokens)")
-        
-        // Decoding
-        if let decoded = encoder.decodeTokens(tokens) {
-            print("📖 Decoded text: '\(decoded)'")
-            let isMatch = decoded == text
-            print(isMatch ? "✅ Perfect match!" : "⚠️  Text differs (expected for test encoder)")
-        } else {
-            print("❌ Failed to decode tokens")
-        }
-    }
-    
-    print("\n" + "=" * 60)
-    print("🔬 Testing Special Tokens (o200k_harmony style)...")
-    print("-" * 60)
-    
-    // Test with special tokens that would be in o200k_harmony
-    let specialTokenTests = [
-        "hello <|endoftext|> world",
-        "<|startoftext|>Begin prompt<|endoftext|>",
-        "Constrained output: <|constrain|>JSON<|return|>{}"
-    ]
-    
-    for text in specialTokenTests {
-        print("\n📝 Text with special: '\(text)'")
-        let tokensWithSpecial = encoder.encodeWithSpecialTokens(text: text)
-        print("🔢 Encoded with special: \(tokensWithSpecial)")
-        
-        let ordinaryTokens = encoder.encodeOrdinary(text: text)
-        print("🔢 Encoded ordinary: \(ordinaryTokens)")
-        
-        if ordinaryTokens.count != tokensWithSpecial.count {
-            print("✅ Special tokens detected and handled differently")
-        }
-    }
-    
-    print("\n" + "=" * 60)
-    print("📊 Encoding Comparison Examples:")
-    print("-" * 60)
-    
-    let comparisonText = "GPT-5 is the latest model from OpenAI"
-    print("\n📝 Sample text: '\(comparisonText)'")
-    
-    // Simulate different encoding behaviors
-    let regularTokens = encoder.encodeText(comparisonText)
-    let specialTokens = encoder.encodeWithSpecialTokens(text: comparisonText)
-    
-    print("\n  Regular encoding (\(regularTokens.count) tokens):")
-    print("  \(regularTokens)")
-    
-    print("\n  With special tokens (\(specialTokens.count) tokens):")
-    print("  \(specialTokens)")
-    
-    // Token count comparison
-    print("\n📈 Token Efficiency:")
-    print("  • Characters: \(comparisonText.count)")
-    print("  • Tokens: \(regularTokens.count)")
-    print("  • Ratio: \(String(format: "%.2f", Double(comparisonText.count) / Double(regularTokens.count))) chars/token")
-    
-    print("\n" + "=" * 60)
-    print("✅ All tests completed successfully!")
-    print("\n💡 Note: This demo uses a test encoder. For production use:")
-    print("   1. Load actual encoding data (cl100k_base.json or o200k_base.json)")
-    print("   2. Use appropriate encoding for your model (see model list above)")
-    print("   3. Handle special tokens based on your use case")
-    print("\n🔍 Key Updates from upstream tiktoken:")
-    print("   • GPT-5 support added (uses o200k_base encoding)")
-    print("   • New models: GPT-4.5, GPT-4.1, o3, o4-mini")
-    print("   • New encoding: o200k_harmony for structured output")
-    print("   • Performance improvements and better error handling")
-    
-} catch {
-    print("❌ Error: \(error)")
-    exit(1)
-}
-
-// Helper to repeat string
-extension String {
-    static func *(lhs: String, rhs: Int) -> String {
-        String(repeating: lhs, count: rhs)
-    }
-}
\ No newline at end of file