Add CLI package

Systemcluster · Systemcluster · commit 72319005c35e · 2025-10-09T18:54:49.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@ build/
 target/
 dist/
 
+tests/models/*/*.kit
+
 .vscode/
 .idea/
 
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
     ".",
     "packages/python",
     "packages/javascript",
+    "packages/cli",
 ]
 
 
diff --git a/README.md b/README.md
@@ -32,12 +32,14 @@ Kitoken is a fast and versatile tokenizer for language models compatible with [S
 - **Compact data format**\
   Definitions are stored in an efficient binary format and without merge list.
 
+See also [`kitoken-cli`](./packages/cli) for Kitoken in the command line.
+
 ## Compatibility
 
 Kitoken can load and convert many existing tokenizer formats. Every supported format is [tested](./tests) against the original implementation across a variety of inputs to ensure correctness and compatibility.
 
 > [!NOTE]
-> Most models on [Hugging Face](https://huggingface.co) are supported. Just take the `tokenizer.json` or `spiece.model` of a model and load it into Kitoken.
+> Most models on [Hugging Face](https://huggingface.co) are supported. Just take the `tokenizer.json` or `spiece.model` and load it into Kitoken.
 
 Kitoken aims to be output-identical with existing implementations for all models. See the notes below for differences in specific cases.
 
@@ -58,7 +60,7 @@ If the model does not contain a trainer definition, `Unigram` is assumed as the
 <summary>Notes</summary>
 
 - SentencePiece uses [different `nfkc` normalization rules in the `nmt_nfkc` and `nmt_nfkc_cf` schemes](https://github.com/google/sentencepiece/blob/master/doc/normalization.md) than during regular `nfkc` normalization. This difference is not entirely additive and prevents the normalization of `～` to `~`. Kitoken uses the regular `nfkc` normalization rules for `nmt_nfkc` and `nmt_nfkc_cf` and normalizes `～` to `~`.
-- SentencePiece's implementation of Unigram merges pieces with the same merge priority differently depending on preceding non-encodable pieces. For example, with `xlnet_base_cased`, SentencePiece encodes `.nnn` and `Զnnn` as `.., 8705, 180` but `ԶԶnnn` as `.., 180, 8705`. Kitoken always merges pieces with the same merge priority in the same order, resulting in `.., 180, 8705` for either case in the example and matching the behavior of Tokenizers.
+- SentencePiece's implementation of Unigram merges pieces with the same merge priority in a different order depending on preceding non-encodable pieces. For example, with `xlnet_base_cased`, SentencePiece encodes `.nnn` and `Զnnn` as `.., 8705, 180` but `ԶԶnnn` as `.., 180, 8705`. Kitoken always merges pieces with the same merge priority in the same order, resulting in `.., 180, 8705` for either case in the example and matching the behavior of Tokenizers.
 
 </details>
 
diff --git a/packages/cli/Cargo.toml b/packages/cli/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+
+name = "kitoken-cli"
+description = "Command line interface for Kitoken, the fast and versatile tokenizer for language models"
+version = "0.10.1"
+edition = "2024"
+resolver = "2"
+publish = false
+workspace = "../.."
+
+
+[[bin]]
+
+name = "kitoken"
+path = "src/main.rs"
+
+
+[dependencies]
+
+kitoken = { path = "../..", features = ["all"] }
+
+log = { version = "0.4.27" }
+clap = { version = "4.5.36", features = [
+    "std",
+    "color",
+    "suggestions",
+    "derive",
+    "unicode",
+    "wrap_help",
+] }
+simple_logger = { version = "5.0" }
diff --git a/packages/cli/README.md b/packages/cli/README.md
@@ -0,0 +1,34 @@
+# kitoken-cli
+
+[![Crates.io](https://img.shields.io/crates/v/kitoken)](https://crates.io/crates/kitoken)
+[![NPM](https://img.shields.io/npm/v/kitoken)](https://www.npmjs.com/package/kitoken)
+[![PyPI](https://img.shields.io/pypi/v/kitoken)](https://pypi.org/project/kitoken)
+[![Tests & Checks](https://img.shields.io/github/actions/workflow/status/Systemcluster/kitoken/tests.yml?label=tests%20%26%20checks)](https://github.com/Systemcluster/kitoken/actions/workflows/tests.yml)
+
+**Tokenizer for language models.**
+
+<sup>**Tokenize text for Llama, Gemini, GPT-4, DeepSeek, Mistral and many others in the command line.**</sup>
+
+```bash
+# Encode
+kitoken encode deepseek.kit ./texts.txt
+# Decode
+kitoken decode deepseek.kit ./tokens.txt
+```
+
+```bash
+# Compare
+kitoken compare llama4.json llama4.model
+# Convert
+kitoken convert llama4.model
+# Inspect
+kitoken inspect llama4.kit
+```
+
+Install `kitoken-cli` with Cargo:
+
+```bash
+cargo install --git https://github.com/Systemcluster/kitoken
+```
+
+See the main [README](//github.com/Systemcluster/kitoken) for more information.
diff --git a/packages/cli/src/main.rs b/packages/cli/src/main.rs
@@ -0,0 +1,227 @@
+use std::fs::File;
+use std::io::{BufReader, BufWriter, Read, Seek};
+use std::path::Path;
+use std::sync::Once;
+
+use clap::Parser;
+use kitoken::{Definition, DeserializationError, Kitoken};
+
+#[derive(Parser)]
+enum Command {
+    #[clap(name = "convert", about = "Convert a tokenizer model to a kitoken definition")]
+    Convert {
+        #[arg(name = "path", help = "Path to the tokenizer model")]
+        path: String,
+    },
+    #[clap(name = "compare", about = "Compare two tokenizer models")]
+    Compare {
+        #[arg(name = "one", help = "Path to the first tokenizer model")]
+        one: String,
+        #[arg(name = "two", help = "Path to the second tokenizer model")]
+        two: String,
+    },
+    #[clap(name = "inspect", about = "Inspect a tokenizer model")]
+    Inspect {
+        #[arg(name = "path", help = "Path to the tokenizer model")]
+        path: String,
+    },
+    #[clap(name = "encode", about = "Encode text into tokens")]
+    Encode {
+        #[arg(name = "model", help = "Path to the tokenizer model")]
+        model: String,
+        #[arg(name = "path", help = "Path to the input file")]
+        input: String,
+    },
+    #[clap(name = "decode", about = "Decode tokens into text")]
+    Decode {
+        #[arg(name = "model", help = "Path to the tokenizer model")]
+        model: String,
+        #[arg(name = "path", help = "Path to the input file")]
+        input: String,
+    },
+}
+
+#[derive(Parser)]
+struct Args {
+    #[clap(subcommand)]
+    command: Command,
+}
+
+static INIT_ENV: Once = Once::new();
+
+pub fn init_env() {
+    INIT_ENV.call_once(|| {
+        simple_logger::SimpleLogger::new()
+            .with_level(log::Level::Info.to_level_filter())
+            .env()
+            .init()
+            .unwrap();
+    });
+}
+
+pub fn main() {
+    init_env();
+
+    let args = Args::parse();
+    match args.command {
+        Command::Convert { path } => {
+            let path = Path::new(&path);
+            let mut paths = Vec::new();
+            if path.is_dir() {
+                for entry in std::fs::read_dir(path).unwrap() {
+                    let entry = entry.unwrap();
+                    let path = entry.path();
+                    if path.is_file() {
+                        paths.push(path);
+                    }
+                }
+            } else if path.is_file() {
+                paths.push(path.to_path_buf());
+            } else {
+                eprintln!("Invalid path: {}", path.display());
+                std::process::exit(1);
+            }
+            for path in paths {
+                convert(&path, true).unwrap_or_else(|error| {
+                    eprintln!("{}", error);
+                    std::process::exit(1);
+                });
+            }
+        }
+        Command::Compare { one, two } => {
+            let one = Path::new(&one);
+            let two = Path::new(&two);
+            let one = convert(one, false).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            let two = convert(two, false).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            if one != two {
+                eprintln!("Models are different");
+                if one.model.vocab() != two.model.vocab() {
+                    let num_diff = one
+                        .model
+                        .vocab()
+                        .iter()
+                        .zip(two.model.vocab())
+                        .filter(|(a, b)| a != b)
+                        .count();
+                    eprintln!("Vocabs are different: {} entries", num_diff);
+                }
+                if one.specials != two.specials {
+                    let num_diff = one
+                        .specials
+                        .iter()
+                        .zip(two.specials.iter())
+                        .filter(|(a, b)| a != b)
+                        .count();
+                    eprintln!("Specials are different: {} entries", num_diff);
+                }
+                if one.config != two.config {
+                    eprintln!("Configs are different");
+                }
+                std::process::exit(1);
+            } else {
+                println!("Models are the same");
+            }
+        }
+        Command::Inspect { path } => {
+            let path = Path::new(&path);
+            let model = convert(path, false).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            println!("Specials: {:#?}", model.specials);
+            println!("{:#?}", model);
+        }
+        Command::Encode { model, input } => {
+            let model = Path::new(&model);
+            let inputp = Path::new(&input);
+            let model = convert(model, false).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            let encoder = Kitoken::from_definition(model).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            let mut buffer = String::with_capacity(1024);
+            if inputp.is_file() {
+                let mut reader = BufReader::new(File::open(inputp).unwrap());
+                reader.read_to_string(&mut buffer).unwrap();
+            } else {
+                println!("No such file \"{}\", assuming literal input", input);
+                buffer.push_str(&input);
+            }
+            let result = encoder.encode(&buffer, true).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            for token in result {
+                print!("{} ", token);
+            }
+            println!()
+        }
+        Command::Decode { model, input } => {
+            let model = Path::new(&model);
+            let inputp = Path::new(&input);
+            let model = convert(model, false).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            let encoder = Kitoken::from_definition(model).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            let mut buffer = String::with_capacity(1024);
+            if inputp.is_file() {
+                let mut reader = BufReader::new(File::open(input).unwrap());
+                reader.read_to_string(&mut buffer).unwrap();
+            } else {
+                println!("No such file \"{}\", assuming literal input", input);
+                buffer.push_str(&input);
+            }
+            let tokens = buffer
+                .split(&[' ', ',', '\n'])
+                .filter(|s| !s.is_empty())
+                .map(str::parse)
+                .collect::<Result<Vec<_>, _>>()
+                .unwrap_or_else(|error| {
+                    eprintln!("{}", error);
+                    std::process::exit(1);
+                });
+            let result = encoder.decode(&tokens, true).unwrap_or_else(|error| {
+                eprintln!("{}", error);
+                std::process::exit(1);
+            });
+            println!("{}", String::from_utf8(result).unwrap());
+        }
+    }
+}
+
+pub fn convert(path: &Path, write: bool) -> Result<Definition, DeserializationError> {
+    let mut reader = BufReader::new(File::open(path)?);
+    let definition = Definition::from_reader(&mut reader)?;
+    eprintln!("Definition loaded from {}", path.display());
+    match definition.model {
+        kitoken::Model::BytePair { .. } => eprintln!("Model type: BPE"),
+        kitoken::Model::Unigram { .. } => eprintln!("Model type: Unigram"),
+        kitoken::Model::WordPiece { .. } => eprintln!("Model type: WordPiece"),
+        _ => {}
+    }
+    eprintln!("Vocab size: {}", definition.model.vocab().len());
+    eprintln!("Specials size: {}", definition.specials.len());
+    eprintln!("Input size: {} bytes", reader.stream_position()?);
+    if write {
+        let out = path.with_extension("kit");
+        let mut writer = BufWriter::new(File::create(&out)?);
+        definition.to_writer(&mut writer)?;
+        eprintln!("Definition written to {}", out.display());
+        eprintln!("Output size: {} bytes", writer.stream_position()?);
+    }
+    eprintln!();
+    Ok(definition)
+}

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ members = [`
`26`	`26`	`".",`
`27`	`27`	`"packages/python",`
`28`	`28`	`"packages/javascript",`
	`29`	`+ "packages/cli",`
`29`	`30`	`]`
`30`	`31`
`31`	`32`