|
| 1 | +use std::fs::File; |
| 2 | +use std::io::{BufReader, BufWriter, Read, Seek}; |
| 3 | +use std::path::Path; |
| 4 | +use std::sync::Once; |
| 5 | + |
| 6 | +use clap::Parser; |
| 7 | +use kitoken::{Definition, DeserializationError, Kitoken}; |
| 8 | + |
| 9 | +#[derive(Parser)] |
| 10 | +enum Command { |
| 11 | + #[clap(name = "convert", about = "Convert a tokenizer model to a kitoken definition")] |
| 12 | + Convert { |
| 13 | + #[arg(name = "path", help = "Path to the tokenizer model")] |
| 14 | + path: String, |
| 15 | + }, |
| 16 | + #[clap(name = "compare", about = "Compare two tokenizer models")] |
| 17 | + Compare { |
| 18 | + #[arg(name = "one", help = "Path to the first tokenizer model")] |
| 19 | + one: String, |
| 20 | + #[arg(name = "two", help = "Path to the second tokenizer model")] |
| 21 | + two: String, |
| 22 | + }, |
| 23 | + #[clap(name = "inspect", about = "Inspect a tokenizer model")] |
| 24 | + Inspect { |
| 25 | + #[arg(name = "path", help = "Path to the tokenizer model")] |
| 26 | + path: String, |
| 27 | + }, |
| 28 | + #[clap(name = "encode", about = "Encode text into tokens")] |
| 29 | + Encode { |
| 30 | + #[arg(name = "model", help = "Path to the tokenizer model")] |
| 31 | + model: String, |
| 32 | + #[arg(name = "path", help = "Path to the input file")] |
| 33 | + input: String, |
| 34 | + }, |
| 35 | + #[clap(name = "decode", about = "Decode tokens into text")] |
| 36 | + Decode { |
| 37 | + #[arg(name = "model", help = "Path to the tokenizer model")] |
| 38 | + model: String, |
| 39 | + #[arg(name = "path", help = "Path to the input file")] |
| 40 | + input: String, |
| 41 | + }, |
| 42 | +} |
| 43 | + |
| 44 | +#[derive(Parser)] |
| 45 | +struct Args { |
| 46 | + #[clap(subcommand)] |
| 47 | + command: Command, |
| 48 | +} |
| 49 | + |
| 50 | +static INIT_ENV: Once = Once::new(); |
| 51 | + |
| 52 | +pub fn init_env() { |
| 53 | + INIT_ENV.call_once(|| { |
| 54 | + simple_logger::SimpleLogger::new() |
| 55 | + .with_level(log::Level::Info.to_level_filter()) |
| 56 | + .env() |
| 57 | + .init() |
| 58 | + .unwrap(); |
| 59 | + }); |
| 60 | +} |
| 61 | + |
| 62 | +pub fn main() { |
| 63 | + init_env(); |
| 64 | + |
| 65 | + let args = Args::parse(); |
| 66 | + match args.command { |
| 67 | + Command::Convert { path } => { |
| 68 | + let path = Path::new(&path); |
| 69 | + let mut paths = Vec::new(); |
| 70 | + if path.is_dir() { |
| 71 | + for entry in std::fs::read_dir(path).unwrap() { |
| 72 | + let entry = entry.unwrap(); |
| 73 | + let path = entry.path(); |
| 74 | + if path.is_file() { |
| 75 | + paths.push(path); |
| 76 | + } |
| 77 | + } |
| 78 | + } else if path.is_file() { |
| 79 | + paths.push(path.to_path_buf()); |
| 80 | + } else { |
| 81 | + eprintln!("Invalid path: {}", path.display()); |
| 82 | + std::process::exit(1); |
| 83 | + } |
| 84 | + for path in paths { |
| 85 | + convert(&path, true).unwrap_or_else(|error| { |
| 86 | + eprintln!("{}", error); |
| 87 | + std::process::exit(1); |
| 88 | + }); |
| 89 | + } |
| 90 | + } |
| 91 | + Command::Compare { one, two } => { |
| 92 | + let one = Path::new(&one); |
| 93 | + let two = Path::new(&two); |
| 94 | + let one = convert(one, false).unwrap_or_else(|error| { |
| 95 | + eprintln!("{}", error); |
| 96 | + std::process::exit(1); |
| 97 | + }); |
| 98 | + let two = convert(two, false).unwrap_or_else(|error| { |
| 99 | + eprintln!("{}", error); |
| 100 | + std::process::exit(1); |
| 101 | + }); |
| 102 | + if one != two { |
| 103 | + eprintln!("Models are different"); |
| 104 | + if one.model.vocab() != two.model.vocab() { |
| 105 | + let num_diff = one |
| 106 | + .model |
| 107 | + .vocab() |
| 108 | + .iter() |
| 109 | + .zip(two.model.vocab()) |
| 110 | + .filter(|(a, b)| a != b) |
| 111 | + .count(); |
| 112 | + eprintln!("Vocabs are different: {} entries", num_diff); |
| 113 | + } |
| 114 | + if one.specials != two.specials { |
| 115 | + let num_diff = one |
| 116 | + .specials |
| 117 | + .iter() |
| 118 | + .zip(two.specials.iter()) |
| 119 | + .filter(|(a, b)| a != b) |
| 120 | + .count(); |
| 121 | + eprintln!("Specials are different: {} entries", num_diff); |
| 122 | + } |
| 123 | + if one.config != two.config { |
| 124 | + eprintln!("Configs are different"); |
| 125 | + } |
| 126 | + std::process::exit(1); |
| 127 | + } else { |
| 128 | + println!("Models are the same"); |
| 129 | + } |
| 130 | + } |
| 131 | + Command::Inspect { path } => { |
| 132 | + let path = Path::new(&path); |
| 133 | + let model = convert(path, false).unwrap_or_else(|error| { |
| 134 | + eprintln!("{}", error); |
| 135 | + std::process::exit(1); |
| 136 | + }); |
| 137 | + println!("Specials: {:#?}", model.specials); |
| 138 | + println!("{:#?}", model); |
| 139 | + } |
| 140 | + Command::Encode { model, input } => { |
| 141 | + let model = Path::new(&model); |
| 142 | + let inputp = Path::new(&input); |
| 143 | + let model = convert(model, false).unwrap_or_else(|error| { |
| 144 | + eprintln!("{}", error); |
| 145 | + std::process::exit(1); |
| 146 | + }); |
| 147 | + let encoder = Kitoken::from_definition(model).unwrap_or_else(|error| { |
| 148 | + eprintln!("{}", error); |
| 149 | + std::process::exit(1); |
| 150 | + }); |
| 151 | + let mut buffer = String::with_capacity(1024); |
| 152 | + if inputp.is_file() { |
| 153 | + let mut reader = BufReader::new(File::open(inputp).unwrap()); |
| 154 | + reader.read_to_string(&mut buffer).unwrap(); |
| 155 | + } else { |
| 156 | + println!("No such file \"{}\", assuming literal input", input); |
| 157 | + buffer.push_str(&input); |
| 158 | + } |
| 159 | + let result = encoder.encode(&buffer, true).unwrap_or_else(|error| { |
| 160 | + eprintln!("{}", error); |
| 161 | + std::process::exit(1); |
| 162 | + }); |
| 163 | + for token in result { |
| 164 | + print!("{} ", token); |
| 165 | + } |
| 166 | + println!() |
| 167 | + } |
| 168 | + Command::Decode { model, input } => { |
| 169 | + let model = Path::new(&model); |
| 170 | + let inputp = Path::new(&input); |
| 171 | + let model = convert(model, false).unwrap_or_else(|error| { |
| 172 | + eprintln!("{}", error); |
| 173 | + std::process::exit(1); |
| 174 | + }); |
| 175 | + let encoder = Kitoken::from_definition(model).unwrap_or_else(|error| { |
| 176 | + eprintln!("{}", error); |
| 177 | + std::process::exit(1); |
| 178 | + }); |
| 179 | + let mut buffer = String::with_capacity(1024); |
| 180 | + if inputp.is_file() { |
| 181 | + let mut reader = BufReader::new(File::open(input).unwrap()); |
| 182 | + reader.read_to_string(&mut buffer).unwrap(); |
| 183 | + } else { |
| 184 | + println!("No such file \"{}\", assuming literal input", input); |
| 185 | + buffer.push_str(&input); |
| 186 | + } |
| 187 | + let tokens = buffer |
| 188 | + .split(&[' ', ',', '\n']) |
| 189 | + .filter(|s| !s.is_empty()) |
| 190 | + .map(str::parse) |
| 191 | + .collect::<Result<Vec<_>, _>>() |
| 192 | + .unwrap_or_else(|error| { |
| 193 | + eprintln!("{}", error); |
| 194 | + std::process::exit(1); |
| 195 | + }); |
| 196 | + let result = encoder.decode(&tokens, true).unwrap_or_else(|error| { |
| 197 | + eprintln!("{}", error); |
| 198 | + std::process::exit(1); |
| 199 | + }); |
| 200 | + println!("{}", String::from_utf8(result).unwrap()); |
| 201 | + } |
| 202 | + } |
| 203 | +} |
| 204 | + |
| 205 | +pub fn convert(path: &Path, write: bool) -> Result<Definition, DeserializationError> { |
| 206 | + let mut reader = BufReader::new(File::open(path)?); |
| 207 | + let definition = Definition::from_reader(&mut reader)?; |
| 208 | + eprintln!("Definition loaded from {}", path.display()); |
| 209 | + match definition.model { |
| 210 | + kitoken::Model::BytePair { .. } => eprintln!("Model type: BPE"), |
| 211 | + kitoken::Model::Unigram { .. } => eprintln!("Model type: Unigram"), |
| 212 | + kitoken::Model::WordPiece { .. } => eprintln!("Model type: WordPiece"), |
| 213 | + _ => {} |
| 214 | + } |
| 215 | + eprintln!("Vocab size: {}", definition.model.vocab().len()); |
| 216 | + eprintln!("Specials size: {}", definition.specials.len()); |
| 217 | + eprintln!("Input size: {} bytes", reader.stream_position()?); |
| 218 | + if write { |
| 219 | + let out = path.with_extension("kit"); |
| 220 | + let mut writer = BufWriter::new(File::create(&out)?); |
| 221 | + definition.to_writer(&mut writer)?; |
| 222 | + eprintln!("Definition written to {}", out.display()); |
| 223 | + eprintln!("Output size: {} bytes", writer.stream_position()?); |
| 224 | + } |
| 225 | + eprintln!(); |
| 226 | + Ok(definition) |
| 227 | +} |
0 commit comments