diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 5c06bd4b3..874f2b901 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -171,6 +171,23 @@ impl WordPiece { Ok(vocab) } + pub fn read_bytes(vocab: &[u8]) -> Result { + let file = BufReader::new(vocab); + + let mut vocab = HashMap::new(); + for (index, line) in file.lines().enumerate() { + let line = line?; + vocab.insert(line.trim_end().to_owned(), index as u32); + } + + Ok(vocab) + } + + pub fn from_bytes>(bytes: P) -> Result { + let tokenizer = serde_json::from_slice(bytes.as_ref())?; + Ok(tokenizer) + } + /// Initialize a `WordPiece` model from a vocab mapping file. pub fn from_file(vocab: &str) -> WordPieceBuilder { WordPiece::builder().files(vocab.to_owned())