Merge pull request #6 from ParisNeo/main

absadiki · web-flow · commit dc70fd67efc0 · 2023-05-06T20:21:57.000-04:00
Added tokenize and untokenize functions
diff --git a/pyllamacpp/model.py b/pyllamacpp/model.py
@@ -111,6 +111,22 @@ def reset(self) -> None:
         self._last_n_tokens = [0] * self._n_ctx  # n_ctx elements
         self._n_past = 0
 
+    def tokenize(self, text:str):
+        """
+        Returns a list of tokens for the text
+        :param text: text to be tokenized
+        :return: List of tokens
+        """
+        return pp.llama_tokenize(self._ctx, text, True)
+
+    def detokenize(self, tokens:list):
+        """
+        Returns a list of tokens for the text
+        :param text: text to be tokenized
+        :return: A string representing the text extracted from the tokens
+        """
+        return pp.llama_tokens_to_str(self._ctx, tokens)
+
     def generate(self,
                  prompt: str,
                  n_predict: Union[None, int] = None,
diff --git a/src/main.cpp b/src/main.cpp
@@ -87,7 +87,27 @@ std::vector<llama_token> llama_tokenize_wrapper(
 //    return tokens;
 //}
 
+std::string llama_tokens_to_str_wrapper(struct llama_context_wrapper* ctx_w, py::array_t<llama_token> tokens_array) {
+    std::string result;
+    struct llama_context * ctx = ctx_w->ptr;
+    bool all_tokens_valid = true;
+
+    for (int i = 0; i < tokens_array.size(); i++) {
+        llama_token token = tokens_array.at(i);
+        if (token >= llama_n_vocab(ctx)) {
+            all_tokens_valid = false;
+            break;
+        }
+
+        result += llama_token_to_str(ctx, token);
+    }
 
+    if (all_tokens_valid) {
+        return result;
+    } else {
+        return "";
+    }
+}
 
 int llama_n_vocab_wrapper(struct llama_context_wrapper * ctx_w){
     struct llama_context * ctx = ctx_w->ptr;
@@ -697,6 +717,8 @@ PYBIND11_MODULE(_pyllamacpp, m) {
         //@NOTE: to prevent implicit conversion of const char* to unicode on python side, leading to UnicodeDecodeError
         return py::bytes(llama_token_to_str_wrapper(ctx_w, token));
     });
+    m.def("llama_tokens_to_str", &llama_tokens_to_str_wrapper);
+
 
     m.def("llama_token_bos", &llama_token_bos);
     m.def("llama_token_eos", &llama_token_eos);