diff --git a/CMakeLists.txt b/CMakeLists.txt
index de51c0a17b2f6..5afdcbf46f540 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,9 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
+# llamax
+option(LLAMAX "llama: enable high-level C++ API." ON)
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -187,6 +190,14 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
     add_subdirectory(pocs)
 endif()
 
+#
+# llamax
+#
+
+if (LLAMAX)
+    add_subdirectory(llamax)
+endif()
+
 #
 # install
 #
diff --git a/llamax/CMakeLists.txt b/llamax/CMakeLists.txt
new file mode 100644
index 0000000000000..deada8d23aa9d
--- /dev/null
+++ b/llamax/CMakeLists.txt
@@ -0,0 +1,48 @@
+#
+# Define version
+#
+
+set(LLAMAX_MAJOR_VERSION 2)
+set(LLAMAX_MINOR_VERSION 1)
+set(LLAMAX_PATCH_VERSION 0)
+set(LLAMAX_VERSION ${LLAMAX_MAJOR_VERSION}.${LLAMAX_MINOR_VERSION}.${LLAMAX_PATCH_VERSION})
+
+#
+# Build llamax
+#
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+set(LLAMAX_SRCS src/llamax.cpp)
+add_library(llamax SHARED ${LLAMAX_SRCS})
+target_link_libraries(llamax PRIVATE llama)
+
+set(LLAMAX_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/llamax.h)
+
+set_target_properties(llamax
+    PROPERTIES
+        PUBLIC_HEADER "${LLAMAX_PUBLIC_HEADERS}")
+
+add_subdirectory(examples)
+
+#
+# install
+#
+
+install(TARGETS llamax LIBRARY PUBLIC_HEADER)
+#
+# Config files
+#
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(llamaxConfig.cmake.in "${PROJECT_BINARY_DIR}/llamaxConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
+    PATH_VARS CMAKE_INSTALL_INCLUDEDIR
+    )
+write_basic_package_version_file("${PROJECT_BINARY_DIR}/llamaxConfigVersion.cmake" VERSION ${LLAMAX_VERSION} COMPATIBILITY SameMajorVersion)
+
+# Install the llamaxConfig.cmake and llamaxConfigVersion.cmake
+install(FILES
+  "${PROJECT_BINARY_DIR}/llamaxConfig.cmake"
+  "${PROJECT_BINARY_DIR}/llamaxConfigVersion.cmake"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/llama" COMPONENT dev)
diff --git a/llamax/README.MD b/llamax/README.MD
new file mode 100644
index 0000000000000..3730adf3827ac
--- /dev/null
+++ b/llamax/README.MD
@@ -0,0 +1,13 @@
+llamax
+======
+
+`llamax` is an experimental high-level API for [llama](https://github.com/ggerganov/llama.cpp).
+
+Development occurs in the `dev/1` branch.
+
+The roadmap includes:
+
+* ~~support for text based LLM models~~
+* support for multi-mode models
+* support for embeddings
+* ~~support for grammars~~
diff --git a/llamax/examples/CMakeLists.txt b/llamax/examples/CMakeLists.txt
new file mode 100644
index 0000000000000..7357af5d639d6
--- /dev/null
+++ b/llamax/examples/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(llamax_simple simple.cpp)
+target_link_libraries(llamax_simple llamax)
+
+add_executable(llamax_chat chat.cpp)
+target_link_libraries(llamax_chat llamax)
+
+add_executable(llamax_grammar grammar.cpp)
+target_link_libraries(llamax_grammar llamax)
diff --git a/llamax/examples/README.MD b/llamax/examples/README.MD
new file mode 100644
index 0000000000000..e05c8d8374b1b
--- /dev/null
+++ b/llamax/examples/README.MD
@@ -0,0 +1,6 @@
+
+
+```bash
+wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.2-GGUF/resolve/main/ggml-model-q4_0.gguf
+./llamax_simple ggml-model-q4_0.gguf "What is up doctor?"
+```
\ No newline at end of file
diff --git a/llamax/examples/chat.cpp b/llamax/examples/chat.cpp
new file mode 100644
index 0000000000000..be1083735c2e6
--- /dev/null
+++ b/llamax/examples/chat.cpp
@@ -0,0 +1,49 @@
+#include <llamax.h>
+
+#include <iostream>
+
+int main(int _argc, const char ** _argv) {
+    if (_argc != 2) {
+        std::cerr << "llamax_chat [model]" << std::endl;
+        return -1;
+    }
+
+    llamax::model   model = llamax::model::load_from_file(_argv[1], llamax::model_params::default_params());
+    llamax::context ctx =
+        model.create_context(llamax::context_params::default_params(),
+                             llamax::sampler_builder().min_p(0.05, 1).temp(0.8f).dist(llamax::default_seed()));
+    llamax::chat_template             ct = model.create_chat_template();
+    std::vector<llamax::chat_message> messages;
+
+    messages.push_back({ llamax::chat_message_role::system, "You are an assistant." });
+
+    int offset = 0;
+
+    while (true) {
+        printf("\033[32m> \033[0m");
+        std::string user;
+        std::getline(std::cin, user);
+
+        if (user.empty()) {
+            break;
+        }
+
+        messages.push_back({ llamax::chat_message_role::user, user });
+
+        std::string prompt = ct.generate(messages);
+
+        std::string      answer;
+        llamax::iterator it = ctx.prompt(prompt.substr(offset));
+        offset              = prompt.size();
+
+        while (std::optional<std::string> s = it.next()) {
+            answer += s.value();
+            std::cout << s.value();
+        }
+        std::cout << std::endl;
+
+        messages.push_back({ llamax::chat_message_role::assistant, answer });
+    }
+
+    return 0;
+}
diff --git a/llamax/examples/grammar.cpp b/llamax/examples/grammar.cpp
new file mode 100644
index 0000000000000..b3e7fbea83843
--- /dev/null
+++ b/llamax/examples/grammar.cpp
@@ -0,0 +1,51 @@
+#include <llamax.h>
+
+#include <iostream>
+
+const char * json_grammar = R"(
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+
+string ::=
+  "\"" (
+    [^"\\\x7F\x00-\x1F] |
+    "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
+  )* "\"" ws
+
+number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
+
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= | " " | "\n" [ \t]{0,20}
+)";
+
+int main(int _argc, const char ** _argv) {
+    if (_argc != 3) {
+        std::cerr << "llamax_simple [model] \"What is up doctor?\"" << std::endl;
+        return -1;
+    }
+
+    llamax::model   model = llamax::model::load_from_file(_argv[1], llamax::model_params::default_params());
+    llamax::context ctx   = model.create_context(
+        llamax::context_params::default_params().set_context_size(4096).set_batch_size(2048),
+        llamax::sampler_builder().grammar(json_grammar, "root").min_p(0.05, 1).temp(0.8f).dist(llamax::default_seed()));
+    llamax::iterator it = ctx.prompt(_argv[2]);
+
+    while (std::optional<std::string> s = it.next()) {
+        std::cout << s.value();
+    }
+    std::cout << std::endl;
+
+    return 0;
+}
diff --git a/llamax/examples/simple.cpp b/llamax/examples/simple.cpp
new file mode 100644
index 0000000000000..b76f711a26cf8
--- /dev/null
+++ b/llamax/examples/simple.cpp
@@ -0,0 +1,22 @@
+#include <llamax.h>
+
+#include <iostream>
+
+int main(int _argc, const char ** _argv) {
+    if (_argc != 3) {
+        std::cerr << "llamax_simple [model] \"What is up doctor?\"" << std::endl;
+        return -1;
+    }
+
+    llamax::model   model = llamax::model::load_from_file(_argv[1], llamax::model_params::default_params());
+    llamax::context ctx =
+        model.create_context(llamax::context_params::default_params(), llamax::sampler_builder().greedy());
+    llamax::iterator it = ctx.prompt(_argv[2]);
+
+    while (std::optional<std::string> s = it.next()) {
+        std::cout << s.value();
+    }
+    std::cout << std::endl;
+
+    return 0;
+}
diff --git a/llamax/include/llamax.h b/llamax/include/llamax.h
new file mode 100644
index 0000000000000..11fb6dc99dd4e
--- /dev/null
+++ b/llamax/include/llamax.h
@@ -0,0 +1,144 @@
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace llamax {
+uint32_t default_seed();
+
+class context;
+class chat_template;
+class model;
+class sampler_builder;
+
+/// Exception class for errors in llamax
+class exception : public std::exception {
+    friend class chat_template;
+    friend class context;
+    friend class iterator;
+    friend class model;
+
+    exception(const std::string & what) : m_what(what) {}
+  public:
+    const char * what() const noexcept override { return m_what.c_str(); }
+  private:
+    std::string m_what;
+};
+
+/// Parameters for a llama models
+class model_params {
+    friend class model;
+  public:
+    static model_params default_params();
+    /// Set the number of layers offset to a GPU
+    model_params &      set_n_gpu_layers(unsigned _n_gpu_layers);
+  private:
+    struct data;
+    std::shared_ptr<data> d;
+};
+
+/// Parameters for the context
+class context_params {
+    friend class model;
+  public:
+    static context_params default_params();
+    /// Set the context size
+    context_params &      set_context_size(unsigned _context_size);
+    // batch_size is the maximum number of tokens that can be processed in a single call to
+    // llama_decode
+    context_params &      set_batch_size(unsigned _batch_size);
+  private:
+    struct data;
+    std::shared_ptr<data> d;
+};
+
+class sampler_builder {
+    friend class model;
+  public:
+    sampler_builder();
+    ~sampler_builder();
+    sampler_builder & top_k(int32_t _k);
+    sampler_builder & top_p(float p, size_t min_keep);
+    sampler_builder & min_p(float p, size_t min_keep);
+    sampler_builder & grammar(const std::string & _grammar, const std::string & _root);
+    sampler_builder & temp(float t);
+    sampler_builder & greedy();
+    sampler_builder & dist(uint32_t seed);
+  private:
+    struct data;
+    std::unique_ptr<data> d;
+};
+class context;
+
+class model {
+    friend class iterator;
+    friend class context;
+    friend class chat_template;
+  public:
+    /**
+     * Attempt to load a model from a file.
+     *
+     * This function can trigger an exception.
+     */
+    static model load_from_file(const std::string & _name, const model_params & _params);
+    /**
+     * Create a context that can be used to generate text based on a prompt.
+     */
+    context      create_context(const context_params & _context_params, const sampler_builder & _sampler_builder) const;
+    /**
+     * Create a chat template that can be used to generate the prompt for a chat bot.
+     */
+    chat_template create_chat_template(bool _add_assistant = true) const;
+  private:
+    struct data;
+    std::shared_ptr<data> d;
+};
+
+class iterator {
+    friend class context;
+    iterator();
+  public:
+    iterator(iterator && _rhs);
+    ~iterator();
+    /**
+     * Return the next token, or null, if no more tokens.
+     *
+     * This function can trigger an exception.
+     */
+    std::optional<std::string> next();
+  private:
+    struct data;
+    std::unique_ptr<data> d;
+};
+
+class context {
+    friend class model;
+    friend class iterator;
+  public:
+    /**
+     * Prompt the llm.
+     */
+    iterator prompt(const std::string & _prompt);
+  private:
+    struct data;
+    std::shared_ptr<data> d;
+};
+enum class chat_message_role { system, user, assistant };
+
+struct chat_message {
+    chat_message_role role;
+    std::string       content;
+};
+
+class chat_template {
+    friend class model;
+  public:
+    /**
+     * Generate a prompt based on a template and a set of messages.
+     */
+    std::string generate(const std::vector<chat_message> & _messages);
+  private:
+    struct data;
+    std::shared_ptr<data> d;
+};
+}  // namespace llamax
diff --git a/llamax/llamaxConfig.cmake.in b/llamax/llamaxConfig.cmake.in
new file mode 100644
index 0000000000000..c5cf2c3da6e45
--- /dev/null
+++ b/llamax/llamaxConfig.cmake.in
@@ -0,0 +1,18 @@
+@PACKAGE_INIT@
+
+set_and_check(LLAMAX_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(LLAMAX_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
+
+find_library(llamax_LIBRARY llamax
+    REQUIRED
+    HINTS ${LLAMAX_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)
+
+add_library(llamax UNKNOWN IMPORTED)
+set_target_properties(llamax
+    PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${LLAMAX_INCLUDE_DIR}"
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${llama_LIBRARY}"
+        POSITION_INDEPENDENT_CODE ON)
diff --git a/llamax/src/llamax.cpp b/llamax/src/llamax.cpp
new file mode 100644
index 0000000000000..a66d3c09eb437
--- /dev/null
+++ b/llamax/src/llamax.cpp
@@ -0,0 +1,325 @@
+#include "llamax.h"
+
+#include <format>
+#include <functional>
+#include <string>
+
+#include "llama.h"
+
+using namespace llamax;
+
+uint32_t llamax::default_seed() {
+    return LLAMA_DEFAULT_SEED;
+}
+
+//      _       _
+//   __| | __ _| |_ __ _
+//  / _` |/ _` | __/ _` |
+// | (_| | (_| | || (_| |
+//  \__,_|\__,_|\__\__,_|
+
+struct model_params::data {
+    data() : model_params(llama_model_default_params()) {}
+
+    llama_model_params model_params;
+};
+
+struct model::data {
+    ~data() { llama_free_model(model); }
+
+    llama_model * model = nullptr;
+};
+
+struct iterator::data {
+    data(const context & _ctx) : ctx(_ctx) {}
+
+    context ctx;
+};
+
+struct context_params::data {
+    data() : ctx_params(llama_context_default_params()) {}
+
+    llama_context_params ctx_params;
+};
+
+struct sampler_builder::data {
+    ~data() {}
+
+    std::vector<std::function<llama_sampler *(llama_model *)>>   builders;
+    std::optional<std::function<llama_sampler *(llama_model *)>> grammar_builder;
+    bool                                                         grammar_first = false;
+};
+
+struct context::data {
+    ~data() {
+        llama_free(ctx);
+        llama_sampler_free(sampler);
+    }
+
+    llamax::model        model;
+    llama_context_params ctx_params;
+    llama_context *      ctx;
+    llama_sampler *      sampler;
+    llama_sampler *      grammar_sampler;
+    bool                 grammar_first = false;
+};
+
+struct chat_template::data {
+    llamax::model     model;
+    std::vector<char> buffer;
+};
+
+//                      _      _
+//  _ __ ___   ___   __| | ___| |     _ __   __ _ _ __ __ _ _ __ ___  ___
+// | '_ ` _ \ / _ \ / _` |/ _ \ |    | '_ \ / _` | '__/ _` | '_ ` _ \/ __|
+// | | | | | | (_) | (_| |  __/ |    | |_) | (_| | | | (_| | | | | | \__ \
+// |_| |_| |_|\___/ \__,_|\___|_|____| .__/ \__,_|_|  \__,_|_| |_| |_|___/
+//                             |_____|_|
+
+model_params model_params::default_params() {
+    model_params p;
+    p.d = std::make_unique<model_params::data>();
+    return p;
+}
+
+model_params & model_params::set_n_gpu_layers(unsigned _n_gpu_layers) {
+    d->model_params.n_gpu_layers = _n_gpu_layers;
+    return *this;
+}
+
+//                  _            _
+//   ___ ___  _ __ | |_ _____  __ |_      _ __   __ _ _ __ __ _ _ __ ___  ___
+//  / __/ _ \| '_ \| __/ _ \ \/ / __|    | '_ \ / _` | '__/ _` | '_ ` _ \/ __|
+// | (_| (_) | | | | ||  __/>  <| |_     | |_) | (_| | | | (_| | | | | | \__ \
+//  \___\___/|_| |_|\__\___/_/\_\\__|____| .__/ \__,_|_|  \__,_|_| |_| |_|___/
+//                                 |_____|_|
+
+context_params context_params::default_params() {
+    context_params p;
+    p.d                     = std::make_unique<context_params::data>();
+    p.d->ctx_params.no_perf = false;
+    return p;
+}
+
+context_params & context_params::set_context_size(unsigned _context_size) {
+    d->ctx_params.n_ctx = _context_size;
+    return *this;
+}
+
+context_params & context_params::set_batch_size(unsigned _batch_size) {
+    d->ctx_params.n_batch = _batch_size;
+    return *this;
+}
+
+//                            _            _           _ _     _
+//  ___  __ _ _ __ ___  _ __ | | ___ _ __ | |__  _   _(_) | __| | ___ _ __
+// / __|/ _` | '_ ` _ \| '_ \| |/ _ \ '__|| '_ \| | | | | |/ _` |/ _ \ '__|
+// \__ \ (_| | | | | | | |_) | |  __/ |   | |_) | |_| | | | (_| |  __/ |
+// |___/\__,_|_| |_| |_| .__/|_|\___|_|___|_.__/ \__,_|_|_|\__,_|\___|_|
+//                     |_|           |_____|
+
+sampler_builder::sampler_builder() : d(std::make_unique<data>()) {}
+
+sampler_builder::~sampler_builder() {}
+
+sampler_builder & sampler_builder::top_k(int32_t _k) {
+    d->builders.push_back([_k](llama_model *) { return llama_sampler_init_top_k(_k); });
+    return *this;
+}
+
+sampler_builder & sampler_builder::top_p(float p, size_t min_keep) {
+    d->builders.push_back([p, min_keep](llama_model *) { return llama_sampler_init_top_p(p, min_keep); });
+    return *this;
+}
+
+sampler_builder & sampler_builder::min_p(float p, size_t min_keep) {
+    d->builders.push_back([p, min_keep](llama_model *) { return llama_sampler_init_min_p(p, min_keep); });
+    return *this;
+}
+
+sampler_builder & sampler_builder::grammar(const std::string & _grammar, const std::string & _root) {
+    d->builders.push_back([_grammar, _root](llama_model * _model) {
+        const llama_vocab * vocab = llama_model_get_vocab(_model);
+        return llama_sampler_init_grammar(vocab, _grammar.c_str(), _root.c_str());
+    });
+    return *this;
+}
+
+sampler_builder & sampler_builder::temp(float t) {
+    d->builders.push_back([t](llama_model *) { return llama_sampler_init_temp(t); });
+    return *this;
+}
+
+sampler_builder & sampler_builder::greedy() {
+    d->builders.push_back([](llama_model *) { return llama_sampler_init_greedy(); });
+    return *this;
+}
+
+sampler_builder & sampler_builder::dist(uint32_t seed) {
+    d->builders.push_back([seed](llama_model *) { return llama_sampler_init_dist(seed); });
+    return *this;
+}
+
+//                      _      _
+//  _ __ ___   ___   __| | ___| |
+// | '_ ` _ \ / _ \ / _` |/ _ \ |
+// | | | | | | (_) | (_| |  __/ |
+// |_| |_| |_|\___/ \__,_|\___|_|
+
+model model::load_from_file(const std::string & _name, const model_params & _params) {
+    model m;
+    m.d        = std::make_shared<model::data>();
+    m.d->model = llama_model_load_from_file(_name.c_str(), _params.d->model_params);
+    if (m.d->model) {
+        return m;
+    } else {
+        throw exception("Unable to load model from file: " + _name);
+    }
+}
+
+context model::create_context(const context_params & _context_params, const sampler_builder & _sampler_builder) const {
+    context ctx;
+    ctx.d             = std::make_shared<context::data>();
+    ctx.d->model      = *this;
+    ctx.d->ctx_params = _context_params.d->ctx_params;
+    ctx.d->ctx        = llama_new_context_with_model(d->model, ctx.d->ctx_params);
+
+    llama_sampler_chain_params sparams = llama_sampler_chain_default_params();
+    sparams.no_perf                    = false;
+    ctx.d->sampler                     = llama_sampler_chain_init(sparams);
+
+    for (const std::function<llama_sampler *(llama_model *)> & f : _sampler_builder.d->builders) {
+        llama_sampler_chain_add(ctx.d->sampler, f(d->model));
+    }
+
+    return ctx;
+}
+
+chat_template model::create_chat_template(bool _add_assistant) const {
+    chat_template ct;
+    ct.d        = std::make_shared<chat_template::data>();
+    ct.d->model = *this;
+    return ct;
+}
+
+//  _ _                 _
+// (_) |_ ___ _ __ __ _| |_ ___  _ __
+// | | __/ _ \ '__/ _` | __/ _ \| '__|
+// | | ||  __/ | | (_| | || (_) | |
+// |_|\__\___|_|  \__,_|\__\___/|_|
+
+iterator::iterator() : d(nullptr) {}
+
+iterator::iterator(iterator && _rhs) : d(std::move(_rhs.d)) {}
+
+iterator::~iterator() {}
+
+std::optional<std::string> iterator::next() {
+    llama_token         new_token_id = llama_sampler_sample(d->ctx.d->sampler, d->ctx.d->ctx, -1);
+    const llama_vocab * vocab        = llama_model_get_vocab(d->ctx.d->model.d->model);
+    if (llama_vocab_is_eog(vocab, new_token_id)) {
+        return std::nullopt;
+    }
+    char buf[128];
+    int  n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
+    if (n < 0) {
+        throw exception("Failed to convert token " + std::to_string(new_token_id) + " to piece.");
+    }
+    std::string s(buf, n);
+
+    // prepare the next batch with the sampled token
+    auto    batch         = llama_batch_get_one(&new_token_id, 1);
+    int32_t decode_result = llama_decode(d->ctx.d->ctx, batch);
+
+    if (decode_result) {
+        throw exception("Failed to eval, return code: " + std::to_string(decode_result));
+    }
+    return s;
+}
+
+//                  _            _
+//   ___ ___  _ __ | |_ _____  __ |_
+//  / __/ _ \| '_ \| __/ _ \ \/ / __|
+// | (_| (_) | | | | ||  __/>  <| |_
+//  \___\___/|_| |_|\__\___/_/\_\\__|
+
+iterator context::prompt(const std::string & _prompt) {
+    const llama_vocab * vocab = llama_model_get_vocab(d->model.d->model);
+
+    //// First tokenize the prompt
+    // Get the number of tokens
+    const int n_prompt = -llama_tokenize(vocab, _prompt.c_str(), _prompt.size(), NULL, 0, true, true);
+
+    // allocate space for the tokens and tokenize the prompt
+    std::vector<llama_token> prompt_tokens(n_prompt);
+    if (llama_tokenize(vocab, _prompt.c_str(), _prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) <
+        0) {
+        throw exception("Failed to tokenize the prompt: " + _prompt);
+    }
+
+    //// Prepare batch
+    const unsigned batch_size = d->ctx_params.n_batch;
+
+    llama_batch batch = llama_batch_init(batch_size, 0, 1);
+
+    //// Consume the prompt
+    for (int32_t batch_first_token = 0; batch_first_token < n_prompt; batch_first_token += batch_size) {
+        const int32_t remaining     = n_prompt - batch_first_token;
+        int32_t       count_to_eval = std::min<int32_t>(remaining, batch_size);
+
+        llama_batch batch = llama_batch_get_one(&prompt_tokens[batch_first_token], count_to_eval);
+
+        int32_t decode_result = llama_decode(d->ctx, batch);
+
+        if (decode_result) {
+            throw exception("Failed to eval, return code: " + std::to_string(decode_result));
+        }
+    }
+
+    iterator it;
+    it.d = std::make_unique<iterator::data>(*this);
+    return it;
+}
+
+//       _           _      _                       _       _
+//   ___| |__   __ _| |_   | |_ ___ _ __ ___  _ __ | | __ _| |_ ___
+//  / __| '_ \ / _` | __|  | __/ _ \ '_ ` _ \| '_ \| |/ _` | __/ _ \
+// | (__| | | | (_| | |_   | ||  __/ | | | | | |_) | | (_| | ||  __/
+//  \___|_| |_|\__,_|\__|___\__\___|_| |_| |_| .__/|_|\__,_|\__\___|
+//                     |_____|               |_|
+
+std::string chat_template::generate(const std::vector<chat_message> & _messages) {
+    std::vector<llama_chat_message> messages;
+    for (const chat_message & msg : _messages) {
+        const char * role = nullptr;
+        switch (msg.role) {
+            case chat_message_role::assistant:
+                role = "assistant";
+                break;
+            case chat_message_role::system:
+                role = "system";
+                break;
+            case chat_message_role::user:
+                role = "user";
+                break;
+        }
+        if (not role) {
+            throw exception("Unknown role.");
+        }
+        messages.push_back({ role, msg.content.c_str() });
+    }
+    const char * tmpl = llama_model_chat_template(d->model.d->model, /* name */ nullptr);
+
+    int new_len =
+        llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, d->buffer.data(), d->buffer.size());
+    if (new_len > (int) d->buffer.size()) {
+        d->buffer.resize(new_len);
+        new_len =
+            llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, d->buffer.data(), d->buffer.size());
+    }
+    if (new_len < 0) {
+        throw exception("Failed to apply template to messages.");
+    }
+
+    return std::string(d->buffer.data(), d->buffer.size());
+}