diff --git a/CMakeLists.txt b/CMakeLists.txt index de51c0a17b2f6..5afdcbf46f540 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,9 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +# llamax +option(LLAMAX "llama: enable high-level C++ API." ON) + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) @@ -187,6 +190,14 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) add_subdirectory(pocs) endif() +# +# llamax +# + +if (LLAMAX) + add_subdirectory(llamax) +endif() + # # install # diff --git a/llamax/CMakeLists.txt b/llamax/CMakeLists.txt new file mode 100644 index 0000000000000..deada8d23aa9d --- /dev/null +++ b/llamax/CMakeLists.txt @@ -0,0 +1,48 @@ +# +# Define version +# + +set(LLAMAX_MAJOR_VERSION 2) +set(LLAMAX_MINOR_VERSION 1) +set(LLAMAX_PATCH_VERSION 0) +set(LLAMAX_VERSION ${LLAMAX_MAJOR_VERSION}.${LLAMAX_MINOR_VERSION}.${LLAMAX_PATCH_VERSION}) + +# +# Build llamax +# +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +set(LLAMAX_SRCS src/llamax.cpp) +add_library(llamax SHARED ${LLAMAX_SRCS}) +target_link_libraries(llamax PRIVATE llama) + +set(LLAMAX_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/include/llamax.h) + +set_target_properties(llamax + PROPERTIES + PUBLIC_HEADER "${LLAMAX_PUBLIC_HEADERS}") + +add_subdirectory(examples) + +# +# install +# + +install(TARGETS llamax LIBRARY PUBLIC_HEADER) +# +# Config files +# + +include(CMakePackageConfigHelpers) +configure_package_config_file(llamaxConfig.cmake.in "${PROJECT_BINARY_DIR}/llamaxConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama + PATH_VARS CMAKE_INSTALL_INCLUDEDIR + ) +write_basic_package_version_file("${PROJECT_BINARY_DIR}/llamaxConfigVersion.cmake" VERSION ${LLAMAX_VERSION} COMPATIBILITY SameMajorVersion) + +# Install the llamaxConfig.cmake and llamaxConfigVersion.cmake +install(FILES + "${PROJECT_BINARY_DIR}/llamaxConfig.cmake" + "${PROJECT_BINARY_DIR}/llamaxConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/llama" COMPONENT dev) diff --git a/llamax/README.MD b/llamax/README.MD new file mode 100644 index 0000000000000..3730adf3827ac --- /dev/null +++ b/llamax/README.MD @@ -0,0 +1,13 @@ +llamax +====== + +`llamax` is an experimental high-level API for [llama](https://github.com/ggerganov/llama.cpp). + +Development occurs in the `dev/1` branch. + +The roadmap includes: + +* ~~support for text based LLM models~~ +* support for multi-mode models +* support for embeddings +* ~~support for grammars~~ diff --git a/llamax/examples/CMakeLists.txt b/llamax/examples/CMakeLists.txt new file mode 100644 index 0000000000000..7357af5d639d6 --- /dev/null +++ b/llamax/examples/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(llamax_simple simple.cpp) +target_link_libraries(llamax_simple llamax) + +add_executable(llamax_chat chat.cpp) +target_link_libraries(llamax_chat llamax) + +add_executable(llamax_grammar grammar.cpp) +target_link_libraries(llamax_grammar llamax) diff --git a/llamax/examples/README.MD b/llamax/examples/README.MD new file mode 100644 index 0000000000000..e05c8d8374b1b --- /dev/null +++ b/llamax/examples/README.MD @@ -0,0 +1,6 @@ + + +```bash +wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.2-GGUF/resolve/main/ggml-model-q4_0.gguf +./llamax_simple ggml-model-q4_0.gguf "What is up doctor?" +``` \ No newline at end of file diff --git a/llamax/examples/chat.cpp b/llamax/examples/chat.cpp new file mode 100644 index 0000000000000..be1083735c2e6 --- /dev/null +++ b/llamax/examples/chat.cpp @@ -0,0 +1,49 @@ +#include + +#include + +int main(int _argc, const char ** _argv) { + if (_argc != 2) { + std::cerr << "llamax_chat [model]" << std::endl; + return -1; + } + + llamax::model model = llamax::model::load_from_file(_argv[1], llamax::model_params::default_params()); + llamax::context ctx = + model.create_context(llamax::context_params::default_params(), + llamax::sampler_builder().min_p(0.05, 1).temp(0.8f).dist(llamax::default_seed())); + llamax::chat_template ct = model.create_chat_template(); + std::vector messages; + + messages.push_back({ llamax::chat_message_role::system, "You are an assistant." }); + + int offset = 0; + + while (true) { + printf("\033[32m> \033[0m"); + std::string user; + std::getline(std::cin, user); + + if (user.empty()) { + break; + } + + messages.push_back({ llamax::chat_message_role::user, user }); + + std::string prompt = ct.generate(messages); + + std::string answer; + llamax::iterator it = ctx.prompt(prompt.substr(offset)); + offset = prompt.size(); + + while (std::optional s = it.next()) { + answer += s.value(); + std::cout << s.value(); + } + std::cout << std::endl; + + messages.push_back({ llamax::chat_message_role::assistant, answer }); + } + + return 0; +} diff --git a/llamax/examples/grammar.cpp b/llamax/examples/grammar.cpp new file mode 100644 index 0000000000000..b3e7fbea83843 --- /dev/null +++ b/llamax/examples/grammar.cpp @@ -0,0 +1,51 @@ +#include + +#include + +const char * json_grammar = R"( +root ::= object +value ::= object | array | string | number | ("true" | "false" | "null") ws + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +string ::= + "\"" ( + [^"\\\x7F\x00-\x1F] | + "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes + )* "\"" ws + +number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws + +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= | " " | "\n" [ \t]{0,20} +)"; + +int main(int _argc, const char ** _argv) { + if (_argc != 3) { + std::cerr << "llamax_simple [model] \"What is up doctor?\"" << std::endl; + return -1; + } + + llamax::model model = llamax::model::load_from_file(_argv[1], llamax::model_params::default_params()); + llamax::context ctx = model.create_context( + llamax::context_params::default_params().set_context_size(4096).set_batch_size(2048), + llamax::sampler_builder().grammar(json_grammar, "root").min_p(0.05, 1).temp(0.8f).dist(llamax::default_seed())); + llamax::iterator it = ctx.prompt(_argv[2]); + + while (std::optional s = it.next()) { + std::cout << s.value(); + } + std::cout << std::endl; + + return 0; +} diff --git a/llamax/examples/simple.cpp b/llamax/examples/simple.cpp new file mode 100644 index 0000000000000..b76f711a26cf8 --- /dev/null +++ b/llamax/examples/simple.cpp @@ -0,0 +1,22 @@ +#include + +#include + +int main(int _argc, const char ** _argv) { + if (_argc != 3) { + std::cerr << "llamax_simple [model] \"What is up doctor?\"" << std::endl; + return -1; + } + + llamax::model model = llamax::model::load_from_file(_argv[1], llamax::model_params::default_params()); + llamax::context ctx = + model.create_context(llamax::context_params::default_params(), llamax::sampler_builder().greedy()); + llamax::iterator it = ctx.prompt(_argv[2]); + + while (std::optional s = it.next()) { + std::cout << s.value(); + } + std::cout << std::endl; + + return 0; +} diff --git a/llamax/include/llamax.h b/llamax/include/llamax.h new file mode 100644 index 0000000000000..11fb6dc99dd4e --- /dev/null +++ b/llamax/include/llamax.h @@ -0,0 +1,144 @@ +#include +#include +#include +#include + +namespace llamax { +uint32_t default_seed(); + +class context; +class chat_template; +class model; +class sampler_builder; + +/// Exception class for errors in llamax +class exception : public std::exception { + friend class chat_template; + friend class context; + friend class iterator; + friend class model; + + exception(const std::string & what) : m_what(what) {} + public: + const char * what() const noexcept override { return m_what.c_str(); } + private: + std::string m_what; +}; + +/// Parameters for a llama models +class model_params { + friend class model; + public: + static model_params default_params(); + /// Set the number of layers offset to a GPU + model_params & set_n_gpu_layers(unsigned _n_gpu_layers); + private: + struct data; + std::shared_ptr d; +}; + +/// Parameters for the context +class context_params { + friend class model; + public: + static context_params default_params(); + /// Set the context size + context_params & set_context_size(unsigned _context_size); + // batch_size is the maximum number of tokens that can be processed in a single call to + // llama_decode + context_params & set_batch_size(unsigned _batch_size); + private: + struct data; + std::shared_ptr d; +}; + +class sampler_builder { + friend class model; + public: + sampler_builder(); + ~sampler_builder(); + sampler_builder & top_k(int32_t _k); + sampler_builder & top_p(float p, size_t min_keep); + sampler_builder & min_p(float p, size_t min_keep); + sampler_builder & grammar(const std::string & _grammar, const std::string & _root); + sampler_builder & temp(float t); + sampler_builder & greedy(); + sampler_builder & dist(uint32_t seed); + private: + struct data; + std::unique_ptr d; +}; +class context; + +class model { + friend class iterator; + friend class context; + friend class chat_template; + public: + /** + * Attempt to load a model from a file. + * + * This function can trigger an exception. + */ + static model load_from_file(const std::string & _name, const model_params & _params); + /** + * Create a context that can be used to generate text based on a prompt. + */ + context create_context(const context_params & _context_params, const sampler_builder & _sampler_builder) const; + /** + * Create a chat template that can be used to generate the prompt for a chat bot. + */ + chat_template create_chat_template(bool _add_assistant = true) const; + private: + struct data; + std::shared_ptr d; +}; + +class iterator { + friend class context; + iterator(); + public: + iterator(iterator && _rhs); + ~iterator(); + /** + * Return the next token, or null, if no more tokens. + * + * This function can trigger an exception. + */ + std::optional next(); + private: + struct data; + std::unique_ptr d; +}; + +class context { + friend class model; + friend class iterator; + public: + /** + * Prompt the llm. + */ + iterator prompt(const std::string & _prompt); + private: + struct data; + std::shared_ptr d; +}; +enum class chat_message_role { system, user, assistant }; + +struct chat_message { + chat_message_role role; + std::string content; +}; + +class chat_template { + friend class model; + public: + /** + * Generate a prompt based on a template and a set of messages. + */ + std::string generate(const std::vector & _messages); + private: + struct data; + std::shared_ptr d; +}; +} // namespace llamax diff --git a/llamax/llamaxConfig.cmake.in b/llamax/llamaxConfig.cmake.in new file mode 100644 index 0000000000000..c5cf2c3da6e45 --- /dev/null +++ b/llamax/llamaxConfig.cmake.in @@ -0,0 +1,18 @@ +@PACKAGE_INIT@ + +set_and_check(LLAMAX_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") +set_and_check(LLAMAX_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@") + +find_library(llamax_LIBRARY llamax + REQUIRED + HINTS ${LLAMAX_LIB_DIR} + NO_CMAKE_FIND_ROOT_PATH +) + +add_library(llamax UNKNOWN IMPORTED) +set_target_properties(llamax + PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${LLAMAX_INCLUDE_DIR}" + IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" + IMPORTED_LOCATION "${llama_LIBRARY}" + POSITION_INDEPENDENT_CODE ON) diff --git a/llamax/src/llamax.cpp b/llamax/src/llamax.cpp new file mode 100644 index 0000000000000..a66d3c09eb437 --- /dev/null +++ b/llamax/src/llamax.cpp @@ -0,0 +1,325 @@ +#include "llamax.h" + +#include +#include +#include + +#include "llama.h" + +using namespace llamax; + +uint32_t llamax::default_seed() { + return LLAMA_DEFAULT_SEED; +} + +// _ _ +// __| | __ _| |_ __ _ +// / _` |/ _` | __/ _` | +// | (_| | (_| | || (_| | +// \__,_|\__,_|\__\__,_| + +struct model_params::data { + data() : model_params(llama_model_default_params()) {} + + llama_model_params model_params; +}; + +struct model::data { + ~data() { llama_free_model(model); } + + llama_model * model = nullptr; +}; + +struct iterator::data { + data(const context & _ctx) : ctx(_ctx) {} + + context ctx; +}; + +struct context_params::data { + data() : ctx_params(llama_context_default_params()) {} + + llama_context_params ctx_params; +}; + +struct sampler_builder::data { + ~data() {} + + std::vector> builders; + std::optional> grammar_builder; + bool grammar_first = false; +}; + +struct context::data { + ~data() { + llama_free(ctx); + llama_sampler_free(sampler); + } + + llamax::model model; + llama_context_params ctx_params; + llama_context * ctx; + llama_sampler * sampler; + llama_sampler * grammar_sampler; + bool grammar_first = false; +}; + +struct chat_template::data { + llamax::model model; + std::vector buffer; +}; + +// _ _ +// _ __ ___ ___ __| | ___| | _ __ __ _ _ __ __ _ _ __ ___ ___ +// | '_ ` _ \ / _ \ / _` |/ _ \ | | '_ \ / _` | '__/ _` | '_ ` _ \/ __| +// | | | | | | (_) | (_| | __/ | | |_) | (_| | | | (_| | | | | | \__ \ +// |_| |_| |_|\___/ \__,_|\___|_|____| .__/ \__,_|_| \__,_|_| |_| |_|___/ +// |_____|_| + +model_params model_params::default_params() { + model_params p; + p.d = std::make_unique(); + return p; +} + +model_params & model_params::set_n_gpu_layers(unsigned _n_gpu_layers) { + d->model_params.n_gpu_layers = _n_gpu_layers; + return *this; +} + +// _ _ +// ___ ___ _ __ | |_ _____ __ |_ _ __ __ _ _ __ __ _ _ __ ___ ___ +// / __/ _ \| '_ \| __/ _ \ \/ / __| | '_ \ / _` | '__/ _` | '_ ` _ \/ __| +// | (_| (_) | | | | || __/> <| |_ | |_) | (_| | | | (_| | | | | | \__ \ +// \___\___/|_| |_|\__\___/_/\_\\__|____| .__/ \__,_|_| \__,_|_| |_| |_|___/ +// |_____|_| + +context_params context_params::default_params() { + context_params p; + p.d = std::make_unique(); + p.d->ctx_params.no_perf = false; + return p; +} + +context_params & context_params::set_context_size(unsigned _context_size) { + d->ctx_params.n_ctx = _context_size; + return *this; +} + +context_params & context_params::set_batch_size(unsigned _batch_size) { + d->ctx_params.n_batch = _batch_size; + return *this; +} + +// _ _ _ _ _ +// ___ __ _ _ __ ___ _ __ | | ___ _ __ | |__ _ _(_) | __| | ___ _ __ +// / __|/ _` | '_ ` _ \| '_ \| |/ _ \ '__|| '_ \| | | | | |/ _` |/ _ \ '__| +// \__ \ (_| | | | | | | |_) | | __/ | | |_) | |_| | | | (_| | __/ | +// |___/\__,_|_| |_| |_| .__/|_|\___|_|___|_.__/ \__,_|_|_|\__,_|\___|_| +// |_| |_____| + +sampler_builder::sampler_builder() : d(std::make_unique()) {} + +sampler_builder::~sampler_builder() {} + +sampler_builder & sampler_builder::top_k(int32_t _k) { + d->builders.push_back([_k](llama_model *) { return llama_sampler_init_top_k(_k); }); + return *this; +} + +sampler_builder & sampler_builder::top_p(float p, size_t min_keep) { + d->builders.push_back([p, min_keep](llama_model *) { return llama_sampler_init_top_p(p, min_keep); }); + return *this; +} + +sampler_builder & sampler_builder::min_p(float p, size_t min_keep) { + d->builders.push_back([p, min_keep](llama_model *) { return llama_sampler_init_min_p(p, min_keep); }); + return *this; +} + +sampler_builder & sampler_builder::grammar(const std::string & _grammar, const std::string & _root) { + d->builders.push_back([_grammar, _root](llama_model * _model) { + const llama_vocab * vocab = llama_model_get_vocab(_model); + return llama_sampler_init_grammar(vocab, _grammar.c_str(), _root.c_str()); + }); + return *this; +} + +sampler_builder & sampler_builder::temp(float t) { + d->builders.push_back([t](llama_model *) { return llama_sampler_init_temp(t); }); + return *this; +} + +sampler_builder & sampler_builder::greedy() { + d->builders.push_back([](llama_model *) { return llama_sampler_init_greedy(); }); + return *this; +} + +sampler_builder & sampler_builder::dist(uint32_t seed) { + d->builders.push_back([seed](llama_model *) { return llama_sampler_init_dist(seed); }); + return *this; +} + +// _ _ +// _ __ ___ ___ __| | ___| | +// | '_ ` _ \ / _ \ / _` |/ _ \ | +// | | | | | | (_) | (_| | __/ | +// |_| |_| |_|\___/ \__,_|\___|_| + +model model::load_from_file(const std::string & _name, const model_params & _params) { + model m; + m.d = std::make_shared(); + m.d->model = llama_model_load_from_file(_name.c_str(), _params.d->model_params); + if (m.d->model) { + return m; + } else { + throw exception("Unable to load model from file: " + _name); + } +} + +context model::create_context(const context_params & _context_params, const sampler_builder & _sampler_builder) const { + context ctx; + ctx.d = std::make_shared(); + ctx.d->model = *this; + ctx.d->ctx_params = _context_params.d->ctx_params; + ctx.d->ctx = llama_new_context_with_model(d->model, ctx.d->ctx_params); + + llama_sampler_chain_params sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; + ctx.d->sampler = llama_sampler_chain_init(sparams); + + for (const std::function & f : _sampler_builder.d->builders) { + llama_sampler_chain_add(ctx.d->sampler, f(d->model)); + } + + return ctx; +} + +chat_template model::create_chat_template(bool _add_assistant) const { + chat_template ct; + ct.d = std::make_shared(); + ct.d->model = *this; + return ct; +} + +// _ _ _ +// (_) |_ ___ _ __ __ _| |_ ___ _ __ +// | | __/ _ \ '__/ _` | __/ _ \| '__| +// | | || __/ | | (_| | || (_) | | +// |_|\__\___|_| \__,_|\__\___/|_| + +iterator::iterator() : d(nullptr) {} + +iterator::iterator(iterator && _rhs) : d(std::move(_rhs.d)) {} + +iterator::~iterator() {} + +std::optional iterator::next() { + llama_token new_token_id = llama_sampler_sample(d->ctx.d->sampler, d->ctx.d->ctx, -1); + const llama_vocab * vocab = llama_model_get_vocab(d->ctx.d->model.d->model); + if (llama_vocab_is_eog(vocab, new_token_id)) { + return std::nullopt; + } + char buf[128]; + int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + throw exception("Failed to convert token " + std::to_string(new_token_id) + " to piece."); + } + std::string s(buf, n); + + // prepare the next batch with the sampled token + auto batch = llama_batch_get_one(&new_token_id, 1); + int32_t decode_result = llama_decode(d->ctx.d->ctx, batch); + + if (decode_result) { + throw exception("Failed to eval, return code: " + std::to_string(decode_result)); + } + return s; +} + +// _ _ +// ___ ___ _ __ | |_ _____ __ |_ +// / __/ _ \| '_ \| __/ _ \ \/ / __| +// | (_| (_) | | | | || __/> <| |_ +// \___\___/|_| |_|\__\___/_/\_\\__| + +iterator context::prompt(const std::string & _prompt) { + const llama_vocab * vocab = llama_model_get_vocab(d->model.d->model); + + //// First tokenize the prompt + // Get the number of tokens + const int n_prompt = -llama_tokenize(vocab, _prompt.c_str(), _prompt.size(), NULL, 0, true, true); + + // allocate space for the tokens and tokenize the prompt + std::vector prompt_tokens(n_prompt); + if (llama_tokenize(vocab, _prompt.c_str(), _prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < + 0) { + throw exception("Failed to tokenize the prompt: " + _prompt); + } + + //// Prepare batch + const unsigned batch_size = d->ctx_params.n_batch; + + llama_batch batch = llama_batch_init(batch_size, 0, 1); + + //// Consume the prompt + for (int32_t batch_first_token = 0; batch_first_token < n_prompt; batch_first_token += batch_size) { + const int32_t remaining = n_prompt - batch_first_token; + int32_t count_to_eval = std::min(remaining, batch_size); + + llama_batch batch = llama_batch_get_one(&prompt_tokens[batch_first_token], count_to_eval); + + int32_t decode_result = llama_decode(d->ctx, batch); + + if (decode_result) { + throw exception("Failed to eval, return code: " + std::to_string(decode_result)); + } + } + + iterator it; + it.d = std::make_unique(*this); + return it; +} + +// _ _ _ _ _ +// ___| |__ __ _| |_ | |_ ___ _ __ ___ _ __ | | __ _| |_ ___ +// / __| '_ \ / _` | __| | __/ _ \ '_ ` _ \| '_ \| |/ _` | __/ _ \ +// | (__| | | | (_| | |_ | || __/ | | | | | |_) | | (_| | || __/ +// \___|_| |_|\__,_|\__|___\__\___|_| |_| |_| .__/|_|\__,_|\__\___| +// |_____| |_| + +std::string chat_template::generate(const std::vector & _messages) { + std::vector messages; + for (const chat_message & msg : _messages) { + const char * role = nullptr; + switch (msg.role) { + case chat_message_role::assistant: + role = "assistant"; + break; + case chat_message_role::system: + role = "system"; + break; + case chat_message_role::user: + role = "user"; + break; + } + if (not role) { + throw exception("Unknown role."); + } + messages.push_back({ role, msg.content.c_str() }); + } + const char * tmpl = llama_model_chat_template(d->model.d->model, /* name */ nullptr); + + int new_len = + llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, d->buffer.data(), d->buffer.size()); + if (new_len > (int) d->buffer.size()) { + d->buffer.resize(new_len); + new_len = + llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, d->buffer.data(), d->buffer.size()); + } + if (new_len < 0) { + throw exception("Failed to apply template to messages."); + } + + return std::string(d->buffer.data(), d->buffer.size()); +}