From bc0a20cc9f764fbaec53298512585b7c00c67c7d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Jul 2025 11:55:02 +0300 Subject: [PATCH] graph : refactor context to not pass gf explicitly ggml-ci --- src/llama-context.cpp | 4 +- src/llama-context.h | 4 +- src/llama-graph.cpp | 30 +-- src/llama-graph.h | 66 ++---- src/llama-model.cpp | 502 ++++++++++++++++++++---------------------- 5 files changed, 280 insertions(+), 326 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7ec6e24a94ada..d15c5242cb7ca 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -674,7 +674,7 @@ bool llama_context::apply_adapter_cvec( return cvec.apply(model, data, len, n_embd, il_start, il_end); } -llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { +llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); ret = GGML_STATUS_FAILED; @@ -1324,7 +1324,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u } llm_graph_params llama_context::graph_params( - llm_graph_result_i * res, + llm_graph_result * res, const llama_ubatch & ubatch, const llama_memory_context_i * mctx, llm_graph_type gtype) const { diff --git a/src/llama-context.h b/src/llama-context.h index 21d23d83851d1..931168ada0523 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -94,7 +94,7 @@ struct llama_context { // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation // returns nullptr only if ret != GGML_STATUS_SUCCESS - llm_graph_result_i * process_ubatch( + llm_graph_result * process_ubatch( const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, @@ -196,7 +196,7 @@ struct llama_context { private: llm_graph_params graph_params( - llm_graph_result_i * res, + llm_graph_result * res, const llama_ubatch & ubatch, const llama_memory_context_i * mctx, llm_graph_type gtype) const; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 904ed3626f1d8..74bc33b137584 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -448,9 +448,10 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : mctx (params.mctx), cross (params.cross), cb_func (params.cb), - res (static_cast(params.res)), - ctx0 (res->get_ctx()) { - res->params = params; + res (params.res), + ctx0 (res->get_ctx()), + gf (res->get_gf()) { + res->set_params(params); } void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const { @@ -1040,7 +1041,6 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t } ggml_tensor * llm_graph_context::build_attn_mha( - ggml_cgraph * gf, ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, @@ -1170,7 +1170,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_no_cache * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1194,7 +1193,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = k_cur; ggml_tensor * v = v_cur; - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1249,7 +1248,6 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_kv_unified * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1282,7 +1280,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1302,7 +1300,6 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_kv_unified_iswa * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1349,7 +1346,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = mctx_cur->get_k(ctx0, il); ggml_tensor * v = mctx_cur->get_v(ctx0, il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1382,7 +1379,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const { ggml_tensor * llm_graph_context::build_attn( llm_graph_input_attn_cross * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -1404,7 +1400,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k = k_cur; ggml_tensor * v = v_cur; - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale); + ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1460,7 +1456,6 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif } ggml_tensor * llm_graph_context::build_rs( - ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, int32_t state_size, @@ -1518,21 +1513,19 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const { ggml_tensor * llm_graph_context::build_rs( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * s, int32_t state_size, int32_t n_seqs, const llm_graph_get_rows_fn & get_state_rows) const { const auto * kv_state = inp->mctx; - return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows); + return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows); } ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( llm_graph_input_rs * inp, - ggml_cgraph * gf, const llama_ubatch & ubatch, - int il) const { + int il) const { const auto * mctx_cur = static_cast(mctx); const auto token_shift_count = hparams.token_shift_count; @@ -1542,7 +1535,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load( ggml_tensor * token_shift_all = mctx_cur->get_r_l(il); ggml_tensor * token_shift = build_rs( - inp, gf, token_shift_all, + inp, token_shift_all, hparams.n_embd_r(), n_seqs); token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); @@ -1582,7 +1575,6 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { } void llm_graph_context::build_pooling( - ggml_cgraph * gf, ggml_tensor * cls, ggml_tensor * cls_b, ggml_tensor * cls_out, diff --git a/src/llama-graph.h b/src/llama-graph.h index 9a9aac7a5a048..a761bc4fad957 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -371,31 +371,11 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i { // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. // these are used by the llama_context to extact the relevant data, based on the compute parameters -// TODO: this interface seems redundant - remove it -class llm_graph_result_i { -public: - virtual ~llm_graph_result_i() = default; - - virtual ggml_tensor * get_tokens() const = 0; - virtual ggml_tensor * get_logits() const = 0; - virtual ggml_tensor * get_embd() const = 0; - virtual ggml_tensor * get_embd_pooled() const = 0; - - virtual ggml_cgraph * get_gf() = 0; - virtual ggml_context * get_ctx() = 0; - - virtual void reset() = 0; - - virtual void set_inputs(const llama_ubatch * ubatch) = 0; - - virtual bool can_reuse(const llm_graph_params & params) = 0; -}; - -using llm_graph_result_ptr = std::unique_ptr; - // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) using llm_graph_cb = std::function; +class llm_graph_result; + struct llm_graph_params { llm_arch arch = LLM_ARCH_UNKNOWN; @@ -418,8 +398,7 @@ struct llm_graph_params { llm_graph_cb cb; - // TODO: temporary - llm_graph_result_i * res; + llm_graph_result * res; // return true if the "other" params would result in a graph with the same topology as with the current params // having the same topology allows us to reuse the graph in some cases @@ -462,7 +441,7 @@ struct llm_graph_params { } }; -class llm_graph_result : public llm_graph_result_i { +class llm_graph_result { public: llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) { reset(); @@ -470,19 +449,19 @@ class llm_graph_result : public llm_graph_result_i { virtual ~llm_graph_result() = default; - ggml_tensor * get_tokens() const override { return t_tokens; } - ggml_tensor * get_logits() const override { return t_logits; } - ggml_tensor * get_embd() const override { return t_embd; } - ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; } + ggml_tensor * get_tokens() const { return t_tokens; } + ggml_tensor * get_logits() const { return t_logits; } + ggml_tensor * get_embd() const { return t_embd; } + ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } - ggml_cgraph * get_gf() override { return gf; } - ggml_context * get_ctx() override { return ctx_compute.get(); } + ggml_cgraph * get_gf() { return gf; } + ggml_context * get_ctx() { return ctx_compute.get(); } void set_max_nodes(int64_t max_nodes) { this->max_nodes = max_nodes; } - void reset() override { + void reset() { t_tokens = nullptr; t_logits = nullptr; t_embd = nullptr; @@ -503,7 +482,7 @@ class llm_graph_result : public llm_graph_result_i { gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false); } - void set_inputs(const llama_ubatch * ubatch) override { + void set_inputs(const llama_ubatch * ubatch) { for (auto & input : inputs) { input->set_input(ubatch); } @@ -514,7 +493,7 @@ class llm_graph_result : public llm_graph_result_i { // would be identical to the existing graph. in that case, we simply have to update the memory // contexts of the input tensors of the graph and we can reuse it for another computation // return true if the graph was updated and can be reused - bool can_reuse(const llm_graph_params & params) override { + bool can_reuse(const llm_graph_params & params) { if (!this->params.allow_reuse(params)) { return false; } @@ -533,6 +512,10 @@ class llm_graph_result : public llm_graph_result_i { return inputs.back().get(); } + void set_params(const llm_graph_params & params) { + this->params = params; + } + // important graph nodes ggml_tensor * t_tokens = nullptr; ggml_tensor * t_logits = nullptr; @@ -550,12 +533,15 @@ class llm_graph_result : public llm_graph_result_i { int64_t max_nodes; +private: // keep a copy of the previous graph parameters // we will use this to determine whether the graph can be reused by comparing them with the new parameters // note: these are updated after constructing the new graph llm_graph_params params; }; +using llm_graph_result_ptr = std::unique_ptr; + // // llm_graph_context // @@ -613,6 +599,7 @@ struct llm_graph_context { llm_graph_result * res; ggml_context * ctx0 = nullptr; + ggml_cgraph * gf = nullptr; llm_graph_context(const llm_graph_params & params); virtual ~llm_graph_context() = default; @@ -698,7 +685,6 @@ struct llm_graph_context { // ggml_tensor * build_attn_mha( - ggml_cgraph * gf, ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) @@ -711,7 +697,6 @@ struct llm_graph_context { ggml_tensor * build_attn( llm_graph_input_attn_no_cache * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -726,7 +711,6 @@ struct llm_graph_context { ggml_tensor * build_attn( llm_graph_input_attn_kv_unified * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -742,7 +726,6 @@ struct llm_graph_context { // note: if k_cur or v_cur are not provided, they will not be stored in the memory ggml_tensor * build_attn( llm_graph_input_attn_kv_unified_iswa * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -757,7 +740,6 @@ struct llm_graph_context { ggml_tensor * build_attn( llm_graph_input_attn_cross * inp, - ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] @@ -779,7 +761,6 @@ struct llm_graph_context { // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in // `llama_memory_recurrent` ggml_tensor * build_rs( - ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, int32_t state_size, @@ -794,7 +775,6 @@ struct llm_graph_context { ggml_tensor * build_rs( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * s, int32_t state_size, int32_t n_seqs, @@ -802,9 +782,8 @@ struct llm_graph_context { ggml_tensor * build_rwkv_token_shift_load( llm_graph_input_rs * inp, - ggml_cgraph * gf, const llama_ubatch & ubatch, - int il) const; + int il) const; ggml_tensor * build_rwkv_token_shift_store( ggml_tensor * token_shift, @@ -821,7 +800,6 @@ struct llm_graph_context { // void build_pooling( - ggml_cgraph * gf, ggml_tensor * cls, ggml_tensor * cls_b, ggml_tensor * cls_out, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cb008e09cbeea..d95129ca83a32 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5381,7 +5381,7 @@ ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int i } struct llm_build_llama : public llm_graph_context { - llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5457,7 +5457,7 @@ struct llm_build_llama : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -5537,7 +5537,7 @@ struct llm_build_llama : public llm_graph_context { }; struct llm_build_llama_iswa : public llm_graph_context { - llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5631,7 +5631,7 @@ struct llm_build_llama_iswa : public llm_graph_context { cb(Kcur, "Kcur_normed", il); } - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -5720,7 +5720,7 @@ struct llm_build_llama_iswa : public llm_graph_context { }; struct llm_build_deci : public llm_graph_context { - llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5808,7 +5808,7 @@ struct llm_build_deci : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } @@ -5876,7 +5876,7 @@ struct llm_build_deci : public llm_graph_context { }; struct llm_build_baichuan : public llm_graph_context { - llm_build_baichuan(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -5940,7 +5940,7 @@ struct llm_build_baichuan : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -5998,7 +5998,7 @@ struct llm_build_baichuan : public llm_graph_context { }; struct llm_build_xverse : public llm_graph_context { - llm_build_xverse(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6055,7 +6055,7 @@ struct llm_build_xverse : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6111,7 +6111,7 @@ struct llm_build_xverse : public llm_graph_context { }; struct llm_build_falcon : public llm_graph_context { - llm_build_falcon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6178,7 +6178,7 @@ struct llm_build_falcon : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6233,7 +6233,7 @@ struct llm_build_falcon : public llm_graph_context { }; struct llm_build_grok : public llm_graph_context { - llm_build_grok(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6308,7 +6308,7 @@ struct llm_build_grok : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -6395,7 +6395,7 @@ struct llm_build_grok : public llm_graph_context { }; struct llm_build_dbrx : public llm_graph_context { - llm_build_dbrx(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6457,7 +6457,7 @@ struct llm_build_dbrx : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6520,7 +6520,7 @@ struct llm_build_dbrx : public llm_graph_context { }; struct llm_build_starcoder : public llm_graph_context { - llm_build_starcoder(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6571,7 +6571,7 @@ struct llm_build_starcoder : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6629,7 +6629,7 @@ struct llm_build_starcoder : public llm_graph_context { }; struct llm_build_refact : public llm_graph_context { - llm_build_refact(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -6670,7 +6670,7 @@ struct llm_build_refact : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -6728,7 +6728,7 @@ struct llm_build_refact : public llm_graph_context { }; struct llm_build_bert : public llm_graph_context { - llm_build_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6827,7 +6827,7 @@ struct llm_build_bert : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); @@ -6914,7 +6914,7 @@ struct llm_build_bert : public llm_graph_context { }; struct llm_build_neo_bert : public llm_graph_context { - llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -6972,7 +6972,7 @@ struct llm_build_neo_bert : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); @@ -7024,7 +7024,7 @@ struct llm_build_neo_bert : public llm_graph_context { }; struct llm_build_bloom : public llm_graph_context { - llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -7072,7 +7072,7 @@ struct llm_build_bloom : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7130,7 +7130,7 @@ struct llm_build_bloom : public llm_graph_context { }; struct llm_build_mpt : public llm_graph_context { - llm_build_mpt(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -7219,7 +7219,7 @@ struct llm_build_mpt : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7278,7 +7278,7 @@ struct llm_build_mpt : public llm_graph_context { }; struct llm_build_stablelm : public llm_graph_context { - llm_build_stablelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7365,7 +7365,7 @@ struct llm_build_stablelm : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7430,7 +7430,7 @@ struct llm_build_stablelm : public llm_graph_context { }; struct llm_build_qwen : public llm_graph_context { - llm_build_qwen(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7486,7 +7486,7 @@ struct llm_build_qwen : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7544,7 +7544,7 @@ struct llm_build_qwen : public llm_graph_context { }; struct llm_build_qwen2 : public llm_graph_context { - llm_build_qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7606,7 +7606,7 @@ struct llm_build_qwen2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7662,7 +7662,7 @@ struct llm_build_qwen2 : public llm_graph_context { }; struct llm_build_qwen2vl : public llm_graph_context { - llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7727,7 +7727,7 @@ struct llm_build_qwen2vl : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7783,7 +7783,7 @@ struct llm_build_qwen2vl : public llm_graph_context { }; struct llm_build_qwen2moe : public llm_graph_context { - llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7854,7 +7854,7 @@ struct llm_build_qwen2moe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -7942,7 +7942,7 @@ struct llm_build_qwen2moe : public llm_graph_context { }; struct llm_build_qwen3 : public llm_graph_context { - llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8007,7 +8007,7 @@ struct llm_build_qwen3 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8063,7 +8063,7 @@ struct llm_build_qwen3 : public llm_graph_context { }; struct llm_build_qwen3moe : public llm_graph_context { - llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8128,7 +8128,7 @@ struct llm_build_qwen3moe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8191,7 +8191,7 @@ struct llm_build_qwen3moe : public llm_graph_context { }; struct llm_build_phi2 : public llm_graph_context { - llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8268,7 +8268,7 @@ struct llm_build_phi2 : public llm_graph_context { // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -8322,7 +8322,7 @@ struct llm_build_phi2 : public llm_graph_context { template struct llm_build_phi3 : public llm_graph_context { - llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8405,7 +8405,7 @@ struct llm_build_phi3 : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -8480,7 +8480,7 @@ struct llm_build_phi3 : public llm_graph_context { }; struct llm_build_plamo : public llm_graph_context { - llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8539,7 +8539,7 @@ struct llm_build_plamo : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8595,7 +8595,7 @@ struct llm_build_plamo : public llm_graph_context { }; struct llm_build_gpt2 : public llm_graph_context { - llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8647,7 +8647,7 @@ struct llm_build_gpt2 : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8705,7 +8705,7 @@ struct llm_build_gpt2 : public llm_graph_context { }; struct llm_build_codeshell : public llm_graph_context { - llm_build_codeshell(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -8761,7 +8761,7 @@ struct llm_build_codeshell : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8819,7 +8819,7 @@ struct llm_build_codeshell : public llm_graph_context { }; struct llm_build_orion : public llm_graph_context { - llm_build_orion(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8890,7 +8890,7 @@ struct llm_build_orion : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -8946,7 +8946,7 @@ struct llm_build_orion : public llm_graph_context { }; struct llm_build_internlm2 : public llm_graph_context { - llm_build_internlm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9017,7 +9017,7 @@ struct llm_build_internlm2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -9073,7 +9073,7 @@ struct llm_build_internlm2 : public llm_graph_context { }; struct llm_build_minicpm3 : public llm_graph_context { - llm_build_minicpm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_minicpm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { //TODO: if the model varies, these parameters need to be read from the model const int64_t n_embd_base = 256; const float scale_embd = 12.0f; @@ -9205,7 +9205,7 @@ struct llm_build_minicpm3 : public llm_graph_context { ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } @@ -9277,7 +9277,7 @@ struct llm_build_minicpm3 : public llm_graph_context { }; struct llm_build_gemma : public llm_graph_context { - llm_build_gemma(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; ggml_tensor * cur; @@ -9335,7 +9335,7 @@ struct llm_build_gemma : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); cb(Qcur, "Qcur_scaled", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -9393,7 +9393,7 @@ struct llm_build_gemma : public llm_graph_context { }; struct llm_build_gemma2_iswa : public llm_graph_context { - llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k; ggml_tensor * cur; @@ -9450,7 +9450,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -9523,7 +9523,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { }; struct llm_build_gemma3_iswa : public llm_graph_context { - llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k; ggml_tensor * cur; @@ -9592,7 +9592,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/model.py#L315 Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } @@ -9661,7 +9661,6 @@ struct llm_build_gemma3_iswa : public llm_graph_context { struct llm_build_gemma3n_iswa : public llm_graph_context { const llama_model & model; - ggml_cgraph * gf; const int64_t n_embd_head; const int64_t n_embd_altup; @@ -9671,10 +9670,9 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { const int n_layer_sparsity = 10; // number of layers using activation sparsity const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) - llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) + llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model), - gf(gf), n_embd_head(model.hparams.n_embd_head_k), n_embd_altup(model.hparams.n_embd_altup), n_altup(model.hparams.n_altup), @@ -9775,7 +9773,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { cb(Qcur, "Qcur_pos", il); cb(Kcur, "Kcur_pos", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il); } else { @@ -9793,7 +9791,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur_pos", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } @@ -10087,7 +10085,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { // TODO: move up next to build_starcoder struct llm_build_starcoder2 : public llm_graph_context { - llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10158,7 +10156,7 @@ struct llm_build_starcoder2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10219,7 +10217,6 @@ struct llm_graph_context_mamba : public llm_graph_context { ggml_tensor * build_mamba_layer( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, @@ -10250,7 +10247,7 @@ struct llm_graph_context_mamba : public llm_graph_context { ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} @@ -10331,7 +10328,7 @@ struct llm_graph_context_mamba : public llm_graph_context { return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); }; - ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); // store last states ggml_build_forward_expand(gf, @@ -10358,11 +10355,10 @@ struct llm_graph_context_mamba : public llm_graph_context { ggml_tensor * build_mamba2_layer( llm_graph_input_rs * inp, - ggml_cgraph * gf, - ggml_tensor * cur, - const llama_model & model, - const llama_ubatch & ubatch, - int il) const { + ggml_tensor * cur, + const llama_model & model, + const llama_ubatch & ubatch, + int il) const { const auto * mctx_cur = inp->mctx; @@ -10385,7 +10381,7 @@ struct llm_graph_context_mamba : public llm_graph_context { ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} @@ -10455,7 +10451,7 @@ struct llm_graph_context_mamba : public llm_graph_context { return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); }; - ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); // store last states ggml_build_forward_expand(gf, @@ -10491,7 +10487,7 @@ struct llm_graph_context_mamba : public llm_graph_context { }; struct llm_build_mamba : public llm_graph_context_mamba { - llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + llm_build_mamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -10510,9 +10506,9 @@ struct llm_build_mamba : public llm_graph_context_mamba { cb(cur, "attn_norm", il); if (model.arch == LLM_ARCH_MAMBA2) { - cur = build_mamba2_layer(rs_inp, gf, cur, model, ubatch, il); + cur = build_mamba2_layer(rs_inp, cur, model, ubatch, il); } else { - cur = build_mamba_layer(rs_inp, gf, cur, model, ubatch, il); + cur = build_mamba_layer(rs_inp, cur, model, ubatch, il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10548,7 +10544,7 @@ struct llm_build_mamba : public llm_graph_context_mamba { }; struct llm_build_jamba : public llm_graph_context_mamba { - llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { const int64_t n_embd_head = hparams.n_embd_head_v; ggml_tensor * cur; @@ -10568,7 +10564,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { cb(cur, "attn_norm", il); if (n_head_kv == 0) { - cur = build_mamba_layer(inp_hybrid->get_recr(), gf, cur, model, ubatch, il); + cur = build_mamba_layer(inp_hybrid->get_recr(), cur, model, ubatch, il); } else { // Attention @@ -10589,7 +10585,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { cb(Vcur, "Vcur", il); // No RoPE :) - cur = build_attn(inp_hybrid->get_attn(), gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); + cur = build_attn(inp_hybrid->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1 && inp_out_ids) { @@ -10657,7 +10653,7 @@ struct llm_build_jamba : public llm_graph_context_mamba { }; struct llm_build_command_r : public llm_graph_context { - llm_build_command_r(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10745,7 +10741,7 @@ struct llm_build_command_r : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10804,7 +10800,7 @@ struct llm_build_command_r : public llm_graph_context { }; struct llm_build_cohere2_iswa : public llm_graph_context { - llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10880,7 +10876,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -10940,7 +10936,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { // * removed bias // * removed MoE struct llm_build_olmo : public llm_graph_context { - llm_build_olmo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11011,7 +11007,7 @@ struct llm_build_olmo : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11068,7 +11064,7 @@ struct llm_build_olmo : public llm_graph_context { }; struct llm_build_olmo2 : public llm_graph_context { - llm_build_olmo2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11131,7 +11127,7 @@ struct llm_build_olmo2 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11197,7 +11193,7 @@ struct llm_build_olmo2 : public llm_graph_context { // * removed bias // * added q, k norm struct llm_build_olmoe : public llm_graph_context { - llm_build_olmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11264,7 +11260,7 @@ struct llm_build_olmoe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11325,7 +11321,7 @@ struct llm_build_olmoe : public llm_graph_context { }; struct llm_build_openelm : public llm_graph_context { - llm_build_openelm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11397,7 +11393,7 @@ struct llm_build_openelm : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Qcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11454,7 +11450,7 @@ struct llm_build_openelm : public llm_graph_context { }; struct llm_build_gptneox : public llm_graph_context { - llm_build_gptneox(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11509,7 +11505,7 @@ struct llm_build_gptneox : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11600,7 +11596,7 @@ struct llm_build_gptneox : public llm_graph_context { }; struct llm_build_arctic : public llm_graph_context { - llm_build_arctic(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11659,7 +11655,7 @@ struct llm_build_arctic : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -11738,7 +11734,7 @@ struct llm_build_arctic : public llm_graph_context { }; struct llm_build_deepseek : public llm_graph_context { - llm_build_deepseek(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11814,7 +11810,7 @@ struct llm_build_deepseek : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } @@ -11900,7 +11896,7 @@ struct llm_build_deepseek : public llm_graph_context { }; struct llm_build_deepseek2 : public llm_graph_context { - llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { bool is_lite = (hparams.n_layer == 27); const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); @@ -12042,7 +12038,7 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(Vcur, "Vcur", il); // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il); } else { @@ -12076,7 +12072,7 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(Kcur, "Kcur", il); // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } @@ -12163,7 +12159,7 @@ struct llm_build_deepseek2 : public llm_graph_context { }; struct llm_build_bitnet : public llm_graph_context { - llm_build_bitnet(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12243,7 +12239,7 @@ struct llm_build_bitnet : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, NULL, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); @@ -12323,7 +12319,7 @@ struct llm_build_bitnet : public llm_graph_context { }; struct llm_build_t5_enc : public llm_graph_context { - llm_build_t5_enc(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12366,7 +12362,7 @@ struct llm_build_t5_enc : public llm_graph_context { ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc; ggml_tensor * kq_b = build_pos_bias(pos_bucket_enc, attn_rel_b); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo_enc, nullptr, Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); cb(cur, "kqv_out", il); @@ -12424,7 +12420,7 @@ struct llm_build_t5_enc : public llm_graph_context { }; struct llm_build_t5_dec : public llm_graph_context { - llm_build_t5_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; //const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12472,7 +12468,7 @@ struct llm_build_t5_dec : public llm_graph_context { ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b; ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b); - cur = build_attn(inp_attn_self, gf, + cur = build_attn(inp_attn_self, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); cb(cur, "kqv_out", il); @@ -12504,7 +12500,7 @@ struct llm_build_t5_dec : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_outputs_enc); - cur = build_attn(inp_attn_cross, gf, + cur = build_attn(inp_attn_cross, model.layers[il].wo_cross, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); cb(cur, "kqv_out", il); @@ -12594,7 +12590,7 @@ struct llm_build_t5_dec : public llm_graph_context { }; struct llm_build_jais : public llm_graph_context { - llm_build_jais(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12636,7 +12632,7 @@ struct llm_build_jais : public llm_graph_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il); } @@ -12689,7 +12685,7 @@ struct llm_build_jais : public llm_graph_context { }; struct llm_build_chatglm : public llm_graph_context { - llm_build_chatglm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12768,7 +12764,7 @@ struct llm_build_chatglm : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12822,7 +12818,7 @@ struct llm_build_chatglm : public llm_graph_context { }; struct llm_build_glm4 : public llm_graph_context { - llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12901,7 +12897,7 @@ struct llm_build_glm4 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -12973,7 +12969,7 @@ struct llm_build_glm4 : public llm_graph_context { }; struct llm_build_nemotron : public llm_graph_context { - llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13045,7 +13041,7 @@ struct llm_build_nemotron : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -13102,7 +13098,7 @@ struct llm_build_nemotron : public llm_graph_context { }; struct llm_build_exaone : public llm_graph_context { - llm_build_exaone(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13176,7 +13172,7 @@ struct llm_build_exaone : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -13269,7 +13265,6 @@ struct llm_build_rwkv6_base : public llm_graph_context { ggml_tensor * build_rwkv6_time_mix( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, const llama_ubatch & ubatch, @@ -13396,7 +13391,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { } ggml_tensor * wkv_state = build_rs( - inp, gf, mctx_cur->get_s_l(il), + inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs); ggml_tensor * wkv_output; @@ -13442,7 +13437,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { }; struct llm_build_rwkv6 : public llm_build_rwkv6_base { - llm_build_rwkv6(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) { + llm_build_rwkv6(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) { GGML_ASSERT(hparams.token_shift_count == 2); ggml_tensor * cur; @@ -13463,7 +13458,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); @@ -13478,7 +13473,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { 1 ); - cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il); + cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -13543,7 +13538,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { - llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv6_base(model, params) { + llm_build_rwkv6qwen2(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv6_base(model, params) { GGML_ASSERT(n_embd == hparams.n_embd_r()); ggml_tensor * cur; @@ -13563,7 +13558,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); cb(att_norm, "attn_norm", il); @@ -13575,7 +13570,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { 1 ); - cur = build_rwkv6_time_mix(rs_inp, gf, att_norm, x_prev, ubatch, il); + cur = build_rwkv6_time_mix(rs_inp, att_norm, x_prev, ubatch, il); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); @@ -13665,7 +13660,6 @@ struct llm_build_rwkv7_base : public llm_graph_context { ggml_tensor * build_rwkv7_time_mix( llm_graph_input_rs * inp, - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor *& first_layer_value, @@ -13751,7 +13745,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens); ggml_tensor * wkv_state = build_rs( - inp, gf, mctx_cur->get_s_l(il), + inp, mctx_cur->get_s_l(il), hparams.n_embd_s(), n_seqs); ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state); @@ -13798,7 +13792,7 @@ struct llm_build_rwkv7_base : public llm_graph_context { }; struct llm_build_rwkv7 : public llm_build_rwkv7_base { - llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) { + llm_build_rwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) { GGML_ASSERT(hparams.token_shift_count == 2); ggml_tensor * cur; @@ -13820,7 +13814,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); @@ -13835,7 +13829,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base { 1 ); - cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il); + cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -13894,7 +13888,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base { struct llm_build_arwkv7 : public llm_build_rwkv7_base { - llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) { + llm_build_arwkv7(const llama_model & model, const llm_graph_params & params) : llm_build_rwkv7_base(model, params) { GGML_ASSERT(n_embd == hparams.n_embd_r()); ggml_tensor * cur; @@ -13915,7 +13909,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { const llama_layer * layer = &model.layers[il]; inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, gf, ubatch, il); + ggml_tensor * token_shift = build_rwkv_token_shift_load(rs_inp, ubatch, il); ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); cb(att_norm, "attn_norm", il); @@ -13927,7 +13921,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { 1 ); - cur = build_rwkv7_time_mix(rs_inp, gf, att_norm, x_prev, v_first, ubatch, il); + cur = build_rwkv7_time_mix(rs_inp, att_norm, x_prev, v_first, ubatch, il); token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); @@ -13984,8 +13978,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base { struct llm_build_granite : public llm_graph_context { llm_build_granite( const llama_model & model, - const llm_graph_params & params, - ggml_cgraph * gf) + const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -14019,7 +14012,7 @@ struct llm_build_granite : public llm_graph_context { // self-attention cur = build_attention_layer( - gf, cur, inp_pos, inp_attn, + cur, inp_pos, inp_attn, model, n_embd_head, il); if (il == n_layer - 1 && inp_out_ids) { @@ -14055,7 +14048,6 @@ struct llm_build_granite : public llm_graph_context { } ggml_tensor * build_attention_layer( - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv_unified * inp_attn, @@ -14110,7 +14102,7 @@ struct llm_build_granite : public llm_graph_context { cb(Vcur, "Vcur", il); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -14198,11 +14190,9 @@ struct llm_build_granite : public llm_graph_context { }; struct llm_build_granite_hybrid : public llm_graph_context_mamba { - llm_build_granite_hybrid( const llama_model & model, - const llm_graph_params & params, - ggml_cgraph * gf) : + const llm_graph_params & params) : llm_graph_context_mamba(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -14234,11 +14224,11 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { if (hparams.is_recurrent(il)) { // ssm layer // - cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il); + cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); } else { // attention layer // cur = build_attention_layer( - gf, cur, inp_pos, inp->get_attn(), model, + cur, inp_pos, inp->get_attn(), model, n_embd_head, il); } @@ -14277,7 +14267,6 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { } ggml_tensor * build_attention_layer( - ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv_unified * inp_attn, @@ -14332,7 +14321,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { cb(Vcur, "Vcur", il); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -14426,7 +14415,7 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { // * removed bias // * removed MoE struct llm_build_chameleon : public llm_graph_context { - llm_build_chameleon(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14517,7 +14506,7 @@ struct llm_build_chameleon : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -14603,7 +14592,7 @@ struct llm_build_chameleon : public llm_graph_context { }; struct llm_build_wavtokenizer_dec : public llm_graph_context { - llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -14755,7 +14744,7 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context { }; struct llm_build_plm : public llm_graph_context { - llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k)); const uint32_t n_embd_head_qk_rope = hparams.n_rot; @@ -14873,7 +14862,7 @@ struct llm_build_plm : public llm_graph_context { ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } @@ -14927,7 +14916,7 @@ struct llm_build_plm : public llm_graph_context { }; struct llm_build_bailingmoe : public llm_graph_context { - llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { ggml_tensor * cur; ggml_tensor * inpL; @@ -14996,7 +14985,7 @@ struct llm_build_bailingmoe : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); } @@ -15071,7 +15060,7 @@ struct llm_build_bailingmoe : public llm_graph_context { }; struct llm_build_dots1 : public llm_graph_context { - llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15136,7 +15125,7 @@ struct llm_build_dots1 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -15221,7 +15210,7 @@ struct llm_build_dots1 : public llm_graph_context { }; struct llm_build_ernie4_5 : public llm_graph_context { - llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15291,7 +15280,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } @@ -15351,7 +15340,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { }; struct llm_build_falcon_h1 : public llm_graph_context_mamba { - llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) { + llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) { const int64_t n_embd_head = hparams.n_embd_head_v; ggml_tensor * cur; @@ -15407,7 +15396,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { cb(Kcur, "Kcur-post-rope", il); cb(Vcur, "Vcur-post-rope", il); - ggml_tensor * attn_out = build_attn(inp->get_attn(), gf, + ggml_tensor * attn_out = build_attn(inp->get_attn(), model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(attn_out, "attn_out", il); @@ -15418,7 +15407,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { // Mamba2 layer cb(cur, "ssm_in", il); - ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il); + ggml_tensor * ssm_out = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); cb(ssm_out, "ssm_out", il); // // Aggregation @@ -15477,7 +15466,7 @@ struct llm_build_falcon_h1 : public llm_graph_context_mamba { }; struct llm_build_arcee : public llm_graph_context { - llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15553,7 +15542,7 @@ struct llm_build_arcee : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -15612,7 +15601,7 @@ struct llm_build_arcee : public llm_graph_context { }; struct llm_build_hunyuan_moe : public llm_graph_context { - llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15698,7 +15687,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { LLM_NORM_RMS, il); cb(Qcur, "Qcur_norm", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -15773,7 +15762,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { }; struct llm_build_smollm3 : public llm_graph_context { - llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -15850,7 +15839,7 @@ struct llm_build_smollm3 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, + cur = build_attn(inp_attn, model.layers[il].wo, model.layers[il].bo, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -15912,7 +15901,7 @@ struct llm_build_smollm3 : public llm_graph_context { struct llm_build_lfm2 : public llm_graph_context { const llama_model & model; - llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { + llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) { ggml_tensor * cur = build_inp_embd(model.tok_embd); cb(cur, "model.embed_tokens", -1); @@ -15927,8 +15916,8 @@ struct llm_build_lfm2 : public llm_graph_context { cb(cur, "model.layers.{}.operator_norm", il); cur = hparams.is_recurrent(il) ? - build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) : - build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ; + build_shortconv_block(cur, inp_hybrid->get_recr(), il) : + build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il) ; if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); @@ -15971,8 +15960,7 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor * build_attn_block(ggml_cgraph * gf, - ggml_tensor * cur, + ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv_unified * inp_attn, int il) const { @@ -16009,7 +15997,7 @@ struct llm_build_lfm2 : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "model.layers.{}.self_attn.out_proj", il); @@ -16017,8 +16005,7 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor * build_shortconv_block(ggml_cgraph * gf, - ggml_tensor * cur, + ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) { const auto * mctx_cur = static_cast(mctx)->get_recr(); @@ -16038,7 +16025,7 @@ struct llm_build_lfm2 : public llm_graph_context { // read conv state directly, with build_rs generation is slower ggml_tensor * conv_state = mctx_cur->get_r_l(il); const int64_t n_seqs = ubatch.n_seqs; - ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs); + ggml_tensor * conv = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs); conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); bx = ggml_concat(ctx0, conv, bx, 0); @@ -16160,226 +16147,223 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { - // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere - auto * gf = params.res->get_gf(); - std::unique_ptr llm; switch (arch) { case LLM_ARCH_LLAMA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_LLAMA4: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DECI: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BAICHUAN: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_FALCON: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GROK: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_STARCODER: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_REFACT: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_NEO_BERT: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BLOOM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_MPT: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_STABLELM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN2VL: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN2MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN3MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_PHI2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: { if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { - llm = std::make_unique> (*this, params, gf); + llm = std::make_unique> (*this, params); } else { - llm = std::make_unique>(*this, params, gf); + llm = std::make_unique>(*this, params); } } break; case LLM_ARCH_PLAMO: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GPT2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_CODESHELL: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ORION: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_INTERNLM2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_MINICPM3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GEMMA3N: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_STARCODER2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_MAMBA: case LLM_ARCH_MAMBA2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_JAMBA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_XVERSE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_COMMAND_R: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_COHERE2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DBRX: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OLMO: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OLMO2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OLMOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_OPENELM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GPTNEOX: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ARCTIC: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_CHATGLM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GLM4: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BITNET: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_T5: { switch (params.gtype) { case LLM_GRAPH_TYPE_ENCODER: - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); break; case LLM_GRAPH_TYPE_DEFAULT: case LLM_GRAPH_TYPE_DECODER: - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); break; default: GGML_ABORT("invalid graph type"); @@ -16387,97 +16371,97 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_T5ENCODER: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_JAIS: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_NEMOTRON: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_EXAONE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_RWKV6: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_RWKV6QWEN2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_RWKV7: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ARWKV7: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_MINICPM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GRANITE_HYBRID: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_CHAMELEON: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_PLM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BAILINGMOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_DOTS1: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ARCEE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_ERNIE4_5: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_HUNYUAN_MOE: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_SMOLLM3: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_FALCON_H1: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_LFM2: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; default: GGML_ABORT("fatal error"); } // add on pooling layer - llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b); + llm->build_pooling(cls, cls_b, cls_out, cls_out_b); return llm->res->get_gf(); }