diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 392d45dd6bc8f..8472f41a56797 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -39,7 +38,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, ggml_cgraph * cgraph, bool is_static, - bool is_first_token, int context_size, int context_size_swa, int num_heads, @@ -55,25 +53,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node, m_num_heads(num_heads), m_num_heads_kv(num_heads_kv), m_head_size(head_size), - m_is_static(is_static), - m_is_first_token(is_first_token) { + m_is_static(is_static) { set_input_output(node); } GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static, - bool is_first_token) : + bool is_static) : m_cgraph(cgraph), m_op_name(m_node ? std::string(m_node->name) : ""), m_model_weights(model_weights), - m_is_static(is_static), - m_is_first_token(is_first_token) { - if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + m_is_static(is_static) { + if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { + unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); print_tensor_address_map(cgraph); } set_llm_params(); + validate_cgraph(); for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; @@ -160,8 +157,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph - if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 || - debug_output_names.count(node_name)) { + if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || + node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) { if (node->op == GGML_OP_SET_ROWS) { assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0); if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) { @@ -285,53 +282,54 @@ void GgmlOvDecoder::set_llm_params() { } else { m_context_size = cache_k->ne[1]; } - } else if (node->op == GGML_OP_ROPE && - (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { - m_head_size = node->ne[0]; - m_num_heads = node->ne[1]; - m_rope_params = node->op_params; - } else if (node->op == GGML_OP_ROPE && - (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0)) { - m_num_heads_kv = node->ne[1]; + } else if (node->op == GGML_OP_ROPE) { + if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { + m_head_size = node->ne[0]; + m_num_heads = node->ne[1]; + m_rope_params = node->op_params; + auto * inp_pos = node->src[1]; + m_input_len = inp_pos->ne[0]; + m_past_kv_len = *(int32_t *) inp_pos->data; + } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { + m_num_heads_kv = node->ne[1]; + } } } } +void GgmlOvDecoder::validate_cgraph() const { + if (m_is_static && m_input_len != 1) { + throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) + + ", try set -ub 1"); + } +} + ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const { auto name = std::string(src->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos") { - if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, 1, m_context_size}; - } else { - input_shape = ov::PartialShape{1, 1, 1}; - } - } else { - input_shape = ov::PartialShape{1, 1, -1}; - } - } else if (name == "inp_out_ids" && !m_is_static) { - input_shape = ov::PartialShape{1, 1, -1}; + + if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") { + input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; + } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - if (m_is_first_token) { - input_shape = ov::PartialShape{1, m_context_size, m_context_size}; - } else { - input_shape = ov::PartialShape{1, 1, m_context_size}; - } + input_shape = ov::PartialShape{1, 1, m_context_size}; } else { input_shape = ov::PartialShape{1, -1, -1}; } + } else if (name.find("cache_") == 0) { + auto past_token_len = -1; if (m_is_static) { int layer = extract_layer_from_name(name); bool is_swa = is_swa_layer(layer); - input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; - } else { - input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size}; + past_token_len = is_swa ? m_context_size_swa : m_context_size; } + input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size}; + } else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; + } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -745,9 +743,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto & node : m_nodes) { - auto decoder = - std::make_shared(node, m_cgraph, m_is_static, m_is_first_token, m_context_size, - m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa, + m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 884151d32ef3c..fe30bde445200 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -16,14 +16,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static, - bool is_first_token); + bool is_static); // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(ggml_tensor * node, ggml_cgraph * cgraph, bool is_static, - bool is_first_token, int context_size, int context_size_swa, int num_heads, @@ -81,9 +79,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual void visit_subgraph(std::function)> node_visitor) const override; - const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } + ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } - const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } + ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); } virtual int get_op_case() const override { return m_op_case; } @@ -119,14 +117,16 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual int get_head_size() const override { return m_head_size; } + int get_past_kv_len() const { return m_past_kv_len; } + + int get_input_len() const { return m_input_len; } + virtual int32_t * get_rope_params() const override { return m_rope_params; } virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } - virtual bool is_first_token() const override { return m_is_first_token; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -153,6 +153,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { // set context_size, num_heads, etc void set_llm_params(); + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; ggml_tensor * m_node = nullptr; @@ -176,10 +177,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { int m_num_heads; int m_num_heads_kv; int m_head_size; + int m_past_kv_len; + int m_input_len; int32_t * m_rope_params; std::vector m_kv_names; bool m_is_static = false; - bool m_is_first_token; }; void print_tensor_address_map(const ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 6f11ff1283e37..a3cb995a3c91e 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -65,7 +65,6 @@ class GgmlDecoder : public DecoderBase { virtual std::map get_kv_param_res_names() const = 0; virtual bool is_static() const = 0; - virtual bool is_first_token() const = 0; virtual int get_context_size() const = 0; virtual int get_context_size_swa() const = 0; virtual int is_swa_layer(int layer) const = 0; diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index a64ae098ab3e9..0d76dc83e0590 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -97,12 +97,7 @@ class NodeContext : public frontend::NodeContext { int get_op_case() const { return m_decoder->get_op_case(); } - bool is_static() const { - return m_decoder->is_static(); - } - bool is_first_token() const { - return m_decoder->is_first_token(); - } + bool is_static() const { return m_decoder->is_static(); } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 029023637a47c..de2af85aa88c9 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -2,9 +2,11 @@ #include "../op_table.hpp" #include "../utils.hpp" +#include #include #include #include +#include #include #include #include @@ -51,43 +53,25 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); - mask_sliced = std::make_shared(mask_sliced, zero_1d); } if (mask_sliced.get_element_type() != ov::element::f16) { mask_sliced = std::make_shared(mask_sliced, ov::element::f16); } - auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output kv, bool is_static) { - int64_t factor = q_batch / kv_batch; + auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output kv) { + int64_t factor = num_heads / num_heads_kv; if (factor > 1) { - auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{q_batch}); - auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_batch}); - auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; - if (is_static) { - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); - - auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2}); - kv_broadcast_shape = std::make_shared( - ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0); - new_kv_shape = - std::make_shared(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0); - } else { - auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); - kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); - auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3}); - kv_broadcast_shape = std::make_shared( - ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0); - new_kv_shape = - std::make_shared(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0); - } + kv_broadcast_shape = + ov::op::v0::Constant::create(ov::element::i64, {4}, {num_heads_kv, factor, (int64_t) 1, head_size}); + new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, {num_heads, (int64_t) -1, head_size}); - kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape); + kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape, + ov::op::BroadcastType::BIDIRECTIONAL); kv = std::make_shared(kv, new_kv_shape, false); } return kv; @@ -95,18 +79,12 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto q_shape = context.get_input_shape(0).to_shape(); auto k_shape = context.get_input_shape(1).to_shape(); - k = tile_kv(q_shape[0], k_shape[0], k, context.is_static()); - v = tile_kv(q_shape[0], k_shape[0], v, context.is_static()); + k = tile_kv(q_shape[0], k_shape[0], q_shape[2], k); + v = tile_kv(q_shape[0], k_shape[0], q_shape[2], v); auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); - auto sdpa_f32 = std::make_shared(sdpa, ov::element::f32); - if (context.is_static()) { - res = std::make_shared(sdpa_f32, - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - res = std::make_shared( - sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); - } + res = std::make_shared(sdpa, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); + res = std::make_shared(res, ov::element::f32); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 128ffb29335b6..cf651a084b01e 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -26,40 +26,8 @@ OutputVector translate_permute(const NodeContext & context) { ov::Output res; auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - if (op_case == 1) { - if (context.is_static()) { - res = std::make_shared( - context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - auto src = context.get_input(0); - if (src.get_partial_shape().rank() == 3) { - src = std::make_shared(src, zero); - } - res = std::make_shared( - src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); - } - } else { - auto src = context.get_input(0); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - - if (context.is_static()) { - auto src_shape_ = context.get_input_shape(0).to_shape(); - std::vector src_shape(src_shape_.begin(), src_shape_.end()); - auto src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, - std::vector{-1, src_shape[1], src_shape[2]}), - false); - res = std::make_shared( - src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - if (src.get_partial_shape().rank() == 3) { - src = std::make_shared(src, zero); - } - res = std::make_shared( - src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); - } - } + auto src = context.get_input(0); + res = std::make_shared(src, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 362ccce17f289..9ad2e25284633 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -84,10 +84,6 @@ OutputVector translate_rope(const NodeContext & context) { ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); - if (!(context.is_static())) { - res = - std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - } } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 643ba7bffa541..8d0277ce86edd 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -33,10 +33,6 @@ OutputVector translate_set_rows(const NodeContext & context) { auto dst_shape = context.get_output_shape(0).to_shape(); FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); - if (context.is_static() && context.is_first_token()) { - return rename_outputs_with_suffix({data}, context.get_name()); - } - auto indices = context.get_input(1); auto dst = context.get_input(context.get_output_name()); @@ -54,13 +50,11 @@ OutputVector translate_set_rows(const NodeContext & context) { auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); res = std::make_shared(updated, std::make_shared(dst), false); } else { - assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() && - dst.get_partial_shape()[3].is_static()); + int64_t dim1 = dst.get_partial_shape()[1].get_length(); int64_t dim2 = dst.get_partial_shape()[2].get_length(); - int64_t dim3 = dst.get_partial_shape()[3].get_length(); data = std::make_shared( - data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false); - res = std::make_shared(OutputVector{dst, data}, 1); + data, ov::op::v0::Constant::create(ov::element::i64, {3}, {(int64_t) -1, dim1, dim2}), false); + res = std::make_shared(OutputVector{dst, data}, 0); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 67c5b4a51bf25..def1f3946073a 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -112,7 +111,6 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { auto stop = std::make_shared(ov::OutputVector{token_len, kv_len}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); - mask_sliced = std::make_shared(mask_sliced, zero_1d); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); mask_sliced->set_friendly_name(sliced_name); } @@ -243,11 +241,11 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); - if (!ggml_model_decoder->is_static()) { - const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); - const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); - manager.register_pass(kv_param_res_pairs); - } + // if (!ggml_model_decoder->is_static()) { + // const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); + // const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names); + // manager.register_pass(kv_param_res_pairs); + // } // if (ggml_model_decoder->is_static()) { manager.register_pass(); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index eb9ea9fee9eb5..50e3ef20bc6dd 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -26,60 +28,29 @@ #include #include #include +#include #include #include -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { - const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - auto * input_data = ggml_tensor->data; - ov::Shape input_shape; - if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); - } else if (ggml_tensor->op == GGML_OP_VIEW) { - // This case is added to make test-backend-ops work - input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); - } else { - input_shape = ggml_decoder->get_input_shape(name).to_shape(); - } - auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - return input_tensor; -} - -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { - std::map output_tensors; - - auto output_names = ggml_decoder->get_model_output_names(); - for (size_t inp = 0; inp < output_names.size(); ++inp) { - auto name = output_names[inp]; - const auto * tensor = ggml_decoder->get_output_ggml_tensor(name); - auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data; - output_tensors[name] = output_data; - } - return output_tensors; -} - -static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { - auto fem = ov::frontend::FrontEndManager(); - auto front_end = fem.load_by_framework("ggml"); - return front_end; -} +// Suppress deprecation warning for ov::Tensor::data() +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { static ov::Core core; - static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; - if (device.empty()) { - const std::vector preferred_device = {"GPU", "CPU", "NPU"}; - const auto available_devices = core.get_available_devices(); - for (const auto & dev : preferred_device) { - if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) { - device = dev; - break; - } + auto get_device = [&] { + std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + auto available_devices = core.get_available_devices(); + if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) { + GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str()); + device = "CPU"; } - } - + return device; + }; + static std::string device = get_device(); bool is_static = device == "NPU" ? true : false; + ov::AnyMap config; if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { @@ -102,11 +73,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * static std::unordered_map> infer_request_cache; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; - // For NPU, store the kvcache model, since we cannot create two infer_request - static std::unordered_map compiled_model_cache; std::shared_ptr ggml_decoder; - ov::InferRequest infer_request; + std::shared_ptr infer_request; int64_t decoder_end_time; int64_t conversion_end_time; @@ -118,83 +87,36 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, false); + ggml_decoder = std::make_shared(cgraph, model_weights, is_static); decoder_end_time = ggml_time_us(); - // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache - if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) { - infer_request_cache[cgraph] = - std::make_shared(compiled_model_cache[cgraph].create_infer_request()); - compiled_model_cache.erase(cgraph); - } - infer_request = *infer_request_cache[cgraph]; - + infer_request = infer_request_cache[cgraph]; conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); - if (is_static) { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - auto ggml_decoder_kvcache = std::make_shared(cgraph, model_weights, is_static, false); - decoder_end_time = ggml_time_us(); - - auto input_model = std::make_shared(ggml_decoder); - auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); - - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); - ggml_decoder_kvcache->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); - ov::serialize(model_kvcache, timestamped_filename); - } - - auto compiled_model = core.compile_model(model, device, get_npu_prefill_config()); - auto compiled_model_kvcache = core.compile_model(model_kvcache, device, get_npu_generate_config()); - compiled_model_cache[cgraph] = compiled_model_kvcache; - compile_end_time = ggml_time_us(); - - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; - compiled_model_cache[cgraph] = compiled_model_kvcache; - } else { - ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true); - decoder_end_time = ggml_time_us(); - - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } - - auto * disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION"); - if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") { - config = { - {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"} - }; - } - - auto compiled_model = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = *infer_request_cache[cgraph]; + ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); } + auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = infer_request_cache[cgraph]; + std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { @@ -210,72 +132,66 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto ov_input_names = ov_input_names_cache[cgraph]; auto ov_output_names = ov_output_names_cache[cgraph]; + for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - infer_request.set_input_tensor(i, input_tensor); + infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { print_input_tensor_info(param_name, input_tensor); } } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + auto input_end_time = ggml_time_us(); - infer_request.infer(); + infer_request->infer(); + auto infer_end_time = ggml_time_us(); - auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < ov_output_names.size(); i++) { - auto & result_name = ov_output_names[i]; - const auto output_tensor = infer_request.get_output_tensor(i); - - std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); - + const auto output_tensor = infer_request->get_output_tensor(i); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } - auto end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_PROFILING")) { - GGML_LOG_INFO("GGML OpenVINO Backend: \n"); + GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); - GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); } -namespace { -ov::AnyMap get_npu_base_config() { - return { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, - {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared" }, - {"NPUW_FUNCALL_FOR_ALL", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_DQ_FULL", "NO" }, - {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, - }; -} -} // namespace - -ov::AnyMap get_npu_prefill_config() { - auto config = get_npu_base_config(); - return config; -} - -ov::AnyMap get_npu_generate_config() { - auto config = get_npu_base_config(); +ov::AnyMap get_ov_compile_config(const std::string & device) { + ov::AnyMap config; + if (device == "NPU") { + config = { + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + }; + if (auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); cache_dir) { + config["NPUW_CACHE_DIR"] = cache_dir; + } + } return config; } @@ -291,7 +207,7 @@ std::map get_types_to_requant(const std::string & dev } if (device == "GPU") { return { - // gs16 is WIP + // gs16 will be supported on openvino-2025.4 {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, }; } @@ -331,68 +247,89 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, infer_request.set_input_tensor(i, input_tensor); } - infer_request.infer(); - - auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder); auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { auto result_name = ov_results[i]->get_friendly_name(); - const auto output_tensor = infer_request.get_output_tensor(i); - - std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); + auto output_tensor = get_ov_output_tensor(decoder, result_name); + infer_request.set_output_tensor(i, output_tensor); } + + infer_request.infer(); return GGML_STATUS_SUCCESS; } +namespace { +ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + auto * input_data = ggml_tensor->data; + ov::Shape input_shape; + if (ggml_tensor->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); + } else { + input_shape = ggml_decoder->get_input_shape(name).to_shape(); + } + auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; +} +} // namespace + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name) { bool is_static = ggml_decoder->is_static(); - bool is_first_token = ggml_decoder->is_first_token(); ov::Tensor input_tensor; if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - } else if (!is_static) { + } else if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) { + void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data; + size_t past_kv_len = + ggml_decoder->is_static() ? ggml_decoder->get_context_size() : ggml_decoder->get_past_kv_len(); + ov::Shape input_shape = {past_kv_len, (size_t) ggml_decoder->get_num_heads_kv(), + (size_t) ggml_decoder->get_head_size()}; + input_tensor = ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data); + + } else if (is_static && param_name.find("KQ_mask") == 0) { + size_t context_size = ggml_decoder->get_context_size(); + const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); + auto * data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + + } else if (is_static && param_name.find("inp_out_ids") == 0) { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + if (input_tensor.get_size() == 0) { + input_tensor = ov::Tensor(input_tensor.get_element_type(), ov::Shape{1, 1, 1}); + *input_tensor.data() = 0; + } } else { - if (param_name == "inp_tokens" || param_name == "inp_pos") { - if (is_first_token) { - size_t context_size = ggml_decoder->get_context_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, 0); - input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); - } - - } else if (param_name.find("KQ_mask") == 0) { - size_t context_size = ggml_decoder->get_context_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - if (is_first_token) { - std::vector padded_data = - pad_input(input_tensor_ggml, context_size, context_size, -INFINITY); - set_zero_diagonal(padded_data, context_size); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, context_size, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } else { - std::vector padded_data = pad_input(input_tensor_ggml, 1, context_size, -INFINITY); - input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size}); - auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin(), padded_data.end(), data_ptr); - } + input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); + } + return input_tensor; +} - } else if (const auto * op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); - op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { - input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, 1, 1}); - } else { - input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); +ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name) { + auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name); + auto output_type = ggml_decoder->get_output_type(result_name); + ov::Shape output_shape; + if (result_name.find("cache") == std::string::npos) { + output_shape = ggml_decoder->get_output_shape(result_name).to_shape(); + if (ggml_decoder->is_static() && result_name == "result_output") { + output_shape[1] = 1; + } + } else { + size_t total_token_len = ggml_decoder->get_past_kv_len() + ggml_decoder->get_input_len(); + size_t num_heads_kv = ggml_decoder->get_num_heads_kv(); + size_t head_size = ggml_decoder->get_head_size(); + if (ggml_decoder->is_static()) { + total_token_len = ggml_decoder->get_context_size(); } + output_shape = ov::Shape{total_token_len, num_heads_kv, head_size}; } - return input_tensor; + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); + return output_tensor; } size_t checksum(const void * data, size_t size) { @@ -405,10 +342,6 @@ size_t checksum(const void * data, size_t size) { return sum; } -// Suppress deprecation warning for ov::Tensor::data() -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; @@ -433,11 +366,9 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor } } -void print_output_tensor_info(const std::string & name, - const ov::Tensor & tensor, - std::map & output_dst) { - std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() - << ", Address: " << output_dst[name] << std::endl; +void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst) { + std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst + << std::endl; auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) { if (size == 0) { @@ -485,15 +416,13 @@ void print_output_tensor_info(const std::string & name, } } -#pragma GCC diagnostic pop - void set_zero_diagonal(std::vector & matrix, size_t dim) { for (size_t i = 0; i < dim; ++i) { matrix[i * dim + i] = 0.0f; } } -bool is_prefill(ggml_cgraph * cgraph) { +const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { auto * op = cgraph->nodes[i]; for (int j = 0; j < GGML_MAX_SRC; ++j) { @@ -501,11 +430,17 @@ bool is_prefill(ggml_cgraph * cgraph) { if (src == nullptr) { break; } - if (std::string(src->name) == "inp_tokens") { - return src->ne[0] != 1; + if (std::string(src->name) == "inp_pos") { + return src; } } } - GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph"); - throw std::runtime_error("is_prefill: inp_tokens not found in cgraph"); + GGML_LOG_ERROR("get_inp_pos_tensor: inp_pos not found in cgraph"); + throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); +} + +bool get_is_first_token(const ggml_tensor * inp_pos) { + return *(int32_t *) inp_pos->data == 0; } + +#pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 22f5cc8c34531..352f67aa12e7c 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -7,19 +7,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, bool is_static, bool is_first_token); - -ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name); - -std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); - size_t checksum(const void * data, size_t size); void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); -void print_output_tensor_info(const std::string & name, - const ov::Tensor & tensor, - std::map & output_dst); +void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst); template std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { @@ -38,15 +30,18 @@ std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t void set_zero_diagonal(std::vector & matrix, size_t dim); -bool is_prefill(struct ggml_cgraph * cgraph); +const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); -ov::AnyMap get_npu_prefill_config(); -ov::AnyMap get_npu_generate_config(); +bool get_is_first_token(const ggml_tensor * inp_pos); + +ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); +ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name); + bool is_naive(struct ggml_cgraph * cgraph); enum ggml_status naive_compute(struct ggml_cgraph * cgraph,