diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index 6d77ecea3cc0a..d631bc6c01d1e 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -5,6 +5,10 @@ AlignConsecutiveDeclarations: false ReferenceAlignment: Left PointerAlignment: Left Cpp11BracedListStyle: true +AccessModifierOffset: -4 +BinPackArguments: false +BinPackParameters: false +BreakBeforeBraces: Attach Language: Cpp AlignAfterOpenBracket: Align @@ -27,29 +31,7 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: true -BinPackArguments: true -BinPackParameters: true # OnePerLine BitFieldColonSpacing: Both -BreakBeforeBraces: Custom # Attach -BraceWrapping: - AfterCaseLabel: true - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false # BreakAdjacentStringLiterals: true BreakAfterAttributes: Never BreakBeforeBinaryOperators: None diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index a94a7ddf9c1c5..8ce9354c69ecc 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -71,9 +74,19 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } } +GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) { + m_cgraph = cgraph; + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + auto* cur_node = cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + set_input_output(cur_node, true); + } +} + // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; -// 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node) { +// 2. constructing a decoder for a node; +// 3. constructing a decoder for the whole graph naively (op test case) +void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -98,8 +111,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { m_inputs[src_name] = src; m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); - // If called for the whole graph, create constant nodes for weights and param nodes for inputs - if (!m_node && !src->view_src) { + // Add model inputs and weights constants, if called for the whole graph + if (naive) { + auto param_node = std::make_shared(get_ov_type(src), get_graph_input_shape(src)); + param_node->set_friendly_name(src_name); + param_node->output(0).get_tensor().set_names({src_name}); + m_model_inputs[src_name] = param_node; + + } else if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -118,7 +137,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } } - if (!m_node) { + // Add model outputs, if called for the whole graph + if (naive) { + m_model_output_names.push_back(node->name); + } else if (!m_node) { static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || @@ -164,17 +186,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { m_op_case = 2; } break; - } - case GGML_OP_MUL_MAT: { - if (node->src[0]->view_src == nullptr) { - m_op_case = 1; - } else if (std::string(node->src[0]->name).find("cache_k") == 0) { - m_op_case = 2; - } else if (std::string(node->src[0]->name).find("cache_v") == 0) { - m_op_case = 3; } - break; - } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -188,6 +200,23 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { } break; } + case GGML_OP_GET_ROWS: + { + if (node->src[1]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; + } + break; + } + case GGML_OP_ROPE: + { + if (node->src[0]->op == GGML_OP_VIEW) { + m_op_case = 2; + } else { + m_op_case = 1; + } + } default: break; } @@ -237,6 +266,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (std::string(src->name).find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; + } else if (src->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ov::PartialShape{get_shape(src->view_src)}; } else { input_shape = ov::PartialShape{get_shape(src)}; } @@ -373,6 +405,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) weight_node = std::make_shared(node_type, node_shape, data_f16); break; } + case GGML_TYPE_BF16: + { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_bf16; + data_bf16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_bf16.push_back(ov::bfloat16::from_bits(ptr[i])); + } + weight_node = std::make_shared(node_type, node_shape, data_bf16); + break; + } default: throw std::invalid_argument("Unsupported tensor type"); } @@ -496,6 +539,9 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { case GGML_TYPE_F16: type = ov::element::f16; break; + case GGML_TYPE_BF16: + type = ov::element::bf16; + break; case GGML_TYPE_I64: type = ov::element::i64; break; @@ -576,6 +622,7 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { + {GGML_OP_NONE, "GGML_OP_NONE" }, {GGML_OP_ACC, "GGML_OP_ACC" }, {GGML_OP_ADD, "GGML_OP_ADD" }, {GGML_OP_ADD1, "GGML_OP_ADD1" }, diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 428edef3ae628..f4fe9c402d53b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -15,6 +15,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size); + // Naive decoder + GgmlOvDecoder(struct ggml_cgraph* cgraph); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; GGML_UNUSED(name); @@ -111,7 +113,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void clear_model_weights() { m_model_weights.clear(); } private: - void set_input_output(ggml_tensor* node); + void set_input_output(ggml_tensor* node, bool naive = false); void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static std::vector get_shape(const ggml_tensor* tensor); @@ -124,13 +126,13 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { static std::shared_ptr create_weight_node(ggml_tensor* tensor); void add_weight_const_parallel(std::map>& model_weights); - struct ggml_cgraph* m_cgraph; + struct ggml_cgraph* m_cgraph = nullptr; + ggml_tensor* m_node = nullptr; + std::vector m_nodes; std::map m_inputs; std::vector m_input_names; std::map m_outputs; std::vector m_output_names; - ggml_tensor* m_node; - std::vector m_nodes; std::string m_op_name; mutable std::string m_name; int m_op_case; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 167453b215657..2bc9d5199c6df 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1,15 +1,17 @@ -#include "ggml-backend-impl.h" -#include "ggml-impl.h" #include "ggml-openvino.h" -#include "ggml-openvino/utils.h" -#include "ggml.h" +#include #include #include #include #include #include +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml-openvino/utils.h" +#include "ggml.h" + #define GGML_OPENVINO_MAX_STREAMS 8 struct ggml_backend_openvino_context { @@ -234,9 +236,85 @@ static ggml_backend_buffer_t ggml_backend_openvino_device_buffer_from_host_ptr(g return nullptr; } +static bool is_op_unsupported_case(const ggml_tensor* op) { + if (op->op == GGML_OP_SOFT_MAX) { + float scale = 1.0f; + float max_bias = 0.0f; + const auto* op_params = op->op_params; + memcpy(&scale, (const float*) op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float*) op_params + 1, sizeof(float)); + const uint32_t h = op->src[0]->ne[2]; + const uint32_t n_head = op->src[0]->ne[0]; + const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + const float slope = + (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; + + if (slope != 1.0f) { + GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n"); + return true; + } + } + + if (op->op == GGML_OP_MUL_MAT) { + if ((op->src[0]->view_src && op->src[0]->op != GGML_OP_PERMUTE) || + (op->src[1]->view_src && op->src[1]->op != GGML_OP_PERMUTE)) { + GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with view_src tensors that are not PERMUTE\n"); + return true; + } + } + + if (op->op == GGML_OP_ROPE) { + const int32_t* op_params = op->op_params; + const int n_dims = op_params[1]; + const int mode = op_params[2]; + if (mode == GGML_ROPE_TYPE_MROPE || mode == GGML_ROPE_TYPE_VISION) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with mode %d\n", mode); + return true; + } + if (n_dims != op->src[0]->ne[0]) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with n_dims %d != src[0]->ne[0] %ld\n", + n_dims, + op->src[0]->ne[0]); + return true; + } + if (op->type != GGML_TYPE_F32) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); + return true; + } + float freq_scale; + memcpy(&freq_scale, op_params + 6, sizeof(float)); + if (freq_scale != 1.0f) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale); + return true; + } + float ext_factor; + memcpy(&ext_factor, op_params + 7, sizeof(float)); + if (ext_factor != 0.0f) { + GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor); + return true; + } + if (op->src[0]->op == GGML_OP_VIEW) { + if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) { + GGML_LOG_WARN( + "OpenVINO backend does not support ROPE with src[0]->view_src->ne[1] %ld != src[0]->ne[2] %ld\n", + op->src[0]->view_src->ne[1], + op->src[0]->ne[2]); + return true; + } + } + } + return false; +} + static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); + static const std::set supported_types{ + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32}; + static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, @@ -248,18 +326,60 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_GLU_OP_SWIGLU, }; - auto res = false; switch (op->op) { case GGML_OP_UNARY: - res = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); - break; + { + auto supported = supported_unary_ops.find(ggml_get_unary_op(op)) != supported_unary_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", + ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } + break; + } case GGML_OP_GLU: - res = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); - break; + { + auto supported = supported_glu_ops.find(ggml_get_glu_op(op)) != supported_glu_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", + ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } + break; + } default: - res = supported_ops.find(op->op) != supported_ops.end(); + { + auto supported = supported_ops.find(op->op) != supported_ops.end(); + if (!supported) { + GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); + return false; + } + } + } + + if (supported_types.find(op->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + return false; + } + if (op->ne[3] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); + return false; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (supported_types.find(op->type) == supported_types.end()) { + GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type)); + return false; + } + if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) { + GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n"); + return false; + } + } + + if (is_op_unsupported_case(op)) { + return false; } - return res; + return true; } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/openvino/frontend.cpp b/ggml/src/ggml-openvino/openvino/frontend.cpp index ff7f0e8392b0f..dbdae1ed45ca1 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.cpp +++ b/ggml/src/ggml-openvino/openvino/frontend.cpp @@ -10,13 +10,13 @@ namespace ggml { FrontEnd::FrontEnd() {} -std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model) { +std::shared_ptr FrontEnd::convert(const InputModel::Ptr& model, bool naive) { auto ggml_model = std::dynamic_pointer_cast(model); FRONT_END_GENERAL_CHECK(ggml_model, "Invalid input model"); std::shared_ptr converted_model; const auto& supported_ops = get_supported_ops(); { - TranslateSession translate_session(model, supported_ops); + TranslateSession translate_session(model, supported_ops, naive); converted_model = translate_session.get_converted_model(); } return converted_model; diff --git a/ggml/src/ggml-openvino/openvino/frontend.hpp b/ggml/src/ggml-openvino/openvino/frontend.hpp index 5cc7ff1773216..f1c6f0c3e3ce3 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.hpp +++ b/ggml/src/ggml-openvino/openvino/frontend.hpp @@ -15,7 +15,7 @@ class FrontEnd { using Ptr = std::shared_ptr; FrontEnd(); - static std::shared_ptr convert(const InputModel::Ptr& model); + static std::shared_ptr convert(const InputModel::Ptr& model, bool naive = false); }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index b5f0f37406ac8..ceba64227523b 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -77,6 +77,10 @@ class NodeContext : public frontend::NodeContext { return m_tensor_map->at(name); } + bool has_input(const std::string& name) const { + return m_tensor_map->find(name) != m_tensor_map->end(); + } + const std::string& get_name() const override { return m_decoder->get_op_name(); } diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 5c6953caffe27..f83c0e62df77b 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -34,19 +34,7 @@ OutputVector translate_cont(const NodeContext& context) { false); } else { // The input comes from a VIEW - // Currently all cases are slicing at lowest dim - int32_t* op_params = context.get_input_op_params(0); - auto output_stride = context.get_output_stride(0); - - int64_t split_addr = op_params[0] / output_stride[2]; - std::vector begin = {0, 0, split_addr}; - std::vector end = {(int64_t)src_shape[0], INT_MAX, split_addr + (int64_t)src_shape[2]}; - std::vector strides = {1, 1, 1}; - - auto begin_const = ov::op::v0::Constant::create(element::i64, {begin.size()}, begin); - auto end_const = ov::op::v0::Constant::create(ov::element::i64, {end.size()}, end); - auto strides_const = ov::op::v0::Constant::create(ov::element::i64, {strides.size()}, strides); - res = std::make_shared(context.get_input(0), begin_const, end_const, strides_const); + res = process_view_input(context, 0); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 9ed5f4deaf047..c97bbbf5a3657 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,10 +1,12 @@ +#include #include #include #include #include #include #include -#include +#include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -18,19 +20,32 @@ namespace op { OutputVector translate_get_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); - auto data_node = context.get_input(0); - auto indices_node = context.get_input(1); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); - auto indices_shape = get_dimensions(indices_node.get_node_shared_ptr(), {2}); - Output indice_reshaped = std::make_shared(indices_node, indices_shape, false); + Output res; + auto data = context.get_input(0); + auto indices = context.get_input(1); - auto axis_node = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + if (op_case == 2) { + // The input comes from a VIEW + indices = process_view_input(context, 1); + } + + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + if (indices.get_partial_shape()[1].get_length() == 1) { + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + res = std::make_shared(data, indices, axis); + } else { + indices = + std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = std::make_shared(data, indices, axis, 1); + } - Output res = std::make_shared(data_node, indice_reshaped, axis_node); if (res.get_element_type() != context.get_output_type(0)) { res = std::make_shared(res, context.get_output_type(0)); } - return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 139498939542e..52d1e575dbd65 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -26,48 +26,46 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported MULMAT case"); - ov::Output res; + ov::Output B = context.get_input(0); + ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); - if (op_case == 1) { - auto src0 = context.get_input(0); - auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); - auto result_lp = std::make_shared(src1, src0, false, true); - res = std::make_shared(result_lp, context.get_output_type(0)); - } else { - ov::Output B = context.get_input(0); - ov::Output A = std::make_shared(context.get_input(1), context.get_input_type(0)); + auto B_shape = context.get_input_shape(0).to_shape(); + auto A_shape = context.get_input_shape(1).to_shape(); + int64_t A_batch = A_shape[0]; + int64_t B_batch = B_shape[0]; + auto A_batch_larger = A_batch > B_batch; + Output Z = A_batch_larger ? B : A; + int64_t factor = A_batch_larger ? A_batch / B_batch : B_batch / A_batch; + if (factor > 1) { + auto A_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{A_batch}); + auto B_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{B_batch}); + auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{factor}); - int64_t num_heads = context.get_num_heads(); - int64_t num_heads_kv = context.get_num_heads_kv(); - int64_t kv_num_heads_factor = num_heads / num_heads_kv; - if (kv_num_heads_factor > 1) { - auto num_heads_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads}); - auto num_heads_kv_node = - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{num_heads_kv}); - auto factor_node = - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{kv_num_heads_factor}); - auto B_shape_last_two = get_dimensions(B.get_node_shared_ptr(), {1, 2}); + auto Z_last_two_dim = get_dimensions(Z.get_node_shared_ptr(), {1, 2}); - auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); - auto B_unsqueezed = std::make_shared(B, unsqueeze_axes); + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1}); + auto Z_unsqueezed = std::make_shared(Z, unsqueeze_axes); - auto broadcast_shape = std::make_shared( - ov::OutputVector{num_heads_kv_node, factor_node, B_shape_last_two}, 0); - auto B_broadcasted = std::make_shared(B_unsqueezed, broadcast_shape); + Output batch_small = A_batch_larger ? B_batch_node : A_batch_node; + Output batch_large = A_batch_larger ? A_batch_node : B_batch_node; + auto broadcast_shape = + std::make_shared(ov::OutputVector{batch_small, factor_node, Z_last_two_dim}, 0); + auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape); - auto new_B_shape = - std::make_shared(ov::OutputVector{num_heads_node, B_shape_last_two}, 0); - B = std::make_shared(B_broadcasted, new_B_shape, false); + auto new_Z_shape = std::make_shared(ov::OutputVector{batch_large, Z_last_two_dim}, 0); + Z = std::make_shared(Z_broadcasted, new_Z_shape, false); + } + if (A_batch_larger) { + B = Z; + } else { + A = Z; } auto result_lp = std::make_shared(A, B, false, true); res = std::make_shared(result_lp, context.get_output_type(0)); - } - return rename_outputs_with_suffix({res}, context.get_name()); + return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index f5736fefc87f5..7951a1e012c54 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "../node_context.hpp" @@ -25,37 +26,66 @@ namespace op { OutputVector translate_rope(const NodeContext& context) { num_inputs_check(context, 2, 3); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + ov::Output res; auto data_node = context.get_input(0).get_node_shared_ptr(); - auto cos_theta_node = context.get_input("rope_cos"); - auto sin_theta_node = context.get_input("rope_sin"); - + auto output_shape = context.get_output_shape(0).to_shape(); int32_t* op_params = context.get_output_op_params(0); - const int mode = op_params[2]; - constexpr int GGML_ROPE_TYPE_NEOX = 2; - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - if (!is_neox) { - auto input_shape = context.get_input_shape(0); + Output cos_theta_node; + Output sin_theta_node; + if (context.has_input("rope_cos")) { + cos_theta_node = context.get_input("rope_cos"); + sin_theta_node = context.get_input("rope_sin"); + } else { + auto inp_pos = context.get_input(1).get_node_shared_ptr(); + std::shared_ptr rope_freqs_weight; + if (context.get_input_size() == 3) { + rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); + } + auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight); + sin_theta_node = sin_cos.first; + cos_theta_node = sin_cos.second; + } + + if (op_case == 2) { + // The input comes from a VIEW + int slice_len = output_shape[1] * output_shape[2]; + data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr(); + auto data_shape = ov::op::v0::Constant::create( + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[1], (int64_t) output_shape[2]}); + data_node = std::make_shared(data_node, data_shape, false); + } - auto begin_even = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 0}); - auto begin_odd = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {0, 0, 1}); - auto end = std::make_shared(data_node); - auto stride = ov::op::v0::Constant::create(ov::element::i64, Shape{3}, {1, 1, 2}); - auto even_slice = std::make_shared(data_node, begin_even, end, stride); - auto odd_slice = std::make_shared(data_node, begin_odd, end, stride); + const int mode = op_params[2]; + constexpr int ROPE_TYPE_NEOX = 2; + constexpr int ROPE_TYPE_NORM = 0; - auto first_half = + if (mode == ROPE_TYPE_NORM) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); + auto even_slice = std::make_shared(data_node, zero, end, two, two); + auto odd_slice = std::make_shared(data_node, one, end, two, two); + + Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), std::make_shared(odd_slice, sin_theta_node)); - auto second_half = + Output second_half = std::make_shared(std::make_shared(even_slice, sin_theta_node), std::make_shared(odd_slice, cos_theta_node)); - auto stack = std::make_shared(OutputVector{first_half, second_half}, 2); + first_half = std::make_shared(first_half, + ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + second_half = std::make_shared(second_half, + ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + auto stack = std::make_shared(OutputVector{first_half, second_half}, 3); res = std::make_shared(stack, std::make_shared(data_node), false); - } else { + } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2); Output slice_data_node_0 = data_split->outputs()[0]; diff --git a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp index d59f4499a3592..001a62be8b5e2 100644 --- a/ggml/src/ggml-openvino/openvino/op/soft_max.cpp +++ b/ggml/src/ggml-openvino/openvino/op/soft_max.cpp @@ -33,9 +33,9 @@ OutputVector translate_soft_max(const NodeContext& context) { auto* op_params = context.get_output_op_params(0); memcpy(&scale, (float*) op_params + 0, sizeof(float)); memcpy(&max_bias, (float*) op_params + 1, sizeof(float)); - const uint32_t h = context.get_head_size(); - - const uint32_t n_head = context.get_input_shape(0)[0].get_length(); + auto src0_shape = context.get_input_shape(0).get_shape(); + const uint32_t h = src0_shape[2]; + const uint32_t n_head = src0_shape[0]; const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); const float m0 = powf(2.0f, -(max_bias) / n_head_log2); @@ -46,23 +46,30 @@ OutputVector translate_soft_max(const NodeContext& context) { auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); auto scaled_input = std::make_shared(input_node, scale_node); + if (context.get_input_size() < 2) { + res = std::make_shared(scaled_input, 2); + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto mask_node = context.get_input(1); - // Use Q-cur to retrieve the token length, so that the translation of SOFT_MAX + std::shared_ptr token_len = get_dimensions(input_node, {1}); + // Try using Q-cur to retrieve the token length, so that the translation of SOFT_MAX // does not depend on the result of the QK MatMul, so that QK matmul + softmax + qkv matmul // can be fused into SDPA. - if (input_node->get_type_info() != ov::op::v0::Convert::get_type_info_static()) { - throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); + if (input_node->get_type_info() == ov::op::v0::Convert::get_type_info_static()) { + auto qk = input_node->get_input_node_shared_ptr(0); + if (qk->get_type_info() == ov::op::v0::MatMul::get_type_info_static()) { + token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); + } } - auto qk = input_node->get_input_node_shared_ptr(0); - if (qk->get_type_info() != ov::op::v0::MatMul::get_type_info_static()) { - throw std::runtime_error("Input of SOFT_MAX should be MatMul of qk followed by a Convert"); - } - auto token_len = get_dimensions(qk->get_input_node_shared_ptr(0), {1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); + std::shared_ptr mask_node_sliced = + std::make_shared(mask_node, zero, token_len, one, one); + if (mask_node_sliced->get_element_type() != context.get_output_type(0)) { + mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type(0)); + } Output slope_mask; if (slope != 1.0f) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index d122497e63d6f..129c3592c903c 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -145,69 +145,18 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { int32_t* rope_params = ggml_model_decoder.get_rope_params(); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); std::shared_ptr rope_freqs_weight; - - inp_pos = std::make_shared(inp_pos, ov::element::f32); - auto pos_perm = - std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); - inp_pos = std::make_shared(inp_pos, pos_perm); if (tensor_map.find("rope_freqs_weight") != tensor_map.end()) { rope_freqs_weight = tensor_map.at("rope_freqs.weight").get_node_shared_ptr(); } - float freq_base; - float freq_scale; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - const int n_dims = rope_params[1]; - const int n_ctx_orig = rope_params[4]; - memcpy(&freq_base, rope_params + 5, sizeof(float)); - memcpy(&freq_scale, rope_params + 6, sizeof(float)); - memcpy(&ext_factor, rope_params + 7, sizeof(float)); - memcpy(&attn_factor, rope_params + 8, sizeof(float)); - memcpy(&beta_fast, rope_params + 9, sizeof(float)); - memcpy(&beta_slow, rope_params + 10, sizeof(float)); - - const float theta_scale = powf(freq_base, -2.0f / n_dims); - - // TODO: corr_dims is not used in the current implementation - float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); - - // TODO: GGML_OP_ROPE_BACK -> false - // bool forward = true; - // const float sin_sign = forward ? 1.0f : -1.0f; - - const int64_t half_head_size = ggml_model_decoder.get_head_size() / 2; - std::vector factor(half_head_size); - factor[0] = freq_scale; - for (int64_t i = 1; i < half_head_size; i++) { - factor[i] = theta_scale * factor[i - 1]; - } - - Output factor_node = - std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); - if (rope_freqs_weight) { - factor_node = std::make_shared(factor_node, rope_freqs_weight); - } + auto sin_cos = make_sin_cos(rope_params, inp_pos, rope_freqs_weight); + auto sin_theta = sin_cos.first; + auto cos_theta = sin_cos.second; - auto half_head_size_node = ov::op::v0::Constant::create(ov::element::i64, Shape{1}, {half_head_size}); - Output cos_factor = - std::make_shared(std::make_shared(factor_node, inp_pos)); - Output sin_factor = - std::make_shared(std::make_shared(factor_node, inp_pos)); - - float mscale = attn_factor; - Output mscale_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{mscale}); - - auto cos_theta = std::make_shared(cos_factor, mscale_node); - auto sin_theta = std::make_shared(sin_factor, mscale_node); - cos_theta->set_friendly_name("rope_cos"); - sin_theta->set_friendly_name("rope_sin"); - tensor_map.insert({"rope_cos", cos_theta->output(0)}); - tensor_map.insert({"rope_sin", sin_theta->output(0)}); + cos_theta.get_node_shared_ptr()->set_friendly_name("rope_cos"); + sin_theta.get_node_shared_ptr()->set_friendly_name("rope_sin"); + tensor_map.insert({"rope_cos", cos_theta}); + tensor_map.insert({"rope_sin", sin_theta}); } // Create common patterns @@ -220,10 +169,12 @@ void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { } // namespace TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map) - : m_input_model(input_model), - m_translator_map(translator_map), - m_ov_model(nullptr) {} + const std::unordered_map& translator_map, + bool naive) : + m_input_model(input_model), + m_translator_map(translator_map), + m_ov_model(nullptr), + m_naive(naive) {} std::shared_ptr TranslateSession::get_converted_model() { if (m_ov_model) { @@ -258,6 +209,10 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo auto node_visitor = [&](std::shared_ptr node) { auto operation_type = node->get_op_type(); + if (operation_type == "GGML_OP_NONE") { + return; + } + ov::OutputVector converted_outputs; auto it = m_translator_map.find(operation_type); FRONT_END_OP_CONVERSION_CHECK(it != m_translator_map.end(), @@ -285,7 +240,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } }; - preprocess(*tensor_map, *ggml_model_decoder); + if (!m_naive) { + preprocess(*tensor_map, *ggml_model_decoder); + } ggml_model_decoder->visit_subgraph(node_visitor); for (const auto& name : ggml_model_decoder->get_model_output_names()) { diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp index 9167b55fe52ea..9eea5fd11cb01 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.hpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp @@ -10,7 +10,7 @@ namespace ggml { class TranslateSession { public: TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map); + const std::unordered_map& translator_map, bool naive = false); std::shared_ptr get_converted_model(); std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); @@ -20,6 +20,7 @@ class TranslateSession { const frontend::InputModel::Ptr m_input_model; const std::unordered_map& m_translator_map; std::shared_ptr m_ov_model; + bool m_naive; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 69e26f05ca095..9634900753224 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -1,9 +1,20 @@ #include "utils.hpp" +#include #include #include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include namespace ov { @@ -58,6 +69,134 @@ OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std:: return outputs; } +namespace { +ov::Output rope_yarn_ramp_mix(int n_dims, const float corr_dims[2], float ext_factor) { + int half_n_dims = n_dims / 2; + std::vector dim_ids_vec(half_n_dims); + std::iota(dim_ids_vec.begin(), dim_ids_vec.end(), 0); + auto dim_ids = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, (size_t) half_n_dims}, dim_ids_vec); + auto corr_low = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[0]}); + auto corr_high = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {corr_dims[1]}); + auto denom = + std::make_shared(std::make_shared(corr_high, corr_low), + ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {0.001f})); + auto ramp_y = + std::make_shared(std::make_shared(dim_ids, corr_low), denom); + auto ramp_clamped = std::make_shared(ramp_y, 0.0f, 1.0f); + auto ext_factor_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {ext_factor}); + auto ramp_mix = std::make_shared(ramp_clamped, ext_factor_node); + return ramp_mix; +} + +float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims(int n_dims, + int n_ctx_orig, + float freq_base, + float beta_fast, + float beta_slow, + float dims[2]) { + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = std::max(0.0f, start); + dims[1] = std::min(static_cast(n_dims - 1), end); +} +} // namespace + +std::pair, ov::Output> make_sin_cos(int32_t* rope_params, + std::shared_ptr inp_pos, + std::shared_ptr rope_freqs_weight) { + inp_pos = std::make_shared(inp_pos, ov::element::f32); + auto pos_perm = + std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); + inp_pos = std::make_shared(inp_pos, pos_perm); + + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + const int n_dims = rope_params[1]; + const int n_ctx_orig = rope_params[4]; + memcpy(&freq_base, rope_params + 5, sizeof(float)); + memcpy(&freq_scale, rope_params + 6, sizeof(float)); + memcpy(&ext_factor, rope_params + 7, sizeof(float)); + memcpy(&attn_factor, rope_params + 8, sizeof(float)); + memcpy(&beta_fast, rope_params + 9, sizeof(float)); + memcpy(&beta_slow, rope_params + 10, sizeof(float)); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + std::vector factor(n_dims / 2); + factor[0] = freq_scale; + for (size_t i = 1; i < factor.size(); i++) { + factor[i] = theta_scale * factor[i - 1]; + } + + Output freq_factors = + std::make_shared(ov::element::f32, ov::Shape{1, 1, factor.size()}, factor); + if (rope_freqs_weight) { + freq_factors = std::make_shared(freq_factors, rope_freqs_weight); + } + + auto theta_extrap = std::make_shared(freq_factors, inp_pos); + auto theta_interp = std::make_shared( + theta_extrap, ov::op::v0::Constant::create(ov::element::f32, {1}, {freq_scale})); + + Output theta; + float mscale = attn_factor; + if (ext_factor == 0.0f) { + theta = theta_interp; + } else { + auto ramp_mix = rope_yarn_ramp_mix(n_dims, corr_dims, ext_factor); + auto one = ov::op::v0::Constant::create(ov::element::f32, Shape{1, 1, 1}, {1.0f}); + auto one_minus_ramp = std::make_shared(one, ramp_mix); + + theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), + std::make_shared(theta_extrap, ramp_mix)); + mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); + } + + Output cos_theta = std::make_shared(theta); + Output sin_theta = std::make_shared(theta); + + auto mscale_node = ov::op::v0::Constant::create(ov::element::f32, Shape{}, {mscale}); + + cos_theta = std::make_shared(cos_theta, mscale_node); + sin_theta = std::make_shared(sin_theta, mscale_node); + return std::make_pair(sin_theta, cos_theta); +} + +ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len) { + // Only works for VIEW operations that slice at the lowest dimension + // If the VIEW also reshape the result, `slice_len` should be provided + auto input = context.get_input(input_index); + int32_t* op_params = context.get_input_op_params(input_index); + auto src1_stride = context.get_input_stride(input_index); + + int64_t split_addr = op_params[0] / src1_stride[2]; + if (slice_len == 0) { + slice_len = context.get_input_shape(input_index)[2].get_length(); + } + int64_t slice_end = split_addr + slice_len; + + auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); + auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); + auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto sliced = std::make_shared(input, begin, end, stride, axes); + return sliced; +} + } // namespace ggml } // namespace frontend } // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.hpp b/ggml/src/ggml-openvino/openvino/utils.hpp index b54b2b92c9dac..6c6d2ae8d4f23 100644 --- a/ggml/src/ggml-openvino/openvino/utils.hpp +++ b/ggml/src/ggml-openvino/openvino/utils.hpp @@ -1,6 +1,10 @@ #pragma once +#include +#include #include +#include +#include #include "node_context.hpp" @@ -60,6 +64,12 @@ std::shared_ptr get_dimensions(const std::shared_ptr& node, OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); +std::pair, ov::Output> make_sin_cos(int32_t* rope_params, + std::shared_ptr inp_pos, + std::shared_ptr rope_freqs_weight = nullptr); + +ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); + namespace op { template OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e5a4401fec2b9..fcfd3639a7136 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -21,6 +21,7 @@ #include #include "ggml-impl.h" +#include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" @@ -35,6 +36,9 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, ov::Shape input_shape; if (name.find("cache_k") == 0 || name.find("cache_v") == 0) { input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape(); + } else if (ggml_tensor->op == GGML_OP_VIEW) { + // This case is added to make test-backend-ops work + input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape(); } else { input_shape = ggml_decoder->get_input_shape(name).to_shape(); } @@ -81,6 +85,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c config = get_npu_config(); } + if (cgraph->n_nodes == 1) { + return naive_compute(cgraph, core, device, config); + } + auto start_time = ggml_time_us(); auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); @@ -242,6 +250,42 @@ ov::AnyMap get_npu_config() { return config; } +enum ggml_status naive_compute(struct ggml_cgraph* cgraph, + ov::Core& core, + const std::string& device, + const ov::AnyMap& config) { + if (cgraph->nodes[0]->op == GGML_OP_NONE) { + return GGML_STATUS_SUCCESS; + } + + auto decoder = std::make_shared(cgraph); + auto input_model = std::make_shared(decoder); + auto naive = true; + auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); + auto infer_request = core.compile_model(model, device, config).create_infer_request(); + + ov::serialize(model, "IR.xml"); + + auto ov_params = model->get_parameters(); + for (size_t i = 0; i < ov_params.size(); i++) { + auto param_name = ov_params[i]->get_friendly_name(); + auto input_tensor = get_ov_input_tensor(decoder, param_name); + infer_request.set_input_tensor(i, input_tensor); + } + + infer_request.infer(); + + auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder); + auto ov_results = model->get_results(); + for (size_t i = 0; i < ov_results.size(); i++) { + auto result_name = ov_results[i]->get_friendly_name(); + const auto output_tensor = infer_request.get_output_tensor(i); + + std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size()); + } + return GGML_STATUS_SUCCESS; +} + ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name) { bool is_static = ggml_decoder->is_static(); bool is_first_token = ggml_decoder->is_first_token(); diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 1d23e285227e6..367b2829bec3b 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,5 @@ #include +#include #include "ggml-backend-impl.h" #include "ggml-decoder.h" @@ -42,3 +43,6 @@ bool is_prefill(struct ggml_cgraph * cgraph); ov::AnyMap get_npu_config(); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string& param_name); + +enum ggml_status naive_compute(struct ggml_cgraph* cgraph, ov::Core& core, const std::string& device, + const ov::AnyMap& config);