Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 39 additions & 42 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/runtime/tensor.hpp>
#include <optional>
#include <ostream>
Expand All @@ -39,7 +38,6 @@
GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
ggml_cgraph * cgraph,
bool is_static,
bool is_first_token,
int context_size,
int context_size_swa,
int num_heads,
Expand All @@ -55,25 +53,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
m_num_heads(num_heads),
m_num_heads_kv(num_heads_kv),
m_head_size(head_size),
m_is_static(is_static),
m_is_first_token(is_first_token) {
m_is_static(is_static) {
set_input_output(node);
}

GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_first_token) :
bool is_static) :
m_cgraph(cgraph),
m_op_name(m_node ? std::string(m_node->name) : ""),
m_model_weights(model_weights),
m_is_static(is_static),
m_is_first_token(is_first_token) {
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
m_is_static(is_static) {
if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
print_tensor_address_map(cgraph);
}

set_llm_params();
validate_cgraph();

for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * cur_node = cgraph->nodes[node_n];
Expand Down Expand Up @@ -160,8 +157,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
static std::set<std::string> debug_output_names = {};
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 ||
debug_output_names.count(node_name)) {
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) {
if (node->op == GGML_OP_SET_ROWS) {
assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0);
if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) {
Expand Down Expand Up @@ -285,53 +282,54 @@ void GgmlOvDecoder::set_llm_params() {
} else {
m_context_size = cache_k->ne[1];
}
} else if (node->op == GGML_OP_ROPE &&
(name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) {
m_head_size = node->ne[0];
m_num_heads = node->ne[1];
m_rope_params = node->op_params;
} else if (node->op == GGML_OP_ROPE &&
(name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0)) {
m_num_heads_kv = node->ne[1];
} else if (node->op == GGML_OP_ROPE) {
if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) {
m_head_size = node->ne[0];
m_num_heads = node->ne[1];
m_rope_params = node->op_params;
auto * inp_pos = node->src[1];
m_input_len = inp_pos->ne[0];
m_past_kv_len = *(int32_t *) inp_pos->data;
} else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) {
m_num_heads_kv = node->ne[1];
}
}
}
}

void GgmlOvDecoder::validate_cgraph() const {
if (m_is_static && m_input_len != 1) {
throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) +
", try set -ub 1");
}
}

ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const {
auto name = std::string(src->name);
ov::PartialShape input_shape;
if (name == "inp_tokens" || name == "inp_pos") {
if (m_is_static) {
if (m_is_first_token) {
input_shape = ov::PartialShape{1, 1, m_context_size};
} else {
input_shape = ov::PartialShape{1, 1, 1};
}
} else {
input_shape = ov::PartialShape{1, 1, -1};
}
} else if (name == "inp_out_ids" && !m_is_static) {
input_shape = ov::PartialShape{1, 1, -1};

if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") {
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};

} else if (name.find("KQ_mask") == 0) {
if (m_is_static) {
if (m_is_first_token) {
input_shape = ov::PartialShape{1, m_context_size, m_context_size};
} else {
input_shape = ov::PartialShape{1, 1, m_context_size};
}
input_shape = ov::PartialShape{1, 1, m_context_size};
} else {
input_shape = ov::PartialShape{1, -1, -1};
}

} else if (name.find("cache_") == 0) {
auto past_token_len = -1;
if (m_is_static) {
int layer = extract_layer_from_name(name);
bool is_swa = is_swa_layer(layer);
input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size};
} else {
input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size};
past_token_len = is_swa ? m_context_size_swa : m_context_size;
}
input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size};

} else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};

} else if (src->op == GGML_OP_VIEW) {
// This case is added to make test-backend-ops work
input_shape = ov::PartialShape{get_shape(src->view_src)};
Expand Down Expand Up @@ -745,9 +743,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {

void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
for (const auto & node : m_nodes) {
auto decoder =
std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa,
m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
node_visitor(decoder);
}
}
Expand Down
18 changes: 10 additions & 8 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
// Graph decoder
GgmlOvDecoder(ggml_cgraph * cgraph,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_first_token);
bool is_static);

// Node decoder, called in GgmlOvDecoder::visit_subgraph
GgmlOvDecoder(ggml_tensor * node,
ggml_cgraph * cgraph,
bool is_static,
bool is_first_token,
int context_size,
int context_size_swa,
int num_heads,
Expand Down Expand Up @@ -81,9 +79,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const override;

const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }

const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); }
ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); }

virtual int get_op_case() const override { return m_op_case; }

Expand Down Expand Up @@ -119,14 +117,16 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual int get_head_size() const override { return m_head_size; }

int get_past_kv_len() const { return m_past_kv_len; }

int get_input_len() const { return m_input_len; }

virtual int32_t * get_rope_params() const override { return m_rope_params; }

virtual std::map<std::string, std::string> get_kv_param_res_names() const override;

virtual bool is_static() const override { return m_is_static; }

virtual bool is_first_token() const override { return m_is_first_token; }

ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const;

static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
Expand All @@ -153,6 +153,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

// set context_size, num_heads, etc
void set_llm_params();
void validate_cgraph() const;

ggml_cgraph * m_cgraph = nullptr;
ggml_tensor * m_node = nullptr;
Expand All @@ -176,10 +177,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
int m_num_heads;
int m_num_heads_kv;
int m_head_size;
int m_past_kv_len;
int m_input_len;
int32_t * m_rope_params;
std::vector<std::string> m_kv_names;
bool m_is_static = false;
bool m_is_first_token;
};

void print_tensor_address_map(const ggml_cgraph * cgraph);
Expand Down
1 change: 0 additions & 1 deletion ggml/src/ggml-openvino/openvino/decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ class GgmlDecoder : public DecoderBase {
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;

virtual bool is_static() const = 0;
virtual bool is_first_token() const = 0;
virtual int get_context_size() const = 0;
virtual int get_context_size_swa() const = 0;
virtual int is_swa_layer(int layer) const = 0;
Expand Down
7 changes: 1 addition & 6 deletions ggml/src/ggml-openvino/openvino/node_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,7 @@ class NodeContext : public frontend::NodeContext {
int get_op_case() const {
return m_decoder->get_op_case();
}
bool is_static() const {
return m_decoder->is_static();
}
bool is_first_token() const {
return m_decoder->is_first_token();
}
bool is_static() const { return m_decoder->is_static(); }

private:
std::shared_ptr<GgmlDecoder> m_decoder;
Expand Down
52 changes: 15 additions & 37 deletions ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
#include "../op_table.hpp"
#include "../utils.hpp"

#include <cstdint>
#include <memory>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/scaled_dot_product_attention.hpp>
Expand Down Expand Up @@ -51,62 +53,38 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {

auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
}

if (mask_sliced.get_element_type() != ov::element::f16) {
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
}

auto tile_kv = [](int64_t q_batch, int64_t kv_batch, ov::Output<Node> kv, bool is_static) {
int64_t factor = q_batch / kv_batch;
auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
int64_t factor = num_heads / num_heads_kv;
if (factor > 1) {
auto q_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{q_batch});
auto kv_batch_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{kv_batch});
auto factor_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{factor});

ov::Output<ov::Node> kv_broadcast_shape, kv_unsqueezed, new_kv_shape;
if (is_static) {
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);

auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {1, 2});
kv_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{kv_batch_node, factor_node, kv_last_two_dims}, 0);
new_kv_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{q_batch_node, kv_last_two_dims}, 0);
} else {
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);

auto kv_last_two_dims = get_dimensions(kv.get_node_shared_ptr(), {2, 3});
kv_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{one_1d, kv_batch_node, factor_node, kv_last_two_dims}, 0);
new_kv_shape =
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, q_batch_node, kv_last_two_dims}, 0);
}
kv_broadcast_shape =
ov::op::v0::Constant::create(ov::element::i64, {4}, {num_heads_kv, factor, (int64_t) 1, head_size});
new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, {num_heads, (int64_t) -1, head_size});

kv = std::make_shared<ov::op::v3::Broadcast>(kv_unsqueezed, kv_broadcast_shape);
kv = std::make_shared<ov::op::v3::Broadcast>(kv_unsqueezed, kv_broadcast_shape,
ov::op::BroadcastType::BIDIRECTIONAL);
kv = std::make_shared<ov::op::v1::Reshape>(kv, new_kv_shape, false);
}
return kv;
};

auto q_shape = context.get_input_shape(0).to_shape();
auto k_shape = context.get_input_shape(1).to_shape();
k = tile_kv(q_shape[0], k_shape[0], k, context.is_static());
v = tile_kv(q_shape[0], k_shape[0], v, context.is_static());
k = tile_kv(q_shape[0], k_shape[0], q_shape[2], k);
v = tile_kv(q_shape[0], k_shape[0], q_shape[2], v);

auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
auto sdpa_f32 = std::make_shared<ov::op::v0::Convert>(sdpa, ov::element::f32);
if (context.is_static()) {
res = std::make_shared<ov::op::v1::Transpose>(sdpa_f32,
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
} else {
res = std::make_shared<ov::op::v1::Transpose>(
sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
}
res = std::make_shared<ov::op::v1::Transpose>(sdpa, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
return rename_outputs_with_suffix({res}, context.get_name());
}

Expand Down
36 changes: 2 additions & 34 deletions ggml/src/ggml-openvino/openvino/op/permute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,40 +26,8 @@ OutputVector translate_permute(const NodeContext & context) {
ov::Output<Node> res;
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});

if (op_case == 1) {
if (context.is_static()) {
res = std::make_shared<ov::op::v1::Transpose>(
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
} else {
auto src = context.get_input(0);
if (src.get_partial_shape().rank() == 3) {
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
}
res = std::make_shared<ov::op::v1::Transpose>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
}
} else {
auto src = context.get_input(0);
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});

if (context.is_static()) {
auto src_shape_ = context.get_input_shape(0).to_shape();
std::vector<int64_t> src_shape(src_shape_.begin(), src_shape_.end());
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
src,
ov::op::v0::Constant::create(ov::element::i64, {3},
std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
false);
res = std::make_shared<ov::op::v1::Transpose>(
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
} else {
if (src.get_partial_shape().rank() == 3) {
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
}
res = std::make_shared<ov::op::v1::Transpose>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
}
}
auto src = context.get_input(0);
res = std::make_shared<ov::op::v1::Transpose>(src, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
return rename_outputs_with_suffix({res}, context.get_name());
}

Expand Down
4 changes: 0 additions & 4 deletions ggml/src/ggml-openvino/openvino/op/rope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,6 @@ OutputVector translate_rope(const NodeContext & context) {
ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
if (!(context.is_static())) {
res =
std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
}
} else if (mode == ROPE_TYPE_NEOX) {
auto data_split = std::make_shared<ov::op::v1::Split>(
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);
Expand Down
Loading