Skip to content

Commit b806886

Browse files
committed
NPU unify PD. Unify dynamic and static dims
1 parent e9abf1c commit b806886

File tree

11 files changed

+145
-298
lines changed

11 files changed

+145
-298
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include <openvino/op/constant.hpp>
2828
#include <openvino/op/convert.hpp>
2929
#include <openvino/op/parameter.hpp>
30-
#include <openvino/op/unsqueeze.hpp>
3130
#include <openvino/runtime/tensor.hpp>
3231
#include <optional>
3332
#include <ostream>
@@ -39,7 +38,6 @@
3938
GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
4039
ggml_cgraph * cgraph,
4140
bool is_static,
42-
bool is_first_token,
4341
int context_size,
4442
int context_size_swa,
4543
int num_heads,
@@ -55,25 +53,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
5553
m_num_heads(num_heads),
5654
m_num_heads_kv(num_heads_kv),
5755
m_head_size(head_size),
58-
m_is_static(is_static),
59-
m_is_first_token(is_first_token) {
56+
m_is_static(is_static) {
6057
set_input_output(node);
6158
}
6259

6360
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
6461
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
65-
bool is_static,
66-
bool is_first_token) :
62+
bool is_static) :
6763
m_cgraph(cgraph),
6864
m_op_name(m_node ? std::string(m_node->name) : ""),
6965
m_model_weights(model_weights),
70-
m_is_static(is_static),
71-
m_is_first_token(is_first_token) {
72-
if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
66+
m_is_static(is_static) {
67+
if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
68+
unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
7369
print_tensor_address_map(cgraph);
7470
}
7571

7672
set_llm_params();
73+
validate_cgraph();
7774

7875
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
7976
auto * cur_node = cgraph->nodes[node_n];
@@ -300,41 +297,39 @@ void GgmlOvDecoder::set_llm_params() {
300297
}
301298
}
302299

300+
void GgmlOvDecoder::validate_cgraph() const {
301+
if (m_is_static && m_input_len != 1) {
302+
throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) +
303+
", try set -ub 1");
304+
}
305+
}
306+
303307
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const {
304308
auto name = std::string(src->name);
305309
ov::PartialShape input_shape;
306-
if (name == "inp_tokens" || name == "inp_pos") {
307-
if (m_is_static) {
308-
if (m_is_first_token) {
309-
input_shape = ov::PartialShape{1, 1, m_context_size};
310-
} else {
311-
input_shape = ov::PartialShape{1, 1, 1};
312-
}
313-
} else {
314-
input_shape = ov::PartialShape{1, 1, -1};
315-
}
316-
} else if (name == "inp_out_ids" && !m_is_static) {
317-
input_shape = ov::PartialShape{1, 1, -1};
310+
311+
if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") {
312+
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
313+
318314
} else if (name.find("KQ_mask") == 0) {
319315
if (m_is_static) {
320-
if (m_is_first_token) {
321-
input_shape = ov::PartialShape{1, m_context_size, m_context_size};
322-
} else {
323-
input_shape = ov::PartialShape{1, 1, m_context_size};
324-
}
316+
input_shape = ov::PartialShape{1, 1, m_context_size};
325317
} else {
326318
input_shape = ov::PartialShape{1, -1, -1};
327319
}
320+
328321
} else if (name.find("cache_") == 0) {
322+
auto past_token_len = -1;
329323
if (m_is_static) {
330324
int layer = extract_layer_from_name(name);
331325
bool is_swa = is_swa_layer(layer);
332-
input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size};
333-
} else {
334-
input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size};
326+
past_token_len = is_swa ? m_context_size_swa : m_context_size;
335327
}
328+
input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size};
329+
336330
} else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
337331
input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
332+
338333
} else if (src->op == GGML_OP_VIEW) {
339334
// This case is added to make test-backend-ops work
340335
input_shape = ov::PartialShape{get_shape(src->view_src)};
@@ -748,9 +743,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {
748743

749744
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
750745
for (const auto & node : m_nodes) {
751-
auto decoder =
752-
std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
753-
m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
746+
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa,
747+
m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
754748
node_visitor(decoder);
755749
}
756750
}

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
1616
// Graph decoder
1717
GgmlOvDecoder(ggml_cgraph * cgraph,
1818
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
19-
bool is_static,
20-
bool is_first_token);
19+
bool is_static);
2120

2221
// Node decoder, called in GgmlOvDecoder::visit_subgraph
2322
GgmlOvDecoder(ggml_tensor * node,
2423
ggml_cgraph * cgraph,
2524
bool is_static,
26-
bool is_first_token,
2725
int context_size,
2826
int context_size_swa,
2927
int num_heads,
@@ -129,8 +127,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
129127

130128
virtual bool is_static() const override { return m_is_static; }
131129

132-
virtual bool is_first_token() const override { return m_is_first_token; }
133-
134130
ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const;
135131

136132
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
@@ -157,6 +153,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
157153

158154
// set context_size, num_heads, etc
159155
void set_llm_params();
156+
void validate_cgraph() const;
160157

161158
ggml_cgraph * m_cgraph = nullptr;
162159
ggml_tensor * m_node = nullptr;
@@ -185,7 +182,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
185182
int32_t * m_rope_params;
186183
std::vector<std::string> m_kv_names;
187184
bool m_is_static = false;
188-
bool m_is_first_token;
189185
};
190186

191187
void print_tensor_address_map(const ggml_cgraph * cgraph);

ggml/src/ggml-openvino/openvino/decoder.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ class GgmlDecoder : public DecoderBase {
6565
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
6666

6767
virtual bool is_static() const = 0;
68-
virtual bool is_first_token() const = 0;
6968
virtual int get_context_size() const = 0;
7069
virtual int get_context_size_swa() const = 0;
7170
virtual int is_swa_layer(int layer) const = 0;

ggml/src/ggml-openvino/openvino/node_context.hpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,7 @@ class NodeContext : public frontend::NodeContext {
9797
int get_op_case() const {
9898
return m_decoder->get_op_case();
9999
}
100-
bool is_static() const {
101-
return m_decoder->is_static();
102-
}
103-
bool is_first_token() const {
104-
return m_decoder->is_first_token();
105-
}
100+
bool is_static() const { return m_decoder->is_static(); }
106101

107102
private:
108103
std::shared_ptr<GgmlDecoder> m_decoder;

ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -53,35 +53,22 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
5353

5454
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
5555
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
56-
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
5756
}
5857

5958
if (mask_sliced.get_element_type() != ov::element::f16) {
6059
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
6160
}
6261

63-
auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv,
64-
bool is_static) {
62+
auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
6563
int64_t factor = num_heads / num_heads_kv;
6664
if (factor > 1) {
6765
ov::Output<ov::Node> kv_broadcast_shape, kv_unsqueezed, new_kv_shape;
68-
if (is_static) {
69-
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
70-
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
66+
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
67+
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
7168

72-
kv_broadcast_shape =
73-
ov::op::v0::Constant::create(ov::element::i64, {4}, {num_heads_kv, factor, (int64_t) 1, head_size});
74-
new_kv_shape =
75-
ov::op::v0::Constant::create(ov::element::i64, {3}, {num_heads, (int64_t) -1, head_size});
76-
} else {
77-
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
78-
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
79-
80-
kv_broadcast_shape = ov::op::v0::Constant::create(
81-
ov::element::i64, {5}, {(int64_t) 1, num_heads_kv, factor, (int64_t) 1, head_size});
82-
new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
83-
{(int64_t) 1, num_heads, (int64_t) -1, head_size});
84-
}
69+
kv_broadcast_shape =
70+
ov::op::v0::Constant::create(ov::element::i64, {4}, {num_heads_kv, factor, (int64_t) 1, head_size});
71+
new_kv_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, {num_heads, (int64_t) -1, head_size});
8572

8673
kv = std::make_shared<ov::op::v3::Broadcast>(kv_unsqueezed, kv_broadcast_shape,
8774
ov::op::BroadcastType::BIDIRECTIONAL);
@@ -92,18 +79,12 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
9279

9380
auto q_shape = context.get_input_shape(0).to_shape();
9481
auto k_shape = context.get_input_shape(1).to_shape();
95-
k = tile_kv(q_shape[0], k_shape[0], q_shape[2], k, context.is_static());
96-
v = tile_kv(q_shape[0], k_shape[0], q_shape[2], v, context.is_static());
82+
k = tile_kv(q_shape[0], k_shape[0], q_shape[2], k);
83+
v = tile_kv(q_shape[0], k_shape[0], q_shape[2], v);
9784

9885
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
99-
auto sdpa_f32 = std::make_shared<ov::op::v0::Convert>(sdpa, ov::element::f32);
100-
if (context.is_static()) {
101-
res = std::make_shared<ov::op::v1::Transpose>(sdpa_f32,
102-
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
103-
} else {
104-
res = std::make_shared<ov::op::v1::Transpose>(
105-
sdpa_f32, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
106-
}
86+
res = std::make_shared<ov::op::v1::Transpose>(sdpa, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
87+
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
10788
return rename_outputs_with_suffix({res}, context.get_name());
10889
}
10990

ggml/src/ggml-openvino/openvino/op/permute.cpp

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,40 +26,8 @@ OutputVector translate_permute(const NodeContext & context) {
2626
ov::Output<Node> res;
2727
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
2828

29-
if (op_case == 1) {
30-
if (context.is_static()) {
31-
res = std::make_shared<ov::op::v1::Transpose>(
32-
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
33-
} else {
34-
auto src = context.get_input(0);
35-
if (src.get_partial_shape().rank() == 3) {
36-
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
37-
}
38-
res = std::make_shared<ov::op::v1::Transpose>(
39-
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
40-
}
41-
} else {
42-
auto src = context.get_input(0);
43-
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
44-
45-
if (context.is_static()) {
46-
auto src_shape_ = context.get_input_shape(0).to_shape();
47-
std::vector<int64_t> src_shape(src_shape_.begin(), src_shape_.end());
48-
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
49-
src,
50-
ov::op::v0::Constant::create(ov::element::i64, {3},
51-
std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
52-
false);
53-
res = std::make_shared<ov::op::v1::Transpose>(
54-
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
55-
} else {
56-
if (src.get_partial_shape().rank() == 3) {
57-
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
58-
}
59-
res = std::make_shared<ov::op::v1::Transpose>(
60-
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
61-
}
62-
}
29+
auto src = context.get_input(0);
30+
res = std::make_shared<ov::op::v1::Transpose>(src, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
6331
return rename_outputs_with_suffix({res}, context.get_name());
6432
}
6533

ggml/src/ggml-openvino/openvino/op/rope.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,6 @@ OutputVector translate_rope(const NodeContext & context) {
8484
ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
8585
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, 3);
8686
res = std::make_shared<ov::op::v1::Reshape>(stack, std::make_shared<ov::op::v0::ShapeOf>(data_node), false);
87-
if (!(context.is_static())) {
88-
res =
89-
std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
90-
}
9187
} else if (mode == ROPE_TYPE_NEOX) {
9288
auto data_split = std::make_shared<ov::op::v1::Split>(
9389
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}), 2);

ggml/src/ggml-openvino/openvino/op/set_rows.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,6 @@ OutputVector translate_set_rows(const NodeContext & context) {
3333
auto dst_shape = context.get_output_shape(0).to_shape();
3434
FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS");
3535

36-
if (context.is_static() && context.is_first_token()) {
37-
return rename_outputs_with_suffix({data}, context.get_name());
38-
}
39-
4036
auto indices = context.get_input(1);
4137
auto dst = context.get_input(context.get_output_name());
4238

@@ -54,13 +50,11 @@ OutputVector translate_set_rows(const NodeContext & context) {
5450
auto updated = std::make_shared<ov::op::v3::ScatterUpdate>(dst_reshaped, indices_reshaped, data_reshaped, zero);
5551
res = std::make_shared<ov::op::v1::Reshape>(updated, std::make_shared<ov::op::v0::ShapeOf>(dst), false);
5652
} else {
57-
assert(dst.get_partial_shape().rank() == 4 && dst.get_partial_shape()[2].is_static() &&
58-
dst.get_partial_shape()[3].is_static());
53+
int64_t dim1 = dst.get_partial_shape()[1].get_length();
5954
int64_t dim2 = dst.get_partial_shape()[2].get_length();
60-
int64_t dim3 = dst.get_partial_shape()[3].get_length();
6155
data = std::make_shared<ov::op::v1::Reshape>(
62-
data, ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 1, (int64_t) -1, dim2, dim3}), false);
63-
res = std::make_shared<ov::op::v0::Concat>(OutputVector{dst, data}, 1);
56+
data, ov::op::v0::Constant::create(ov::element::i64, {3}, {(int64_t) -1, dim1, dim2}), false);
57+
res = std::make_shared<ov::op::v0::Concat>(OutputVector{dst, data}, 0);
6458
}
6559
return rename_outputs_with_suffix({res}, context.get_name());
6660
}

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include <openvino/op/squeeze.hpp>
2828
#include <openvino/op/strided_slice.hpp>
2929
#include <openvino/op/transpose.hpp>
30-
#include <openvino/op/unsqueeze.hpp>
3130
#include <openvino/pass/constant_folding.hpp>
3231
#include <openvino/pass/make_stateful.hpp>
3332

@@ -112,7 +111,6 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
112111
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
113112

114113
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
115-
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
116114
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
117115
mask_sliced->set_friendly_name(sliced_name);
118116
}

0 commit comments

Comments
 (0)