Skip to content

Commit b650ca3

Browse files
committed
Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working
1 parent f61a50f commit b650ca3

File tree

6 files changed

+38
-57
lines changed

6 files changed

+38
-57
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
7373
set_input_output(cur_node);
7474
}
7575

76-
add_extra_inputs();
76+
// add_extra_inputs();
7777
}
7878

7979
GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
@@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
336336

337337
void GgmlOvDecoder::add_extra_inputs() {
338338
// Extra inputs:
339-
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
339+
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
340340
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
341-
// Not used for NPU
341+
// Not used for NPU.
342+
// Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models)
342343
int64_t attention_size = -1;
343344
int64_t attention_size_swa = -1;
344345
for (const auto& node : m_nodes) {

ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#include <openvino/op/broadcast.hpp>
33
#include <openvino/op/concat.hpp>
44
#include <openvino/op/convert.hpp>
5-
#include <openvino/op/gather.hpp>
65
#include <openvino/op/reshape.hpp>
76
#include <openvino/op/scaled_dot_product_attention.hpp>
87
#include <openvino/op/transpose.hpp>

ggml/src/ggml-openvino/openvino/op/mulmat.cpp

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) {
5959

6060
auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2});
6161

62+
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
63+
auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
64+
6265
Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
6366
Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
64-
65-
ov::Output<Node> broadcast_shape;
66-
ov::Output<Node> Z_unsqueezed;
67-
if (context.is_static()) {
68-
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
69-
Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
70-
broadcast_shape =
71-
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
72-
} else {
73-
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
74-
Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
75-
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
76-
broadcast_shape =
77-
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0);
78-
}
67+
auto broadcast_shape =
68+
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
7969
auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);
8070

8171
auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dims}, 0);

ggml/src/ggml-openvino/openvino/op/permute.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) {
4040
}
4141
} else {
4242
auto src = context.get_input(0);
43-
Output<Node> attention_size;
44-
if (context.is_static()) {
45-
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
46-
} else if (op_case == 2) {
47-
attention_size = context.get_input("attention_size");
48-
} else {
49-
attention_size = context.get_input("attention_size_swa");
50-
}
51-
5243
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
5344

5445
if (context.is_static()) {
@@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) {
5849
src,
5950
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
6051
false);
61-
auto src_slice = std::make_shared<ov::op::v8::Slice>(src_reshaped, zero, attention_size, one, zero);
62-
res = std::make_shared<ov::op::v1::Transpose>(src_slice,
63-
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
52+
res = std::make_shared<ov::op::v1::Transpose>(
53+
src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
6454
} else {
6555
if (src.get_partial_shape().rank() == 3) {
6656
src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);

ggml/src/ggml-openvino/openvino/op/softmax.cpp

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
#include <openvino/op/concat.hpp>
88
#include <openvino/op/constant.hpp>
99
#include <openvino/op/convert.hpp>
10-
#include <openvino/op/gather.hpp>
1110
#include <openvino/op/matmul.hpp>
1211
#include <openvino/op/multiply.hpp>
13-
#include <openvino/op/unsqueeze.hpp>
1412
#include <openvino/op/slice.hpp>
1513
#include <openvino/op/softmax.hpp>
1614
#include <vector>
@@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
5957
} else {
6058
auto token_len = get_dimensions(input_node, {1});
6159
auto mask_node = context.get_input(1);
62-
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
63-
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
64-
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
65-
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
66-
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
67-
auto inp_pos = context.get_input("inp_pos");
68-
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
69-
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
70-
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
71-
mask_node_sliced =
72-
std::make_shared<ov::op::v8::Slice>(mask_node, zero_2d, stop, one_2d, axes);
73-
if (!(context.is_static())) {
74-
mask_node_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_node_sliced, zero_1d);
75-
}
60+
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
61+
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
62+
mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
7663
}
7764

7865
if (mask_node_sliced.get_element_type() != context.get_output_type(0)) {

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
#include <openvino/op/convert.hpp>
1212
#include <openvino/op/cos.hpp>
1313
#include <openvino/op/divide.hpp>
14-
#include <openvino/op/gather.hpp>
1514
#include <openvino/op/multiply.hpp>
1615
#include <openvino/op/parameter.hpp>
1716
#include <openvino/op/range.hpp>
1817
#include <openvino/op/reshape.hpp>
1918
#include <openvino/op/result.hpp>
2019
#include <openvino/op/sin.hpp>
20+
#include <openvino/op/slice.hpp>
2121
#include <openvino/op/squeeze.hpp>
22+
#include <openvino/op/strided_slice.hpp>
2223
#include <openvino/op/transpose.hpp>
2324
#include <openvino/op/unsqueeze.hpp>
2425
#include <openvino/pass/constant_folding.hpp>
@@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
8889
if (is_static) {
8990
mask_sliced = mask;
9091
} else {
91-
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
92-
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
92+
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
93+
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
94+
auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
9395
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
9496
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
95-
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
96-
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
97-
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
98-
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
99-
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
97+
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2});
98+
99+
std::shared_ptr<ov::Node> kv_len;
100+
{
101+
auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1});
102+
auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1});
103+
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
104+
kv_len = std::make_shared<ov::op::v1::StridedSlice>(
105+
inp_pos, start, start, stride, std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{1, 1, 1});
106+
}
107+
kv_len = std::make_shared<ov::op::v0::Squeeze>(
108+
kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
109+
kv_len = std::make_shared<ov::op::v0::Convert>(kv_len, ov::element::i64);
110+
kv_len = std::make_shared<ov::op::v1::Add>(kv_len, one_1d);
111+
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
112+
100113
mask_sliced =
101114
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
102115
mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
@@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
108121
};
109122

110123
create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
111-
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
124+
// swa is not working for the `kv_len` is not correct
125+
// create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
112126
}
113127

114128
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
@@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
132146
// Create common patterns
133147
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
134148
add_token_len(tensor_map);
135-
// add_sliced_mask(tensor_map, ggml_model_decoder);
149+
add_sliced_mask(tensor_map, ggml_model_decoder);
136150
add_rope_sin_cos(tensor_map, ggml_model_decoder);
137151
}
138152

0 commit comments

Comments
 (0)