Skip to content

Commit a079242

Browse files
committed
Fix NPU accuracy
1 parent 500aead commit a079242

File tree

2 files changed

+16
-14
lines changed

2 files changed

+16
-14
lines changed

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,23 +77,28 @@ void add_token_len(TensorMap& tensor_map) {
7777
tensor_map.insert({"token_len", token_len->output(0)});
7878
}
7979

80-
void add_sliced_mask(TensorMap& tensor_map) {
80+
void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
8181
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
8282

83-
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) {
83+
auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name, bool is_static) {
8484
if (tensor_map.find(mask_name) != tensor_map.end()) {
85-
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
86-
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
8785
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
88-
std::shared_ptr<ov::Node> mask_sliced =
89-
std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, one);
90-
mask_sliced->set_friendly_name(sliced_name);
86+
std::shared_ptr<ov::Node> mask_sliced;
87+
if (is_static) {
88+
mask_sliced = mask;
89+
} else {
90+
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
91+
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
92+
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, one);
93+
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
94+
mask_sliced->set_friendly_name(sliced_name);
95+
}
9196
tensor_map.insert({sliced_name, mask_sliced->output(0)});
9297
}
9398
};
9499

95-
create_sliced_mask("KQ_mask", "KQ_mask_sliced");
96-
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced");
100+
create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
101+
create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
97102
}
98103

99104
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
@@ -117,7 +122,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
117122
// Create common patterns
118123
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
119124
add_token_len(tensor_map);
120-
add_sliced_mask(tensor_map);
125+
add_sliced_mask(tensor_map, ggml_model_decoder);
121126
add_rope_sin_cos(tensor_map, ggml_model_decoder);
122127
}
123128

ggml/src/ggml-openvino/utils.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ ov::AnyMap get_npu_base_config() {
253253
{"NPUW_FOLD", "YES" },
254254
{"NPUW_WEIGHTS_BANK", "shared" },
255255
{"NPUW_FUNCALL_FOR_ALL", "YES" },
256+
{"NPUW_FUNCALL_ASYNC", "YES" },
256257
{"NPUW_DQ", "YES" },
257258
{"NPUW_DQ_FULL", "NO" },
258259
{"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
@@ -262,15 +263,11 @@ ov::AnyMap get_npu_base_config() {
262263

263264
ov::AnyMap get_npu_prefill_config() {
264265
auto config = get_npu_base_config();
265-
config.emplace("NPUW_FUNCALL_ASYNC", "NO");
266-
config.emplace("NPUW_ACC_CHECK", "YES");
267-
config.emplace("NPUW_ACC_DEVICE", "CPU");
268266
return config;
269267
}
270268

271269
ov::AnyMap get_npu_generate_config() {
272270
auto config = get_npu_base_config();
273-
config.emplace("NPUW_FUNCALL_ASYNC", "YES");
274271
return config;
275272
}
276273

0 commit comments

Comments
 (0)