Skip to content

Commit 129dc0b

Browse files
committed
NPU support version 2: prefill + kvcache
1 parent 8934f73 commit 129dc0b

File tree

5 files changed

+52
-28
lines changed

5 files changed

+52
-28
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,11 +222,11 @@ void GgmlOvDecoder::add_extra_inputs() {
222222
past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads);
223223

224224
std::string name = "past_token_len";
225-
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{});
225+
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
226226
param_node->set_friendly_name(name);
227227
m_model_extra_inputs[name] = param_node;
228228

229-
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{});
229+
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
230230
*tensor->data<int64_t>() = past_token_len;
231231
m_model_extra_input_values[name] = tensor;
232232
break;

ggml/src/ggml-openvino/openvino/op/cpy.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ OutputVector translate_cpy(const NodeContext& context) {
3434

3535
auto src0 = context.get_input(0);
3636
auto src1 = context.get_input(1);
37-
auto past_token_len_scalar = context.get_input("past_token_len");
37+
auto past_token_len = context.get_input("past_token_len");
3838

3939
src0 = std::make_shared<ov::op::v0::Convert>(src0, context.get_input_type(1));
4040
ov::Output<Node> res;
@@ -68,18 +68,16 @@ OutputVector translate_cpy(const NodeContext& context) {
6868

6969
std::shared_ptr<ov::Node> indices;
7070
if (context.is_static()) {
71-
indices = past_token_len_scalar.get_node_shared_ptr();
72-
indices = std::make_shared<ov::op::v0::Unsqueeze>(
73-
indices,
74-
ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{0, 1}));
71+
indices = past_token_len.get_node_shared_ptr();
7572
} else {
73+
auto past_token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(past_token_len, zero);
7674
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
7775
indices = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
7876
total_token_len_scalar,
7977
one_scalar,
8078
ov::element::i64);
81-
indices = std::make_shared<ov::op::v0::Unsqueeze>(indices, one);
8279
}
80+
indices = std::make_shared<ov::op::v0::Unsqueeze>(indices, one);
8381

8482
res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
8583
} else {
@@ -108,11 +106,9 @@ OutputVector translate_cpy(const NodeContext& context) {
108106
// 1D tensor of shape [token_len], values starting from past_token_len
109107
std::shared_ptr<ov::Node> range_col;
110108
if (context.is_static()) {
111-
range_col = past_token_len_scalar.get_node_shared_ptr();
112-
range_col = std::make_shared<ov::op::v0::Unsqueeze>(
113-
range_col,
114-
ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{0}));
109+
range_col = past_token_len.get_node_shared_ptr();
115110
} else {
111+
auto past_token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(past_token_len, zero);
116112
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
117113
range_col = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
118114
total_token_len_scalar,

ggml/src/ggml-openvino/openvino/op/mulmat.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <climits>
12
#include <cstdint>
23
#include <memory>
34
#include <openvino/core/node.hpp>
@@ -68,7 +69,7 @@ OutputVector translate_mulmat(const NodeContext& context) {
6869
std::vector<int64_t> src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end());
6970

7071
if (context.is_static()) {
71-
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]});
72+
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
7273
}
7374
src0_original_shape[token_dim] = -1;
7475

ggml/src/ggml-openvino/utils.cpp

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "utils.h"
22

33
#include <algorithm>
4+
#include <cassert>
45
#include <cmath>
56
#include <cstddef>
67
#include <cstdint>
@@ -70,15 +71,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
7071
ov::AnyMap config;
7172
if (device == "NPU") {
7273
config = {
73-
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"},
74-
{"NPU_USE_NPUW", "YES"},
75-
{"NPUW_DEVICES", "NPU"},
76-
{"NPUW_FOLD", "YES"},
77-
{"NPUW_DQ", "YES"},
78-
{"NPUW_FUNCALL_ASYNC", "YES"},
79-
{"NPUW_HOST_GATHER", "YES"},
80-
{"NPUW_WEIGHTS_BANK", "shared"},
81-
// {"NPU_COMPILER_TYPE", "MLIR"},
74+
{ "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" },
75+
{ "NPU_USE_NPUW", "YES" },
76+
{ "NPUW_DEVICES", "NPU" },
77+
{ "NPUW_FOLD", "YES" },
78+
{ "NPUW_HOST_GATHER", "YES" },
79+
{ "NPUW_DQ", "YES" },
80+
{ "NPUW_FUNCALL_ASYNC", "YES" },
81+
{ "NPUW_WEIGHTS_BANK", "shared" },
82+
// Option 'CACHE_DIR' is not supported with MLIR compiler type
83+
// {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
84+
{ "NPU_COMPILER_TYPE", "MLIR" },
8285
};
8386
}
8487

@@ -102,15 +105,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
102105
int64_t conversion_end_time;
103106
int64_t compile_end_time;
104107

108+
bool is_first_token = is_prefill(cgraph);
109+
105110
auto it = compiled_cache_prefill.find(cgraph);
106-
bool is_first_token = it == compiled_cache_prefill.end();
107-
if (!is_first_token) {
111+
if (it != compiled_cache_prefill.end()) {
108112
ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
109113
decoder_end_time = ggml_time_us();
110114

111115
if (is_static) {
112-
model = compiled_cache_kvcache[cgraph].first;
113-
compiled_model = compiled_cache_kvcache[cgraph].second;
116+
if (is_first_token) {
117+
model = compiled_cache_prefill[cgraph].first;
118+
compiled_model = compiled_cache_prefill[cgraph].second;
119+
} else {
120+
model = compiled_cache_kvcache[cgraph].first;
121+
compiled_model = compiled_cache_kvcache[cgraph].second;
122+
}
114123
} else {
115124
model = it->second.first;
116125
compiled_model = it->second.second;
@@ -235,8 +244,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
235244
}
236245
auto end_time = ggml_time_us();
237246

238-
is_first_token = false;
239-
240247
if (getenv("GGML_OPENVINO_PROFILING")) {
241248
GGML_LOG_INFO("GGML OpenVINO Backend: \n");
242249
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
@@ -305,3 +312,20 @@ void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
305312
matrix[i * dim + i] = 0.0f;
306313
}
307314
}
315+
316+
bool is_prefill(struct ggml_cgraph * cgraph) {
317+
for (int i = 0; i < cgraph->n_nodes; ++i) {
318+
auto * op = cgraph->nodes[i];
319+
for (int j = 0; j < GGML_MAX_SRC; ++j) {
320+
auto* src = op->src[j];
321+
if (src == nullptr) {
322+
break;
323+
}
324+
if (std::string(src->name) == "inp_tokens") {
325+
return src->ne[0] != 1;
326+
}
327+
}
328+
}
329+
GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph");
330+
throw std::runtime_error("is_prefill: inp_tokens not found in cgraph");
331+
}

ggml/src/ggml-openvino/utils.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "ggml-backend-impl.h"
44
#include "ggml-decoder.h"
5+
#include "ggml-impl.h"
56

67
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);
78

@@ -35,3 +36,5 @@ std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p
3536
}
3637

3738
void set_zero_diagonal(std::vector<float>& matrix, size_t dim);
39+
40+
bool is_prefill(struct ggml_cgraph * cgraph);

0 commit comments

Comments
 (0)