Skip to content

Commit b16c04b

Browse files
committed
NPU unify PD (handled internally)
1 parent d5038aa commit b16c04b

File tree

3 files changed

+121
-45
lines changed

3 files changed

+121
-45
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -297,12 +297,7 @@ void GgmlOvDecoder::set_llm_params() {
297297
}
298298
}
299299

300-
void GgmlOvDecoder::validate_cgraph() const {
301-
if (m_is_static && m_input_len != 1) {
302-
throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) +
303-
", try set -ub 1");
304-
}
305-
}
300+
void GgmlOvDecoder::validate_cgraph() const {}
306301

307302
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const {
308303
auto name = std::string(src->name);

ggml/src/ggml-openvino/utils.cpp

Lines changed: 116 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
8080
int64_t decoder_end_time;
8181
int64_t conversion_end_time;
8282
int64_t compile_end_time;
83+
int64_t infer_end_time;
8384

8485
{
8586
std::lock_guard<std::mutex> lock(cache_mutex);
@@ -127,47 +128,87 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
127128
}
128129
ov_input_names_cache[cgraph] = ov_input_names;
129130
ov_output_names_cache[cgraph] = ov_output_names;
131+
132+
// Set output tensors and kvcache address for NPU once and for all since the graph is static
133+
if (is_static) {
134+
for (size_t i = 0; i < ov_output_names.size(); i++) {
135+
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
136+
infer_request->set_output_tensor(i, output_tensor);
137+
}
138+
for (size_t i = 0; i < ov_input_names.size(); i++) {
139+
auto param_name = ov_input_names[i];
140+
if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) {
141+
auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
142+
infer_request->set_input_tensor(i, input_tensor);
143+
}
144+
}
145+
}
130146
}
131147
}
132148

133149
auto ov_input_names = ov_input_names_cache[cgraph];
134150
auto ov_output_names = ov_output_names_cache[cgraph];
135151

136-
for (size_t i = 0; i < ov_input_names.size(); i++) {
137-
auto param_name = ov_input_names[i];
138-
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
139-
infer_request->set_input_tensor(i, input_tensor);
152+
if (!is_static) {
153+
for (size_t i = 0; i < ov_input_names.size(); i++) {
154+
auto param_name = ov_input_names[i];
155+
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
156+
infer_request->set_input_tensor(i, input_tensor);
140157

141-
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
142-
print_input_tensor_info(param_name, input_tensor);
158+
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
159+
print_input_tensor_info(param_name, input_tensor);
160+
}
143161
}
144-
}
145162

146-
for (size_t i = 0; i < ov_output_names.size(); i++) {
147-
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
148-
infer_request->set_output_tensor(i, output_tensor);
149-
}
163+
for (size_t i = 0; i < ov_output_names.size(); i++) {
164+
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
165+
infer_request->set_output_tensor(i, output_tensor);
166+
}
150167

151-
auto input_end_time = ggml_time_us();
168+
infer_request->infer();
169+
infer_end_time = ggml_time_us();
152170

153-
infer_request->infer();
171+
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
172+
for (size_t i = 0; i < ov_output_names.size(); i++) {
173+
const auto output_tensor = infer_request->get_output_tensor(i);
174+
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
175+
}
176+
}
177+
} else {
178+
auto input_len = ggml_decoder->get_input_len();
179+
for (int j = 0; j < input_len; j++) {
180+
for (size_t i = 0; i < ov_input_names.size(); i++) {
181+
auto param_name = ov_input_names[i];
182+
if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) {
183+
continue;
184+
}
185+
auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len);
186+
infer_request->set_input_tensor(i, input_tensor);
187+
188+
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
189+
const auto input_tensor = infer_request->get_input_tensor(i);
190+
print_input_tensor_info(param_name, input_tensor);
191+
}
192+
}
154193

155-
auto infer_end_time = ggml_time_us();
194+
infer_request->infer();
156195

157-
for (size_t i = 0; i < ov_output_names.size(); i++) {
158-
const auto output_tensor = infer_request->get_output_tensor(i);
159-
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
160-
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
196+
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
197+
for (size_t i = 0; i < ov_output_names.size(); i++) {
198+
const auto output_tensor = infer_request->get_output_tensor(i);
199+
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
200+
}
201+
}
161202
}
203+
infer_end_time = ggml_time_us();
162204
}
163205

164206
if (getenv("GGML_OPENVINO_PROFILING")) {
165207
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
166208
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
167209
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
168210
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
169-
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
170-
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
211+
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
171212
}
172213

173214
return GGML_STATUS_SUCCESS;
@@ -275,39 +316,75 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
275316
} // namespace
276317

277318
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name) {
278-
bool is_static = ggml_decoder->is_static();
279-
280319
ov::Tensor input_tensor;
281320
if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
282321
input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
283322

284323
} else if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) {
285324
void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data;
286-
size_t past_kv_len =
287-
ggml_decoder->is_static() ? ggml_decoder->get_context_size() : ggml_decoder->get_past_kv_len();
288-
ov::Shape input_shape = {past_kv_len, (size_t) ggml_decoder->get_num_heads_kv(),
325+
ov::Shape input_shape = {(size_t) ggml_decoder->get_past_kv_len(), (size_t) ggml_decoder->get_num_heads_kv(),
289326
(size_t) ggml_decoder->get_head_size()};
290327
input_tensor = ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data);
291328

292-
} else if (is_static && param_name.find("KQ_mask") == 0) {
293-
size_t context_size = ggml_decoder->get_context_size();
294-
const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
295-
std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, context_size, -INFINITY);
296-
input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, context_size});
297-
auto * data_ptr = input_tensor.data<float>();
298-
std::copy(padded_data.begin(), padded_data.end(), data_ptr);
299-
300-
} else if (is_static && param_name.find("inp_out_ids") == 0) {
329+
} else {
301330
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
302-
if (input_tensor.get_size() == 0) {
303-
input_tensor = ov::Tensor(input_tensor.get_element_type(), ov::Shape{1, 1, 1});
331+
}
332+
return input_tensor;
333+
}
334+
335+
ov::Tensor get_ov_input_tensor_static(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
336+
const std::string & param_name,
337+
int j,
338+
int input_len) {
339+
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
340+
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
341+
342+
if (param_name.find("cache_k") == 0 || param_name.find("cache_v") == 0) {
343+
void * input_data = ggml_decoder->get_input_ggml_tensor(param_name)->data;
344+
ov::Shape input_shape = {(size_t) ggml_decoder->get_context_size(), (size_t) ggml_decoder->get_num_heads_kv(),
345+
(size_t) ggml_decoder->get_head_size()};
346+
return ov::Tensor(ggml_decoder->get_input_type(param_name), input_shape, input_data);
347+
}
348+
349+
if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) {
350+
ov::Shape input_shape = {1, 1, 1};
351+
ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
352+
// copy the j-th value from ggml_tensor
353+
size_t element_size = ggml_type_size(ggml_tensor->type);
354+
void * input_data = (char *) ggml_tensor->data + j * element_size;
355+
std::memcpy(input_tensor.data(), input_data, element_size);
356+
return input_tensor;
357+
}
358+
359+
if (param_name == "inp_out_ids") {
360+
ov::Shape input_shape = {1, 1, 1};
361+
ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
362+
if (ggml_tensor->ne[0] == 0) {
304363
*input_tensor.data<int32_t>() = 0;
364+
} else if (ggml_tensor->ne[0] == 1) {
365+
if (j == input_len - 1) {
366+
*input_tensor.data<int32_t>() = *((int32_t *) ggml_tensor->data);
367+
} else {
368+
*input_tensor.data<int32_t>() = 0;
369+
}
370+
} else {
371+
throw std::runtime_error("Static graph inp_out_ids unexpected ne[0] > 1");
305372
}
373+
return input_tensor;
374+
}
306375

307-
} else {
308-
input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name);
376+
if (param_name.find("KQ_mask") == 0) {
377+
size_t context_size = ggml_decoder->get_context_size();
378+
const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
379+
std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, input_len, context_size, -INFINITY);
380+
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, context_size});
381+
// copy the j-th row of padded_data
382+
auto * data_ptr = input_tensor.data<float>();
383+
std::copy(padded_data.begin() + j * context_size, padded_data.begin() + (j + 1) * context_size, data_ptr);
384+
return input_tensor;
309385
}
310-
return input_tensor;
386+
387+
return get_ov_input_tensor(ggml_decoder, param_name);
311388
}
312389

313390
ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name) {

ggml/src/ggml-openvino/utils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ ov::AnyMap get_ov_compile_config(const std::string & device);
3939
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device);
4040

4141
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
42+
ov::Tensor get_ov_input_tensor_static(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
43+
const std::string & param_name,
44+
int j,
45+
int input_len);
4246

4347
ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name);
4448

0 commit comments

Comments
 (0)