Skip to content

Commit e9abf1c

Browse files
committed
Replace get_output_tensor+memcpy with set_output_tensor
1 parent 0cfe69c commit e9abf1c

File tree

2 files changed

+30
-39
lines changed

2 files changed

+30
-39
lines changed

ggml/src/ggml-openvino/utils.cpp

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,6 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
4848
return input_tensor;
4949
}
5050

51-
std::map<std::string, ggml_tensor *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
52-
std::map<std::string, ggml_tensor *> output_tensors;
53-
54-
auto output_names = ggml_decoder->get_model_output_names();
55-
for (size_t inp = 0; inp < output_names.size(); ++inp) {
56-
auto name = output_names[inp];
57-
auto * tensor = ggml_decoder->get_output_ggml_tensor(name);
58-
tensor = tensor->view_src ? tensor->view_src : tensor;
59-
output_tensors[name] = tensor;
60-
}
61-
return output_tensors;
62-
}
63-
6451
static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
6552
auto fem = ov::frontend::FrontEndManager();
6653
auto front_end = fem.load_by_framework("ggml");
@@ -210,6 +197,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
210197

211198
auto ov_input_names = ov_input_names_cache[cgraph];
212199
auto ov_output_names = ov_output_names_cache[cgraph];
200+
213201
for (size_t i = 0; i < ov_input_names.size(); i++) {
214202
auto param_name = ov_input_names[i];
215203
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
@@ -219,43 +207,32 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
219207
print_input_tensor_info(param_name, input_tensor);
220208
}
221209
}
210+
211+
for (size_t i = 0; i < ov_output_names.size(); i++) {
212+
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
213+
infer_request->set_output_tensor(i, output_tensor);
214+
}
215+
222216
auto input_end_time = ggml_time_us();
223217

224218
infer_request->infer();
225219

226220
auto infer_end_time = ggml_time_us();
227221

228-
auto ggml_tensors = get_ggml_graph_output_dst(ggml_decoder);
229-
auto kv_size = ggml_decoder->get_input_len() * ggml_decoder->get_num_heads_kv() * ggml_decoder->get_head_size();
230-
auto kv_offset = ggml_decoder->get_past_kv_len() * ggml_decoder->get_num_heads_kv() * ggml_decoder->get_head_size();
231222
for (size_t i = 0; i < ov_output_names.size(); i++) {
232223
const auto output_tensor = infer_request->get_output_tensor(i);
233-
auto & result_name = ov_output_names[i];
234-
auto * ggml_tensor = ggml_tensors[result_name];
235-
236-
if (result_name.find("cache") == std::string::npos) {
237-
std::memcpy(ggml_tensors[result_name]->data, output_tensor.data(), output_tensor.get_byte_size());
238-
} else {
239-
auto offset = kv_offset * ggml_type_size(ggml_tensor->type);
240-
auto size = kv_size * ggml_type_size(ggml_tensor->type);
241-
std::memcpy((char *) ggml_tensor->data + offset, (char *) output_tensor.data() + offset, size);
242-
}
243-
244224
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
245-
print_output_tensor_info(result_name, output_tensor, ggml_tensor->data);
225+
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
246226
}
247227
}
248228

249-
auto end_time = ggml_time_us();
250-
251229
if (getenv("GGML_OPENVINO_PROFILING")) {
252230
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
253231
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
254232
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
255233
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
256234
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
257235
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
258-
GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
259236
}
260237

261238
return GGML_STATUS_SUCCESS;
@@ -342,16 +319,14 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
342319
infer_request.set_input_tensor(i, input_tensor);
343320
}
344321

345-
infer_request.infer();
346-
347-
auto gguf_tensor_addrs = get_ggml_graph_output_dst(decoder);
348322
auto ov_results = model->get_results();
349323
for (size_t i = 0; i < ov_results.size(); i++) {
350324
auto result_name = ov_results[i]->get_friendly_name();
351-
const auto output_tensor = infer_request.get_output_tensor(i);
352-
353-
std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
325+
auto output_tensor = get_ov_output_tensor(decoder, result_name);
326+
infer_request.set_output_tensor(i, output_tensor);
354327
}
328+
329+
infer_request.infer();
355330
return GGML_STATUS_SUCCESS;
356331
}
357332

@@ -406,6 +381,22 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
406381
return input_tensor;
407382
}
408383

384+
ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name) {
385+
auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor(result_name);
386+
auto output_type = ggml_decoder->get_output_type(result_name);
387+
ov::Shape output_shape;
388+
if (result_name.find("cache") == std::string::npos) {
389+
output_shape = ggml_decoder->get_output_shape(result_name).to_shape();
390+
} else {
391+
size_t total_token_len = ggml_decoder->get_past_kv_len() + ggml_decoder->get_input_len();
392+
size_t num_heads_kv = ggml_decoder->get_num_heads_kv();
393+
size_t head_size = ggml_decoder->get_head_size();
394+
output_shape = ov::Shape{1, total_token_len, num_heads_kv, head_size};
395+
}
396+
ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data);
397+
return output_tensor;
398+
}
399+
409400
size_t checksum(const void * data, size_t size) {
410401
const uint8_t * bytes = static_cast<const uint8_t *>(data);
411402
size_t sum = 0;

ggml/src/ggml-openvino/utils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph * cgraph, boo
1111

1212
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name);
1313

14-
std::map<std::string, ggml_tensor *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
15-
1614
size_t checksum(const void * data, size_t size);
1715

1816
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
@@ -47,6 +45,8 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & dev
4745

4846
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
4947

48+
ov::Tensor get_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name);
49+
5050
bool is_naive(struct ggml_cgraph * cgraph);
5151

5252
enum ggml_status naive_compute(struct ggml_cgraph * cgraph,

0 commit comments

Comments
 (0)