@@ -48,19 +48,6 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
4848 return input_tensor;
4949}
5050
51- std::map<std::string, ggml_tensor *> get_ggml_graph_output_dst (std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
52- std::map<std::string, ggml_tensor *> output_tensors;
53-
54- auto output_names = ggml_decoder->get_model_output_names ();
55- for (size_t inp = 0 ; inp < output_names.size (); ++inp) {
56- auto name = output_names[inp];
57- auto * tensor = ggml_decoder->get_output_ggml_tensor (name);
58- tensor = tensor->view_src ? tensor->view_src : tensor;
59- output_tensors[name] = tensor;
60- }
61- return output_tensors;
62- }
63-
6451static ov::frontend::FrontEnd::Ptr get_ggml_frontend () {
6552 auto fem = ov::frontend::FrontEndManager ();
6653 auto front_end = fem.load_by_framework (" ggml" );
@@ -210,6 +197,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
210197
211198 auto ov_input_names = ov_input_names_cache[cgraph];
212199 auto ov_output_names = ov_output_names_cache[cgraph];
200+
213201 for (size_t i = 0 ; i < ov_input_names.size (); i++) {
214202 auto param_name = ov_input_names[i];
215203 auto input_tensor = get_ov_input_tensor (ggml_decoder, param_name);
@@ -219,43 +207,32 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
219207 print_input_tensor_info (param_name, input_tensor);
220208 }
221209 }
210+
211+ for (size_t i = 0 ; i < ov_output_names.size (); i++) {
212+ auto output_tensor = get_ov_output_tensor (ggml_decoder, ov_output_names[i]);
213+ infer_request->set_output_tensor (i, output_tensor);
214+ }
215+
222216 auto input_end_time = ggml_time_us ();
223217
224218 infer_request->infer ();
225219
226220 auto infer_end_time = ggml_time_us ();
227221
228- auto ggml_tensors = get_ggml_graph_output_dst (ggml_decoder);
229- auto kv_size = ggml_decoder->get_input_len () * ggml_decoder->get_num_heads_kv () * ggml_decoder->get_head_size ();
230- auto kv_offset = ggml_decoder->get_past_kv_len () * ggml_decoder->get_num_heads_kv () * ggml_decoder->get_head_size ();
231222 for (size_t i = 0 ; i < ov_output_names.size (); i++) {
232223 const auto output_tensor = infer_request->get_output_tensor (i);
233- auto & result_name = ov_output_names[i];
234- auto * ggml_tensor = ggml_tensors[result_name];
235-
236- if (result_name.find (" cache" ) == std::string::npos) {
237- std::memcpy (ggml_tensors[result_name]->data , output_tensor.data (), output_tensor.get_byte_size ());
238- } else {
239- auto offset = kv_offset * ggml_type_size (ggml_tensor->type );
240- auto size = kv_size * ggml_type_size (ggml_tensor->type );
241- std::memcpy ((char *) ggml_tensor->data + offset, (char *) output_tensor.data () + offset, size);
242- }
243-
244224 if (getenv (" GGML_OPENVINO_DEBUG_OUTPUT" )) {
245- print_output_tensor_info (result_name , output_tensor, ggml_tensor-> data );
225+ print_output_tensor_info (ov_output_names[i] , output_tensor, output_tensor. data () );
246226 }
247227 }
248228
249- auto end_time = ggml_time_us ();
250-
251229 if (getenv (" GGML_OPENVINO_PROFILING" )) {
252230 GGML_LOG_INFO (" \n GGML OpenVINO Backend: \n " );
253231 GGML_LOG_INFO (" - Graph decoder Time: %ld ms \n " , (decoder_end_time - start_time) / 1000 );
254232 GGML_LOG_INFO (" - Graph conversion Time: %ld ms \n " , (conversion_end_time - decoder_end_time) / 1000 );
255233 GGML_LOG_INFO (" - Graph compile Time: %ld ms \n " , (compile_end_time - conversion_end_time) / 1000 );
256234 GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - compile_end_time) / 1000 );
257235 GGML_LOG_INFO (" - Graph Inference Time: %ld ms \n " , (infer_end_time - input_end_time) / 1000 );
258- GGML_LOG_INFO (" - Graph Output Time: %ld ms \n " , (end_time - infer_end_time) / 1000 );
259236 }
260237
261238 return GGML_STATUS_SUCCESS;
@@ -342,16 +319,14 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
342319 infer_request.set_input_tensor (i, input_tensor);
343320 }
344321
345- infer_request.infer ();
346-
347- auto gguf_tensor_addrs = get_ggml_graph_output_dst (decoder);
348322 auto ov_results = model->get_results ();
349323 for (size_t i = 0 ; i < ov_results.size (); i++) {
350324 auto result_name = ov_results[i]->get_friendly_name ();
351- const auto output_tensor = infer_request.get_output_tensor (i);
352-
353- std::memcpy (gguf_tensor_addrs[result_name], output_tensor.data (), output_tensor.get_byte_size ());
325+ auto output_tensor = get_ov_output_tensor (decoder, result_name);
326+ infer_request.set_output_tensor (i, output_tensor);
354327 }
328+
329+ infer_request.infer ();
355330 return GGML_STATUS_SUCCESS;
356331}
357332
@@ -406,6 +381,22 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
406381 return input_tensor;
407382}
408383
384+ ov::Tensor get_ov_output_tensor (std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name) {
385+ auto * ggml_tensor = ggml_decoder->get_output_ggml_tensor (result_name);
386+ auto output_type = ggml_decoder->get_output_type (result_name);
387+ ov::Shape output_shape;
388+ if (result_name.find (" cache" ) == std::string::npos) {
389+ output_shape = ggml_decoder->get_output_shape (result_name).to_shape ();
390+ } else {
391+ size_t total_token_len = ggml_decoder->get_past_kv_len () + ggml_decoder->get_input_len ();
392+ size_t num_heads_kv = ggml_decoder->get_num_heads_kv ();
393+ size_t head_size = ggml_decoder->get_head_size ();
394+ output_shape = ov::Shape{1 , total_token_len, num_heads_kv, head_size};
395+ }
396+ ov::Tensor output_tensor (output_type, output_shape, ggml_tensor->data );
397+ return output_tensor;
398+ }
399+
409400size_t checksum (const void * data, size_t size) {
410401 const uint8_t * bytes = static_cast <const uint8_t *>(data);
411402 size_t sum = 0 ;
0 commit comments