@@ -80,6 +80,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
8080 int64_t decoder_end_time;
8181 int64_t conversion_end_time;
8282 int64_t compile_end_time;
83+ int64_t infer_end_time;
8384
8485 {
8586 std::lock_guard<std::mutex> lock (cache_mutex);
@@ -127,47 +128,87 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
127128 }
128129 ov_input_names_cache[cgraph] = ov_input_names;
129130 ov_output_names_cache[cgraph] = ov_output_names;
131+
132+ // Set output tensors and kvcache address for NPU once and for all since the graph is static
133+ if (is_static) {
134+ for (size_t i = 0 ; i < ov_output_names.size (); i++) {
135+ auto output_tensor = get_ov_output_tensor (ggml_decoder, ov_output_names[i]);
136+ infer_request->set_output_tensor (i, output_tensor);
137+ }
138+ for (size_t i = 0 ; i < ov_input_names.size (); i++) {
139+ auto param_name = ov_input_names[i];
140+ if (param_name.find (" cache_k" ) == 0 || param_name.find (" cache_v" ) == 0 ) {
141+ auto input_tensor = get_ov_input_tensor_static (ggml_decoder, param_name, 0 , 0 );
142+ infer_request->set_input_tensor (i, input_tensor);
143+ }
144+ }
145+ }
130146 }
131147 }
132148
133149 auto ov_input_names = ov_input_names_cache[cgraph];
134150 auto ov_output_names = ov_output_names_cache[cgraph];
135151
136- for (size_t i = 0 ; i < ov_input_names.size (); i++) {
137- auto param_name = ov_input_names[i];
138- auto input_tensor = get_ov_input_tensor (ggml_decoder, param_name);
139- infer_request->set_input_tensor (i, input_tensor);
152+ if (!is_static) {
153+ for (size_t i = 0 ; i < ov_input_names.size (); i++) {
154+ auto param_name = ov_input_names[i];
155+ auto input_tensor = get_ov_input_tensor (ggml_decoder, param_name);
156+ infer_request->set_input_tensor (i, input_tensor);
140157
141- if (getenv (" GGML_OPENVINO_DEBUG_INPUT" )) {
142- print_input_tensor_info (param_name, input_tensor);
158+ if (getenv (" GGML_OPENVINO_DEBUG_INPUT" )) {
159+ print_input_tensor_info (param_name, input_tensor);
160+ }
143161 }
144- }
145162
146- for (size_t i = 0 ; i < ov_output_names.size (); i++) {
147- auto output_tensor = get_ov_output_tensor (ggml_decoder, ov_output_names[i]);
148- infer_request->set_output_tensor (i, output_tensor);
149- }
163+ for (size_t i = 0 ; i < ov_output_names.size (); i++) {
164+ auto output_tensor = get_ov_output_tensor (ggml_decoder, ov_output_names[i]);
165+ infer_request->set_output_tensor (i, output_tensor);
166+ }
150167
151- auto input_end_time = ggml_time_us ();
168+ infer_request->infer ();
169+ infer_end_time = ggml_time_us ();
152170
153- infer_request->infer ();
171+ if (getenv (" GGML_OPENVINO_DEBUG_OUTPUT" )) {
172+ for (size_t i = 0 ; i < ov_output_names.size (); i++) {
173+ const auto output_tensor = infer_request->get_output_tensor (i);
174+ print_output_tensor_info (ov_output_names[i], output_tensor, output_tensor.data ());
175+ }
176+ }
177+ } else {
178+ auto input_len = ggml_decoder->get_input_len ();
179+ for (int j = 0 ; j < input_len; j++) {
180+ for (size_t i = 0 ; i < ov_input_names.size (); i++) {
181+ auto param_name = ov_input_names[i];
182+ if (param_name.find (" cache_k" ) == 0 || param_name.find (" cache_v" ) == 0 ) {
183+ continue ;
184+ }
185+ auto input_tensor = get_ov_input_tensor_static (ggml_decoder, param_name, j, input_len);
186+ infer_request->set_input_tensor (i, input_tensor);
187+
188+ if (getenv (" GGML_OPENVINO_DEBUG_INPUT" )) {
189+ const auto input_tensor = infer_request->get_input_tensor (i);
190+ print_input_tensor_info (param_name, input_tensor);
191+ }
192+ }
154193
155- auto infer_end_time = ggml_time_us ();
194+ infer_request-> infer ();
156195
157- for (size_t i = 0 ; i < ov_output_names.size (); i++) {
158- const auto output_tensor = infer_request->get_output_tensor (i);
159- if (getenv (" GGML_OPENVINO_DEBUG_OUTPUT" )) {
160- print_output_tensor_info (ov_output_names[i], output_tensor, output_tensor.data ());
196+ if (getenv (" GGML_OPENVINO_DEBUG_OUTPUT" )) {
197+ for (size_t i = 0 ; i < ov_output_names.size (); i++) {
198+ const auto output_tensor = infer_request->get_output_tensor (i);
199+ print_output_tensor_info (ov_output_names[i], output_tensor, output_tensor.data ());
200+ }
201+ }
161202 }
203+ infer_end_time = ggml_time_us ();
162204 }
163205
164206 if (getenv (" GGML_OPENVINO_PROFILING" )) {
165207 GGML_LOG_INFO (" \n GGML OpenVINO Backend: \n " );
166208 GGML_LOG_INFO (" - Graph decoder Time: %ld ms \n " , (decoder_end_time - start_time) / 1000 );
167209 GGML_LOG_INFO (" - Graph conversion Time: %ld ms \n " , (conversion_end_time - decoder_end_time) / 1000 );
168210 GGML_LOG_INFO (" - Graph compile Time: %ld ms \n " , (compile_end_time - conversion_end_time) / 1000 );
169- GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - compile_end_time) / 1000 );
170- GGML_LOG_INFO (" - Graph Inference Time: %ld ms \n " , (infer_end_time - input_end_time) / 1000 );
211+ GGML_LOG_INFO (" - Graph Inference Time: %ld ms \n " , (infer_end_time - compile_end_time) / 1000 );
171212 }
172213
173214 return GGML_STATUS_SUCCESS;
@@ -275,39 +316,75 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
275316} // namespace
276317
277318ov::Tensor get_ov_input_tensor (std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name) {
278- bool is_static = ggml_decoder->is_static ();
279-
280319 ov::Tensor input_tensor;
281320 if (ggml_decoder->get_model_extra_inputs ().find (param_name) != ggml_decoder->get_model_extra_inputs ().end ()) {
282321 input_tensor = *ggml_decoder->get_model_extra_input_values ().at (param_name);
283322
284323 } else if (param_name.find (" cache_k" ) == 0 || param_name.find (" cache_v" ) == 0 ) {
285324 void * input_data = ggml_decoder->get_input_ggml_tensor (param_name)->data ;
286- size_t past_kv_len =
287- ggml_decoder->is_static () ? ggml_decoder->get_context_size () : ggml_decoder->get_past_kv_len ();
288- ov::Shape input_shape = {past_kv_len, (size_t ) ggml_decoder->get_num_heads_kv (),
325+ ov::Shape input_shape = {(size_t ) ggml_decoder->get_past_kv_len (), (size_t ) ggml_decoder->get_num_heads_kv (),
289326 (size_t ) ggml_decoder->get_head_size ()};
290327 input_tensor = ov::Tensor (ggml_decoder->get_input_type (param_name), input_shape, input_data);
291328
292- } else if (is_static && param_name.find (" KQ_mask" ) == 0 ) {
293- size_t context_size = ggml_decoder->get_context_size ();
294- const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor (param_name);
295- std::vector<float > padded_data = pad_input<float >(input_tensor_ggml, 1 , context_size, -INFINITY);
296- input_tensor = ov::Tensor (ov::element::f32 , ov::Shape{1 , 1 , context_size});
297- auto * data_ptr = input_tensor.data <float >();
298- std::copy (padded_data.begin (), padded_data.end (), data_ptr);
299-
300- } else if (is_static && param_name.find (" inp_out_ids" ) == 0 ) {
329+ } else {
301330 input_tensor = convert_ggml_input_to_ov (ggml_decoder, param_name);
302- if (input_tensor.get_size () == 0 ) {
303- input_tensor = ov::Tensor (input_tensor.get_element_type (), ov::Shape{1 , 1 , 1 });
331+ }
332+ return input_tensor;
333+ }
334+
335+ ov::Tensor get_ov_input_tensor_static (std::shared_ptr<GgmlOvDecoder> ggml_decoder,
336+ const std::string & param_name,
337+ int j,
338+ int input_len) {
339+ const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor (param_name);
340+ const auto * op = ggml_decoder->get_tensor_used_op (ggml_tensor);
341+
342+ if (param_name.find (" cache_k" ) == 0 || param_name.find (" cache_v" ) == 0 ) {
343+ void * input_data = ggml_decoder->get_input_ggml_tensor (param_name)->data ;
344+ ov::Shape input_shape = {(size_t ) ggml_decoder->get_context_size (), (size_t ) ggml_decoder->get_num_heads_kv (),
345+ (size_t ) ggml_decoder->get_head_size ()};
346+ return ov::Tensor (ggml_decoder->get_input_type (param_name), input_shape, input_data);
347+ }
348+
349+ if (param_name == " inp_pos" || param_name == " inp_tokens" || op->op == GGML_OP_SET_ROWS) {
350+ ov::Shape input_shape = {1 , 1 , 1 };
351+ ov::Tensor input_tensor (ggml_decoder->get_input_type (param_name), input_shape);
352+ // copy the j-th value from ggml_tensor
353+ size_t element_size = ggml_type_size (ggml_tensor->type );
354+ void * input_data = (char *) ggml_tensor->data + j * element_size;
355+ std::memcpy (input_tensor.data (), input_data, element_size);
356+ return input_tensor;
357+ }
358+
359+ if (param_name == " inp_out_ids" ) {
360+ ov::Shape input_shape = {1 , 1 , 1 };
361+ ov::Tensor input_tensor (ggml_decoder->get_input_type (param_name), input_shape);
362+ if (ggml_tensor->ne [0 ] == 0 ) {
304363 *input_tensor.data <int32_t >() = 0 ;
364+ } else if (ggml_tensor->ne [0 ] == 1 ) {
365+ if (j == input_len - 1 ) {
366+ *input_tensor.data <int32_t >() = *((int32_t *) ggml_tensor->data );
367+ } else {
368+ *input_tensor.data <int32_t >() = 0 ;
369+ }
370+ } else {
371+ throw std::runtime_error (" Static graph inp_out_ids unexpected ne[0] > 1" );
305372 }
373+ return input_tensor;
374+ }
306375
307- } else {
308- input_tensor = convert_ggml_input_to_ov (ggml_decoder, param_name);
376+ if (param_name.find (" KQ_mask" ) == 0 ) {
377+ size_t context_size = ggml_decoder->get_context_size ();
378+ const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor (param_name);
379+ std::vector<float > padded_data = pad_input<float >(input_tensor_ggml, input_len, context_size, -INFINITY);
380+ ov::Tensor input_tensor (ov::element::f32 , ov::Shape{1 , 1 , context_size});
381+ // copy the j-th row of padded_data
382+ auto * data_ptr = input_tensor.data <float >();
383+ std::copy (padded_data.begin () + j * context_size, padded_data.begin () + (j + 1 ) * context_size, data_ptr);
384+ return input_tensor;
309385 }
310- return input_tensor;
386+
387+ return get_ov_input_tensor (ggml_decoder, param_name);
311388}
312389
313390ov::Tensor get_ov_output_tensor (std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & result_name) {
0 commit comments