2727#include < openvino/runtime/intel_npu/properties.hpp>
2828#include < openvino/runtime/properties.hpp>
2929#include < openvino/runtime/tensor.hpp>
30+ #include < string>
3031#include < unordered_map>
3132#include < vector>
3233
@@ -35,7 +36,8 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
3536 auto * input_data = ggml_tensor->data ;
3637 ov::Shape input_shape;
3738 if (name.find (" cache_k" ) == 0 || name.find (" cache_v" ) == 0 ) {
38- input_shape = ggml_decoder->get_graph_input_shape (ggml_tensor).to_shape ();
39+ input_shape = {1 , (size_t ) ggml_decoder->get_past_kv_len (), (size_t ) ggml_decoder->get_num_heads_kv (),
40+ (size_t ) ggml_decoder->get_head_size ()};
3941 } else if (ggml_tensor->op == GGML_OP_VIEW) {
4042 // This case is added to make test-backend-ops work
4143 input_shape = ggml_decoder->get_graph_input_shape (ggml_tensor->view_src ).to_shape ();
@@ -46,15 +48,15 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
4648 return input_tensor;
4749}
4850
49- std::map<std::string, void *> get_ggml_graph_output_dst (std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
50- std::map<std::string, void *> output_tensors;
51+ std::map<std::string, ggml_tensor *> get_ggml_graph_output_dst (std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
52+ std::map<std::string, ggml_tensor *> output_tensors;
5153
5254 auto output_names = ggml_decoder->get_model_output_names ();
5355 for (size_t inp = 0 ; inp < output_names.size (); ++inp) {
5456 auto name = output_names[inp];
55- const auto * tensor = ggml_decoder->get_output_ggml_tensor (name);
56- auto * output_data = tensor->view_src ? tensor->view_src -> data : tensor-> data ;
57- output_tensors[name] = output_data ;
57+ auto * tensor = ggml_decoder->get_output_ggml_tensor (name);
58+ tensor = tensor->view_src ? tensor->view_src : tensor;
59+ output_tensors[name] = tensor ;
5860 }
5961 return output_tensors;
6062}
@@ -110,7 +112,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
110112 // For NPU
111113 static std::unordered_map<ggml_cgraph *, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
112114
113- auto kv_tensors = get_kv_tensors (cgraph);
114115 std::shared_ptr<GgmlOvDecoder> ggml_decoder;
115116 std::shared_ptr<ov::InferRequest> infer_request;
116117
@@ -208,45 +209,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
208209 }
209210 }
210211
211- if (!is_static) {
212- auto states = infer_request->query_state ();
213- int32_t kv_len = *(int32_t *) inp_pos->data ;
214- int32_t kv_len_in_state = states[0 ].get_state ().get_shape ()[1 ];
215-
216- // outdated if:
217- // 1. kv_len != kv_len_in_state
218- // 2. last row has different values
219- bool state_outdated = kv_len != kv_len_in_state;
220- if (!state_outdated && kv_len > 0 ) {
221- auto state_tensor = states[0 ].get_state ();
222- auto state_name = states[0 ].get_name ();
223- state_name = state_name.substr (0 , state_name.size () / 2 );
224- auto state_shape = state_tensor.get_shape ();
225- auto * ggml_tensor = kv_tensors[state_name];
226- auto offset = (kv_len - 1 ) * state_shape[2 ] * state_shape[3 ] * ggml_type_size (ggml_tensor->type );
227- auto size = state_shape[2 ] * state_shape[3 ] * ggml_type_size (ggml_tensor->type );
228- state_outdated =
229- std::memcmp ((char *) ggml_tensor->data + offset, (char *) state_tensor.data () + offset, size) != 0 ;
230- }
231-
232- if (state_outdated) {
233- GGML_LOG_DEBUG (
234- " GGML OpenVINO Backend: updating kv cache states from ggml tensors (kv_len: %d, kv_len_in_state: %d)\n " ,
235- kv_len, kv_len_in_state);
236- for (auto & state : states) {
237- auto state_name = state.get_name ();
238- state_name = state_name.substr (0 , state_name.size () / 2 );
239- auto * ggml_tensor = kv_tensors[state_name];
240- auto state_shape = state.get_state ().get_shape ();
241- state_shape[1 ] = kv_len;
242- ov::Tensor state_tensor (state.get_state ().get_element_type (), state_shape, ggml_tensor->data );
243- state.set_state (state_tensor);
244- }
245- }
246- }
247-
248- auto state_update_end_time = ggml_time_us ();
249-
250212 auto ov_input_names = ov_input_names_cache[cgraph];
251213 auto ov_output_names = ov_output_names_cache[cgraph];
252214 for (size_t i = 0 ; i < ov_input_names.size (); i++) {
@@ -264,39 +226,35 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
264226
265227 auto infer_end_time = ggml_time_us ();
266228
267- auto gguf_tensor_addrs = get_ggml_graph_output_dst (ggml_decoder);
229+ auto ggml_tensors = get_ggml_graph_output_dst (ggml_decoder);
230+ auto kv_size = ggml_decoder->get_input_len () * ggml_decoder->get_num_heads_kv () * ggml_decoder->get_head_size ();
231+ auto kv_offset = ggml_decoder->get_past_kv_len () * ggml_decoder->get_num_heads_kv () * ggml_decoder->get_head_size ();
268232 for (size_t i = 0 ; i < ov_output_names.size (); i++) {
269- auto & result_name = ov_output_names[i];
270233 const auto output_tensor = infer_request->get_output_tensor (i);
234+ auto & result_name = ov_output_names[i];
235+ auto * ggml_tensor = ggml_tensors[result_name];
271236
272- std::memcpy (gguf_tensor_addrs[result_name], output_tensor.data (), output_tensor.get_byte_size ());
237+ if (result_name.find (" cache" ) == std::string::npos) {
238+ std::memcpy (ggml_tensors[result_name]->data , output_tensor.data (), output_tensor.get_byte_size ());
239+ } else {
240+ auto offset = kv_offset * ggml_type_size (ggml_tensor->type );
241+ auto size = kv_size * ggml_type_size (ggml_tensor->type );
242+ std::memcpy ((char *) ggml_tensor->data + offset, (char *) output_tensor.data () + offset, size);
243+ }
273244
274245 if (getenv (" GGML_OPENVINO_DEBUG_OUTPUT" )) {
275- print_output_tensor_info (result_name, output_tensor, gguf_tensor_addrs );
246+ print_output_tensor_info (result_name, output_tensor, ggml_tensor-> data );
276247 }
277248 }
278249
279- for (auto & state : infer_request->query_state ()) {
280- auto state_name = state.get_name ();
281- state_name = state_name.substr (0 , state_name.size () / 2 );
282- auto state_tensor = state.get_state ();
283- auto state_shape = state_tensor.get_shape ();
284- auto * ggml_tensor = kv_tensors[state_name];
285- auto size = state_shape[2 ] * state_shape[3 ] * inp_pos->ne [0 ] * ggml_type_size (ggml_tensor->type );
286- auto offset =
287- state_shape[2 ] * state_shape[3 ] * (*(int32_t *) inp_pos->data ) * ggml_type_size (ggml_tensor->type );
288- std::memcpy ((char *) ggml_tensor->data + offset, (char *) state_tensor.data () + offset, size);
289- }
290-
291250 auto end_time = ggml_time_us ();
292251
293252 if (getenv (" GGML_OPENVINO_PROFILING" )) {
294253 GGML_LOG_INFO (" \n GGML OpenVINO Backend: \n " );
295254 GGML_LOG_INFO (" - Graph decoder Time: %ld ms \n " , (decoder_end_time - start_time) / 1000 );
296255 GGML_LOG_INFO (" - Graph conversion Time: %ld ms \n " , (conversion_end_time - decoder_end_time) / 1000 );
297256 GGML_LOG_INFO (" - Graph compile Time: %ld ms \n " , (compile_end_time - conversion_end_time) / 1000 );
298- GGML_LOG_INFO (" - Graph State Update Time: %ld ms \n " , (state_update_end_time - compile_end_time) / 1000 );
299- GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - state_update_end_time) / 1000 );
257+ GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - compile_end_time) / 1000 );
300258 GGML_LOG_INFO (" - Graph Inference Time: %ld ms \n " , (infer_end_time - input_end_time) / 1000 );
301259 GGML_LOG_INFO (" - Graph Output Time: %ld ms \n " , (end_time - infer_end_time) / 1000 );
302260 }
@@ -487,11 +445,9 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor
487445 }
488446}
489447
490- void print_output_tensor_info (const std::string & name,
491- const ov::Tensor & tensor,
492- std::map<std::string, void *> & output_dst) {
493- std::cout << " Output name: " << name << " , Output shape: " << tensor.get_shape ()
494- << " , Address: " << output_dst[name] << std::endl;
448+ void print_output_tensor_info (const std::string & name, const ov::Tensor & tensor, void * output_dst) {
449+ std::cout << " Output name: " << name << " , Output shape: " << tensor.get_shape () << " , Address: " << output_dst
450+ << std::endl;
495451
496452 auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) {
497453 if (size == 0 ) {
@@ -567,23 +523,3 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
567523bool get_is_first_token (const ggml_tensor * inp_pos) {
568524 return *(int32_t *) inp_pos->data == 0 ;
569525}
570-
571- std::unordered_map<std::string, ggml_tensor *> get_kv_tensors (struct ggml_cgraph * cgraph) {
572- static std::unordered_map<struct ggml_cgraph *, std::unordered_map<std::string, ggml_tensor *>> kv_tensors_cache;
573-
574- auto it = kv_tensors_cache.find (cgraph);
575- if (it != kv_tensors_cache.end ()) {
576- return it->second ;
577- }
578-
579- std::unordered_map<std::string, ggml_tensor *> kv_tensors;
580- for (int i = 0 ; i < cgraph->n_nodes ; ++i) {
581- auto * op = cgraph->nodes [i];
582- if (op->op == GGML_OP_SET_ROWS) {
583- assert (std::string (op->src [2 ]->name ).find (" cache_" ) == 0 );
584- kv_tensors[std::string (op->src [2 ]->name )] = op->src [2 ];
585- }
586- }
587- kv_tensors_cache[cgraph] = kv_tensors;
588- return kv_tensors;
589- }
0 commit comments