66#include < cstddef>
77#include < cstdint>
88#include < cstdlib>
9+ #include < cstring>
910#include < iomanip>
1011#include < iostream>
1112#include < memory>
@@ -101,10 +102,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
101102 static std::unordered_map<struct ggml_cgraph *, std::shared_ptr<ov::InferRequest>> infer_request_cache;
102103 static std::unordered_map<struct ggml_cgraph *, std::vector<std::string>> ov_input_names_cache;
103104 static std::unordered_map<struct ggml_cgraph *, std::vector<std::string>> ov_output_names_cache;
104- static std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors = get_kv_tensors (cgraph);
105105 // For NPU
106106 static std::unordered_map<struct ggml_cgraph *, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
107107
108+ auto kv_tensors = get_kv_tensors (cgraph);
108109 std::shared_ptr<GgmlOvDecoder> ggml_decoder;
109110 std::shared_ptr<ov::InferRequest> infer_request;
110111
@@ -224,6 +225,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
224225 }
225226 }
226227
228+ auto state_update_end_time = ggml_time_us ();
229+
227230 auto ov_input_names = ov_input_names_cache[cgraph];
228231 auto ov_output_names = ov_output_names_cache[cgraph];
229232 for (size_t i = 0 ; i < ov_input_names.size (); i++) {
@@ -238,6 +241,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
238241 auto input_end_time = ggml_time_us ();
239242
240243 infer_request->infer ();
244+
241245 auto infer_end_time = ggml_time_us ();
242246
243247 auto gguf_tensor_addrs = get_ggml_graph_output_dst (ggml_decoder);
@@ -254,11 +258,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
254258 auto end_time = ggml_time_us ();
255259
256260 if (getenv (" GGML_OPENVINO_PROFILING" )) {
257- GGML_LOG_INFO (" GGML OpenVINO Backend: \n " );
261+ GGML_LOG_INFO (" \n GGML OpenVINO Backend: \n " );
258262 GGML_LOG_INFO (" - Graph decoder Time: %ld ms \n " , (decoder_end_time - start_time) / 1000 );
259263 GGML_LOG_INFO (" - Graph conversion Time: %ld ms \n " , (conversion_end_time - decoder_end_time) / 1000 );
260264 GGML_LOG_INFO (" - Graph compile Time: %ld ms \n " , (compile_end_time - conversion_end_time) / 1000 );
261- GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - compile_end_time) / 1000 );
265+ GGML_LOG_INFO (" - Graph State Update Time: %ld ms \n " , (state_update_end_time - compile_end_time) / 1000 );
266+ GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - state_update_end_time) / 1000 );
262267 GGML_LOG_INFO (" - Graph Inference Time: %ld ms \n " , (infer_end_time - input_end_time) / 1000 );
263268 GGML_LOG_INFO (" - Graph Output Time: %ld ms \n " , (end_time - infer_end_time) / 1000 );
264269 }
@@ -529,14 +534,22 @@ bool get_is_first_token(const ggml_tensor* inp_pos) {
529534 return *(int32_t *) inp_pos->data == 0 ;
530535}
531536
532- std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors (struct ggml_cgraph * cgraph) {
533- std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors;
537+ std::unordered_map<std::string, ggml_tensor*> get_kv_tensors (struct ggml_cgraph * cgraph) {
538+ static std::unordered_map<struct ggml_cgraph *, std::unordered_map<std::string, ggml_tensor*>> kv_tensors_cache;
539+
540+ auto it = kv_tensors_cache.find (cgraph);
541+ if (it != kv_tensors_cache.end ()) {
542+ return it->second ;
543+ }
544+
545+ std::unordered_map<std::string, ggml_tensor*> kv_tensors;
534546 for (int i = 0 ; i < cgraph->n_nodes ; ++i) {
535547 auto * op = cgraph->nodes [i];
536548 if (op->op == GGML_OP_SET_ROWS) {
537549 assert (std::string (op->src [2 ]->name ).find (" cache_" ) == 0 );
538- kv_tensors. emplace_back (op->src [2 ]->name , op->src [2 ]) ;
550+ kv_tensors[ std::string (op->src [2 ]->name )] = op->src [2 ];
539551 }
540552 }
553+ kv_tensors_cache[cgraph] = kv_tensors;
541554 return kv_tensors;
542555}
0 commit comments