Minor updates

wine99 · wine99 · commit 0dee07f2b7ac · 2025-10-21T10:57:03.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -153,8 +153,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
         // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
         static std::set<std::string> debug_output_names = {};
         // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
-        if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 ||
-            debug_output_names.count(node_name)) {
+        if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
+            node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) {
             if (node->op == GGML_OP_SET_ROWS) {
                 assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0);
                 if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) {
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -6,6 +6,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <iomanip>
 #include <iostream>
 #include <memory>
@@ -101,10 +102,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
-    static std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors = get_kv_tensors(cgraph);
     // For NPU
     static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
 
+    auto kv_tensors = get_kv_tensors(cgraph);
     std::shared_ptr<GgmlOvDecoder> ggml_decoder;
     std::shared_ptr<ov::InferRequest> infer_request;
 
@@ -224,6 +225,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
         }
     }
 
+    auto state_update_end_time = ggml_time_us();
+
     auto ov_input_names = ov_input_names_cache[cgraph];
     auto ov_output_names = ov_output_names_cache[cgraph];
     for (size_t i = 0; i < ov_input_names.size(); i++) {
@@ -238,6 +241,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     auto input_end_time = ggml_time_us();
 
     infer_request->infer();
+
     auto infer_end_time = ggml_time_us();
 
     auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
@@ -254,11 +258,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     auto end_time = ggml_time_us();
 
     if (getenv("GGML_OPENVINO_PROFILING")) {
-        GGML_LOG_INFO("GGML OpenVINO Backend: \n");
+        GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
         GGML_LOG_INFO("  - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
         GGML_LOG_INFO("  - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
         GGML_LOG_INFO("  - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
-        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph State Update Time: %ld ms \n", (state_update_end_time - compile_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - state_update_end_time) / 1000);
         GGML_LOG_INFO("  - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
         GGML_LOG_INFO("  - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
     }
@@ -529,14 +534,22 @@ bool get_is_first_token(const ggml_tensor* inp_pos) {
     return *(int32_t*) inp_pos->data == 0;
 }
 
-std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph) {
-    std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors;
+std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph) {
+    static std::unordered_map<struct ggml_cgraph*, std::unordered_map<std::string, ggml_tensor*>> kv_tensors_cache;
+
+    auto it = kv_tensors_cache.find(cgraph);
+    if (it != kv_tensors_cache.end()) {
+        return it->second;
+    }
+
+    std::unordered_map<std::string, ggml_tensor*> kv_tensors;
     for (int i = 0; i < cgraph->n_nodes; ++i) {
         auto* op = cgraph->nodes[i];
         if (op->op == GGML_OP_SET_ROWS) {
             assert(std::string(op->src[2]->name).find("cache_") == 0);
-            kv_tensors.emplace_back(op->src[2]->name, op->src[2]);
+            kv_tensors[std::string(op->src[2]->name)] = op->src[2];
         }
     }
+    kv_tensors_cache[cgraph] = kv_tensors;
     return kv_tensors;
 }
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
@@ -42,7 +42,7 @@ const ggml_tensor* get_inp_pos_tensor(struct ggml_cgraph* cgraph);
 
 bool get_is_first_token(const ggml_tensor* inp_pos);
 
-std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph);
+std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph);
 
 ov::AnyMap get_npu_prefill_config();
 ov::AnyMap get_npu_generate_config();