Minor udpates

wine99 · wine99 · commit 5ea2158cc919 · 2025-10-21T11:20:29.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -153,8 +153,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
         // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
         static std::set<std::string> debug_output_names = {};
         // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
-        if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 ||
-            debug_output_names.count(node_name)) {
+        if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
+            node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) {
             if (node->op == GGML_OP_SET_ROWS) {
                 assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0);
                 if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) {
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -6,6 +6,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <iomanip>
 #include <iostream>
 #include <memory>
@@ -66,20 +67,24 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
 enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
     static ov::Core core;
 
-    static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
-    if (device.empty()) {
-        const std::vector<std::string> preferred_device = { "GPU", "CPU", "NPU" };
-        const auto available_devices = core.get_available_devices();
-        for (const auto& dev : preferred_device) {
-            if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) {
-                device = dev;
-                break;
-            }
-        }
+    static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
+    static const auto available_devices = core.get_available_devices();
+    if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) {
+        GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str());
+        device = "CPU";
     }
 
     bool is_static = device == "NPU" ? true : false;
+
     ov::AnyMap config;
+    if (device == "GPU") {
+        auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
+        if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") {
+            config = {
+                {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"}
+            };
+        }
+    }
 
     if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
         std::string filename = "cgraph.txt";
@@ -101,10 +106,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
-    static std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors = get_kv_tensors(cgraph);
     // For NPU
     static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
 
+    auto kv_tensors = get_kv_tensors(cgraph);
     std::shared_ptr<GgmlOvDecoder> ggml_decoder;
     std::shared_ptr<ov::InferRequest> infer_request;
 
@@ -183,13 +188,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
                     ov::serialize(model, timestamped_filename);
                 }
 
-                auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
-                if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") {
-                    config = {
-                        {"GPU_ENABLE_SDPA_OPTIMIZATION", "0"}
-                    };
-                }
-
                 auto compiled_model = core.compile_model(model, device, config);
                 compile_end_time = ggml_time_us();
                 infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
@@ -224,6 +222,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
         }
     }
 
+    auto state_update_end_time = ggml_time_us();
+
     auto ov_input_names = ov_input_names_cache[cgraph];
     auto ov_output_names = ov_output_names_cache[cgraph];
     for (size_t i = 0; i < ov_input_names.size(); i++) {
@@ -238,6 +238,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     auto input_end_time = ggml_time_us();
 
     infer_request->infer();
+
     auto infer_end_time = ggml_time_us();
 
     auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
@@ -254,11 +255,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     auto end_time = ggml_time_us();
 
     if (getenv("GGML_OPENVINO_PROFILING")) {
-        GGML_LOG_INFO("GGML OpenVINO Backend: \n");
+        GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
         GGML_LOG_INFO("  - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
         GGML_LOG_INFO("  - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
         GGML_LOG_INFO("  - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
-        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph State Update Time: %ld ms \n", (state_update_end_time - compile_end_time) / 1000);
+        GGML_LOG_INFO("  - Graph Input Time: %ld ms \n", (input_end_time - state_update_end_time) / 1000);
         GGML_LOG_INFO("  - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
         GGML_LOG_INFO("  - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
     }
@@ -529,14 +531,22 @@ bool get_is_first_token(const ggml_tensor* inp_pos) {
     return *(int32_t*) inp_pos->data == 0;
 }
 
-std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph) {
-    std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors;
+std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph) {
+    static std::unordered_map<struct ggml_cgraph*, std::unordered_map<std::string, ggml_tensor*>> kv_tensors_cache;
+
+    auto it = kv_tensors_cache.find(cgraph);
+    if (it != kv_tensors_cache.end()) {
+        return it->second;
+    }
+
+    std::unordered_map<std::string, ggml_tensor*> kv_tensors;
     for (int i = 0; i < cgraph->n_nodes; ++i) {
         auto* op = cgraph->nodes[i];
         if (op->op == GGML_OP_SET_ROWS) {
             assert(std::string(op->src[2]->name).find("cache_") == 0);
-            kv_tensors.emplace_back(op->src[2]->name, op->src[2]);
+            kv_tensors[std::string(op->src[2]->name)] = op->src[2];
         }
     }
+    kv_tensors_cache[cgraph] = kv_tensors;
     return kv_tensors;
 }
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h
@@ -42,7 +42,7 @@ const ggml_tensor* get_inp_pos_tensor(struct ggml_cgraph* cgraph);
 
 bool get_is_first_token(const ggml_tensor* inp_pos);
 
-std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph);
+std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph);
 
 ov::AnyMap get_npu_prefill_config();
 ov::AnyMap get_npu_generate_config();