Fix test-thread-safety

wine99 · wine99 · commit fcc7d16a93a1 · 2025-08-15T10:15:34.000+08:00
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <mutex>
 #include <openvino/core/any.hpp>
 #include <openvino/core/graph_util.hpp>
 #include <openvino/core/type/float16.hpp>
@@ -96,6 +97,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
         core.set_property(ov::cache_dir(cache_dir));
     }
 
+    static std::mutex cache_mutex;
     static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
     static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
@@ -109,89 +111,93 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     int64_t conversion_end_time;
     int64_t compile_end_time;
 
-    auto it = infer_request_cache.find(cgraph);
-    if (it != infer_request_cache.end()) {
-        std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
-        ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
-        decoder_end_time = ggml_time_us();
-
-        // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
-        if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) {
-            infer_request_cache[cgraph] =
-                std::make_shared<ov::InferRequest>(compiled_model_cache[cgraph].create_infer_request());
-            compiled_model_cache.erase(cgraph);
-        }
-        infer_request = *infer_request_cache[cgraph];
-
-        conversion_end_time = ggml_time_us();
-        compile_end_time = conversion_end_time;
-    } else {
-        std::shared_ptr<ov::Model> model;
-        auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
+    {
+        std::lock_guard<std::mutex> lock(cache_mutex);
 
-        if (is_static) {
-            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
-            auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
+        auto it = infer_request_cache.find(cgraph);
+        if (it != infer_request_cache.end()) {
+            std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
             decoder_end_time = ggml_time_us();
 
-            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
-            auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
-
-            model = ov::frontend::ggml::FrontEnd::convert(input_model);
-            ggml_decoder->clear_model_weights();
-            auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
-            ggml_decoder_kvcache->clear_model_weights();
-            conversion_end_time = ggml_time_us();
-
-            auto compiled_model = core.compile_model(model, device, config);
-            auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config);
-            compiled_model_cache[cgraph] = compiled_model_kvcache;
-            compile_end_time = ggml_time_us();
-
-            infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
-            infer_request = *infer_request_cache[cgraph];
-            compiled_model_cache[cgraph] = compiled_model_kvcache;
-
-            if (getenv("GGML_OPENVINO_DUMP_IR")) {
-                char timestamped_filename[64];
-                auto timestamp = (long long) ggml_time_us();
-                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
-                ov::serialize(model, timestamped_filename);
-                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp);
-                ov::serialize(model_kvcache, timestamped_filename);
+            // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
+            if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) {
+                infer_request_cache[cgraph] =
+                    std::make_shared<ov::InferRequest>(compiled_model_cache[cgraph].create_infer_request());
+                compiled_model_cache.erase(cgraph);
             }
-        } else {
-            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
-            decoder_end_time = ggml_time_us();
-
-            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
-            model = ov::frontend::ggml::FrontEnd::convert(input_model);
-            ggml_decoder->clear_model_weights();
-            conversion_end_time = ggml_time_us();
-
-            auto compiled_model = core.compile_model(model, device, config);
-            compile_end_time = ggml_time_us();
-            infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
             infer_request = *infer_request_cache[cgraph];
 
-            if (getenv("GGML_OPENVINO_DUMP_IR")) {
-                char timestamped_filename[64];
-                auto timestamp = (long long) ggml_time_us();
-                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
-                ov::serialize(model, timestamped_filename);
+            conversion_end_time = ggml_time_us();
+            compile_end_time = conversion_end_time;
+        } else {
+            std::shared_ptr<ov::Model> model;
+            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
+
+            if (is_static) {
+                ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
+                auto ggml_decoder_kvcache = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, false);
+                decoder_end_time = ggml_time_us();
+
+                auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
+                auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
+
+                model = ov::frontend::ggml::FrontEnd::convert(input_model);
+                ggml_decoder->clear_model_weights();
+                auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
+                ggml_decoder_kvcache->clear_model_weights();
+                conversion_end_time = ggml_time_us();
+
+                auto compiled_model = core.compile_model(model, device, config);
+                auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config);
+                compiled_model_cache[cgraph] = compiled_model_kvcache;
+                compile_end_time = ggml_time_us();
+
+                infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
+                infer_request = *infer_request_cache[cgraph];
+                compiled_model_cache[cgraph] = compiled_model_kvcache;
+
+                if (getenv("GGML_OPENVINO_DUMP_IR")) {
+                    char timestamped_filename[64];
+                    auto timestamp = (long long) ggml_time_us();
+                    snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
+                    ov::serialize(model, timestamped_filename);
+                    snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp);
+                    ov::serialize(model_kvcache, timestamped_filename);
+                }
+            } else {
+                ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
+                decoder_end_time = ggml_time_us();
+
+                auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
+                model = ov::frontend::ggml::FrontEnd::convert(input_model);
+                ggml_decoder->clear_model_weights();
+                conversion_end_time = ggml_time_us();
+
+                auto compiled_model = core.compile_model(model, device, config);
+                compile_end_time = ggml_time_us();
+                infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
+                infer_request = *infer_request_cache[cgraph];
+
+                if (getenv("GGML_OPENVINO_DUMP_IR")) {
+                    char timestamped_filename[64];
+                    auto timestamp = (long long) ggml_time_us();
+                    snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
+                    ov::serialize(model, timestamped_filename);
+                }
             }
-        }
 
-        std::vector<std::string> ov_input_names;
-        std::vector<std::string> ov_output_names;
-        for (const auto& ov_param : model->get_parameters()) {
-            ov_input_names.push_back(ov_param->get_friendly_name());
-        }
-        for (const auto& ov_output : model->get_results()) {
-            ov_output_names.push_back(ov_output->get_friendly_name());
+            std::vector<std::string> ov_input_names;
+            std::vector<std::string> ov_output_names;
+            for (const auto& ov_param : model->get_parameters()) {
+                ov_input_names.push_back(ov_param->get_friendly_name());
+            }
+            for (const auto& ov_output : model->get_results()) {
+                ov_output_names.push_back(ov_output->get_friendly_name());
+            }
+            ov_input_names_cache[cgraph] = ov_input_names;
+            ov_output_names_cache[cgraph] = ov_output_names;
         }
-        ov_input_names_cache[cgraph] = ov_input_names;
-        ov_output_names_cache[cgraph] = ov_output_names;
     }
 
     auto ov_input_names = ov_input_names_cache[cgraph];
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -185,9 +185,7 @@ llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(test-regex-partial.cpp)
 
-if (NOT GGML_OPENVINO)
-    llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
-endif()
+llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
 
 # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
 if (NOT WIN32)