66#include < cstddef>
77#include < cstdint>
88#include < cstdlib>
9+ #include < cstring>
910#include < iomanip>
1011#include < iostream>
1112#include < memory>
@@ -66,20 +67,24 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
6667enum ggml_status openvino_frontend_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph) {
6768 static ov::Core core;
6869
69- static std::string device = getenv (" GGML_OPENVINO_DEVICE" ) ? getenv (" GGML_OPENVINO_DEVICE" ) : " " ;
70- if (device.empty ()) {
71- const std::vector<std::string> preferred_device = { " GPU" , " CPU" , " NPU" };
72- const auto available_devices = core.get_available_devices ();
73- for (const auto & dev : preferred_device) {
74- if (std::find (available_devices.begin (), available_devices.end (), dev) != available_devices.end ()) {
75- device = dev;
76- break ;
77- }
78- }
70+ static std::string device = getenv (" GGML_OPENVINO_DEVICE" ) ? getenv (" GGML_OPENVINO_DEVICE" ) : " CPU" ;
71+ static const auto available_devices = core.get_available_devices ();
72+ if (std::find (available_devices.begin (), available_devices.end (), device) == available_devices.end ()) {
73+ GGML_LOG_WARN (" GGML OpenVINO Backend: device %s is not available, fallback to CPU\n " , device.c_str ());
74+ device = " CPU" ;
7975 }
8076
8177 bool is_static = device == " NPU" ? true : false ;
78+
8279 ov::AnyMap config;
80+ if (device == " GPU" ) {
81+ auto * disable_sdpa_optimization = getenv (" GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION" );
82+ if (disable_sdpa_optimization && std::string (disable_sdpa_optimization) != " 0" ) {
83+ config = {
84+ {" GPU_ENABLE_SDPA_OPTIMIZATION" , " 0" }
85+ };
86+ }
87+ }
8388
8489 if (getenv (" GGML_OPENVINO_DUMP_CGRAPH" )) {
8590 std::string filename = " cgraph.txt" ;
@@ -101,10 +106,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
101106 static std::unordered_map<struct ggml_cgraph *, std::shared_ptr<ov::InferRequest>> infer_request_cache;
102107 static std::unordered_map<struct ggml_cgraph *, std::vector<std::string>> ov_input_names_cache;
103108 static std::unordered_map<struct ggml_cgraph *, std::vector<std::string>> ov_output_names_cache;
104- static std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors = get_kv_tensors (cgraph);
105109 // For NPU
106110 static std::unordered_map<struct ggml_cgraph *, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
107111
112+ auto kv_tensors = get_kv_tensors (cgraph);
108113 std::shared_ptr<GgmlOvDecoder> ggml_decoder;
109114 std::shared_ptr<ov::InferRequest> infer_request;
110115
@@ -183,13 +188,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
183188 ov::serialize (model, timestamped_filename);
184189 }
185190
186- auto * disable_sdpa_optimization = getenv (" GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION" );
187- if (disable_sdpa_optimization && std::string (disable_sdpa_optimization) != " 0" ) {
188- config = {
189- {" GPU_ENABLE_SDPA_OPTIMIZATION" , " 0" }
190- };
191- }
192-
193191 auto compiled_model = core.compile_model (model, device, config);
194192 compile_end_time = ggml_time_us ();
195193 infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request ());
@@ -224,6 +222,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
224222 }
225223 }
226224
225+ auto state_update_end_time = ggml_time_us ();
226+
227227 auto ov_input_names = ov_input_names_cache[cgraph];
228228 auto ov_output_names = ov_output_names_cache[cgraph];
229229 for (size_t i = 0 ; i < ov_input_names.size (); i++) {
@@ -238,6 +238,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
238238 auto input_end_time = ggml_time_us ();
239239
240240 infer_request->infer ();
241+
241242 auto infer_end_time = ggml_time_us ();
242243
243244 auto gguf_tensor_addrs = get_ggml_graph_output_dst (ggml_decoder);
@@ -254,11 +255,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
254255 auto end_time = ggml_time_us ();
255256
256257 if (getenv (" GGML_OPENVINO_PROFILING" )) {
257- GGML_LOG_INFO (" GGML OpenVINO Backend: \n " );
258+ GGML_LOG_INFO (" \n GGML OpenVINO Backend: \n " );
258259 GGML_LOG_INFO (" - Graph decoder Time: %ld ms \n " , (decoder_end_time - start_time) / 1000 );
259260 GGML_LOG_INFO (" - Graph conversion Time: %ld ms \n " , (conversion_end_time - decoder_end_time) / 1000 );
260261 GGML_LOG_INFO (" - Graph compile Time: %ld ms \n " , (compile_end_time - conversion_end_time) / 1000 );
261- GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - compile_end_time) / 1000 );
262+ GGML_LOG_INFO (" - Graph State Update Time: %ld ms \n " , (state_update_end_time - compile_end_time) / 1000 );
263+ GGML_LOG_INFO (" - Graph Input Time: %ld ms \n " , (input_end_time - state_update_end_time) / 1000 );
262264 GGML_LOG_INFO (" - Graph Inference Time: %ld ms \n " , (infer_end_time - input_end_time) / 1000 );
263265 GGML_LOG_INFO (" - Graph Output Time: %ld ms \n " , (end_time - infer_end_time) / 1000 );
264266 }
@@ -529,14 +531,22 @@ bool get_is_first_token(const ggml_tensor* inp_pos) {
529531 return *(int32_t *) inp_pos->data == 0 ;
530532}
531533
532- std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors (struct ggml_cgraph * cgraph) {
533- std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors;
534+ std::unordered_map<std::string, ggml_tensor*> get_kv_tensors (struct ggml_cgraph * cgraph) {
535+ static std::unordered_map<struct ggml_cgraph *, std::unordered_map<std::string, ggml_tensor*>> kv_tensors_cache;
536+
537+ auto it = kv_tensors_cache.find (cgraph);
538+ if (it != kv_tensors_cache.end ()) {
539+ return it->second ;
540+ }
541+
542+ std::unordered_map<std::string, ggml_tensor*> kv_tensors;
534543 for (int i = 0 ; i < cgraph->n_nodes ; ++i) {
535544 auto * op = cgraph->nodes [i];
536545 if (op->op == GGML_OP_SET_ROWS) {
537546 assert (std::string (op->src [2 ]->name ).find (" cache_" ) == 0 );
538- kv_tensors. emplace_back (op->src [2 ]->name , op->src [2 ]) ;
547+ kv_tensors[ std::string (op->src [2 ]->name )] = op->src [2 ];
539548 }
540549 }
550+ kv_tensors_cache[cgraph] = kv_tensors;
541551 return kv_tensors;
542552}
0 commit comments