Skip to content

Commit 5ea2158

Browse files
committed
Minor udpates
1 parent 9fd135b commit 5ea2158

File tree

3 files changed

+36
-26
lines changed

3 files changed

+36
-26
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
153153
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
154154
static std::set<std::string> debug_output_names = {};
155155
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
156-
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 ||
157-
debug_output_names.count(node_name)) {
156+
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
157+
node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) {
158158
if (node->op == GGML_OP_SET_ROWS) {
159159
assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0);
160160
if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) {

ggml/src/ggml-openvino/utils.cpp

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <cstddef>
77
#include <cstdint>
88
#include <cstdlib>
9+
#include <cstring>
910
#include <iomanip>
1011
#include <iostream>
1112
#include <memory>
@@ -66,20 +67,24 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
6667
enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
6768
static ov::Core core;
6869

69-
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
70-
if (device.empty()) {
71-
const std::vector<std::string> preferred_device = { "GPU", "CPU", "NPU" };
72-
const auto available_devices = core.get_available_devices();
73-
for (const auto& dev : preferred_device) {
74-
if (std::find(available_devices.begin(), available_devices.end(), dev) != available_devices.end()) {
75-
device = dev;
76-
break;
77-
}
78-
}
70+
static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
71+
static const auto available_devices = core.get_available_devices();
72+
if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) {
73+
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str());
74+
device = "CPU";
7975
}
8076

8177
bool is_static = device == "NPU" ? true : false;
78+
8279
ov::AnyMap config;
80+
if (device == "GPU") {
81+
auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
82+
if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") {
83+
config = {
84+
{"GPU_ENABLE_SDPA_OPTIMIZATION", "0"}
85+
};
86+
}
87+
}
8388

8489
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
8590
std::string filename = "cgraph.txt";
@@ -101,10 +106,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
101106
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
102107
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
103108
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
104-
static std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors = get_kv_tensors(cgraph);
105109
// For NPU
106110
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
107111

112+
auto kv_tensors = get_kv_tensors(cgraph);
108113
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
109114
std::shared_ptr<ov::InferRequest> infer_request;
110115

@@ -183,13 +188,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
183188
ov::serialize(model, timestamped_filename);
184189
}
185190

186-
auto* disable_sdpa_optimization = getenv("GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION");
187-
if (disable_sdpa_optimization && std::string(disable_sdpa_optimization) != "0") {
188-
config = {
189-
{"GPU_ENABLE_SDPA_OPTIMIZATION", "0"}
190-
};
191-
}
192-
193191
auto compiled_model = core.compile_model(model, device, config);
194192
compile_end_time = ggml_time_us();
195193
infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
@@ -224,6 +222,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
224222
}
225223
}
226224

225+
auto state_update_end_time = ggml_time_us();
226+
227227
auto ov_input_names = ov_input_names_cache[cgraph];
228228
auto ov_output_names = ov_output_names_cache[cgraph];
229229
for (size_t i = 0; i < ov_input_names.size(); i++) {
@@ -238,6 +238,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
238238
auto input_end_time = ggml_time_us();
239239

240240
infer_request->infer();
241+
241242
auto infer_end_time = ggml_time_us();
242243

243244
auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
@@ -254,11 +255,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
254255
auto end_time = ggml_time_us();
255256

256257
if (getenv("GGML_OPENVINO_PROFILING")) {
257-
GGML_LOG_INFO("GGML OpenVINO Backend: \n");
258+
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
258259
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
259260
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
260261
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
261-
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
262+
GGML_LOG_INFO(" - Graph State Update Time: %ld ms \n", (state_update_end_time - compile_end_time) / 1000);
263+
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - state_update_end_time) / 1000);
262264
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
263265
GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
264266
}
@@ -529,14 +531,22 @@ bool get_is_first_token(const ggml_tensor* inp_pos) {
529531
return *(int32_t*) inp_pos->data == 0;
530532
}
531533

532-
std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph) {
533-
std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors;
534+
std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph) {
535+
static std::unordered_map<struct ggml_cgraph*, std::unordered_map<std::string, ggml_tensor*>> kv_tensors_cache;
536+
537+
auto it = kv_tensors_cache.find(cgraph);
538+
if (it != kv_tensors_cache.end()) {
539+
return it->second;
540+
}
541+
542+
std::unordered_map<std::string, ggml_tensor*> kv_tensors;
534543
for (int i = 0; i < cgraph->n_nodes; ++i) {
535544
auto* op = cgraph->nodes[i];
536545
if (op->op == GGML_OP_SET_ROWS) {
537546
assert(std::string(op->src[2]->name).find("cache_") == 0);
538-
kv_tensors.emplace_back(op->src[2]->name, op->src[2]);
547+
kv_tensors[std::string(op->src[2]->name)] = op->src[2];
539548
}
540549
}
550+
kv_tensors_cache[cgraph] = kv_tensors;
541551
return kv_tensors;
542552
}

ggml/src/ggml-openvino/utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ const ggml_tensor* get_inp_pos_tensor(struct ggml_cgraph* cgraph);
4242

4343
bool get_is_first_token(const ggml_tensor* inp_pos);
4444

45-
std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph);
45+
std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph);
4646

4747
ov::AnyMap get_npu_prefill_config();
4848
ov::AnyMap get_npu_generate_config();

0 commit comments

Comments
 (0)