Skip to content

Commit 0dee07f

Browse files
committed
Minor updates
1 parent 9fd135b commit 0dee07f

File tree

3 files changed

+22
-9
lines changed

3 files changed

+22
-9
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
153153
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
154154
static std::set<std::string> debug_output_names = {};
155155
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
156-
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name.find("result") == 0 ||
157-
debug_output_names.count(node_name)) {
156+
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
157+
node_name.find("output") != std::string::npos || debug_output_names.count(node_name)) {
158158
if (node->op == GGML_OP_SET_ROWS) {
159159
assert(node_name.find("cache_k") == 0 || node_name.find("cache_v") == 0);
160160
if (auto it = std::find(m_kv_names.begin(), m_kv_names.end(), node_name); it == m_kv_names.end()) {

ggml/src/ggml-openvino/utils.cpp

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <cstddef>
77
#include <cstdint>
88
#include <cstdlib>
9+
#include <cstring>
910
#include <iomanip>
1011
#include <iostream>
1112
#include <memory>
@@ -101,10 +102,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
101102
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
102103
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
103104
static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
104-
static std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors = get_kv_tensors(cgraph);
105105
// For NPU
106106
static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
107107

108+
auto kv_tensors = get_kv_tensors(cgraph);
108109
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
109110
std::shared_ptr<ov::InferRequest> infer_request;
110111

@@ -224,6 +225,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
224225
}
225226
}
226227

228+
auto state_update_end_time = ggml_time_us();
229+
227230
auto ov_input_names = ov_input_names_cache[cgraph];
228231
auto ov_output_names = ov_output_names_cache[cgraph];
229232
for (size_t i = 0; i < ov_input_names.size(); i++) {
@@ -238,6 +241,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
238241
auto input_end_time = ggml_time_us();
239242

240243
infer_request->infer();
244+
241245
auto infer_end_time = ggml_time_us();
242246

243247
auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
@@ -254,11 +258,12 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
254258
auto end_time = ggml_time_us();
255259

256260
if (getenv("GGML_OPENVINO_PROFILING")) {
257-
GGML_LOG_INFO("GGML OpenVINO Backend: \n");
261+
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
258262
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
259263
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
260264
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
261-
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
265+
GGML_LOG_INFO(" - Graph State Update Time: %ld ms \n", (state_update_end_time - compile_end_time) / 1000);
266+
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - state_update_end_time) / 1000);
262267
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
263268
GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
264269
}
@@ -529,14 +534,22 @@ bool get_is_first_token(const ggml_tensor* inp_pos) {
529534
return *(int32_t*) inp_pos->data == 0;
530535
}
531536

532-
std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph) {
533-
std::vector<std::pair<std::string, ggml_tensor*>> kv_tensors;
537+
std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph) {
538+
static std::unordered_map<struct ggml_cgraph*, std::unordered_map<std::string, ggml_tensor*>> kv_tensors_cache;
539+
540+
auto it = kv_tensors_cache.find(cgraph);
541+
if (it != kv_tensors_cache.end()) {
542+
return it->second;
543+
}
544+
545+
std::unordered_map<std::string, ggml_tensor*> kv_tensors;
534546
for (int i = 0; i < cgraph->n_nodes; ++i) {
535547
auto* op = cgraph->nodes[i];
536548
if (op->op == GGML_OP_SET_ROWS) {
537549
assert(std::string(op->src[2]->name).find("cache_") == 0);
538-
kv_tensors.emplace_back(op->src[2]->name, op->src[2]);
550+
kv_tensors[std::string(op->src[2]->name)] = op->src[2];
539551
}
540552
}
553+
kv_tensors_cache[cgraph] = kv_tensors;
541554
return kv_tensors;
542555
}

ggml/src/ggml-openvino/utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ const ggml_tensor* get_inp_pos_tensor(struct ggml_cgraph* cgraph);
4242

4343
bool get_is_first_token(const ggml_tensor* inp_pos);
4444

45-
std::vector<std::pair<std::string, ggml_tensor*>> get_kv_tensors(struct ggml_cgraph* cgraph);
45+
std::unordered_map<std::string, ggml_tensor*> get_kv_tensors(struct ggml_cgraph* cgraph);
4646

4747
ov::AnyMap get_npu_prefill_config();
4848
ov::AnyMap get_npu_generate_config();

0 commit comments

Comments
 (0)