Skip to content

Commit 77d2195

Browse files
committed
stateless
1 parent 1b55304 commit 77d2195

File tree

5 files changed

+51
-110
lines changed

5 files changed

+51
-110
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -285,14 +285,17 @@ void GgmlOvDecoder::set_llm_params() {
285285
} else {
286286
m_context_size = cache_k->ne[1];
287287
}
288-
} else if (node->op == GGML_OP_ROPE &&
289-
(name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) {
290-
m_head_size = node->ne[0];
291-
m_num_heads = node->ne[1];
292-
m_rope_params = node->op_params;
293-
} else if (node->op == GGML_OP_ROPE &&
294-
(name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0)) {
295-
m_num_heads_kv = node->ne[1];
288+
} else if (node->op == GGML_OP_ROPE) {
289+
if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) {
290+
m_head_size = node->ne[0];
291+
m_num_heads = node->ne[1];
292+
m_rope_params = node->op_params;
293+
auto * inp_pos = node->src[1];
294+
m_input_len = inp_pos->ne[0];
295+
m_past_kv_len = *(int32_t *) inp_pos->data;
296+
} else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) {
297+
m_num_heads_kv = node->ne[1];
298+
}
296299
}
297300
}
298301
}

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
8181

8282
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const override;
8383

84-
const ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
84+
ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
8585

86-
const ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); }
86+
ggml_tensor * get_output_ggml_tensor(const std::string & name) const { return m_outputs.at(name); }
8787

8888
virtual int get_op_case() const override { return m_op_case; }
8989

@@ -119,6 +119,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
119119

120120
virtual int get_head_size() const override { return m_head_size; }
121121

122+
int get_past_kv_len() const { return m_past_kv_len; }
123+
124+
int get_input_len() const { return m_input_len; }
125+
122126
virtual int32_t * get_rope_params() const override { return m_rope_params; }
123127

124128
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
@@ -176,6 +180,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
176180
int m_num_heads;
177181
int m_num_heads_kv;
178182
int m_head_size;
183+
int m_past_kv_len;
184+
int m_input_len;
179185
int32_t * m_rope_params;
180186
std::vector<std::string> m_kv_names;
181187
bool m_is_static = false;

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -243,11 +243,11 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
243243
manager.set_per_pass_validation(true);
244244
manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
245245

246-
if (!ggml_model_decoder->is_static()) {
247-
const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
248-
const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
249-
manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
250-
}
246+
// if (!ggml_model_decoder->is_static()) {
247+
// const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
248+
// const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
249+
// manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
250+
// }
251251

252252
// if (ggml_model_decoder->is_static()) {
253253
manager.register_pass<pass::EliminateZeroPoints>();

ggml/src/ggml-openvino/utils.cpp

Lines changed: 25 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <openvino/runtime/intel_npu/properties.hpp>
2828
#include <openvino/runtime/properties.hpp>
2929
#include <openvino/runtime/tensor.hpp>
30+
#include <string>
3031
#include <unordered_map>
3132
#include <vector>
3233

@@ -35,7 +36,8 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
3536
auto * input_data = ggml_tensor->data;
3637
ov::Shape input_shape;
3738
if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
38-
input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
39+
input_shape = {1, (size_t) ggml_decoder->get_past_kv_len(), (size_t) ggml_decoder->get_num_heads_kv(),
40+
(size_t) ggml_decoder->get_head_size()};
3941
} else if (ggml_tensor->op == GGML_OP_VIEW) {
4042
// This case is added to make test-backend-ops work
4143
input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor->view_src).to_shape();
@@ -46,15 +48,15 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
4648
return input_tensor;
4749
}
4850

49-
std::map<std::string, void *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
50-
std::map<std::string, void *> output_tensors;
51+
std::map<std::string, ggml_tensor *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder) {
52+
std::map<std::string, ggml_tensor *> output_tensors;
5153

5254
auto output_names = ggml_decoder->get_model_output_names();
5355
for (size_t inp = 0; inp < output_names.size(); ++inp) {
5456
auto name = output_names[inp];
55-
const auto * tensor = ggml_decoder->get_output_ggml_tensor(name);
56-
auto * output_data = tensor->view_src ? tensor->view_src->data : tensor->data;
57-
output_tensors[name] = output_data;
57+
auto * tensor = ggml_decoder->get_output_ggml_tensor(name);
58+
tensor = tensor->view_src ? tensor->view_src : tensor;
59+
output_tensors[name] = tensor;
5860
}
5961
return output_tensors;
6062
}
@@ -110,7 +112,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
110112
// For NPU
111113
static std::unordered_map<ggml_cgraph *, std::shared_ptr<ov::InferRequest>> decode_infer_request_cache;
112114

113-
auto kv_tensors = get_kv_tensors(cgraph);
114115
std::shared_ptr<GgmlOvDecoder> ggml_decoder;
115116
std::shared_ptr<ov::InferRequest> infer_request;
116117

@@ -208,45 +209,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
208209
}
209210
}
210211

211-
if (!is_static) {
212-
auto states = infer_request->query_state();
213-
int32_t kv_len = *(int32_t *) inp_pos->data;
214-
int32_t kv_len_in_state = states[0].get_state().get_shape()[1];
215-
216-
// outdated if:
217-
// 1. kv_len != kv_len_in_state
218-
// 2. last row has different values
219-
bool state_outdated = kv_len != kv_len_in_state;
220-
if (!state_outdated && kv_len > 0) {
221-
auto state_tensor = states[0].get_state();
222-
auto state_name = states[0].get_name();
223-
state_name = state_name.substr(0, state_name.size() / 2);
224-
auto state_shape = state_tensor.get_shape();
225-
auto * ggml_tensor = kv_tensors[state_name];
226-
auto offset = (kv_len - 1) * state_shape[2] * state_shape[3] * ggml_type_size(ggml_tensor->type);
227-
auto size = state_shape[2] * state_shape[3] * ggml_type_size(ggml_tensor->type);
228-
state_outdated =
229-
std::memcmp((char *) ggml_tensor->data + offset, (char *) state_tensor.data() + offset, size) != 0;
230-
}
231-
232-
if (state_outdated) {
233-
GGML_LOG_DEBUG(
234-
"GGML OpenVINO Backend: updating kv cache states from ggml tensors (kv_len: %d, kv_len_in_state: %d)\n",
235-
kv_len, kv_len_in_state);
236-
for (auto & state : states) {
237-
auto state_name = state.get_name();
238-
state_name = state_name.substr(0, state_name.size() / 2);
239-
auto * ggml_tensor = kv_tensors[state_name];
240-
auto state_shape = state.get_state().get_shape();
241-
state_shape[1] = kv_len;
242-
ov::Tensor state_tensor(state.get_state().get_element_type(), state_shape, ggml_tensor->data);
243-
state.set_state(state_tensor);
244-
}
245-
}
246-
}
247-
248-
auto state_update_end_time = ggml_time_us();
249-
250212
auto ov_input_names = ov_input_names_cache[cgraph];
251213
auto ov_output_names = ov_output_names_cache[cgraph];
252214
for (size_t i = 0; i < ov_input_names.size(); i++) {
@@ -264,39 +226,35 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
264226

265227
auto infer_end_time = ggml_time_us();
266228

267-
auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
229+
auto ggml_tensors = get_ggml_graph_output_dst(ggml_decoder);
230+
auto kv_size = ggml_decoder->get_input_len() * ggml_decoder->get_num_heads_kv() * ggml_decoder->get_head_size();
231+
auto kv_offset = ggml_decoder->get_past_kv_len() * ggml_decoder->get_num_heads_kv() * ggml_decoder->get_head_size();
268232
for (size_t i = 0; i < ov_output_names.size(); i++) {
269-
auto & result_name = ov_output_names[i];
270233
const auto output_tensor = infer_request->get_output_tensor(i);
234+
auto & result_name = ov_output_names[i];
235+
auto * ggml_tensor = ggml_tensors[result_name];
271236

272-
std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
237+
if (result_name.find("cache") == std::string::npos) {
238+
std::memcpy(ggml_tensors[result_name]->data, output_tensor.data(), output_tensor.get_byte_size());
239+
} else {
240+
auto offset = kv_offset * ggml_type_size(ggml_tensor->type);
241+
auto size = kv_size * ggml_type_size(ggml_tensor->type);
242+
std::memcpy((char *) ggml_tensor->data + offset, (char *) output_tensor.data() + offset, size);
243+
}
273244

274245
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
275-
print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs);
246+
print_output_tensor_info(result_name, output_tensor, ggml_tensor->data);
276247
}
277248
}
278249

279-
for (auto & state : infer_request->query_state()) {
280-
auto state_name = state.get_name();
281-
state_name = state_name.substr(0, state_name.size() / 2);
282-
auto state_tensor = state.get_state();
283-
auto state_shape = state_tensor.get_shape();
284-
auto * ggml_tensor = kv_tensors[state_name];
285-
auto size = state_shape[2] * state_shape[3] * inp_pos->ne[0] * ggml_type_size(ggml_tensor->type);
286-
auto offset =
287-
state_shape[2] * state_shape[3] * (*(int32_t *) inp_pos->data) * ggml_type_size(ggml_tensor->type);
288-
std::memcpy((char *) ggml_tensor->data + offset, (char *) state_tensor.data() + offset, size);
289-
}
290-
291250
auto end_time = ggml_time_us();
292251

293252
if (getenv("GGML_OPENVINO_PROFILING")) {
294253
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
295254
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
296255
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
297256
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
298-
GGML_LOG_INFO(" - Graph State Update Time: %ld ms \n", (state_update_end_time - compile_end_time) / 1000);
299-
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - state_update_end_time) / 1000);
257+
GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000);
300258
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000);
301259
GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000);
302260
}
@@ -487,11 +445,9 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor
487445
}
488446
}
489447

490-
void print_output_tensor_info(const std::string & name,
491-
const ov::Tensor & tensor,
492-
std::map<std::string, void *> & output_dst) {
493-
std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape()
494-
<< ", Address: " << output_dst[name] << std::endl;
448+
void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst) {
449+
std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() << ", Address: " << output_dst
450+
<< std::endl;
495451

496452
auto print_float_stats = [](const std::string & type_name, size_t size, auto get_value) {
497453
if (size == 0) {
@@ -567,23 +523,3 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
567523
bool get_is_first_token(const ggml_tensor * inp_pos) {
568524
return *(int32_t *) inp_pos->data == 0;
569525
}
570-
571-
std::unordered_map<std::string, ggml_tensor *> get_kv_tensors(struct ggml_cgraph * cgraph) {
572-
static std::unordered_map<struct ggml_cgraph *, std::unordered_map<std::string, ggml_tensor *>> kv_tensors_cache;
573-
574-
auto it = kv_tensors_cache.find(cgraph);
575-
if (it != kv_tensors_cache.end()) {
576-
return it->second;
577-
}
578-
579-
std::unordered_map<std::string, ggml_tensor *> kv_tensors;
580-
for (int i = 0; i < cgraph->n_nodes; ++i) {
581-
auto * op = cgraph->nodes[i];
582-
if (op->op == GGML_OP_SET_ROWS) {
583-
assert(std::string(op->src[2]->name).find("cache_") == 0);
584-
kv_tensors[std::string(op->src[2]->name)] = op->src[2];
585-
}
586-
}
587-
kv_tensors_cache[cgraph] = kv_tensors;
588-
return kv_tensors;
589-
}

ggml/src/ggml-openvino/utils.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,13 @@ std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph * cgraph, boo
1111

1212
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name);
1313

14-
std::map<std::string, void *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
14+
std::map<std::string, ggml_tensor *> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
1515

1616
size_t checksum(const void * data, size_t size);
1717

1818
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
1919

20-
void print_output_tensor_info(const std::string & name,
21-
const ov::Tensor & tensor,
22-
std::map<std::string, void *> & output_dst);
20+
void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst);
2321

2422
template <typename T>
2523
std::vector<T> pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
@@ -42,8 +40,6 @@ const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
4240

4341
bool get_is_first_token(const ggml_tensor * inp_pos);
4442

45-
std::unordered_map<std::string, ggml_tensor *> get_kv_tensors(struct ggml_cgraph * cgraph);
46-
4743
ov::AnyMap get_npu_prefill_config();
4844
ov::AnyMap get_npu_generate_config();
4945

0 commit comments

Comments
 (0)