Skip to content

Commit 0df8b23

Browse files
committed
Remove CPY
1 parent 8083bc7 commit 0df8b23

File tree

6 files changed

+25
-200
lines changed

6 files changed

+25
-200
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 10 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
9090
// 3. constructing a decoder for the whole graph naively (op test case)
9191
void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
9292
std::string node_name;
93-
if (node->op == GGML_OP_CPY || node->op == GGML_OP_SET_ROWS) {
94-
// CPY updates the input tensor in place. For later ov op that uses the
95-
// input tensor of CPY, we need to make sure they get the updated tensor
96-
// by putting the src tensor name in the tensor_map in
93+
if (node->op == GGML_OP_SET_ROWS) {
94+
// SET_ROWS updates the tensor in place. For later ov op that uses the
95+
// the view_src of SET_ROWS, we need to make sure they get the updated tensor
96+
// by putting the view_src name in the tensor_map in
9797
// <openvino>/src/frontends/ggml/src/translate_session.cpp
9898
node_name = std::string(node->view_src->name);
9999
} else {
@@ -183,16 +183,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
183183
}
184184
break;
185185
}
186-
case GGML_OP_CPY: {
187-
if (std::string(node->src[1]->name).find("cache_k") == 0) {
188-
// Write K to cache_k
189-
m_op_case = 1;
190-
} else {
191-
// Write V to cache_v
192-
m_op_case = 2;
193-
}
194-
break;
195-
}
196186
case GGML_OP_SET_ROWS: {
197187
if (std::string(node->name).find("cache_k") == 0) {
198188
m_op_case = 1;
@@ -305,62 +295,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
305295

306296
void GgmlOvDecoder::add_extra_inputs() {
307297
// Extra inputs:
308-
// 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for
309-
// llama-perplexity.
310-
// Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See:
311-
// https://github.com/ggml-org/llama.cpp/pull/14285
312-
// 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
298+
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
313299
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
314300
// Not used for NPU
315-
int64_t past_token_len = -1;
316301
int64_t attention_size = -1;
317-
318-
int64_t token_len = -1;
319-
int64_t past_token_len_from_inp_pos = -1;
320302
for (const auto& node : m_nodes) {
321-
if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") {
322-
if (node->src[1]->type != GGML_TYPE_I32) {
323-
throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32");
303+
if (node->op == GGML_OP_SOFT_MAX) {
304+
auto* mask = node->src[1];
305+
if (std::string(mask->name).find("KQ_mask") != 0) {
306+
throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name));
324307
}
325-
token_len = node->src[1]->ne[0];
326-
past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0];
327-
}
328-
if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
329-
assert(std::string(node->view_src->name).find("cache_k") == 0);
330-
past_token_len =
331-
(int64_t) (node->src[1]->op_params[0] / node->src[1]->nb[0] / m_head_size / m_num_heads_kv);
308+
attention_size = mask->ne[0];
332309
break;
333310
}
334-
if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) {
335-
assert(node->src[1]->type == GGML_TYPE_I64);
336-
past_token_len = *(int64_t*) (node->src[1]->data);
337-
break;
338-
}
339-
}
340-
341-
if (past_token_len == -1) {
342-
throw std::runtime_error("Failed to find input \"cache_k\" in the graph");
343-
}
344-
if (past_token_len != past_token_len_from_inp_pos) {
345-
GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n",
346-
past_token_len,
347-
past_token_len_from_inp_pos);
348311
}
349312

350313
{
351-
std::string name = "past_token_len";
352-
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
353-
param_node->set_friendly_name(name);
354-
param_node->output(0).get_tensor().set_names({name});
355-
m_model_extra_inputs[name] = param_node;
356-
357-
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
358-
*tensor->data<int64_t>() = past_token_len;
359-
m_model_extra_input_values[name] = tensor;
360-
}
361-
{
362-
int64_t total_token_len = token_len + past_token_len;
363-
attention_size = GGML_PAD(total_token_len, 32);
364314
std::string name = "attention_size";
365315
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
366316
param_node->set_friendly_name(name);
@@ -663,7 +613,6 @@ const std::string& GgmlOvDecoder::get_op_type() const {
663613
{GGML_OP_ADD, "GGML_OP_ADD" },
664614
{GGML_OP_ADD1, "GGML_OP_ADD1" },
665615
{GGML_OP_CONT, "GGML_OP_CONT" },
666-
{GGML_OP_CPY, "GGML_OP_CPY" },
667616
{GGML_OP_DIV, "GGML_OP_DIV" },
668617
{GGML_OP_DUP, "GGML_OP_DUP" },
669618
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -328,10 +328,21 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
328328
static const std::set<ggml_type> supported_types{
329329
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32};
330330

331-
static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT,
332-
GGML_OP_VIEW, GGML_OP_CONT, GGML_OP_CPY, GGML_OP_RESHAPE,
333-
GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE,
334-
GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_SOFT_MAX, GGML_OP_SET_ROWS};
331+
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
332+
GGML_OP_ADD,
333+
GGML_OP_MUL,
334+
GGML_OP_MUL_MAT,
335+
GGML_OP_VIEW,
336+
GGML_OP_CONT,
337+
GGML_OP_RESHAPE,
338+
GGML_OP_PERMUTE,
339+
GGML_OP_TRANSPOSE,
340+
GGML_OP_GET_ROWS,
341+
GGML_OP_ROPE,
342+
GGML_OP_RMS_NORM,
343+
GGML_OP_SCALE,
344+
GGML_OP_SOFT_MAX,
345+
GGML_OP_SET_ROWS};
335346
static const std::set<ggml_unary_op> supported_unary_ops{
336347
GGML_UNARY_OP_SILU,
337348
};

ggml/src/ggml-openvino/openvino/op/cpy.cpp

Lines changed: 0 additions & 73 deletions
This file was deleted.

ggml/src/ggml-openvino/openvino/op_table.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
1919
{"GGML_OP_ADD", op::translate_1to1_match_2_inputs<v1::Add> },
2020
{"GGML_OP_ADD1", op::translate_1to1_match_2_inputs<v1::Add> },
2121
{"GGML_OP_CONT", op::translate_cont },
22-
{"GGML_OP_CPY", op::translate_cpy },
2322
{"GGML_OP_DIV", op::translate_1to1_match_2_inputs<v1::Divide> },
2423
{"GGML_OP_GET_ROWS", op::translate_get_rows },
2524
{"GGML_OP_MUL", op::translate_1to1_match_2_inputs<v1::Multiply>},

ggml/src/ggml-openvino/openvino/op_table.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ namespace op {
1212

1313
GGML_OP_CONVERTER(translate_add);
1414
GGML_OP_CONVERTER(translate_cont);
15-
GGML_OP_CONVERTER(translate_cpy);
1615
GGML_OP_CONVERTER(translate_get_rows);
1716
GGML_OP_CONVERTER(translate_mul);
1817
GGML_OP_CONVERTER(translate_mulmat);

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -76,65 +76,6 @@ void add_token_len(TensorMap& tensor_map) {
7676
tensor_map.insert({"token_len", token_len->output(0)});
7777
}
7878

79-
void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
80-
// cache_k layout: [S, N, H] (seq, num_heads, head_size)
81-
// cache_v layout: [N, H, S] (num_heads, head_size, seq)
82-
// When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened
83-
auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr();
84-
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
85-
86-
Output<Node> update_indices_k;
87-
Output<Node> update_indices_v;
88-
89-
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
90-
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
91-
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
92-
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
93-
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
94-
95-
auto past_token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(past_token_len, zero);
96-
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
97-
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
98-
99-
Output<Node> update_indices = std::make_shared<ov::op::v4::Range>(
100-
past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64);
101-
if (ggml_model_decoder.is_static()) {
102-
update_indices = past_token_len;
103-
}
104-
105-
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices, one);
106-
update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k");
107-
tensor_map.insert({"update_indices_k", update_indices_k});
108-
109-
auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size();
110-
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
111-
auto total_head_size_scalar = std::make_shared<ov::op::v0::Squeeze>(total_head_size_node, zero);
112-
113-
// 1D tensor of shape [total_head_size], values starting from 0
114-
auto range_row =
115-
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
116-
auto range_row_reshaped =
117-
std::make_shared<ov::op::v0::Unsqueeze>(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
118-
auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
119-
range_row_reshaped,
120-
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
121-
122-
// 1D tensor of shape [token_len], values starting from past_token_len
123-
auto range_col = update_indices;
124-
auto range_col_reshaped =
125-
std::make_shared<ov::op::v0::Unsqueeze>(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
126-
auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
127-
range_col_reshaped,
128-
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
129-
130-
// Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
131-
update_indices_v = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
132-
update_indices_v = std::make_shared<ov::op::v1::Reshape>(
133-
update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
134-
update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v");
135-
tensor_map.insert({"update_indices_v", update_indices_v});
136-
}
137-
13879
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
13980
int32_t* rope_params = ggml_model_decoder.get_rope_params();
14081
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
@@ -156,7 +97,6 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
15697
// Create common patterns
15798
void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
15899
add_token_len(tensor_map);
159-
add_kv_update_indices(tensor_map, ggml_model_decoder);
160100
add_rope_sin_cos(tensor_map, ggml_model_decoder);
161101
}
162102

0 commit comments

Comments
 (0)