@@ -90,7 +90,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
9090// 3. constructing a decoder for the whole graph naively (op test case)
9191void GgmlOvDecoder::set_input_output (ggml_tensor* node, bool naive) {
9292 std::string node_name;
93- if (node->op == GGML_OP_CPY) {
93+ if (node->op == GGML_OP_CPY || node-> op == GGML_OP_SET_ROWS ) {
9494 // CPY updates the input tensor in place. For later ov op that uses the
9595 // input tensor of CPY, we need to make sure they get the updated tensor
9696 // by putting the src tensor name in the tensor_map in
@@ -151,9 +151,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
151151 if (node->buffer ->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
152152 assert (name.find (" cache_k" ) == 0 || name.find (" cache_v" ) == 0 );
153153 }
154- auto it = std::find (m_model_output_names.begin (), m_model_output_names.end (), name);
155- if ( it == m_model_output_names.end ()) {
154+ if ( auto it = std::find (m_model_output_names.begin (), m_model_output_names.end (), name);
155+ it == m_model_output_names.end ()) {
156156 m_model_output_names.push_back (name);
157+ }
158+ if (auto it = std::find (m_kv_names.begin (), m_kv_names.end (), name); it == m_kv_names.end ()) {
157159 m_kv_names.push_back (name);
158160 }
159161 }
@@ -166,6 +168,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
166168 m_op_case = 1 ;
167169 } else if (node->src [0 ]->ne [0 ] * node->src [0 ]->ne [1 ] == node->ne [0 ]) {
168170 m_op_case = 2 ;
171+ } else if (node->src [0 ]->ne [0 ] * node->src [0 ]->ne [1 ] == node->ne [1 ]) {
172+ m_op_case = 3 ;
169173 }
170174 break ;
171175 }
@@ -270,6 +274,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
270274 input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
271275 } else if (name.find (" cache_v" ) == 0 ) {
272276 input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
277+ } else if (get_tensor_used_op (src)->op == GGML_OP_SET_ROWS) {
278+ input_shape = ov::PartialShape{1 , 1 , -1 };
273279 } else if (src->op == GGML_OP_VIEW) {
274280 // This case is added to make test-backend-ops work
275281 input_shape = ov::PartialShape{get_shape (src->view_src )};
@@ -283,6 +289,8 @@ void GgmlOvDecoder::add_extra_inputs() {
283289 // Extra inputs:
284290 // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for
285291 // llama-perplexity.
292+ // Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See:
293+ // https://github.com/ggml-org/llama.cpp/pull/14285
286294 // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
287295 // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
288296 // Not used for NPU
@@ -305,6 +313,10 @@ void GgmlOvDecoder::add_extra_inputs() {
305313 (int64_t ) (node->src [1 ]->op_params [0 ] / node->src [1 ]->nb [0 ] / m_head_size / m_num_heads_kv);
306314 break ;
307315 }
316+ if (node->op == GGML_OP_SET_ROWS && std::string (node->name ).find (" cache_k" ) == 0 ) {
317+ assert (node->src [1 ]->type == GGML_TYPE_I64);
318+ past_token_len = *(int64_t *) (node->src [1 ]->data );
319+ }
308320 }
309321
310322 if (past_token_len == -1 ) {
@@ -342,6 +354,18 @@ void GgmlOvDecoder::add_extra_inputs() {
342354 }
343355}
344356
357+ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op (const ggml_tensor* tensor) const {
358+ for (int i = 0 ; i < m_cgraph->n_nodes ; i++) {
359+ const auto * node = m_cgraph->nodes [i];
360+ for (int j = 0 ; j < GGML_MAX_SRC; j++) {
361+ if (node->src [j] == tensor) {
362+ return node;
363+ }
364+ }
365+ }
366+ throw std::runtime_error (" Tensor not found in cgraph" );
367+ }
368+
345369std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names () const {
346370 std::map<std::string, std::string> kv_param_res_names;
347371 for (const auto & name : m_kv_names) {
@@ -618,7 +642,8 @@ const std::string& GgmlOvDecoder::get_op_type() const {
618642 {GGML_OP_SOFT_MAX, " GGML_OP_SOFT_MAX" },
619643 {GGML_OP_SUB, " GGML_OP_SUB" },
620644 {GGML_OP_TRANSPOSE, " GGML_OP_TRANSPOSE" },
621- {GGML_OP_VIEW, " GGML_OP_VIEW" }
645+ {GGML_OP_VIEW, " GGML_OP_VIEW" },
646+ {GGML_OP_SET_ROWS, " GGML_OP_SET_ROWS" },
622647 };
623648 static const std::map<ggml_unary_op, std::string> unary_ops = {
624649 {GGML_UNARY_OP_ABS, " GGML_UNARY_OP_ABS" },
0 commit comments