@@ -90,10 +90,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph) {
9090// 3. constructing a decoder for the whole graph naively (op test case)
9191void GgmlOvDecoder::set_input_output (ggml_tensor* node, bool naive) {
9292 std::string node_name;
93- if (node->op == GGML_OP_CPY || node-> op == GGML_OP_SET_ROWS) {
94- // CPY updates the input tensor in place. For later ov op that uses the
95- // input tensor of CPY , we need to make sure they get the updated tensor
96- // by putting the src tensor name in the tensor_map in
93+ if (node->op == GGML_OP_SET_ROWS) {
94+ // SET_ROWS updates the tensor in place. For later ov op that uses the
95+ // the view_src of SET_ROWS , we need to make sure they get the updated tensor
96+ // by putting the view_src name in the tensor_map in
9797 // <openvino>/src/frontends/ggml/src/translate_session.cpp
9898 node_name = std::string (node->view_src ->name );
9999 } else {
@@ -183,16 +183,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) {
183183 }
184184 break ;
185185 }
186- case GGML_OP_CPY: {
187- if (std::string (node->src [1 ]->name ).find (" cache_k" ) == 0 ) {
188- // Write K to cache_k
189- m_op_case = 1 ;
190- } else {
191- // Write V to cache_v
192- m_op_case = 2 ;
193- }
194- break ;
195- }
196186 case GGML_OP_SET_ROWS: {
197187 if (std::string (node->name ).find (" cache_k" ) == 0 ) {
198188 m_op_case = 1 ;
@@ -305,62 +295,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
305295
306296void GgmlOvDecoder::add_extra_inputs () {
307297 // Extra inputs:
308- // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for
309- // llama-perplexity.
310- // Update: SET_ROWS replaces CPY for updating kv cache. The indices creation is not needed anymore. See:
311- // https://github.com/ggml-org/llama.cpp/pull/14285
312- // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
298+ // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
313299 // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
314300 // Not used for NPU
315- int64_t past_token_len = -1 ;
316301 int64_t attention_size = -1 ;
317-
318- int64_t token_len = -1 ;
319- int64_t past_token_len_from_inp_pos = -1 ;
320302 for (const auto & node : m_nodes) {
321- if (node->op == GGML_OP_ROPE && std::string (node->src [1 ]->name ) == " inp_pos" ) {
322- if (node->src [1 ]->type != GGML_TYPE_I32) {
323- throw std::runtime_error (" Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32" );
303+ if (node->op == GGML_OP_SOFT_MAX) {
304+ auto * mask = node->src [1 ];
305+ if (std::string (mask->name ).find (" KQ_mask" ) != 0 ) {
306+ throw std::runtime_error (" Unexpected softmax node: " + std::string (mask->name ));
324307 }
325- token_len = node->src [1 ]->ne [0 ];
326- past_token_len_from_inp_pos = ((int32_t *) (node->src [1 ]->data ))[0 ];
327- }
328- if (node->op == GGML_OP_CPY && ggml_is_contiguous (node)) {
329- assert (std::string (node->view_src ->name ).find (" cache_k" ) == 0 );
330- past_token_len =
331- (int64_t ) (node->src [1 ]->op_params [0 ] / node->src [1 ]->nb [0 ] / m_head_size / m_num_heads_kv);
308+ attention_size = mask->ne [0 ];
332309 break ;
333310 }
334- if (node->op == GGML_OP_SET_ROWS && std::string (node->name ).find (" cache_k" ) == 0 ) {
335- assert (node->src [1 ]->type == GGML_TYPE_I64);
336- past_token_len = *(int64_t *) (node->src [1 ]->data );
337- break ;
338- }
339- }
340-
341- if (past_token_len == -1 ) {
342- throw std::runtime_error (" Failed to find input \" cache_k\" in the graph" );
343- }
344- if (past_token_len != past_token_len_from_inp_pos) {
345- GGML_LOG_DEBUG (" Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n " ,
346- past_token_len,
347- past_token_len_from_inp_pos);
348311 }
349312
350313 {
351- std::string name = " past_token_len" ;
352- auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::Shape{1 });
353- param_node->set_friendly_name (name);
354- param_node->output (0 ).get_tensor ().set_names ({name});
355- m_model_extra_inputs[name] = param_node;
356-
357- auto tensor = std::make_shared<ov::Tensor>(ov::element::i64 , ov::Shape{1 });
358- *tensor->data <int64_t >() = past_token_len;
359- m_model_extra_input_values[name] = tensor;
360- }
361- {
362- int64_t total_token_len = token_len + past_token_len;
363- attention_size = GGML_PAD (total_token_len, 32 );
364314 std::string name = " attention_size" ;
365315 auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::Shape{1 });
366316 param_node->set_friendly_name (name);
@@ -663,7 +613,6 @@ const std::string& GgmlOvDecoder::get_op_type() const {
663613 {GGML_OP_ADD, " GGML_OP_ADD" },
664614 {GGML_OP_ADD1, " GGML_OP_ADD1" },
665615 {GGML_OP_CONT, " GGML_OP_CONT" },
666- {GGML_OP_CPY, " GGML_OP_CPY" },
667616 {GGML_OP_DIV, " GGML_OP_DIV" },
668617 {GGML_OP_DUP, " GGML_OP_DUP" },
669618 {GGML_OP_GET_ROWS, " GGML_OP_GET_ROWS" },
0 commit comments