Fix llama-cli (need to run with --no-warmup)

wine99 · wine99 · commit d84268ebdfea · 2025-10-11T10:10:21.000+08:00
diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
@@ -42,15 +42,15 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) {
         mask_sliced = context.get_input(mask_name);
     } else {
         auto token_len = get_dimensions(q, {2});
+        auto kv_len = get_dimensions(k.get_node_shared_ptr(), {2});
+
         auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
         auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
         auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
         auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
         auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
-        auto inp_pos = context.get_input("inp_pos");
-        auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
-        auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
-        auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
+
+        auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
         mask_sliced =
             std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
         mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -132,7 +132,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
 // Create common patterns
 void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
     add_token_len(tensor_map);
-    add_sliced_mask(tensor_map, ggml_model_decoder);
+    // add_sliced_mask(tensor_map, ggml_model_decoder);
     add_rope_sin_cos(tensor_map, ggml_model_decoder);
 }
 

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {`
`132`	`132`	`// Create common patterns`
`133`	`133`	`void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {`
`134`	`134`	`add_token_len(tensor_map);`
`135`		`- add_sliced_mask(tensor_map, ggml_model_decoder);`
	`135`	`+ // add_sliced_mask(tensor_map, ggml_model_decoder);`
`136`	`136`	`add_rope_sin_cos(tensor_map, ggml_model_decoder);`
`137`	`137`	`}`
`138`	`138`