Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working

wine99 · wine99 · commit b650ca37e754 · 2025-10-10T13:20:07.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -73,7 +73,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
         set_input_output(cur_node);
     }
 
-    add_extra_inputs();
+    // add_extra_inputs();
 }
 
 GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph,
@@ -336,9 +336,10 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
 
 void GgmlOvDecoder::add_extra_inputs() {
     // Extra inputs:
-    // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
+    // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
     //     see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
-    //     Not used for NPU
+    //     Not used for NPU.
+    //     Update: not used anymore after the optimization of making kvcache dynamic (but breaks iSWA models)
     int64_t attention_size = -1;
     int64_t attention_size_swa = -1;
     for (const auto& node : m_nodes) {
diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
@@ -2,7 +2,6 @@
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/convert.hpp>
-#include <openvino/op/gather.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/scaled_dot_product_attention.hpp>
 #include <openvino/op/transpose.hpp>
diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@@ -59,23 +59,13 @@ OutputVector translate_mulmat(const NodeContext& context) {
 
         auto Z_last_two_dims = get_dimensions(Z.get_node_shared_ptr(), {1, 2});
 
+        auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
+        auto Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
+
         Output<Node> batch_small = A_batch_larger ? B_batch_node : A_batch_node;
         Output<Node> batch_large = A_batch_larger ? A_batch_node : B_batch_node;
-
-        ov::Output<Node> broadcast_shape;
-        ov::Output<Node> Z_unsqueezed;
-        if (context.is_static()) {
-            auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {1});
-            Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
-            broadcast_shape =
-                std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
-        } else {
-            auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
-            Z_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(Z, unsqueeze_axes);
-            auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
-            broadcast_shape =
-                std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one_1d, batch_small, factor_node, Z_last_two_dims}, 0);
-        }
+        auto broadcast_shape =
+            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_small, factor_node, Z_last_two_dims}, 0);
         auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape);
 
         auto new_Z_shape = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{batch_large, Z_last_two_dims}, 0);
diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp
@@ -40,15 +40,6 @@ OutputVector translate_permute(const NodeContext& context) {
         }
     } else {
         auto src = context.get_input(0);
-        Output<Node> attention_size;
-        if (context.is_static()) {
-            attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
-        } else if (op_case == 2) {
-            attention_size = context.get_input("attention_size");
-        } else {
-            attention_size = context.get_input("attention_size_swa");
-        }
-
         auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
 
         if (context.is_static()) {
@@ -58,9 +49,8 @@ OutputVector translate_permute(const NodeContext& context) {
                 src,
                 ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, src_shape[1], src_shape[2]}),
                 false);
-            auto src_slice = std::make_shared<ov::op::v8::Slice>(src_reshaped, zero, attention_size, one, zero);
-            res = std::make_shared<ov::op::v1::Transpose>(src_slice,
-                                                          ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
+            res = std::make_shared<ov::op::v1::Transpose>(
+                src_reshaped, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2}));
         } else {
             if (src.get_partial_shape().rank() == 3) {
                 src = std::make_shared<ov::op::v0::Unsqueeze>(src, zero);
diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp
@@ -7,10 +7,8 @@
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
-#include <openvino/op/gather.hpp>
 #include <openvino/op/matmul.hpp>
 #include <openvino/op/multiply.hpp>
-#include <openvino/op/unsqueeze.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/softmax.hpp>
 #include <vector>
@@ -59,20 +57,9 @@ OutputVector translate_soft_max(const NodeContext& context) {
     } else {
         auto token_len = get_dimensions(input_node, {1});
         auto mask_node = context.get_input(1);
-        auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
-        auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
-        auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-        auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
-        auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
-        auto inp_pos = context.get_input("inp_pos");
-        auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
-        auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
-        auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
-        mask_node_sliced =
-            std::make_shared<ov::op::v8::Slice>(mask_node, zero_2d, stop, one_2d, axes);
-        if (!(context.is_static())) {
-            mask_node_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_node_sliced, zero_1d);
-        }
+        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
     }
 
     if (mask_node_sliced.get_element_type() != context.get_output_type(0)) {
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -11,14 +11,15 @@
 #include <openvino/op/convert.hpp>
 #include <openvino/op/cos.hpp>
 #include <openvino/op/divide.hpp>
-#include <openvino/op/gather.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/parameter.hpp>
 #include <openvino/op/range.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/result.hpp>
 #include <openvino/op/sin.hpp>
+#include <openvino/op/slice.hpp>
 #include <openvino/op/squeeze.hpp>
+#include <openvino/op/strided_slice.hpp>
 #include <openvino/op/transpose.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <openvino/pass/constant_folding.hpp>
@@ -88,15 +89,27 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
             if (is_static) {
                 mask_sliced = mask;
             } else {
-                auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
-                auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
+                auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 0});
+                auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 1});
+                auto one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
                 auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
                 auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
-                auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,2});
-                auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
-                auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
-                auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
-                auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, gather_inp_pos}, 0);
+                auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2});
+
+                std::shared_ptr<ov::Node> kv_len;
+                {
+                    auto start = ov::op::v0::Constant::create(element::i64, Shape{3}, {0, 0, -1});
+                    auto stride = ov::op::v0::Constant::create(element::i64, Shape{3}, {1, 1, 1});
+                    auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
+                    kv_len = std::make_shared<ov::op::v1::StridedSlice>(
+                        inp_pos, start, start, stride, std::vector<int64_t>{0, 0, 0}, std::vector<int64_t>{1, 1, 1});
+                }
+                kv_len = std::make_shared<ov::op::v0::Squeeze>(
+                    kv_len, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
+                kv_len = std::make_shared<ov::op::v0::Convert>(kv_len, ov::element::i64);
+                kv_len = std::make_shared<ov::op::v1::Add>(kv_len, one_1d);
+                auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len, kv_len}, 0);
+
                 mask_sliced =
                     std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
                 mask_sliced = std::make_shared<ov::op::v0::Unsqueeze>(mask_sliced, zero_1d);
@@ -108,7 +121,8 @@ void add_sliced_mask(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
     };
 
     create_sliced_mask("KQ_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
-    create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
+    // swa is not working for the `kv_len` is not correct
+    // create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
 }
 
 void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
@@ -132,7 +146,7 @@ void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
 // Create common patterns
 void preprocess(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
     add_token_len(tensor_map);
-    // add_sliced_mask(tensor_map, ggml_model_decoder);
+    add_sliced_mask(tensor_map, ggml_model_decoder);
     add_rope_sin_cos(tensor_map, ggml_model_decoder);
 }