draft NPU support version 2: prefill + kvcache

wine99 · wine99 · commit 8934f73fc290 · 2025-06-04T15:28:44.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -108,22 +108,25 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
                 ov::PartialShape input_shape;
                 if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
                     if (m_is_static) {
-                        input_shape = ov::PartialShape(get_shape(src));
-                        // if (m_is_first_token) {
-                        //     input_shape = ov::PartialShape{1, 1, m_max_token_len};
-                        // } else {
-                        //     input_shape = ov::PartialShape{1, 1, 1};
-                        // }
+                        if (m_is_first_token) {
+                            input_shape = ov::PartialShape{1, 1, m_max_token_len};
+                        } else {
+                            input_shape = ov::PartialShape{1, 1, 1};
+                        }
                     } else {
                         input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
                     }
-                } else if (std::string(src->name).find("KQ_mask") == 0) {
+                } else if (std::string(src->name) == "KQ_mask") {
                     if (m_is_static) {
-                        input_shape = ov::PartialShape(get_shape(src));
+                        if (m_is_first_token) {
+                            input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len};
+                        } else {
+                            input_shape = ov::PartialShape{1, 1, m_max_token_len};
+                        }
                     } else {
-                        auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
+                        auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
                         input_shape =
-                            ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
+                            ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
                     }
                 } else {
                     input_shape = ov::Shape{get_shape(src)};
@@ -208,6 +211,7 @@ void GgmlOvDecoder::set_max_token_len() {
 
 void GgmlOvDecoder::add_extra_inputs() {
     int64_t past_token_len;
+    // attention_size not used for NPU
     int64_t attention_size;
 
     for (const auto& node : m_nodes) {
@@ -231,8 +235,7 @@ void GgmlOvDecoder::add_extra_inputs() {
     for (const auto& node : m_nodes) {
         if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
             int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
-            attention_size = (total_token_len + 31) / 32 * 32;
-
+            attention_size = GGML_PAD(total_token_len, 32);
             std::string name = "attention_size";
             auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
             param_node->set_friendly_name(name);
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -92,9 +92,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     virtual bool is_static() const override {
         return m_is_static;
     }
-    virtual bool is_first_token() const {
+    virtual bool is_first_token() const override {
         return m_is_first_token;
     }
+    virtual int get_max_token_len() const override {
+        return m_max_token_len;
+    }
 
 private:
     void set_input_output(ggml_tensor* node);
@@ -106,7 +109,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
 
     void set_max_token_len();
-    int64_t m_max_token_len;
+    int m_max_token_len;
 
     void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
 
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <map>
 #include <openvino/core/node.hpp>
 #include <openvino/frontend/decoder.hpp>
@@ -57,6 +58,8 @@ class GgmlDecoder : public DecoderBase {
     virtual const std::vector<std::string>& get_model_output_names() const = 0;
 
     virtual bool is_static() const = 0;
+    virtual bool is_first_token() const = 0;
+    virtual int get_max_token_len() const = 0;
 };
 
 }  // namespace ggml
diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <openvino/frontend/node_context.hpp>
 
 #include "decoder.hpp"
@@ -87,6 +88,12 @@ class NodeContext : public frontend::NodeContext {
     bool is_static() const {
         return m_decoder->is_static();
     }
+    bool is_first_token() const {
+        return m_decoder->is_first_token();
+    }
+    int get_max_token_len() const {
+        return m_decoder->get_max_token_len();
+    }
 
 private:
     std::shared_ptr<GgmlDecoder> m_decoder;
diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
@@ -8,7 +8,7 @@
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
-#include <openvino/op/convert_like.hpp>
+#include <openvino/op/convert.hpp>
 #include <openvino/op/range.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/scatter_nd_update.hpp>
@@ -34,18 +34,26 @@ OutputVector translate_cpy(const NodeContext& context) {
 
     auto src0 = context.get_input(0);
     auto src1 = context.get_input(1);
-    auto past_token_len = context.get_input("past_token_len");
+    auto past_token_len_scalar = context.get_input("past_token_len");
+
+    src0 = std::make_shared<ov::op::v0::Convert>(src0, context.get_input_type(1));
     ov::Output<Node> res;
 
+    if (context.is_static() && context.is_first_token()) {
+        res = src0;
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
     auto src0_shape = context.get_input_shape(0).to_shape();
     auto output_shape = context.get_output_shape(0).to_shape();
 
     std::vector<size_t> input0_strides = context.get_input_stride(0);
     std::vector<size_t> output_strides = context.get_output_stride(0);
 
-    auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
+    auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+    auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+    auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
 
-    src0 = std::make_shared<ov::op::v1::ConvertLike>(src0, src1);
     if (op_case == 1) {
         // Write K to cache_k
         int64_t head_size = src0_shape[2];
@@ -56,69 +64,36 @@ OutputVector translate_cpy(const NodeContext& context) {
         auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(src1, reshaped_src1_shape, false);
 
         auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0});
-        token_len = std::make_shared<ov::op::v1::Reshape>(token_len,
-                                                          ov::op::v0::Constant::create(ov::element::i64, {0}, {}),
-                                                          false);
+        auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
 
+        std::shared_ptr<ov::Node> indices;
         if (context.is_static()) {
-            int32_t* op_params = context.get_input_op_params(1);
-            int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size;
-            past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
+            indices = past_token_len_scalar.get_node_shared_ptr();
+            indices = std::make_shared<ov::op::v0::Unsqueeze>(
+                indices,
+                ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{0, 1}));
+        } else {
+            auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
+            indices = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
+                                                          total_token_len_scalar,
+                                                          one_scalar,
+                                                          ov::element::i64);
+            indices = std::make_shared<ov::op::v0::Unsqueeze>(indices, one);
         }
 
-        auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
-        std::shared_ptr<ov::Node> indices =
-            std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len, one, ov::element::i64);
-        indices = std::make_shared<ov::op::v0::Unsqueeze>(
-            indices,
-            ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{1}));
-
         res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
     } else {
         // Write V to cache_v
-        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
         auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
         auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
-
         auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
-        auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1});
 
         int64_t total_head_size = src0_shape[1];
         auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
         auto total_head_size_scalar = std::make_shared<ov::op::v0::Squeeze>(total_head_size_node, zero);
 
         auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2});
         auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
-        if (context.is_static()) {
-            int32_t* op_params = context.get_input_op_params(1);
-            int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2];
-            past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
-        }
-        auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len, token_len_scalar);
-
-        // auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
-        //     src1,
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
-        //     false);
-
-        // auto src1_left = std::make_shared<ov::op::v8::Slice>(
-        //     reshaped_src1,
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
-        //     std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
-
-        // auto src1_right = std::make_shared<ov::op::v8::Slice>(
-        //     reshaped_src1,
-        //     std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
-
-        // auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
-        //     src0,
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
-        //     false);
-
-        // auto res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
 
         // 1D tensor of shape [total_head_size], values starting from 0
         auto range_row =
@@ -131,8 +106,19 @@ OutputVector translate_cpy(const NodeContext& context) {
             std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
 
         // 1D tensor of shape [token_len], values starting from past_token_len
-        auto range_col =
-            std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len_scalar, one_scalar, element::i64);
+        std::shared_ptr<ov::Node> range_col;
+        if (context.is_static()) {
+            range_col = past_token_len_scalar.get_node_shared_ptr();
+            range_col = std::make_shared<ov::op::v0::Unsqueeze>(
+                range_col,
+                ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{0}));
+        } else {
+            auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
+            range_col = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
+                                                            total_token_len_scalar,
+                                                            one_scalar,
+                                                            ov::element::i64);
+        }
         auto range_col_reshaped =
             std::make_shared<ov::op::v0::Unsqueeze>(range_col,
                                                     ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h