Add initial NPU support

wine99 · wine99 · commit 9f99529eaa97 · 2025-06-04T15:28:44.000+08:00
diff --git a/docs/build.md b/docs/build.md
@@ -570,7 +570,7 @@ To read documentation for how to build on Android, [click here](./android.md)
 
 ## OpenVINO
 
-[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. 
+[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
 
 Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support.
 
@@ -582,7 +582,7 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
 ```bash
 source /opt/intel/openvino_2025.1.0/setupvars.sh
 ```
-- Verify OpenVINO is initialized properly 
+- Verify OpenVINO is initialized properly
 ```bash
 echo $OpenVINO_DIR
 ```
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -14,6 +14,7 @@
 #include <memory>
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/node.hpp>
+#include <openvino/core/partial_shape.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/parameter.hpp>
@@ -25,14 +26,16 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
-GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph)
+GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token)
     : m_cgraph(cgraph),
       m_node(node),
-      m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
+      m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
+      m_is_static(is_static),
+      m_is_first_token(is_first_token) {
     static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
 
     if (m_node) {
-        set_input_output(m_node, model_weights);
+        set_input_output(m_node);
     } else {
         static bool printed = false;
         if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
@@ -47,15 +50,15 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
         set_max_token_len();
 
         static bool weight_created = false;
-        if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) {
+        if (!weight_created) {
             add_weight_const_parallel(model_weights);
             weight_created = true;
         }
 
         for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
             auto* cur_node = m_cgraph->nodes[node_n];
             m_nodes.push_back(cur_node);
-            set_input_output(cur_node, model_weights);
+            set_input_output(cur_node);
         }
         m_model_weights = model_weights;
 
@@ -65,8 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
 
 // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
 // 2. constructing a decoder for a node.
-void GgmlOvDecoder::set_input_output(ggml_tensor* node,
-                                     std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
+void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
     std::string node_name;
     if (node->op == GGML_OP_CPY) {
         // CPY updates the input tensor in place. For later ov op that uses the
@@ -95,21 +97,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
         if (!m_node && !src->view_src) {
             ggml_backend_buffer* buffer = src->buffer;
 
-            if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-                bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT");
-                auto& weights_map = weight_as_input ? m_model_inputs : model_weights;
-                if (weights_map.find(src_name) != weights_map.end()) {
-                    continue;
-                }
-
-                std::shared_ptr<ov::Node> weight_node =
-                    weight_as_input
-                        ? std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)})
-                        : create_weight_node(src);
-                weight_node->set_friendly_name(src_name);
-                weights_map[src_name] = weight_node;
-
-            } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
+            if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
                 // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
                 if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
                     assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
@@ -119,10 +107,24 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
                 }
                 ov::PartialShape input_shape;
                 if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
-                    input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
+                    if (m_is_static) {
+                        input_shape = ov::PartialShape(get_shape(src));
+                        // if (m_is_first_token) {
+                        //     input_shape = ov::PartialShape{1, 1, m_max_token_len};
+                        // } else {
+                        //     input_shape = ov::PartialShape{1, 1, 1};
+                        // }
+                    } else {
+                        input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
+                    }
                 } else if (std::string(src->name).find("KQ_mask") == 0) {
-                    auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
-                    input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
+                    if (m_is_static) {
+                        input_shape = ov::PartialShape(get_shape(src));
+                    } else {
+                        auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
+                        input_shape =
+                            ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
+                    }
                 } else {
                     input_shape = ov::Shape{get_shape(src)};
                 }
@@ -510,7 +512,7 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
 
 void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
     for (const auto& node : m_nodes) {
-        auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph);
+        auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token);
         node_visitor(decoder);
     }
 }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -12,7 +12,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 public:
     using ov::frontend::ggml::GgmlDecoder::GgmlDecoder;
 
-    GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph);
+    GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
 
     virtual ov::Any get_attribute(const std::string& name) const override {
         return nullptr;
@@ -89,8 +89,15 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
         return m_model_output_names;
     }
 
+    virtual bool is_static() const override {
+        return m_is_static;
+    }
+    virtual bool is_first_token() const {
+        return m_is_first_token;
+    }
+
 private:
-    void set_input_output(ggml_tensor* node, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
+    void set_input_output(ggml_tensor* node);
     void add_extra_inputs();
     static void dump_cgraph(const struct ggml_cgraph* cgraph);
     static std::vector<size_t> get_shape(const ggml_tensor* tensor);
@@ -119,6 +126,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
     std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
     std::vector<std::string> m_model_output_names;
+    bool m_is_static;
+    bool m_is_first_token;
 };
 
 void print_tensor_address_map(const struct ggml_cgraph* cgraph);
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -55,6 +55,8 @@ class GgmlDecoder : public DecoderBase {
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
     virtual const std::vector<std::string>& get_model_output_names() const = 0;
+
+    virtual bool is_static() const = 0;
 };
 
 }  // namespace ggml
diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp
@@ -84,6 +84,9 @@ class NodeContext : public frontend::NodeContext {
     int get_op_case() const {
         return m_decoder->get_op_case();
     }
+    bool is_static() const {
+        return m_decoder->is_static();
+    }
 
 private:
     std::shared_ptr<GgmlDecoder> m_decoder;
diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
@@ -5,13 +5,15 @@
 #include <openvino/core/node_output.hpp>
 #include <openvino/core/node_vector.hpp>
 #include <openvino/op/add.hpp>
+#include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert_like.hpp>
 #include <openvino/op/range.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/scatter_nd_update.hpp>
 #include <openvino/op/slice.hpp>
+#include <openvino/op/squeeze.hpp>
 #include <openvino/op/transpose.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <vector>
@@ -57,6 +59,13 @@ OutputVector translate_cpy(const NodeContext& context) {
         token_len = std::make_shared<ov::op::v1::Reshape>(token_len,
                                                           ov::op::v0::Constant::create(ov::element::i64, {0}, {}),
                                                           false);
+
+        if (context.is_static()) {
+            int32_t* op_params = context.get_input_op_params(1);
+            int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size;
+            past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
+        }
+
         auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
         std::shared_ptr<ov::Node> indices =
             std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len, one, ov::element::i64);
@@ -67,39 +76,88 @@ OutputVector translate_cpy(const NodeContext& context) {
         res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
     } else {
         // Write V to cache_v
-        int64_t total_head_size = src0_shape[1];
-        auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
-
         auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
         auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+
+        auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
+        auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1});
+
+        int64_t total_head_size = src0_shape[1];
+        auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
+        auto total_head_size_scalar = std::make_shared<ov::op::v0::Squeeze>(total_head_size_node, zero);
 
         auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2});
-        past_token_len = std::make_shared<ov::op::v0::Unsqueeze>(past_token_len, zero);
-        auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
+        auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
+        if (context.is_static()) {
+            int32_t* op_params = context.get_input_op_params(1);
+            int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2];
+            past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
+        }
+        auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len, token_len_scalar);
+
+        // auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
+        //     src1,
+        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
+        //     false);
+
+        // auto src1_left = std::make_shared<ov::op::v8::Slice>(
+        //     reshaped_src1,
+        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
+        //     std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
+        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
+
+        // auto src1_right = std::make_shared<ov::op::v8::Slice>(
+        //     reshaped_src1,
+        //     std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
+        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
+        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
+
+        // auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
+        //     src0,
+        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
+        //     false);
+
+        // auto res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
+
+        // 1D tensor of shape [total_head_size], values starting from 0
+        auto range_row =
+            std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
+        auto range_row_reshaped =
+            std::make_shared<ov::op::v0::Unsqueeze>(range_row,
+                                                    ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
+        auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
+            range_row_reshaped,
+            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
+
+        // 1D tensor of shape [token_len], values starting from past_token_len
+        auto range_col =
+            std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len_scalar, one_scalar, element::i64);
+        auto range_col_reshaped =
+            std::make_shared<ov::op::v0::Unsqueeze>(range_col,
+                                                    ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
+        auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
+            range_col_reshaped,
+            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
+
+        // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
+        auto indices = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
+        auto indices_final = std::make_shared<ov::op::v1::Reshape>(
+            indices,
+            ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}),
+            false);
 
+        auto flattend_src0 =
+            std::make_shared<ov::op::v1::Reshape>(src0,
+                                                  ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}),
+                                                  false);
         auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
             src1,
-            ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
-            false);
-
-        auto src1_left = std::make_shared<ov::op::v8::Slice>(
-            reshaped_src1,
-            ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
-            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
-            ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
-
-        auto src1_right = std::make_shared<ov::op::v8::Slice>(
-            reshaped_src1,
-            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
-            ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
-            ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
-
-        auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
-            src0,
-            ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
+            ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{total_head_size, -1}),
             false);
 
-        res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
+        auto updated = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices_final, flattend_src0);
+        res = std::make_shared<ov::op::v0::Unsqueeze>(updated, zero);
     }
 
     return rename_outputs_with_suffix({res}, context.get_name());
diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@@ -55,17 +55,21 @@ OutputVector translate_mulmat(const NodeContext& context) {
         ov::Output<ov::Node> A;
         ov::Output<ov::Node> B;
 
-        auto attention_size = context.get_input("attention_size");
-
         auto src0 = context.get_input(0);
         auto src0_shape = context.get_input_shape(0).to_shape();
         auto src0_stride = context.get_input_stride(0);
         auto permuted = is_permuted(src0_stride);
         auto token_dim = permuted ? 0 : 2;
 
+        auto attention_size = context.get_input("attention_size");
+
         auto src0_perm = argsort_descend(src0_stride);
         auto src0_original_shape_ = permute(src0_shape, src0_perm);
         std::vector<int64_t> src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end());
+
+        if (context.is_static()) {
+            attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]});
+        }
         src0_original_shape[token_dim] = -1;
 
         auto src0_slice_shape = src0_original_shape;
diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp
diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,9 @@ class NodeContext : public frontend::NodeContext {`
`84`	`84`	`int get_op_case() const {`
`85`	`85`	`return m_decoder->get_op_case();`
`86`	`86`	`}`
	`87`	`+ bool is_static() const {`
	`88`	`+ return m_decoder->is_static();`
	`89`	`+ }`
`87`	`90`
`88`	`91`	`private:`
`89`	`92`	`std::shared_ptr<GgmlDecoder> m_decoder;`