Skip to content

Commit 9f99529

Browse files
committed
Add initial NPU support
1 parent 811c702 commit 9f99529

File tree

10 files changed

+203
-101
lines changed

10 files changed

+203
-101
lines changed

docs/build.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ To read documentation for how to build on Android, [click here](./android.md)
570570

571571
## OpenVINO
572572

573-
[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
573+
[OpenVINO](https://docs.openvino.ai/2025/index.html) is a open-source toolkit for optimizing and deploying performant AI inference, specifically designed for Intel hardware including CPUs, GPUs, and NPUs in the cloud, on-prem, and on the edge alike. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp.
574574

575575
Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support.
576576

@@ -582,7 +582,7 @@ Follow the instructions below to install OpenVINO runtime and build llama.cpp wi
582582
```bash
583583
source /opt/intel/openvino_2025.1.0/setupvars.sh
584584
```
585-
- Verify OpenVINO is initialized properly
585+
- Verify OpenVINO is initialized properly
586586
```bash
587587
echo $OpenVINO_DIR
588588
```

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <memory>
1515
#include <openvino/core/dimension.hpp>
1616
#include <openvino/core/node.hpp>
17+
#include <openvino/core/partial_shape.hpp>
1718
#include <openvino/core/type/float16.hpp>
1819
#include <openvino/op/constant.hpp>
1920
#include <openvino/op/parameter.hpp>
@@ -25,14 +26,16 @@
2526
#include "ggml-backend-impl.h"
2627
#include "ggml-backend.h"
2728

28-
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph)
29+
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token)
2930
: m_cgraph(cgraph),
3031
m_node(node),
31-
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
32+
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
33+
m_is_static(is_static),
34+
m_is_first_token(is_first_token) {
3235
static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
3336

3437
if (m_node) {
35-
set_input_output(m_node, model_weights);
38+
set_input_output(m_node);
3639
} else {
3740
static bool printed = false;
3841
if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
@@ -47,15 +50,15 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
4750
set_max_token_len();
4851

4952
static bool weight_created = false;
50-
if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) {
53+
if (!weight_created) {
5154
add_weight_const_parallel(model_weights);
5255
weight_created = true;
5356
}
5457

5558
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
5659
auto* cur_node = m_cgraph->nodes[node_n];
5760
m_nodes.push_back(cur_node);
58-
set_input_output(cur_node, model_weights);
61+
set_input_output(cur_node);
5962
}
6063
m_model_weights = model_weights;
6164

@@ -65,8 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
6568

6669
// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph;
6770
// 2. constructing a decoder for a node.
68-
void GgmlOvDecoder::set_input_output(ggml_tensor* node,
69-
std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
71+
void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
7072
std::string node_name;
7173
if (node->op == GGML_OP_CPY) {
7274
// CPY updates the input tensor in place. For later ov op that uses the
@@ -95,21 +97,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
9597
if (!m_node && !src->view_src) {
9698
ggml_backend_buffer* buffer = src->buffer;
9799

98-
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
99-
bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT");
100-
auto& weights_map = weight_as_input ? m_model_inputs : model_weights;
101-
if (weights_map.find(src_name) != weights_map.end()) {
102-
continue;
103-
}
104-
105-
std::shared_ptr<ov::Node> weight_node =
106-
weight_as_input
107-
? std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), ov::Shape{get_shape(src)})
108-
: create_weight_node(src);
109-
weight_node->set_friendly_name(src_name);
110-
weights_map[src_name] = weight_node;
111-
112-
} else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
100+
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
113101
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
114102
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
115103
assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0);
@@ -119,10 +107,24 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
119107
}
120108
ov::PartialShape input_shape;
121109
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
122-
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
110+
if (m_is_static) {
111+
input_shape = ov::PartialShape(get_shape(src));
112+
// if (m_is_first_token) {
113+
// input_shape = ov::PartialShape{1, 1, m_max_token_len};
114+
// } else {
115+
// input_shape = ov::PartialShape{1, 1, 1};
116+
// }
117+
} else {
118+
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
119+
}
123120
} else if (std::string(src->name).find("KQ_mask") == 0) {
124-
auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
125-
input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
121+
if (m_is_static) {
122+
input_shape = ov::PartialShape(get_shape(src));
123+
} else {
124+
auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
125+
input_shape =
126+
ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
127+
}
126128
} else {
127129
input_shape = ov::Shape{get_shape(src)};
128130
}
@@ -510,7 +512,7 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const {
510512

511513
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
512514
for (const auto& node : m_nodes) {
513-
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph);
515+
auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token);
514516
node_visitor(decoder);
515517
}
516518
}

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
1212
public:
1313
using ov::frontend::ggml::GgmlDecoder::GgmlDecoder;
1414

15-
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph);
15+
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
1616

1717
virtual ov::Any get_attribute(const std::string& name) const override {
1818
return nullptr;
@@ -89,8 +89,15 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
8989
return m_model_output_names;
9090
}
9191

92+
virtual bool is_static() const override {
93+
return m_is_static;
94+
}
95+
virtual bool is_first_token() const {
96+
return m_is_first_token;
97+
}
98+
9299
private:
93-
void set_input_output(ggml_tensor* node, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
100+
void set_input_output(ggml_tensor* node);
94101
void add_extra_inputs();
95102
static void dump_cgraph(const struct ggml_cgraph* cgraph);
96103
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
@@ -119,6 +126,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
119126
std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
120127
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
121128
std::vector<std::string> m_model_output_names;
129+
bool m_is_static;
130+
bool m_is_first_token;
122131
};
123132

124133
void print_tensor_address_map(const struct ggml_cgraph* cgraph);

ggml/src/ggml-openvino/openvino/decoder.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class GgmlDecoder : public DecoderBase {
5555
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
5656
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
5757
virtual const std::vector<std::string>& get_model_output_names() const = 0;
58+
59+
virtual bool is_static() const = 0;
5860
};
5961

6062
} // namespace ggml

ggml/src/ggml-openvino/openvino/node_context.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ class NodeContext : public frontend::NodeContext {
8484
int get_op_case() const {
8585
return m_decoder->get_op_case();
8686
}
87+
bool is_static() const {
88+
return m_decoder->is_static();
89+
}
8790

8891
private:
8992
std::shared_ptr<GgmlDecoder> m_decoder;

ggml/src/ggml-openvino/openvino/op/cpy.cpp

Lines changed: 82 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
#include <openvino/core/node_output.hpp>
66
#include <openvino/core/node_vector.hpp>
77
#include <openvino/op/add.hpp>
8+
#include <openvino/op/broadcast.hpp>
89
#include <openvino/op/concat.hpp>
910
#include <openvino/op/constant.hpp>
1011
#include <openvino/op/convert_like.hpp>
1112
#include <openvino/op/range.hpp>
1213
#include <openvino/op/reshape.hpp>
1314
#include <openvino/op/scatter_nd_update.hpp>
1415
#include <openvino/op/slice.hpp>
16+
#include <openvino/op/squeeze.hpp>
1517
#include <openvino/op/transpose.hpp>
1618
#include <openvino/op/unsqueeze.hpp>
1719
#include <vector>
@@ -57,6 +59,13 @@ OutputVector translate_cpy(const NodeContext& context) {
5759
token_len = std::make_shared<ov::op::v1::Reshape>(token_len,
5860
ov::op::v0::Constant::create(ov::element::i64, {0}, {}),
5961
false);
62+
63+
if (context.is_static()) {
64+
int32_t* op_params = context.get_input_op_params(1);
65+
int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size;
66+
past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
67+
}
68+
6069
auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
6170
std::shared_ptr<ov::Node> indices =
6271
std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len, one, ov::element::i64);
@@ -67,39 +76,88 @@ OutputVector translate_cpy(const NodeContext& context) {
6776
res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
6877
} else {
6978
// Write V to cache_v
70-
int64_t total_head_size = src0_shape[1];
71-
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
72-
7379
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
7480
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
81+
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
82+
83+
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
84+
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1});
85+
86+
int64_t total_head_size = src0_shape[1];
87+
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
88+
auto total_head_size_scalar = std::make_shared<ov::op::v0::Squeeze>(total_head_size_node, zero);
7589

7690
auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2});
77-
past_token_len = std::make_shared<ov::op::v0::Unsqueeze>(past_token_len, zero);
78-
auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
91+
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
92+
if (context.is_static()) {
93+
int32_t* op_params = context.get_input_op_params(1);
94+
int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2];
95+
past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
96+
}
97+
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len, token_len_scalar);
98+
99+
// auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
100+
// src1,
101+
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
102+
// false);
103+
104+
// auto src1_left = std::make_shared<ov::op::v8::Slice>(
105+
// reshaped_src1,
106+
// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
107+
// std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
108+
// ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
109+
110+
// auto src1_right = std::make_shared<ov::op::v8::Slice>(
111+
// reshaped_src1,
112+
// std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
113+
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
114+
// ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
115+
116+
// auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
117+
// src0,
118+
// ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
119+
// false);
120+
121+
// auto res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
122+
123+
// 1D tensor of shape [total_head_size], values starting from 0
124+
auto range_row =
125+
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
126+
auto range_row_reshaped =
127+
std::make_shared<ov::op::v0::Unsqueeze>(range_row,
128+
ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
129+
auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
130+
range_row_reshaped,
131+
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
132+
133+
// 1D tensor of shape [token_len], values starting from past_token_len
134+
auto range_col =
135+
std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len_scalar, one_scalar, element::i64);
136+
auto range_col_reshaped =
137+
std::make_shared<ov::op::v0::Unsqueeze>(range_col,
138+
ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
139+
auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
140+
range_col_reshaped,
141+
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
142+
143+
// Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
144+
auto indices = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
145+
auto indices_final = std::make_shared<ov::op::v1::Reshape>(
146+
indices,
147+
ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}),
148+
false);
79149

150+
auto flattend_src0 =
151+
std::make_shared<ov::op::v1::Reshape>(src0,
152+
ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}),
153+
false);
80154
auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
81155
src1,
82-
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
83-
false);
84-
85-
auto src1_left = std::make_shared<ov::op::v8::Slice>(
86-
reshaped_src1,
87-
ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
88-
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
89-
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
90-
91-
auto src1_right = std::make_shared<ov::op::v8::Slice>(
92-
reshaped_src1,
93-
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
94-
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
95-
ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
96-
97-
auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
98-
src0,
99-
ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
156+
ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{total_head_size, -1}),
100157
false);
101158

102-
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);
159+
auto updated = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices_final, flattend_src0);
160+
res = std::make_shared<ov::op::v0::Unsqueeze>(updated, zero);
103161
}
104162

105163
return rename_outputs_with_suffix({res}, context.get_name());

ggml/src/ggml-openvino/openvino/op/mulmat.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,21 @@ OutputVector translate_mulmat(const NodeContext& context) {
5555
ov::Output<ov::Node> A;
5656
ov::Output<ov::Node> B;
5757

58-
auto attention_size = context.get_input("attention_size");
59-
6058
auto src0 = context.get_input(0);
6159
auto src0_shape = context.get_input_shape(0).to_shape();
6260
auto src0_stride = context.get_input_stride(0);
6361
auto permuted = is_permuted(src0_stride);
6462
auto token_dim = permuted ? 0 : 2;
6563

64+
auto attention_size = context.get_input("attention_size");
65+
6666
auto src0_perm = argsort_descend(src0_stride);
6767
auto src0_original_shape_ = permute(src0_shape, src0_perm);
6868
std::vector<int64_t> src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end());
69+
70+
if (context.is_static()) {
71+
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]});
72+
}
6973
src0_original_shape[token_dim] = -1;
7074

7175
auto src0_slice_shape = src0_original_shape;

0 commit comments

Comments
 (0)