Skip to content

Commit f5312f5

Browse files
authored
Merge pull request #8 from ravi9/quant
Quant
2 parents b6130a7 + 812590b commit f5312f5

29 files changed

+1763
-198
lines changed

IR.xml

Lines changed: 462 additions & 0 deletions
Large diffs are not rendered by default.

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ if [ ! -z ${GG_BUILD_OPENVINO} ]; then
103103
echo "source /opt/intel/openvino/setupvars.sh"
104104
exit 1
105105
fi
106-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
106+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF"
107107
fi
108108
## helpers
109109

docs/build.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ git switch dev_backend_openvino
648648

649649
# Build with OpenVINO support
650650
source /opt/intel/openvino/setupvars.sh
651-
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
651+
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF
652652
cmake --build build/ReleaseOV --config Release -j $(nproc)
653653
```
654654

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 206 additions & 70 deletions
Large diffs are not rendered by default.

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
#include <map>
55
#include <memory>
66
#include <openvino/core/partial_shape.hpp>
7+
#include <optional>
78
#include <vector>
89

10+
#include "ggml-quants.hpp"
911
#include "ggml.h"
1012
#include "openvino/decoder.hpp"
1113

@@ -17,10 +19,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
1719

1820
// Node decoder, called in GgmlOvDecoder::visit_subgraph
1921
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
20-
int context_size, int num_heads, int num_heads_kv, int head_size);
22+
int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size,
23+
const std::vector<int>& swa_layers);
2124

2225
// Naive graph decoder
23-
GgmlOvDecoder(struct ggml_cgraph* cgraph);
26+
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
2427

2528
virtual ov::Any get_attribute(const std::string& name) const override {
2629
return nullptr;
@@ -99,6 +102,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
99102

100103
virtual int get_context_size() const override { return m_context_size; }
101104

105+
virtual int get_context_size_swa() const override { return m_context_size_swa; }
106+
107+
virtual int is_swa_layer(int layer) const override {
108+
return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end();
109+
}
110+
102111
virtual int get_num_heads() const override { return m_num_heads; }
103112

104113
virtual int get_num_heads_kv() const override { return m_num_heads_kv; }
@@ -115,8 +124,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
115124

116125
ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
117126

118-
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
119-
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
127+
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
128+
129+
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
130+
std::optional<ExtraQuantType> requant_type = std::nullopt);
131+
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
132+
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
120133

121134
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
122135
const ggml_tensor* get_tensor_from_name(const std::string& name) const;
@@ -126,7 +139,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
126139
private:
127140
void set_input_output(ggml_tensor* node, bool naive = false);
128141
void add_extra_inputs();
129-
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
130142
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
131143
static std::vector<size_t> get_stride(const ggml_tensor* tensor);
132144
static ov::element::Type get_ov_type(const ggml_tensor* tensor);
@@ -151,13 +163,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
151163
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
152164
std::vector<std::string> m_model_output_names;
153165
int m_context_size;
166+
int m_context_size_swa;
167+
std::vector<int> m_swa_layers;
154168
int m_num_heads;
155169
int m_num_heads_kv;
156170
int m_head_size;
157171
int32_t* m_rope_params;
158172
std::vector<std::string> m_kv_names;
159-
bool m_is_static;
173+
bool m_is_static = false;
160174
bool m_is_first_token;
161175
};
162176

163177
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
178+
179+
int extract_layer_from_name(const std::string& name);

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <vector>
99

1010
#include "ggml-backend-impl.h"
11+
#include "ggml-backend.h"
1112
#include "ggml-impl.h"
1213
#include "ggml-openvino/utils.h"
1314
#include "ggml.h"
@@ -248,17 +249,30 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
248249
const auto* op_params = op->op_params;
249250
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
250251
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
251-
const uint32_t h = op->src[0]->ne[2];
252-
const uint32_t n_head = op->src[0]->ne[0];
253-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
254-
255-
const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
256-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
257-
const float slope =
258-
(max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
252+
if (max_bias > 0) {
253+
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
254+
return true;
255+
}
256+
}
259257

260-
if (slope != 1.0f) {
261-
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n");
258+
if (op->op == GGML_OP_FLASH_ATTN_EXT) {
259+
if (op->src[4] != nullptr) {
260+
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
261+
return true;
262+
}
263+
float scale = 1.0f;
264+
float max_bias = 0.0f;
265+
float logit_softcap = 0.0f;
266+
const auto* op_params = op->op_params;
267+
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
268+
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
269+
memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float));
270+
if (max_bias > 0) {
271+
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
272+
return true;
273+
}
274+
if (logit_softcap != 0) {
275+
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n");
262276
return true;
263277
}
264278
}
@@ -305,12 +319,8 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
305319
return true;
306320
}
307321
float freq_scale;
308-
memcpy(&freq_scale, op_params + 6, sizeof(float));
309-
if (freq_scale != 0.0f && freq_scale != 1.0f) {
310-
GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale);
311-
return true;
312-
}
313322
float ext_factor;
323+
memcpy(&freq_scale, op_params + 6, sizeof(float));
314324
memcpy(&ext_factor, op_params + 7, sizeof(float));
315325
if (ext_factor != 0.0f) {
316326
GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
@@ -332,8 +342,17 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
332342
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
333343
GGML_ASSERT(dev->reg != nullptr);
334344

335-
static const std::set<ggml_type> supported_types{
336-
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32};
345+
static std::set<ggml_type> supported_types{GGML_TYPE_F32,
346+
GGML_TYPE_F16,
347+
GGML_TYPE_BF16,
348+
GGML_TYPE_I64,
349+
GGML_TYPE_I32,
350+
GGML_TYPE_Q4_0,
351+
GGML_TYPE_Q4_1,
352+
GGML_TYPE_Q4_K,
353+
GGML_TYPE_Q5_K,
354+
GGML_TYPE_Q8_0,
355+
GGML_TYPE_Q6_K};
337356

338357
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
339358
GGML_OP_ADD,
@@ -348,7 +367,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
348367
GGML_OP_ROPE,
349368
GGML_OP_RMS_NORM,
350369
GGML_OP_SCALE,
351-
GGML_OP_SOFT_MAX,
370+
// softmax is not updated due to replaced by flash_attn_ext
371+
// GGML_OP_SOFT_MAX,
352372
GGML_OP_SET_ROWS,
353373
GGML_OP_FLASH_ATTN_EXT,
354374
GGML_OP_CPY};
@@ -357,6 +377,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
357377
};
358378
static const std::set<ggml_glu_op> supported_glu_ops{
359379
GGML_GLU_OP_SWIGLU,
380+
GGML_GLU_OP_GEGLU,
360381
};
361382

362383
switch (op->op) {
@@ -394,14 +415,22 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
394415
return false;
395416
}
396417
for (int i = 0; i < GGML_MAX_SRC; i++) {
397-
if (supported_types.find(op->type) == supported_types.end()) {
398-
GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
418+
auto* src = op->src[i];
419+
if (src == nullptr) {
420+
break;
421+
}
422+
if (supported_types.find(src->type) == supported_types.end()) {
423+
GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type));
399424
return false;
400425
}
401-
if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) {
426+
if (src->ne[3] != 1) {
402427
GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
403428
return false;
404429
}
430+
if (ggml_is_quantized(src->type) && src->ne[2] != 1) {
431+
GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n");
432+
return false;
433+
}
405434
}
406435

407436
if (is_op_unsupported_case(op)) {

0 commit comments

Comments
 (0)