Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
08de982
Update supports_buft and supports_op for quantized models
wine99 Aug 5, 2025
7cabe37
Add quant weight conversion functions from genai gguf reader
wine99 Aug 5, 2025
976c85b
Quant models run with accuracy issue
wine99 Aug 6, 2025
1f8e007
Fix accuracy: disable cpu_repack
wine99 Aug 7, 2025
c77cefe
Fix CI; Disable test-backend-ops
wine99 Aug 7, 2025
3023981
Fix Q4_1
wine99 Aug 8, 2025
7b8189a
Fix test-thread-safety
wine99 Aug 8, 2025
0376a7a
Fix test-backend-ops: Treat quantized tensors as weights
wine99 Aug 12, 2025
e1a5f7e
Add NPU Q4_0 support
wine99 Aug 19, 2025
489d453
NPU perf: eliminate zp
wine99 Aug 22, 2025
c319ce5
NPU perf: Faster compilation
wine99 Aug 26, 2025
5b6418d
Dequantize q4_1 q4_k q6_k for NPU
wine99 Aug 29, 2025
623f863
Add custom quant type: q8_1_c, q4_0_128
wine99 Sep 2, 2025
ef4de4d
Set m_is_static=false as default in decoder
wine99 Sep 2, 2025
1b8323f
Simpilfy translation of get_rows
wine99 Sep 2, 2025
6da5a22
Fix after rebasing
wine99 Sep 8, 2025
e37cdd4
Improve debug util; Eliminate nop ReshapeReshape
wine99 Sep 10, 2025
b453d68
STYLE: make get_types_to_requant a function
wine99 Sep 10, 2025
4ed2510
Support BF16 model
wine99 Sep 11, 2025
3eeb567
Fix NPU compile
wine99 Sep 12, 2025
3346a33
WA for npu 1st token acc issue
wine99 Sep 12, 2025
fa237a1
Apply EliminateZP only for npu
wine99 Sep 12, 2025
9a85e53
Add GeGLU
wine99 Sep 15, 2025
3d31fa6
Fix Hunyuan
wine99 Sep 15, 2025
500aead
Support iSWA
wine99 Sep 16, 2025
a079242
Fix NPU accuracy
wine99 Sep 17, 2025
b6c84af
Fix ROPE accuracy when freq_scale != 1
wine99 Sep 17, 2025
b3eb6fb
Minor: not add attention_size_swa for non-swa model
wine99 Sep 17, 2025
bb33530
Minor refactor
wine99 Sep 19, 2025
812590b
Add Q5_K to support phi-3-q4_k_m
wine99 Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
462 changes: 462 additions & 0 deletions IR.xml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ if [ ! -z ${GG_BUILD_OPENVINO} ]; then
echo "source /opt/intel/openvino/setupvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF"
fi
## helpers

Expand Down
2 changes: 1 addition & 1 deletion docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ git switch dev_backend_openvino

# Build with OpenVINO support
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DGGML_CPU_REPACK=OFF
cmake --build build/ReleaseOV --config Release -j $(nproc)
```

Expand Down
276 changes: 206 additions & 70 deletions ggml/src/ggml-openvino/ggml-decoder.cpp

Large diffs are not rendered by default.

28 changes: 22 additions & 6 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
#include <optional>
#include <vector>

#include "ggml-quants.hpp"
#include "ggml.h"
#include "openvino/decoder.hpp"

Expand All @@ -17,10 +19,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

// Node decoder, called in GgmlOvDecoder::visit_subgraph
GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token,
int context_size, int num_heads, int num_heads_kv, int head_size);
int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size,
const std::vector<int>& swa_layers);

// Naive graph decoder
GgmlOvDecoder(struct ggml_cgraph* cgraph);
GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);

virtual ov::Any get_attribute(const std::string& name) const override {
return nullptr;
Expand Down Expand Up @@ -99,6 +102,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual int get_context_size() const override { return m_context_size; }

virtual int get_context_size_swa() const override { return m_context_size_swa; }

virtual int is_swa_layer(int layer) const override {
return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end();
}

virtual int get_num_heads() const override { return m_num_heads; }

virtual int get_num_heads_kv() const override { return m_num_heads_kv; }
Expand All @@ -115,8 +124,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;

static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);

static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
std::optional<ExtraQuantType> requant_type = std::nullopt);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});

const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
const ggml_tensor* get_tensor_from_name(const std::string& name) const;
Expand All @@ -126,7 +139,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
private:
void set_input_output(ggml_tensor* node, bool naive = false);
void add_extra_inputs();
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
static std::vector<size_t> get_shape(const ggml_tensor* tensor);
static std::vector<size_t> get_stride(const ggml_tensor* tensor);
static ov::element::Type get_ov_type(const ggml_tensor* tensor);
Expand All @@ -151,13 +163,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
std::vector<std::string> m_model_output_names;
int m_context_size;
int m_context_size_swa;
std::vector<int> m_swa_layers;
int m_num_heads;
int m_num_heads_kv;
int m_head_size;
int32_t* m_rope_params;
std::vector<std::string> m_kv_names;
bool m_is_static;
bool m_is_static = false;
bool m_is_first_token;
};

void print_tensor_address_map(const struct ggml_cgraph* cgraph);

int extract_layer_from_name(const std::string& name);
71 changes: 50 additions & 21 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <vector>

#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-openvino/utils.h"
#include "ggml.h"
Expand Down Expand Up @@ -248,17 +249,30 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
const auto* op_params = op->op_params;
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
const uint32_t h = op->src[0]->ne[2];
const uint32_t n_head = op->src[0]->ne[0];
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));

const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const float slope =
(max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
if (max_bias > 0) {
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
return true;
}
}

if (slope != 1.0f) {
GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with slope != 1.0f\n");
if (op->op == GGML_OP_FLASH_ATTN_EXT) {
if (op->src[4] != nullptr) {
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
return true;
}
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
const auto* op_params = op->op_params;
memcpy(&scale, (const float*) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float*) op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float*) op_params + 2, sizeof(float));
if (max_bias > 0) {
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
return true;
}
if (logit_softcap != 0) {
GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with logit_softcap != 0\n");
return true;
}
}
Expand Down Expand Up @@ -305,12 +319,8 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
return true;
}
float freq_scale;
memcpy(&freq_scale, op_params + 6, sizeof(float));
if (freq_scale != 0.0f && freq_scale != 1.0f) {
GGML_LOG_WARN("OpenVINO backend does not support ROPE with freq_scale %f != 1.0f\n", freq_scale);
return true;
}
float ext_factor;
memcpy(&freq_scale, op_params + 6, sizeof(float));
memcpy(&ext_factor, op_params + 7, sizeof(float));
if (ext_factor != 0.0f) {
GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
Expand All @@ -332,8 +342,17 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
GGML_ASSERT(dev->reg != nullptr);

static const std::set<ggml_type> supported_types{
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32};
static std::set<ggml_type> supported_types{GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_I64,
GGML_TYPE_I32,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K,
GGML_TYPE_Q8_0,
GGML_TYPE_Q6_K};

static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
GGML_OP_ADD,
Expand All @@ -348,7 +367,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
GGML_OP_ROPE,
GGML_OP_RMS_NORM,
GGML_OP_SCALE,
GGML_OP_SOFT_MAX,
// softmax is not updated due to replaced by flash_attn_ext
// GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS,
GGML_OP_FLASH_ATTN_EXT,
GGML_OP_CPY};
Expand All @@ -357,6 +377,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
};
static const std::set<ggml_glu_op> supported_glu_ops{
GGML_GLU_OP_SWIGLU,
GGML_GLU_OP_GEGLU,
};

switch (op->op) {
Expand Down Expand Up @@ -394,14 +415,22 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
return false;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (supported_types.find(op->type) == supported_types.end()) {
GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(op->type));
auto* src = op->src[i];
if (src == nullptr) {
break;
}
if (supported_types.find(src->type) == supported_types.end()) {
GGML_LOG_WARN("OpenVINO backend does not support tensor type %s\n", ggml_type_name(src->type));
return false;
}
if (op->src[i] != nullptr && op->src[i]->ne[3] != 1) {
if (src->ne[3] != 1) {
GGML_LOG_WARN("OpenVINO backend does not support tensors with ne[3] != 1\n");
return false;
}
if (ggml_is_quantized(src->type) && src->ne[2] != 1) {
GGML_LOG_WARN("OpenVINO backend does not support 3D quantized tensors\n");
return false;
}
}

if (is_op_unsupported_case(op)) {
Expand Down
Loading
Loading