Skip to content

Commit 5b6418d

Browse files
committed
Dequantize q4_1 q4_k q6_k for NPU
1 parent c319ce5 commit 5b6418d

File tree

4 files changed

+26
-18
lines changed

4 files changed

+26
-18
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,8 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
370370
return kv_param_res_names;
371371
}
372372

373-
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) {
373+
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
374+
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
374375
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
375376
static std::mutex weights_mutex;
376377
auto* nodes = cgraph->nodes;
@@ -395,7 +396,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
395396
}
396397
}
397398
if (should_create) {
398-
auto weight_node = create_weight_node(src);
399+
auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
399400
weight_node->set_friendly_name(src_name);
400401
{
401402
std::lock_guard<std::mutex> lock(weights_mutex);
@@ -409,7 +410,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
409410
return model_weights;
410411
}
411412

412-
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) {
413+
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
413414
std::set<ggml_type> weight_types = {
414415
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
415416
if (weight_types.find(tensor->type) == weight_types.end()) {
@@ -422,15 +423,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
422423
auto ne_total = ggml_nelements(tensor);
423424

424425
OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
426+
node_shape.erase(node_shape.begin());
425427

426428
// F16 and F32 case
427429
if (node_type != ov::element::dynamic) {
428430
ov::Tensor weights(node_type, node_shape);
429431
memcpy(weights.data(), tensor->data, ne_total * node_type.size());
430432
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
431-
if (node_type == ov::element::f16) {
432-
weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
433-
}
433+
// Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU
434+
// if (node_type == ov::element::f16) {
435+
// weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
436+
// }
434437
weight_node->set_friendly_name(tensor->name);
435438
return weight_node;
436439
}
@@ -440,7 +443,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
440443
tensor->extra == nullptr,
441444
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
442445

443-
node_shape.erase(node_shape.begin());
446+
if (to_dequantize) {
447+
std::vector<float> weights_f32(ne_total);
448+
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
449+
ov::Tensor weights(ov::element::f16, node_shape);
450+
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
451+
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
452+
weight_node->set_friendly_name(tensor->name);
453+
return weight_node;
454+
}
444455

445456
uint64_t weights_per_byte;
446457
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
117117

118118
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
119119

120-
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
121-
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph);
120+
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
121+
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
122+
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
122123

123124
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
124125
const ggml_tensor* get_tensor_from_name(const std::string& name) const;

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -344,14 +344,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
344344
GGML_TYPE_Q8_0,
345345
GGML_TYPE_Q6_K};
346346

347-
std::string device = std::string(getenv("GGML_OPENVINO_DEVICE"));
348-
bool is_npu = device == "NPU";
349-
if (is_npu) {
350-
// NPU has poor support for asymmetric quantization
351-
supported_types.erase(GGML_TYPE_Q4_1);
352-
supported_types.erase(GGML_TYPE_Q4_K);
353-
}
354-
355347
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
356348
GGML_OP_ADD,
357349
GGML_OP_MUL,

ggml/src/ggml-openvino/utils.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
130130
compile_end_time = conversion_end_time;
131131
} else {
132132
std::shared_ptr<ov::Model> model;
133-
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
133+
std::set<ggml_type> types_to_dequantize;
134+
if (is_static) {
135+
types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
136+
}
137+
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize);
134138

135139
if (is_static) {
136140
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);

0 commit comments

Comments
 (0)