Skip to content

Commit 623f863

Browse files
committed
Add custom quant type: q8_1_c, q4_0_128
1 parent 5b6418d commit 623f863

File tree

5 files changed

+203
-68
lines changed

5 files changed

+203
-68
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <openvino/op/parameter.hpp>
2626
#include <openvino/op/unsqueeze.hpp>
2727
#include <openvino/runtime/tensor.hpp>
28+
#include <optional>
2829
#include <ostream>
2930
#include <set>
3031
#include <stdexcept>
@@ -371,7 +372,7 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
371372
}
372373

373374
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
374-
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
375+
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
375376
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
376377
static std::mutex weights_mutex;
377378
auto* nodes = cgraph->nodes;
@@ -396,7 +397,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
396397
}
397398
}
398399
if (should_create) {
399-
auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
400+
auto requant_type = types_to_requantize.count(src->type) ?
401+
std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
402+
std::nullopt;
403+
auto weight_node = create_weight_node(src, requant_type);
400404
weight_node->set_friendly_name(src_name);
401405
{
402406
std::lock_guard<std::mutex> lock(weights_mutex);
@@ -410,7 +414,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
410414
return model_weights;
411415
}
412416

413-
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
417+
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
418+
std::optional<ExtraQuantType> requant_type) {
414419
std::set<ggml_type> weight_types = {
415420
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
416421
if (weight_types.find(tensor->type) == weight_types.end()) {
@@ -443,21 +448,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
443448
tensor->extra == nullptr,
444449
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
445450

446-
if (to_dequantize) {
447-
std::vector<float> weights_f32(ne_total);
448-
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
449-
ov::Tensor weights(ov::element::f16, node_shape);
450-
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
451-
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
452-
weight_node->set_friendly_name(tensor->name);
453-
return weight_node;
451+
if (requant_type.has_value()) {
452+
return requantize(tensor, requant_type.value());
454453
}
455454

456-
uint64_t weights_per_byte;
455+
ov::element::Type weight_type;
457456
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
458-
weights_per_byte = 2;
457+
weight_type = ov::element::u4;
459458
} else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K
460-
weights_per_byte = 1;
459+
weight_type = ov::element::u8;
461460
}
462461

463462
uint64_t weights_per_block;
@@ -474,15 +473,12 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
474473
" has incompatible last dim shape: ",
475474
node_shape.back());
476475

477-
auto weights_shape = node_shape;
478-
weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8
479-
480-
ov::Tensor weights(ov::element::u32, weights_shape);
481-
// For scales and bias
476+
ov::Tensor weights(weight_type, node_shape);
477+
// For scales and biases
482478
node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block;
483-
484479
ov::Tensor scales(ov::element::f16, node_shape);
485480
ov::Tensor biases(ov::element::f16, node_shape);
481+
486482
ov::Output<ov::Node> weight_node;
487483
if (tensor->type == GGML_TYPE_Q4_0) {
488484
extract_q4_0_data(tensor, weights, scales, biases);
@@ -494,7 +490,6 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
494490
extract_q8_0_data(tensor, weights, scales, biases);
495491
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
496492
} else if (tensor->type == GGML_TYPE_Q6_K) {
497-
// due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled.
498493
extract_q6_k_data(tensor, weights, scales, biases);
499494
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
500495
} else if (tensor->type == GGML_TYPE_Q4_K) {
@@ -503,15 +498,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
503498
}
504499

505500
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
506-
// weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
507-
// weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
508501

509502
weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
510-
// GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n",
511-
// tensor->name,
512-
// ggml_type_name(tensor->type),
513-
// weight_node.get_element_type().get_type_name().c_str(),
514-
// weight_node.get_partial_shape().to_string().c_str());
515503
return weight_node.get_node_shared_ptr();
516504
}
517505

ggml/src/ggml-openvino/ggml-decoder.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
#include <map>
55
#include <memory>
66
#include <openvino/core/partial_shape.hpp>
7+
#include <optional>
78
#include <vector>
89

10+
#include "ggml-quants.hpp"
911
#include "ggml.h"
1012
#include "openvino/decoder.hpp"
1113

@@ -117,9 +119,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
117119

118120
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
119121

120-
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
122+
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
123+
std::optional<ExtraQuantType> requant_type = std::nullopt);
121124
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
122-
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
125+
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
123126

124127
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
125128
const ggml_tensor* get_tensor_from_name(const std::string& name) const;

ggml/src/ggml-openvino/ggml-quants.cpp

Lines changed: 159 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
#include "ggml-quants.hpp"
22

33
#include <cstdint>
4+
#include <limits>
5+
#include <memory>
46
#include <openvino/core/parallel.hpp>
57
#include <openvino/core/type/element_type_traits.hpp>
8+
#include <openvino/core/type/float16.hpp>
69
#include <openvino/op/constant.hpp>
710
#include <openvino/op/convert.hpp>
811
#include <openvino/op/multiply.hpp>
912
#include <openvino/op/reshape.hpp>
1013
#include <openvino/op/subtract.hpp>
1114
#include <openvino/runtime/tensor.hpp>
15+
#include <string>
1216

17+
#include "ggml-impl.h"
1318
#include "ggml.h"
1419

1520
void unpack_32_4(const uint8_t* data, uint8_t* dst) {
@@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor,
203208
// TODO Reorder for make_intX_weights
204209

205210
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
206-
207-
// Reshape weight to (num_heads, -1, group_size)
208211
ov::Shape orig_shape = weight.get_shape();
209-
orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t);
210-
size_t num_groups = orig_shape[1] / group_size;
211212

212213
// Expand dimensions for scales and biases
213214
auto scale_shape = scales.get_shape();
214-
scale_shape.push_back(1);
215-
scales.set_shape(scale_shape);
216-
biases.set_shape(scale_shape);
215+
216+
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
217+
218+
if (packed_shape[1] == 1) {
219+
packed_shape.erase(packed_shape.begin() + 1);
220+
} else {
221+
scale_shape.push_back(1);
222+
scales.set_shape(scale_shape);
223+
biases.set_shape(scale_shape);
224+
}
217225

218226
// Create graph nodes
219-
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast<uint8_t*>(weight.data()), nullptr);
227+
auto weights_node = std::make_shared<ov::op::v0::Constant>(
228+
ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
220229
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
221230
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
222231
ov::Tensor biases_u8(ov::element::u8, scale_shape);
@@ -242,32 +251,24 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
242251
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
243252
weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
244253
);
245-
auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
246-
w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY
247-
);
248-
249-
// Reshape back to original dimensions
250-
auto final_shape = std::make_shared<ov::op::v0::Constant>(
251-
ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape
252-
);
253-
auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
254-
w_zp_s, final_shape, false
255-
);
254+
ov::Output<ov::Node> w_zp_s =
255+
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
256+
257+
if (packed_shape.size() != 2) {
258+
// If not requantized channel-wise case, reshape back to original shape
259+
auto final_shape =
260+
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
261+
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
262+
}
256263

257-
return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
264+
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
258265
}
259266

260267
ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
261-
262-
// Convert weight to uint8 view and adjust shape
263268
ov::Shape orig_weight_shape = weight.get_shape();
264-
orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation
265269

266270
// Expand dimensions for scales and biases
267271
ov::Shape scale_bias_shape = scales.get_shape();
268-
scale_bias_shape.push_back(1); // Add new axis at the end
269-
scales.set_shape(scale_bias_shape);
270-
biases.set_shape(scale_bias_shape);
271272

272273
// Create INT4 weight tensor
273274
ov::Shape packed_shape = {
@@ -276,8 +277,17 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
276277
group_size
277278
};
278279

280+
// Requantized channel-wise case
281+
if (packed_shape[1] == 1) {
282+
packed_shape.erase(packed_shape.begin() + 1);
283+
} else {
284+
scale_bias_shape.push_back(1);
285+
scales.set_shape(scale_bias_shape);
286+
biases.set_shape(scale_bias_shape);
287+
}
288+
279289
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
280-
weights_node->get_rt_info()["__gguf_tensor_holde"] = weight;
290+
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
281291
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
282292

283293
// Pack zero points: two subsequent values into one
@@ -304,15 +314,129 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
304314
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
305315
weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
306316

307-
auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
308-
w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
317+
ov::Output<ov::Node> w_zp_s =
318+
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
319+
320+
if (packed_shape.size() != 2) {
321+
// If not requantized channel-wise case, reshape back to original shape
322+
auto final_shape = std::make_shared<ov::op::v0::Constant>(
323+
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
324+
325+
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
326+
}
327+
328+
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
329+
}
309330

310-
// Reshape back to original shape
311-
auto final_shape = std::make_shared<ov::op::v0::Constant>(
312-
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
331+
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
332+
std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
333+
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
313334

314-
auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
315-
w_zp_s, final_shape, false);
335+
std::shared_ptr<ov::Node> weight_node;
336+
ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
337+
338+
if (requant_type == ExtraQuantType::F16) {
339+
ov::Tensor weights(ov::element::f16, node_shape);
340+
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
341+
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
342+
weight_node->set_friendly_name(tensor->name);
343+
return weight_node;
344+
}
316345

317-
return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
346+
int64_t block_size = node_shape[1];
347+
if (requant_type == ExtraQuantType::Q4_0_128) {
348+
block_size = 128;
349+
}
350+
auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
351+
352+
ov::Tensor weights;
353+
ov::Tensor scales(ov::element::f16, scales_shape);
354+
ov::Tensor bias(ov::element::f16, scales_shape);
355+
356+
if (requant_type == ExtraQuantType::Q4_0_C) {
357+
weights = ov::Tensor(ov::element::u4, node_shape);
358+
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
359+
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
360+
} else if (requant_type == ExtraQuantType::Q8_1_C) {
361+
weights = ov::Tensor(ov::element::u8, node_shape);
362+
quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
363+
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
364+
} else if (requant_type == ExtraQuantType::Q4_0_128) {
365+
weights = ov::Tensor(ov::element::u4, node_shape);
366+
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
367+
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
368+
}
369+
370+
weight_node->set_friendly_name(tensor->name);
371+
return weight_node;
372+
}
373+
374+
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
375+
int64_t qk) {
376+
assert(k % qk == 0);
377+
const int nb = k / qk;
378+
379+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
380+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
381+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
382+
for (int i = 0; i < nb; i++) {
383+
float amax = 0.0f; // absolute max
384+
float max = 0.0f;
385+
386+
for (int j = 0; j < qk; j++) {
387+
const float v = x[i * qk + j];
388+
if (amax < fabsf(v)) {
389+
amax = fabsf(v);
390+
max = v;
391+
}
392+
}
393+
394+
const float d = max / -8;
395+
const float id = d ? 1.0f / d : 0.0f;
396+
scales[i] = ov::float16(d);
397+
biases[i] = ov::float16(-8.f * d);
398+
399+
for (int j = 0; j < qk / 2; ++j) {
400+
const float x0 = x[i * qk + 2 * j] * id;
401+
const float x1 = x[i * qk + 2 * j + 1] * id;
402+
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
403+
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
404+
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
405+
}
406+
}
407+
}
408+
409+
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
410+
int64_t qk) {
411+
assert(k % qk == 0);
412+
const int nb = k / qk;
413+
414+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
415+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
416+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
417+
for (int i = 0; i < nb; i++) {
418+
float min = std::numeric_limits<float>::max();
419+
float max = std::numeric_limits<float>::lowest();
420+
421+
for (int j = 0; j < qk; j++) {
422+
const float v = x[i * qk + j];
423+
if (v < min) {
424+
min = v;
425+
}
426+
if (v > max) {
427+
max = v;
428+
}
429+
}
430+
431+
const float d = (max - min) / ((1 << 8) - 1);
432+
const float id = d ? 1.0f / d : 0.0f;
433+
scales[i] = ov::float16(d);
434+
biases[i] = ov::float16(min);
435+
436+
for (int j = 0; j < qk; ++j) {
437+
const float x0 = (x[i * qk + j] - min) * id;
438+
const uint8_t xi0 = roundf(x0);
439+
weights[i * qk + j] = xi0;
440+
}
441+
}
318442
}

0 commit comments

Comments
 (0)