Skip to content

Commit b5bfc0a

Browse files
committed
Add Q5_K to support phi-3-q4_k_m
1 parent 327e156 commit b5bfc0a

File tree

5 files changed

+124
-34
lines changed

5 files changed

+124
-34
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
448448
GGML_TYPE_Q4_0,
449449
GGML_TYPE_Q4_1,
450450
GGML_TYPE_Q4_K,
451+
GGML_TYPE_Q5_K,
451452
GGML_TYPE_Q6_K};
452453
if (weight_types.find(tensor->type) == weight_types.end()) {
453454
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
@@ -486,12 +487,12 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
486487
ov::element::Type weight_type;
487488
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
488489
weight_type = ov::element::u4;
489-
} else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K
490+
} else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K
490491
weight_type = ov::element::u8;
491492
}
492493

493494
uint64_t weights_per_block;
494-
// here we only consider sub block, q6k:16 q4k:32
495+
// here we only consider sub block, q6k:16 q4k:32 q5k:32
495496
if (tensor->type == GGML_TYPE_Q6_K) {
496497
weights_per_block = 16;
497498
} else {
@@ -526,6 +527,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
526527
} else if (tensor->type == GGML_TYPE_Q4_K) {
527528
extract_q4_k_data(tensor, weights, scales, biases);
528529
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
530+
} else if (tensor->type == GGML_TYPE_Q5_K) {
531+
extract_q5_k_data(tensor, weights, scales, biases);
532+
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
529533
}
530534

531535
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");

ggml/src/ggml-openvino/ggml-openvino.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
350350
GGML_TYPE_Q4_0,
351351
GGML_TYPE_Q4_1,
352352
GGML_TYPE_Q4_K,
353+
GGML_TYPE_Q5_K,
353354
GGML_TYPE_Q8_0,
354355
GGML_TYPE_Q6_K};
355356

ggml/src/ggml-openvino/ggml-quants.cpp

Lines changed: 111 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
#include "ggml-quants.hpp"
22

3+
#include <algorithm>
4+
#include <cassert>
5+
#include <cmath>
6+
#include <cstddef>
37
#include <cstdint>
48
#include <limits>
59
#include <memory>
10+
#include <openvino/core/node.hpp>
11+
#include <openvino/core/node_output.hpp>
612
#include <openvino/core/parallel.hpp>
13+
#include <openvino/core/shape.hpp>
14+
#include <openvino/core/type/element_type.hpp>
715
#include <openvino/core/type/element_type_traits.hpp>
816
#include <openvino/core/type/float16.hpp>
917
#include <openvino/op/constant.hpp>
1018
#include <openvino/op/convert.hpp>
1119
#include <openvino/op/multiply.hpp>
1220
#include <openvino/op/reshape.hpp>
1321
#include <openvino/op/subtract.hpp>
22+
#include <openvino/op/util/attr_types.hpp>
1423
#include <openvino/runtime/tensor.hpp>
1524
#include <string>
25+
#include <vector>
1626

27+
#include "ggml-common.h"
1728
#include "ggml-impl.h"
1829
#include "ggml.h"
1930

@@ -38,10 +49,10 @@ void extract_q4_0_data(const ggml_tensor* tensor,
3849
ov::Tensor& scales_arr,
3950
ov::Tensor& biases_arr) {
4051
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
41-
auto data = static_cast<uint8_t*>(tensor->data);
42-
auto weights = static_cast<uint8_t*>(weights_arr.data());
43-
auto scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
44-
auto biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
52+
auto* data = static_cast<uint8_t*>(tensor->data);
53+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
54+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
55+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
4556

4657
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
4758
scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block)));
@@ -57,10 +68,10 @@ void extract_q4_1_data(const ggml_tensor* tensor,
5768
ov::Tensor& scales_arr,
5869
ov::Tensor& biases_arr) {
5970
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
60-
auto data = static_cast<uint8_t*>(tensor->data);
61-
auto weights = static_cast<uint8_t*>(weights_arr.data());
62-
auto scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
63-
auto biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
71+
auto* data = static_cast<uint8_t*>(tensor->data);
72+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
73+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
74+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
6475
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
6576
scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block)));
6677
biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 2)));
@@ -76,22 +87,22 @@ void extract_q8_0_data(const ggml_tensor* tensor,
7687
ov::Tensor& biases_arr) {
7788
const uint64_t weights_per_block = 32;
7889
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
79-
auto data = static_cast<uint8_t*>(tensor->data);
80-
auto weights = static_cast<uint8_t*>(weights_arr.data());
81-
auto scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
82-
auto biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
83-
for (size_t i = 0; i < scales_arr.get_size(); i++) {
90+
auto* data = static_cast<uint8_t*>(tensor->data);
91+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
92+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
93+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
94+
95+
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
8496
uint8_t* block_data = data + i * bytes_per_block;
85-
scales[i] = ov::float16::from_bits(*(uint16_t*)block_data);
97+
scales[i] = ov::float16::from_bits(*(uint16_t*) block_data);
8698
biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
8799
for (size_t j = 0; j < weights_per_block; ++j) {
88100
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
89-
// Original data is in int8_t, so we add a bias of -128 and invert the
90-
// first bit.
101+
// Original data is in int8_t, so we add a bias of -128 and invert the first bit.
91102
x ^= 1 << 7;
92103
weights[i * weights_per_block + j] = x;
93104
}
94-
}
105+
});
95106
}
96107

97108
void unpack_256_4(const uint8_t* data, uint8_t* dst) {
@@ -117,12 +128,11 @@ void extract_q4_k_data(const ggml_tensor* tensor,
117128
ov::Tensor& scales_arr,
118129
ov::Tensor& biases_arr) {
119130
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
120-
// TODO tensor->nb[3]
121131
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
122-
auto data = static_cast<uint8_t*>(tensor->data);
123-
auto weights = static_cast<uint8_t*>(weights_arr.data());
124-
auto scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
125-
auto biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
132+
auto* data = static_cast<uint8_t*>(tensor->data);
133+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
134+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
135+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
126136

127137
ov::parallel_for(n_super_block, [&](size_t i) {
128138
uint8_t* block_data = data + i * bytes_per_block;
@@ -170,28 +180,26 @@ void extract_q6_k_data(const ggml_tensor* tensor,
170180
ov::Tensor& biases_arr) {
171181
const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
172182
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
173-
auto data = static_cast<uint8_t*>(tensor->data);
174-
auto weights = static_cast<uint8_t*>(weights_arr.data());
175-
auto scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
176-
auto biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
177-
// std::string name(tensor.name, tensor.namelen);
178-
for (size_t i = 0; i < n_super_block; i++) {
183+
auto* data = static_cast<uint8_t*>(tensor->data);
184+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
185+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
186+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
187+
188+
ov::parallel_for(n_super_block, [&](size_t i) {
179189
uint8_t* block_data = data + i * bytes_per_block;
180190

181191
float scale_factor =
182-
static_cast<float>(ov::float16::from_bits(*((uint16_t*)block_data + 104))); // (128+64+16)/2
192+
static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data + 104))); // (128+64+16)/2
183193

184194
for (size_t j = 0; j < 16; j++) {
185195
scales[j + i * 16] =
186-
ov::float16(scale_factor * static_cast<float>(*((int8_t*)(block_data + 128 + 64 + j))));
196+
ov::float16(scale_factor * static_cast<float>(*((int8_t*) (block_data + 128 + 64 + j))));
187197
biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
188198
}
189199

190-
// Extract ql and qh
191200
uint8_t* ql = block_data;
192201
uint8_t* qh = block_data + 128;
193202

194-
// Extract weights
195203
for (int64_t j = 0; j < 32; ++j) {
196204
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
197205
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
@@ -202,9 +210,80 @@ void extract_q6_k_data(const ggml_tensor* tensor,
202210
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
203211
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
204212
}
213+
});
214+
}
215+
216+
static inline void get_scale_min_k4(int j, const uint8_t* q, uint8_t* d, uint8_t* m) {
217+
if (j < 4) {
218+
*d = q[j] & 63;
219+
*m = q[j + 4] & 63;
220+
} else {
221+
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
222+
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
205223
}
206224
}
207225

226+
void extract_q5_k_data(const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr,
227+
ov::Tensor& biases_arr) {
228+
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
229+
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
230+
auto* data = static_cast<uint8_t*>(tensor->data);
231+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
232+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
233+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
234+
235+
ov::parallel_for(n_super_block, [&](size_t i) {
236+
uint8_t* block_data = data + i * bytes_per_block;
237+
238+
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data)));
239+
const float min = static_cast<float>(ov::float16::from_bits(*((uint16_t*) block_data + 1)));
240+
241+
const uint8_t* scales_data = block_data + 4; // 12 bytes of scales
242+
const uint8_t* qh = block_data + 4 + 12; // 32 bytes of high bits
243+
const uint8_t* ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
244+
245+
int is = 0;
246+
uint8_t u1 = 1;
247+
uint8_t u2 = 2;
248+
249+
// Process 2 blocks in one iteration
250+
for (int j = 0; j < 256; j += 64) { // 256 = QK_K, so 4 iterations of 64
251+
uint8_t sc;
252+
uint8_t m;
253+
254+
// Get scale and min for first 32 elements
255+
get_scale_min_k4(is + 0, scales_data, &sc, &m);
256+
const float d1 = d * sc;
257+
const float m1 = min * m;
258+
259+
// Get scale and min for second 32 elements
260+
get_scale_min_k4(is + 1, scales_data, &sc, &m);
261+
const float d2 = d * sc;
262+
const float m2 = min * m;
263+
264+
scales[i * 8 + is] = ov::float16(d1);
265+
biases[i * 8 + is] = ov::float16(-m1);
266+
scales[i * 8 + is + 1] = ov::float16(d2);
267+
biases[i * 8 + is + 1] = ov::float16(-m2);
268+
269+
// Extract weights for first 32 elements (matching deq formula exactly)
270+
for (int l = 0; l < 32; ++l) {
271+
weights[i * 256 + j + l] = (ql[l] & 0xF) + ((qh[l] & u1) ? 16 : 0);
272+
}
273+
274+
// Extract weights for second 32 elements
275+
for (int l = 0; l < 32; ++l) {
276+
weights[i * 256 + j + l + 32] = (ql[l] >> 4) + ((qh[l] & u2) ? 16 : 0);
277+
}
278+
279+
ql += 32;
280+
is += 2;
281+
u1 <<= 2;
282+
u2 <<= 2;
283+
}
284+
});
285+
}
286+
208287
// TODO Reorder for make_intX_weights
209288

210289
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {

ggml/src/ggml-openvino/ggml-quants.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ void extract_q4_k_data(const ggml_tensor* tensor,
2929
ov::Tensor& scales_arr,
3030
ov::Tensor& biases_arr);
3131

32+
void extract_q5_k_data(const ggml_tensor* tensor,
33+
ov::Tensor& weights_arr,
34+
ov::Tensor& scales_arr,
35+
ov::Tensor& biases_arr);
36+
3237
void extract_q6_k_data(const ggml_tensor* tensor,
3338
ov::Tensor& weights_arr,
3439
ov::Tensor& scales_arr,

ggml/src/ggml-openvino/utils.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
283283
{GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
284284
{GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
285285
{GGML_TYPE_Q6_K, ExtraQuantType::F16 },
286+
{GGML_TYPE_Q5_K, ExtraQuantType::F16 },
286287
};
287288
}
288289
if (device == "GPU") {

0 commit comments

Comments
 (0)