11#include " ggml-quants.hpp"
22
3+ #include < algorithm>
4+ #include < cassert>
5+ #include < cmath>
6+ #include < cstddef>
37#include < cstdint>
48#include < limits>
59#include < memory>
10+ #include < openvino/core/node.hpp>
11+ #include < openvino/core/node_output.hpp>
612#include < openvino/core/parallel.hpp>
13+ #include < openvino/core/shape.hpp>
14+ #include < openvino/core/type/element_type.hpp>
715#include < openvino/core/type/element_type_traits.hpp>
816#include < openvino/core/type/float16.hpp>
917#include < openvino/op/constant.hpp>
1018#include < openvino/op/convert.hpp>
1119#include < openvino/op/multiply.hpp>
1220#include < openvino/op/reshape.hpp>
1321#include < openvino/op/subtract.hpp>
22+ #include < openvino/op/util/attr_types.hpp>
1423#include < openvino/runtime/tensor.hpp>
1524#include < string>
25+ #include < vector>
1626
27+ #include " ggml-common.h"
1728#include " ggml-impl.h"
1829#include " ggml.h"
1930
@@ -38,10 +49,10 @@ void extract_q4_0_data(const ggml_tensor* tensor,
3849 ov::Tensor& scales_arr,
3950 ov::Tensor& biases_arr) {
4051 const uint64_t bytes_per_block = 18 ; // 2 bytes scale, 32x0.5 byte weights
41- auto data = static_cast <uint8_t *>(tensor->data );
42- auto weights = static_cast <uint8_t *>(weights_arr.data ());
43- auto scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
44- auto biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
52+ auto * data = static_cast <uint8_t *>(tensor->data );
53+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
54+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
55+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
4556
4657 ov::parallel_for (scales_arr.get_size (), [&](size_t i) {
4758 scales[i] = ov::float16::from_bits (*((uint16_t *)(data + i * bytes_per_block)));
@@ -57,10 +68,10 @@ void extract_q4_1_data(const ggml_tensor* tensor,
5768 ov::Tensor& scales_arr,
5869 ov::Tensor& biases_arr) {
5970 const uint64_t bytes_per_block = 20 ; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
60- auto data = static_cast <uint8_t *>(tensor->data );
61- auto weights = static_cast <uint8_t *>(weights_arr.data ());
62- auto scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
63- auto biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
71+ auto * data = static_cast <uint8_t *>(tensor->data );
72+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
73+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
74+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
6475 ov::parallel_for (scales_arr.get_size (), [&](size_t i) {
6576 scales[i] = ov::float16::from_bits (*((uint16_t *)(data + i * bytes_per_block)));
6677 biases[i] = ov::float16::from_bits (*((uint16_t *)(data + i * bytes_per_block + 2 )));
@@ -76,22 +87,22 @@ void extract_q8_0_data(const ggml_tensor* tensor,
7687 ov::Tensor& biases_arr) {
7788 const uint64_t weights_per_block = 32 ;
7889 const uint64_t bytes_per_block = 34 ; // 2 bytes scale, 32x1 byte weights
79- auto data = static_cast <uint8_t *>(tensor->data );
80- auto weights = static_cast <uint8_t *>(weights_arr.data ());
81- auto scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
82- auto biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
83- for (size_t i = 0 ; i < scales_arr.get_size (); i++) {
90+ auto * data = static_cast <uint8_t *>(tensor->data );
91+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
92+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
93+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
94+
95+ ov::parallel_for (scales_arr.get_size (), [&](size_t i) {
8496 uint8_t * block_data = data + i * bytes_per_block;
85- scales[i] = ov::float16::from_bits (*(uint16_t *)block_data);
97+ scales[i] = ov::float16::from_bits (*(uint16_t *) block_data);
8698 biases[i] = ov::float16 (-128 .f * static_cast <float >(scales[i]));
8799 for (size_t j = 0 ; j < weights_per_block; ++j) {
88100 uint8_t x = block_data[j + 2 ]; // j+2 to skip the scale bytes.
89- // Original data is in int8_t, so we add a bias of -128 and invert the
90- // first bit.
101+ // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
91102 x ^= 1 << 7 ;
92103 weights[i * weights_per_block + j] = x;
93104 }
94- }
105+ });
95106}
96107
97108void unpack_256_4 (const uint8_t * data, uint8_t * dst) {
@@ -117,12 +128,11 @@ void extract_q4_k_data(const ggml_tensor* tensor,
117128 ov::Tensor& scales_arr,
118129 ov::Tensor& biases_arr) {
119130 const uint64_t bytes_per_block = 2 + 2 + 12 + 128 ;
120- // TODO tensor->nb[3]
121131 const uint64_t n_super_block = tensor->nb [3 ] / bytes_per_block;
122- auto data = static_cast <uint8_t *>(tensor->data );
123- auto weights = static_cast <uint8_t *>(weights_arr.data ());
124- auto scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
125- auto biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
132+ auto * data = static_cast <uint8_t *>(tensor->data );
133+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
134+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
135+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
126136
127137 ov::parallel_for (n_super_block, [&](size_t i) {
128138 uint8_t * block_data = data + i * bytes_per_block;
@@ -170,28 +180,26 @@ void extract_q6_k_data(const ggml_tensor* tensor,
170180 ov::Tensor& biases_arr) {
171181 const uint64_t bytes_per_block = 128 + 64 + 16 + 2 ;
172182 const uint64_t n_super_block = tensor->nb [3 ] / bytes_per_block;
173- auto data = static_cast <uint8_t *>(tensor->data );
174- auto weights = static_cast <uint8_t *>(weights_arr.data ());
175- auto scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
176- auto biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
177- // std::string name(tensor.name, tensor.namelen);
178- for (size_t i = 0 ; i < n_super_block; i++ ) {
183+ auto * data = static_cast <uint8_t *>(tensor->data );
184+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
185+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
186+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
187+
188+ ov::parallel_for (n_super_block, [&] (size_t i) {
179189 uint8_t * block_data = data + i * bytes_per_block;
180190
181191 float scale_factor =
182- static_cast <float >(ov::float16::from_bits (*((uint16_t *)block_data + 104 ))); // (128+64+16)/2
192+ static_cast <float >(ov::float16::from_bits (*((uint16_t *) block_data + 104 ))); // (128+64+16)/2
183193
184194 for (size_t j = 0 ; j < 16 ; j++) {
185195 scales[j + i * 16 ] =
186- ov::float16 (scale_factor * static_cast <float >(*((int8_t *)(block_data + 128 + 64 + j))));
196+ ov::float16 (scale_factor * static_cast <float >(*((int8_t *) (block_data + 128 + 64 + j))));
187197 biases[j + i * 16 ] = ov::float16 (-32 .f * static_cast <float >(scales[j + i * 16 ]));
188198 }
189199
190- // Extract ql and qh
191200 uint8_t * ql = block_data;
192201 uint8_t * qh = block_data + 128 ;
193202
194- // Extract weights
195203 for (int64_t j = 0 ; j < 32 ; ++j) {
196204 weights[i * 256 + j] = (ql[j] & 0xF ) | (((qh[j] >> 0 ) & 3 ) << 4 );
197205 weights[i * 256 + j + 32 ] = (ql[32 + j] & 0xF ) | (((qh[j] >> 2 ) & 3 ) << 4 );
@@ -202,9 +210,80 @@ void extract_q6_k_data(const ggml_tensor* tensor,
202210 weights[i * 256 + j + 192 ] = (ql[64 + j] >> 4 ) | (((qh[32 + j] >> 4 ) & 3 ) << 4 );
203211 weights[i * 256 + j + 224 ] = (ql[96 + j] >> 4 ) | (((qh[32 + j] >> 6 ) & 3 ) << 4 );
204212 }
213+ });
214+ }
215+
216+ static inline void get_scale_min_k4 (int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
217+ if (j < 4 ) {
218+ *d = q[j] & 63 ;
219+ *m = q[j + 4 ] & 63 ;
220+ } else {
221+ *d = (q[j + 4 ] & 0xF ) | ((q[j - 4 ] >> 6 ) << 4 );
222+ *m = (q[j + 4 ] >> 4 ) | ((q[j - 0 ] >> 6 ) << 4 );
205223 }
206224}
207225
226+ void extract_q5_k_data (const ggml_tensor* tensor, ov::Tensor& weights_arr, ov::Tensor& scales_arr,
227+ ov::Tensor& biases_arr) {
228+ const uint64_t bytes_per_block = 4 + 12 + 32 + 128 ;
229+ const uint64_t n_super_block = tensor->nb [3 ] / bytes_per_block;
230+ auto * data = static_cast <uint8_t *>(tensor->data );
231+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
232+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
233+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
234+
235+ ov::parallel_for (n_super_block, [&](size_t i) {
236+ uint8_t * block_data = data + i * bytes_per_block;
237+
238+ const float d = static_cast <float >(ov::float16::from_bits (*((uint16_t *) block_data)));
239+ const float min = static_cast <float >(ov::float16::from_bits (*((uint16_t *) block_data + 1 )));
240+
241+ const uint8_t * scales_data = block_data + 4 ; // 12 bytes of scales
242+ const uint8_t * qh = block_data + 4 + 12 ; // 32 bytes of high bits
243+ const uint8_t * ql = block_data + 4 + 12 + 32 ; // 128 bytes of low bits
244+
245+ int is = 0 ;
246+ uint8_t u1 = 1 ;
247+ uint8_t u2 = 2 ;
248+
249+ // Process 2 blocks in one iteration
250+ for (int j = 0 ; j < 256 ; j += 64 ) { // 256 = QK_K, so 4 iterations of 64
251+ uint8_t sc;
252+ uint8_t m;
253+
254+ // Get scale and min for first 32 elements
255+ get_scale_min_k4 (is + 0 , scales_data, &sc, &m);
256+ const float d1 = d * sc;
257+ const float m1 = min * m;
258+
259+ // Get scale and min for second 32 elements
260+ get_scale_min_k4 (is + 1 , scales_data, &sc, &m);
261+ const float d2 = d * sc;
262+ const float m2 = min * m;
263+
264+ scales[i * 8 + is] = ov::float16 (d1);
265+ biases[i * 8 + is] = ov::float16 (-m1);
266+ scales[i * 8 + is + 1 ] = ov::float16 (d2);
267+ biases[i * 8 + is + 1 ] = ov::float16 (-m2);
268+
269+ // Extract weights for first 32 elements (matching deq formula exactly)
270+ for (int l = 0 ; l < 32 ; ++l) {
271+ weights[i * 256 + j + l] = (ql[l] & 0xF ) + ((qh[l] & u1) ? 16 : 0 );
272+ }
273+
274+ // Extract weights for second 32 elements
275+ for (int l = 0 ; l < 32 ; ++l) {
276+ weights[i * 256 + j + l + 32 ] = (ql[l] >> 4 ) + ((qh[l] & u2) ? 16 : 0 );
277+ }
278+
279+ ql += 32 ;
280+ is += 2 ;
281+ u1 <<= 2 ;
282+ u2 <<= 2 ;
283+ }
284+ });
285+ }
286+
208287// TODO Reorder for make_intX_weights
209288
210289ov::Output<ov::Node> make_int8_weights (ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
0 commit comments