11#include " ggml-quants.hpp"
22
33#include < cstdint>
4+ #include < limits>
5+ #include < memory>
46#include < openvino/core/parallel.hpp>
57#include < openvino/core/type/element_type_traits.hpp>
8+ #include < openvino/core/type/float16.hpp>
69#include < openvino/op/constant.hpp>
710#include < openvino/op/convert.hpp>
811#include < openvino/op/multiply.hpp>
912#include < openvino/op/reshape.hpp>
1013#include < openvino/op/subtract.hpp>
1114#include < openvino/runtime/tensor.hpp>
15+ #include < string>
1216
17+ #include " ggml-impl.h"
1318#include " ggml.h"
1419
1520void unpack_32_4 (const uint8_t * data, uint8_t * dst) {
@@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor,
203208// TODO Reorder for make_intX_weights
204209
205210ov::Output<ov::Node> make_int8_weights (ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
206-
207- // Reshape weight to (num_heads, -1, group_size)
208211 ov::Shape orig_shape = weight.get_shape ();
209- orig_shape[1 ] *= sizeof (uint32_t ) / sizeof (uint8_t );
210- size_t num_groups = orig_shape[1 ] / group_size;
211212
212213 // Expand dimensions for scales and biases
213214 auto scale_shape = scales.get_shape ();
214- scale_shape.push_back (1 );
215- scales.set_shape (scale_shape);
216- biases.set_shape (scale_shape);
215+
216+ ov::Shape packed_shape = {orig_shape[0 ], orig_shape[1 ] / group_size, group_size};
217+
218+ if (packed_shape[1 ] == 1 ) {
219+ packed_shape.erase (packed_shape.begin () + 1 );
220+ } else {
221+ scale_shape.push_back (1 );
222+ scales.set_shape (scale_shape);
223+ biases.set_shape (scale_shape);
224+ }
217225
218226 // Create graph nodes
219- auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8 , ov::Shape{orig_shape[0 ], num_groups, group_size}, static_cast <uint8_t *>(weight.data ()), nullptr );
227+ auto weights_node = std::make_shared<ov::op::v0::Constant>(
228+ ov::element::u8 , packed_shape, static_cast <uint8_t *>(weight.data ()), nullptr );
220229 weights_node->get_rt_info ()[" __gguf_tensor_holder" ] = weight;
221230 auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
222231 ov::Tensor biases_u8 (ov::element::u8 , scale_shape);
@@ -242,32 +251,24 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
242251 auto w_zp = std::make_shared<ov::op::v1::Subtract>(
243252 weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
244253 );
245- auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
246- w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY
247- );
248-
249- // Reshape back to original dimensions
250- auto final_shape = std::make_shared<ov::op::v0::Constant>(
251- ov::element::i64 , ov::Shape{orig_shape.size ()}, orig_shape
252- );
253- auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
254- w_zp_s, final_shape, false
255- );
254+ ov::Output<ov::Node> w_zp_s =
255+ std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
256+
257+ if (packed_shape.size () != 2 ) {
258+ // If not requantized channel-wise case, reshape back to original shape
259+ auto final_shape =
260+ std::make_shared<ov::op::v0::Constant>(ov::element::i64 , ov::Shape{orig_shape.size ()}, orig_shape);
261+ w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false );
262+ }
256263
257- return std::make_shared<ov::op::v0::Convert>(w_zp_s_r , ov::element::f32 );
264+ return std::make_shared<ov::op::v0::Convert>(w_zp_s , ov::element::f32 );
258265}
259266
260267ov::Output<ov::Node> make_int4_weights (ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
261-
262- // Convert weight to uint8 view and adjust shape
263268 ov::Shape orig_weight_shape = weight.get_shape ();
264- orig_weight_shape[1 ] *= sizeof (uint32_t ) / sizeof (uint8_t ) * 2 ; // Double number of columns for 4-bit representation
265269
266270 // Expand dimensions for scales and biases
267271 ov::Shape scale_bias_shape = scales.get_shape ();
268- scale_bias_shape.push_back (1 ); // Add new axis at the end
269- scales.set_shape (scale_bias_shape);
270- biases.set_shape (scale_bias_shape);
271272
272273 // Create INT4 weight tensor
273274 ov::Shape packed_shape = {
@@ -276,8 +277,17 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
276277 group_size
277278 };
278279
280+ // Requantized channel-wise case
281+ if (packed_shape[1 ] == 1 ) {
282+ packed_shape.erase (packed_shape.begin () + 1 );
283+ } else {
284+ scale_bias_shape.push_back (1 );
285+ scales.set_shape (scale_bias_shape);
286+ biases.set_shape (scale_bias_shape);
287+ }
288+
279289 auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast <uint8_t *>(weight.data ()), nullptr );
280- weights_node->get_rt_info ()[" __gguf_tensor_holde " ] = weight;
290+ weights_node->get_rt_info ()[" __gguf_tensor_holder " ] = weight;
281291 auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16 );
282292
283293 // Pack zero points: two subsequent values into one
@@ -304,15 +314,129 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
304314 auto w_zp = std::make_shared<ov::op::v1::Subtract>(
305315 weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
306316
307- auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
308- w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
317+ ov::Output<ov::Node> w_zp_s =
318+ std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
319+
320+ if (packed_shape.size () != 2 ) {
321+ // If not requantized channel-wise case, reshape back to original shape
322+ auto final_shape = std::make_shared<ov::op::v0::Constant>(
323+ ov::element::i64 , ov::Shape{orig_weight_shape.size ()}, orig_weight_shape);
324+
325+ w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false );
326+ }
327+
328+ return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32 );
329+ }
309330
310- // Reshape back to original shape
311- auto final_shape = std::make_shared<ov::op::v0::Constant>(
312- ov::element:: i64 , ov::Shape{orig_weight_shape. size ()}, orig_weight_shape );
331+ std::shared_ptr<ov::Node> requantize ( const ggml_tensor* tensor, ExtraQuantType requant_type) {
332+ std::vector< float > weights_f32 (tensor-> ne [ 0 ] * tensor-> ne [ 1 ]);
333+ ggml_get_type_traits (tensor-> type )-> to_float (tensor-> data , weights_f32. data (), ggml_nelements (tensor) );
313334
314- auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
315- w_zp_s, final_shape, false );
335+ std::shared_ptr<ov::Node> weight_node;
336+ ov::Shape node_shape = {(uint64_t ) (tensor->ne [1 ]), (uint64_t ) (tensor->ne [0 ])};
337+
338+ if (requant_type == ExtraQuantType::F16) {
339+ ov::Tensor weights (ov::element::f16 , node_shape);
340+ ggml_get_type_traits (GGML_TYPE_F16)->from_float_ref (weights_f32.data (), weights.data (), ggml_nelements (tensor));
341+ std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
342+ weight_node->set_friendly_name (tensor->name );
343+ return weight_node;
344+ }
316345
317- return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32 );
346+ int64_t block_size = node_shape[1 ];
347+ if (requant_type == ExtraQuantType::Q4_0_128) {
348+ block_size = 128 ;
349+ }
350+ auto scales_shape = ov::Shape{node_shape[0 ], node_shape[1 ] / block_size};
351+
352+ ov::Tensor weights;
353+ ov::Tensor scales (ov::element::f16 , scales_shape);
354+ ov::Tensor bias (ov::element::f16 , scales_shape);
355+
356+ if (requant_type == ExtraQuantType::Q4_0_C) {
357+ weights = ov::Tensor (ov::element::u4, node_shape);
358+ quantize_q4_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
359+ weight_node = make_int4_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
360+ } else if (requant_type == ExtraQuantType::Q8_1_C) {
361+ weights = ov::Tensor (ov::element::u8 , node_shape);
362+ quantize_q8_1 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
363+ weight_node = make_int8_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
364+ } else if (requant_type == ExtraQuantType::Q4_0_128) {
365+ weights = ov::Tensor (ov::element::u4, node_shape);
366+ quantize_q4_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
367+ weight_node = make_int4_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
368+ }
369+
370+ weight_node->set_friendly_name (tensor->name );
371+ return weight_node;
372+ }
373+
374+ void quantize_q4_0 (const float * x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
375+ int64_t qk) {
376+ assert (k % qk == 0 );
377+ const int nb = k / qk;
378+
379+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
380+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
381+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
382+ for (int i = 0 ; i < nb; i++) {
383+ float amax = 0 .0f ; // absolute max
384+ float max = 0 .0f ;
385+
386+ for (int j = 0 ; j < qk; j++) {
387+ const float v = x[i * qk + j];
388+ if (amax < fabsf (v)) {
389+ amax = fabsf (v);
390+ max = v;
391+ }
392+ }
393+
394+ const float d = max / -8 ;
395+ const float id = d ? 1 .0f / d : 0 .0f ;
396+ scales[i] = ov::float16 (d);
397+ biases[i] = ov::float16 (-8 .f * d);
398+
399+ for (int j = 0 ; j < qk / 2 ; ++j) {
400+ const float x0 = x[i * qk + 2 * j] * id;
401+ const float x1 = x[i * qk + 2 * j + 1 ] * id;
402+ const uint8_t xi0 = MIN (15 , (int8_t ) (x0 + 8 .5f ));
403+ const uint8_t xi1 = MIN (15 , (int8_t ) (x1 + 8 .5f ));
404+ weights[i * qk / 2 + j] = xi0 | (xi1 << 4 );
405+ }
406+ }
407+ }
408+
409+ void quantize_q8_1 (const float * x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
410+ int64_t qk) {
411+ assert (k % qk == 0 );
412+ const int nb = k / qk;
413+
414+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
415+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
416+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
417+ for (int i = 0 ; i < nb; i++) {
418+ float min = std::numeric_limits<float >::max ();
419+ float max = std::numeric_limits<float >::lowest ();
420+
421+ for (int j = 0 ; j < qk; j++) {
422+ const float v = x[i * qk + j];
423+ if (v < min) {
424+ min = v;
425+ }
426+ if (v > max) {
427+ max = v;
428+ }
429+ }
430+
431+ const float d = (max - min) / ((1 << 8 ) - 1 );
432+ const float id = d ? 1 .0f / d : 0 .0f ;
433+ scales[i] = ov::float16 (d);
434+ biases[i] = ov::float16 (min);
435+
436+ for (int j = 0 ; j < qk; ++j) {
437+ const float x0 = (x[i * qk + j] - min) * id;
438+ const uint8_t xi0 = roundf (x0);
439+ weights[i * qk + j] = xi0;
440+ }
441+ }
318442}
0 commit comments