@@ -425,25 +425,27 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
425425 int64_t block_size = node_shape[1 ];
426426 if (requant_type == ExtraQuantType::Q4_0_128) {
427427 block_size = 128 ;
428+ } else if (requant_type == ExtraQuantType::Q8_0_32) {
429+ block_size = 32 ;
428430 }
429431 auto scales_shape = ov::Shape{node_shape[0 ], node_shape[1 ] / block_size};
430432
431433 ov::Tensor weights;
432434 ov::Tensor scales (ov::element::f16 , scales_shape);
433435 ov::Tensor bias (ov::element::f16 , scales_shape);
434436
435- if (requant_type == ExtraQuantType::Q4_0_C) {
437+ if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128 ) {
436438 weights = ov::Tensor (ov::element::u4, node_shape);
437439 quantize_q4_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
438440 weight_node = make_int4_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
439441 } else if (requant_type == ExtraQuantType::Q8_1_C) {
440442 weights = ov::Tensor (ov::element::u8 , node_shape);
441443 quantize_q8_1 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
442444 weight_node = make_int8_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
443- } else if (requant_type == ExtraQuantType::Q4_0_128 ) {
444- weights = ov::Tensor (ov::element::u4 , node_shape);
445- quantize_q4_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
446- weight_node = make_int4_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
445+ } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32 ) {
446+ weights = ov::Tensor (ov::element::u8 , node_shape);
447+ quantize_q8_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
448+ weight_node = make_int8_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
447449 }
448450
449451 weight_node->set_friendly_name (tensor->name );
@@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
485487 }
486488}
487489
490+ void quantize_q8_0 (const float * x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
491+ int64_t qk) {
492+ assert (k % qk == 0 );
493+ const int nb = k / qk;
494+
495+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
496+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
497+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
498+ for (int i = 0 ; i < nb; i++) {
499+ float amax = 0 .0f ; // absolute max
500+
501+ for (int j = 0 ; j < qk; j++) {
502+ const float v = x[i * qk + j];
503+ if (amax < fabsf (v)) {
504+ amax = fabsf (v);
505+ }
506+ }
507+
508+ const float d = amax / 127 .0f ;
509+ const float id = d ? 1 .0f / d : 0 .0f ;
510+ scales[i] = ov::float16 (d);
511+ biases[i] = ov::float16 (-128 .0f * d);
512+
513+ for (int j = 0 ; j < qk; ++j) {
514+ const float x0 = x[i * qk + j] * id;
515+ const int8_t xi0 = roundf (x0);
516+ weights[i * qk + j] = (uint8_t ) (xi0 + 128 );
517+ }
518+ }
519+ }
520+
488521void quantize_q8_1 (const float * x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
489522 int64_t qk) {
490523 assert (k % qk == 0 );
0 commit comments