ravi9
diff --git a/‎ggml/src/ggml-openvino/ggml-decoder.cpp‎
Lines changed: 16 additions & 28 deletions b/‎ggml/src/ggml-openvino/ggml-decoder.cpp‎
Lines changed: 16 additions & 28 deletions
diff --git a/‎ggml/src/ggml-openvino/ggml-decoder.h‎
Lines changed: 5 additions & 2 deletions b/‎ggml/src/ggml-openvino/ggml-decoder.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎ggml/src/ggml-openvino/ggml-quants.cpp‎
Lines changed: 159 additions & 35 deletions b/‎ggml/src/ggml-openvino/ggml-quants.cpp‎
Lines changed: 159 additions & 35 deletions
@@ -25,6 +25,7 @@
 #include <openvino/op/parameter.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <openvino/runtime/tensor.hpp>
+#include <optional>
 #include <ostream>
 #include <set>
 #include <stdexcept>
@@ -371,7 +372,7 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }
 
 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
-    struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
+    struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
     std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
     static std::mutex weights_mutex;
     auto* nodes = cgraph->nodes;
@@ -396,7 +397,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
                         }
                     }
                     if (should_create) {
-                        auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
+                        auto requant_type = types_to_requantize.count(src->type) ?
+                                                std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
+                                                std::nullopt;
+                        auto weight_node = create_weight_node(src, requant_type);
                         weight_node->set_friendly_name(src_name);
                         {
                             std::lock_guard<std::mutex> lock(weights_mutex);
@@ -410,7 +414,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
     return model_weights;
 }
 
-std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
+std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
+                                                            std::optional<ExtraQuantType> requant_type) {
     std::set<ggml_type> weight_types = {
         GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
     if (weight_types.find(tensor->type) == weight_types.end()) {
@@ -443,21 +448,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
         tensor->extra == nullptr,
         "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
 
-    if (to_dequantize) {
-        std::vector<float> weights_f32(ne_total);
-        ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
-        ov::Tensor weights(ov::element::f16, node_shape);
-        ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
-        std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
-        weight_node->set_friendly_name(tensor->name);
-        return weight_node;
+    if (requant_type.has_value()) {
+        return requantize(tensor, requant_type.value());
     }
 
-    uint64_t weights_per_byte;
+    ov::element::Type weight_type;
     if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
-        weights_per_byte = 2;
+        weight_type = ov::element::u4;
     } else {  // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K
-        weights_per_byte = 1;
+        weight_type = ov::element::u8;
     }
 
     uint64_t weights_per_block;
@@ -474,15 +473,12 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
                     " has incompatible last dim shape: ",
                     node_shape.back());
 
-    auto weights_shape = node_shape;
-    weights_shape.back() /= (weights_per_byte * 4);  // means u32 type can store 8 q4 or 4 q8
-
-    ov::Tensor weights(ov::element::u32, weights_shape);
-    // For scales and bias
+    ov::Tensor weights(weight_type, node_shape);
+    // For scales and biases
     node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block;
-
     ov::Tensor scales(ov::element::f16, node_shape);
     ov::Tensor biases(ov::element::f16, node_shape);
+
     ov::Output<ov::Node> weight_node;
     if (tensor->type == GGML_TYPE_Q4_0) {
         extract_q4_0_data(tensor, weights, scales, biases);
@@ -494,7 +490,6 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
         extract_q8_0_data(tensor, weights, scales, biases);
         weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
     } else if (tensor->type == GGML_TYPE_Q6_K) {
-        // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled.
         extract_q6_k_data(tensor, weights, scales, biases);
         weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
     } else if (tensor->type == GGML_TYPE_Q4_K) {
@@ -503,15 +498,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
     }
 
     OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
-    // weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
-    //     weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
 
     weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
-    // GGML_LOG_DEBUG("Created weight node: %s   %s %s%s\n",
-    //                tensor->name,
-    //                ggml_type_name(tensor->type),
-    //                weight_node.get_element_type().get_type_name().c_str(),
-    //                weight_node.get_partial_shape().to_string().c_str());
     return weight_node.get_node_shared_ptr();
 }
 
 
@@ -4,8 +4,10 @@
 #include <map>
 #include <memory>
 #include <openvino/core/partial_shape.hpp>
+#include <optional>
 #include <vector>
 
+#include "ggml-quants.hpp"
 #include "ggml.h"
 #include "openvino/decoder.hpp"
 
@@ -117,9 +119,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
 
-    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
+                                                        std::optional<ExtraQuantType> requant_type = std::nullopt);
     static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
-        struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
+        struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
 
     const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
     const ggml_tensor* get_tensor_from_name(const std::string& name) const;
 
@@ -1,15 +1,20 @@
 #include "ggml-quants.hpp"
 
 #include <cstdint>
+#include <limits>
+#include <memory>
 #include <openvino/core/parallel.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
+#include <openvino/core/type/float16.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/runtime/tensor.hpp>
+#include <string>
 
+#include "ggml-impl.h"
 #include "ggml.h"
 
 void unpack_32_4(const uint8_t* data, uint8_t* dst) {
@@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor,
 // TODO Reorder for make_intX_weights
 
 ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
-
-    // Reshape weight to (num_heads, -1, group_size)
     ov::Shape orig_shape = weight.get_shape();
-    orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t);
-    size_t num_groups = orig_shape[1] / group_size;
 
     // Expand dimensions for scales and biases
     auto scale_shape = scales.get_shape();
-    scale_shape.push_back(1);
-    scales.set_shape(scale_shape);
-    biases.set_shape(scale_shape);
+
+    ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
+
+    if (packed_shape[1] == 1) {
+        packed_shape.erase(packed_shape.begin() + 1);
+    } else {
+        scale_shape.push_back(1);
+        scales.set_shape(scale_shape);
+        biases.set_shape(scale_shape);
+    }
 
     // Create graph nodes
-    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast<uint8_t*>(weight.data()), nullptr);
+    auto weights_node = std::make_shared<ov::op::v0::Constant>(
+        ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
     weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
     auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
     ov::Tensor biases_u8(ov::element::u8, scale_shape);
@@ -242,32 +251,24 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
     auto w_zp = std::make_shared<ov::op::v1::Subtract>(
         weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
     );
-    auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
-        w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY
-    );
-
-    // Reshape back to original dimensions
-    auto final_shape = std::make_shared<ov::op::v0::Constant>(
-        ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape
-    );
-    auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
-        w_zp_s, final_shape, false
-    );
+    ov::Output<ov::Node> w_zp_s =
+        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+
+    if (packed_shape.size() != 2) {
+        // If not requantized channel-wise case, reshape back to original shape
+        auto final_shape =
+            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
+        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+    }
 
-    return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
 }
 
 ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
-
-    // Convert weight to uint8 view and adjust shape
     ov::Shape orig_weight_shape = weight.get_shape();
-    orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation
 
     // Expand dimensions for scales and biases
     ov::Shape scale_bias_shape = scales.get_shape();
-    scale_bias_shape.push_back(1); // Add new axis at the end
-    scales.set_shape(scale_bias_shape);
-    biases.set_shape(scale_bias_shape);
 
     // Create INT4 weight tensor
     ov::Shape packed_shape = {
@@ -276,8 +277,17 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
         group_size
     };
 
+    // Requantized channel-wise case
+    if (packed_shape[1] == 1) {
+        packed_shape.erase(packed_shape.begin() + 1);
+    } else {
+        scale_bias_shape.push_back(1);
+        scales.set_shape(scale_bias_shape);
+        biases.set_shape(scale_bias_shape);
+    }
+
     auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
-    weights_node->get_rt_info()["__gguf_tensor_holde"] = weight;
+    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
     auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
 
     // Pack zero points: two subsequent values into one
@@ -304,15 +314,129 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
     auto w_zp = std::make_shared<ov::op::v1::Subtract>(
         weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
 
-    auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
-        w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    ov::Output<ov::Node> w_zp_s =
+        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+
+    if (packed_shape.size() != 2) {
+        // If not requantized channel-wise case, reshape back to original shape
+        auto final_shape = std::make_shared<ov::op::v0::Constant>(
+            ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
+
+        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+    }
+
+    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
+}
 
-    // Reshape back to original shape
-    auto final_shape = std::make_shared<ov::op::v0::Constant>(
-        ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
+std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
+    std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
+    ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
 
-    auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
-        w_zp_s, final_shape, false);
+    std::shared_ptr<ov::Node> weight_node;
+    ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
+
+    if (requant_type == ExtraQuantType::F16) {
+        ov::Tensor weights(ov::element::f16, node_shape);
+        ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
+        std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
+        weight_node->set_friendly_name(tensor->name);
+        return weight_node;
+    }
 
-    return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
+    int64_t block_size = node_shape[1];
+    if (requant_type == ExtraQuantType::Q4_0_128) {
+        block_size = 128;
+    }
+    auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
+
+    ov::Tensor weights;
+    ov::Tensor scales(ov::element::f16, scales_shape);
+    ov::Tensor bias(ov::element::f16, scales_shape);
+
+    if (requant_type == ExtraQuantType::Q4_0_C) {
+        weights = ov::Tensor(ov::element::u4, node_shape);
+        quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    } else if (requant_type == ExtraQuantType::Q8_1_C) {
+        weights = ov::Tensor(ov::element::u8, node_shape);
+        quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    } else if (requant_type == ExtraQuantType::Q4_0_128) {
+        weights = ov::Tensor(ov::element::u4, node_shape);
+        quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    }
+
+    weight_node->set_friendly_name(tensor->name);
+    return weight_node;
+}
+
+void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk) {
+    assert(k % qk == 0);
+    const int nb = k / qk;
+
+    auto* weights = static_cast<uint8_t*>(weights_arr.data());
+    auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f;  // absolute max
+        float max = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
+        }
+
+        const float d = max / -8;
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        biases[i] = ov::float16(-8.f * d);
+
+        for (int j = 0; j < qk / 2; ++j) {
+            const float x0 = x[i * qk + 2 * j] * id;
+            const float x1 = x[i * qk + 2 * j + 1] * id;
+            const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
+            weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
+        }
+    }
+}
+
+void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk) {
+    assert(k % qk == 0);
+    const int nb = k / qk;
+
+    auto* weights = static_cast<uint8_t*>(weights_arr.data());
+    auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    for (int i = 0; i < nb; i++) {
+        float min = std::numeric_limits<float>::max();
+        float max = std::numeric_limits<float>::lowest();
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (v < min) {
+                min = v;
+            }
+            if (v > max) {
+                max = v;
+            }
+        }
+
+        const float d = (max - min) / ((1 << 8) - 1);
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        biases[i] = ov::float16(min);
+
+        for (int j = 0; j < qk; ++j) {
+            const float x0 = (x[i * qk + j] - min) * id;
+            const uint8_t xi0 = roundf(x0);
+            weights[i * qk + j] = xi0;
+        }
+    }
 }