@@ -370,7 +370,8 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
370370 return kv_param_res_names;
371371}
372372
373- std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes (struct ggml_cgraph * cgraph) {
373+ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes (
374+ struct ggml_cgraph * cgraph, std::set<ggml_type> types_to_dequantize) {
374375 std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
375376 static std::mutex weights_mutex;
376377 auto * nodes = cgraph->nodes ;
@@ -395,7 +396,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
395396 }
396397 }
397398 if (should_create) {
398- auto weight_node = create_weight_node (src);
399+ auto weight_node = create_weight_node (src, types_to_dequantize. count (src-> type ) > 0 );
399400 weight_node->set_friendly_name (src_name);
400401 {
401402 std::lock_guard<std::mutex> lock (weights_mutex);
@@ -409,7 +410,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
409410 return model_weights;
410411}
411412
412- std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node (ggml_tensor* tensor) {
413+ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node (ggml_tensor* tensor, bool to_dequantize ) {
413414 std::set<ggml_type> weight_types = {
414415 GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
415416 if (weight_types.find (tensor->type ) == weight_types.end ()) {
@@ -422,15 +423,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
422423 auto ne_total = ggml_nelements (tensor);
423424
424425 OPENVINO_ASSERT (node_shape[0 ] == 1 , " Got 3D weights, expect all weights to be 2D: " , tensor->name );
426+ node_shape.erase (node_shape.begin ());
425427
426428 // F16 and F32 case
427429 if (node_type != ov::element::dynamic) {
428430 ov::Tensor weights (node_type, node_shape);
429431 memcpy (weights.data (), tensor->data , ne_total * node_type.size ());
430432 std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
431- if (node_type == ov::element::f16 ) {
432- weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32 );
433- }
433+ // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU
434+ // if (node_type == ov::element::f16) {
435+ // weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
436+ // }
434437 weight_node->set_friendly_name (tensor->name );
435438 return weight_node;
436439 }
@@ -440,7 +443,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
440443 tensor->extra == nullptr ,
441444 " Unsupported weight tensor: " + std::string (tensor->name ) + " Possibly this is a repacked quantized weights" );
442445
443- node_shape.erase (node_shape.begin ());
446+ if (to_dequantize) {
447+ std::vector<float > weights_f32 (ne_total);
448+ ggml_get_type_traits (tensor->type )->to_float (tensor->data , weights_f32.data (), ggml_nelements (tensor));
449+ ov::Tensor weights (ov::element::f16 , node_shape);
450+ ggml_get_type_traits (GGML_TYPE_F16)->from_float_ref (weights_f32.data (), weights.data (), ggml_nelements (tensor));
451+ std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
452+ weight_node->set_friendly_name (tensor->name );
453+ return weight_node;
454+ }
444455
445456 uint64_t weights_per_byte;
446457 if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
0 commit comments