@@ -414,6 +414,13 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
414414 std::shared_ptr<ov::Node> weight_node;
415415 ov::Shape node_shape = {(uint64_t ) (tensor->ne [1 ]), (uint64_t ) (tensor->ne [0 ])};
416416
417+ // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k)
418+ // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0)
419+ std::string device = getenv (" GGML_OPENVINO_DEVICE" ) ? getenv (" GGML_OPENVINO_DEVICE" ) : " " ;
420+ if (device == " NPU" && std::string (tensor->name ) == " token_embd.weight" ) {
421+ requant_type = ExtraQuantType::F16;
422+ }
423+
417424 if (requant_type == ExtraQuantType::F16) {
418425 ov::Tensor weights (ov::element::f16 , node_shape);
419426 ggml_get_type_traits (GGML_TYPE_F16)->from_float_ref (weights_f32.data (), weights.data (), ggml_nelements (tensor));
@@ -473,7 +480,16 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
473480 }
474481
475482 const float d = max / -8 ;
476- const float id = d ? 1 .0f / d : 0 .0f ;
483+
484+ if (d == 0 ) {
485+ scales[i] = ov::float16 (1 .0f );
486+ biases[i] = ov::float16 (-8 .0f );
487+ uint8_t zp = 8 ;
488+ memset (weights + i * qk / 2 , zp | (zp << 4 ), qk / 2 );
489+ continue ;
490+ }
491+
492+ const float id = 1 .0f / d;
477493 scales[i] = ov::float16 (d);
478494 biases[i] = ov::float16 (-8 .f * d);
479495
0 commit comments