@@ -218,7 +218,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
218218
219219 auto gguf_tensor_addrs = get_ggml_graph_output_dst (ggml_decoder);
220220 for (size_t i = 0 ; i < ov_output_names.size (); i++) {
221- auto result_name = ov_output_names[i];
221+ auto & result_name = ov_output_names[i];
222222 const auto output_tensor = infer_request.get_output_tensor (i);
223223
224224 std::memcpy (gguf_tensor_addrs[result_name], output_tensor.data (), output_tensor.get_byte_size ());
@@ -243,20 +243,34 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
243243 GGML_UNUSED (backend);
244244}
245245
246- ov::AnyMap get_npu_prefill_config () {
247- ov::AnyMap config = {
246+ namespace {
247+ ov::AnyMap get_npu_base_config () {
248+ return {
248249 {" NPU_COMPILATION_MODE_PARAMS" , " compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" },
249250 {" NPU_COMPILER_DYNAMIC_QUANTIZATION" , " YES" },
250251 {" NPU_USE_NPUW" , " YES" },
251252 {" NPUW_DEVICES" , " NPU" },
252253 {" NPUW_FOLD" , " YES" },
253254 {" NPUW_WEIGHTS_BANK" , " shared" },
254- {" NPUW_FUNCALL_ASYNC" , " YES" },
255255 {" NPUW_FUNCALL_FOR_ALL" , " YES" },
256256 {" NPUW_DQ" , " YES" },
257257 {" NPUW_DQ_FULL" , " NO" },
258258 {" NPUW_CACHE_DIR" , getenv (" GGML_OPENVINO_CACHE_DIR" ) ? getenv (" GGML_OPENVINO_CACHE_DIR" ) : " " },
259259 };
260+ }
261+ } // namespace
262+
263+ ov::AnyMap get_npu_prefill_config () {
264+ auto config = get_npu_base_config ();
265+ config.emplace (" NPUW_FUNCALL_ASYNC" , " NO" );
266+ config.emplace (" NPUW_ACC_CHECK" , " YES" );
267+ config.emplace (" NPUW_ACC_DEVICE" , " CPU" );
268+ return config;
269+ }
270+
271+ ov::AnyMap get_npu_generate_config () {
272+ auto config = get_npu_base_config ();
273+ config.emplace (" NPUW_FUNCALL_ASYNC" , " YES" );
260274 return config;
261275}
262276
@@ -266,7 +280,7 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
266280 {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
267281 {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
268282 {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
269- {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C },
283+ {GGML_TYPE_Q6_K, ExtraQuantType::F16 },
270284 };
271285 }
272286 if (device == " GPU" ) {
@@ -278,12 +292,6 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
278292 return {};
279293}
280294
281- ov::AnyMap get_npu_generate_config () {
282- ov::AnyMap config = get_npu_prefill_config ();
283- config.emplace (" NPUW_UNFOLD_IREQS" , " YES" );
284- return config;
285- }
286-
287295bool is_naive (struct ggml_cgraph * cgraph) {
288296 constexpr int naive_graph_size_threshold = 20 ;
289297 return cgraph->n_nodes < naive_graph_size_threshold;
@@ -373,7 +381,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, cons
373381
374382 } else if (const auto * op = ggml_decoder->get_tensor_used_op (ggml_decoder->get_tensor_from_name (param_name));
375383 op && op->op == GGML_OP_SET_ROWS && is_static && is_first_token) {
376- input_tensor = ov::Tensor (ov::element::i64 , ov::Shape{1 });
384+ input_tensor = ov::Tensor (ov::element::i64 , ov::Shape{1 , 1 , 1 });
377385 } else {
378386 input_tensor = convert_ggml_input_to_ov (ggml_decoder, param_name);
379387 }
0 commit comments