11#include " utils.h"
22
33#include < algorithm>
4+ #include < cassert>
45#include < cmath>
56#include < cstddef>
67#include < cstdint>
@@ -70,15 +71,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
7071 ov::AnyMap config;
7172 if (device == " NPU" ) {
7273 config = {
73- {" NPU_COMPILATION_MODE_PARAMS" , " compute-layers-with-higher-precision=ReduceMean" },
74- {" NPU_USE_NPUW" , " YES" },
75- {" NPUW_DEVICES" , " NPU" },
76- {" NPUW_FOLD" , " YES" },
77- {" NPUW_DQ" , " YES" },
78- {" NPUW_FUNCALL_ASYNC" , " YES" },
79- {" NPUW_HOST_GATHER" , " YES" },
80- {" NPUW_WEIGHTS_BANK" , " shared" },
81- // {"NPU_COMPILER_TYPE", "MLIR"},
74+ { " NPU_COMPILATION_MODE_PARAMS" , " compute-layers-with-higher-precision=ReduceMean" },
75+ { " NPU_USE_NPUW" , " YES" },
76+ { " NPUW_DEVICES" , " NPU" },
77+ { " NPUW_FOLD" , " YES" },
78+ { " NPUW_HOST_GATHER" , " YES" },
79+ { " NPUW_DQ" , " YES" },
80+ { " NPUW_FUNCALL_ASYNC" , " YES" },
81+ { " NPUW_WEIGHTS_BANK" , " shared" },
82+ // Option 'CACHE_DIR' is not supported with MLIR compiler type
83+ // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""},
84+ { " NPU_COMPILER_TYPE" , " MLIR" },
8285 };
8386 }
8487
@@ -102,15 +105,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
102105 int64_t conversion_end_time;
103106 int64_t compile_end_time;
104107
108+ bool is_first_token = is_prefill (cgraph);
109+
105110 auto it = compiled_cache_prefill.find (cgraph);
106- bool is_first_token = it == compiled_cache_prefill.end ();
107- if (!is_first_token) {
111+ if (it != compiled_cache_prefill.end ()) {
108112 ggml_decoder = get_ggml_decoder (cgraph, is_static, false );
109113 decoder_end_time = ggml_time_us ();
110114
111115 if (is_static) {
112- model = compiled_cache_kvcache[cgraph].first ;
113- compiled_model = compiled_cache_kvcache[cgraph].second ;
116+ if (is_first_token) {
117+ model = compiled_cache_prefill[cgraph].first ;
118+ compiled_model = compiled_cache_prefill[cgraph].second ;
119+ } else {
120+ model = compiled_cache_kvcache[cgraph].first ;
121+ compiled_model = compiled_cache_kvcache[cgraph].second ;
122+ }
114123 } else {
115124 model = it->second .first ;
116125 compiled_model = it->second .second ;
@@ -235,8 +244,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
235244 }
236245 auto end_time = ggml_time_us ();
237246
238- is_first_token = false ;
239-
240247 if (getenv (" GGML_OPENVINO_PROFILING" )) {
241248 GGML_LOG_INFO (" GGML OpenVINO Backend: \n " );
242249 GGML_LOG_INFO (" - Graph decoder Time: %ld ms \n " , (decoder_end_time - start_time) / 1000 );
@@ -305,3 +312,20 @@ void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
305312 matrix[i * dim + i] = 0 .0f ;
306313 }
307314}
315+
316+ bool is_prefill (struct ggml_cgraph * cgraph) {
317+ for (int i = 0 ; i < cgraph->n_nodes ; ++i) {
318+ auto * op = cgraph->nodes [i];
319+ for (int j = 0 ; j < GGML_MAX_SRC; ++j) {
320+ auto * src = op->src [j];
321+ if (src == nullptr ) {
322+ break ;
323+ }
324+ if (std::string (src->name ) == " inp_tokens" ) {
325+ return src->ne [0 ] != 1 ;
326+ }
327+ }
328+ }
329+ GGML_LOG_ERROR (" is_prefill: inp_tokens not found in cgraph" );
330+ throw std::runtime_error (" is_prefill: inp_tokens not found in cgraph" );
331+ }
0 commit comments