WIP: NPU ok, need to fix CPU GPU

wine99 · wine99 · commit 222199544488 · 2025-11-03T13:08:46.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -39,7 +39,6 @@
 GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
                              ggml_cgraph * cgraph,
                              bool is_static,
-                             bool is_first_token,
                              int context_size,
                              int context_size_swa,
                              int num_heads,
@@ -55,25 +54,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
     m_num_heads(num_heads),
     m_num_heads_kv(num_heads_kv),
     m_head_size(head_size),
-    m_is_static(is_static),
-    m_is_first_token(is_first_token) {
+    m_is_static(is_static) {
     set_input_output(node);
 }
 
 GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
                              std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
-                             bool is_static,
-                             bool is_first_token) :
+                             bool is_static) :
     m_cgraph(cgraph),
     m_op_name(m_node ? std::string(m_node->name) : ""),
     m_model_weights(model_weights),
-    m_is_static(is_static),
-    m_is_first_token(is_first_token) {
-    if (is_first_token && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) {
+    m_is_static(is_static) {
+    if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
+        unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
         print_tensor_address_map(cgraph);
     }
 
     set_llm_params();
+    validate_cgraph();
 
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         auto * cur_node = cgraph->nodes[node_n];
@@ -300,41 +298,39 @@ void GgmlOvDecoder::set_llm_params() {
     }
 }
 
+void GgmlOvDecoder::validate_cgraph() const {
+    if (m_is_static && m_input_len != 1) {
+        throw std::runtime_error("Static graph (NPU) must have input_len == 1, but got " + std::to_string(m_input_len) +
+                                 ", try set -ub 1");
+    }
+}
+
 ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * src) const {
     auto name = std::string(src->name);
     ov::PartialShape input_shape;
-    if (name == "inp_tokens" || name == "inp_pos") {
-        if (m_is_static) {
-            if (m_is_first_token) {
-                input_shape = ov::PartialShape{1, 1, m_context_size};
-            } else {
-                input_shape = ov::PartialShape{1, 1, 1};
-            }
-        } else {
-            input_shape = ov::PartialShape{1, 1, -1};
-        }
-    } else if (name == "inp_out_ids" && !m_is_static) {
-        input_shape = ov::PartialShape{1, 1, -1};
+
+    if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") {
+        input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
+
     } else if (name.find("KQ_mask") == 0) {
         if (m_is_static) {
-            if (m_is_first_token) {
-                input_shape = ov::PartialShape{1, m_context_size, m_context_size};
-            } else {
-                input_shape = ov::PartialShape{1, 1, m_context_size};
-            }
+            input_shape = ov::PartialShape{1, 1, m_context_size};
         } else {
             input_shape = ov::PartialShape{1, -1, -1};
         }
+
     } else if (name.find("cache_") == 0) {
+        auto past_token_len = -1;
         if (m_is_static) {
             int layer = extract_layer_from_name(name);
             bool is_swa = is_swa_layer(layer);
-            input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size};
-        } else {
-            input_shape = ov::PartialShape{1, -1, m_num_heads_kv, m_head_size};
+            past_token_len = is_swa ? m_context_size_swa : m_context_size;
         }
+        input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size};
+
     } else if (const auto * op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) {
         input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1};
+
     } else if (src->op == GGML_OP_VIEW) {
         // This case is added to make test-backend-ops work
         input_shape = ov::PartialShape{get_shape(src->view_src)};
@@ -748,9 +744,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {
 
 void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const {
     for (const auto & node : m_nodes) {
-        auto decoder =
-            std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
-                                            m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
+        auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa,
+                                                       m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
         node_visitor(decoder);
     }
 }
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -16,14 +16,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     // Graph decoder
     GgmlOvDecoder(ggml_cgraph * cgraph,
                   std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
-                  bool is_static,
-                  bool is_first_token);
+                  bool is_staticn);
 
     // Node decoder, called in GgmlOvDecoder::visit_subgraph
     GgmlOvDecoder(ggml_tensor * node,
                   ggml_cgraph * cgraph,
                   bool is_static,
-                  bool is_first_token,
                   int context_size,
                   int context_size_swa,
                   int num_heads,
@@ -129,8 +127,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual bool is_static() const override { return m_is_static; }
 
-    virtual bool is_first_token() const override { return m_is_first_token; }
-
     ov::PartialShape get_graph_input_shape(const ggml_tensor * src) const;
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
@@ -157,6 +153,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     // set context_size, num_heads, etc
     void set_llm_params();
+    void validate_cgraph() const;
 
     ggml_cgraph * m_cgraph = nullptr;
     ggml_tensor * m_node = nullptr;
@@ -185,7 +182,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     int32_t * m_rope_params;
     std::vector<std::string> m_kv_names;
     bool m_is_static = false;
-    bool m_is_first_token;
 };
 
 void print_tensor_address_map(const ggml_cgraph * cgraph);
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -65,7 +65,6 @@ class GgmlDecoder : public DecoderBase {
     virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
 
     virtual bool is_static() const = 0;
-    virtual bool is_first_token() const = 0;
     virtual int get_context_size() const = 0;
     virtual int get_context_size_swa() const = 0;
     virtual int is_swa_layer(int layer) const = 0;
diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp
@@ -97,12 +97,7 @@ class NodeContext : public frontend::NodeContext {
     int get_op_case() const {
         return m_decoder->get_op_case();
     }
-    bool is_static() const {
-        return m_decoder->is_static();
-    }
-    bool is_first_token() const {
-        return m_decoder->is_first_token();
-    }
+    bool is_static() const { return m_decoder->is_static(); }
 
 private:
     std::shared_ptr<GgmlDecoder> m_decoder;
diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp
@@ -33,10 +33,6 @@ OutputVector translate_set_rows(const NodeContext & context) {
     auto dst_shape = context.get_output_shape(0).to_shape();
     FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS");
 
-    if (context.is_static() && context.is_first_token()) {
-        return rename_outputs_with_suffix({data}, context.get_name());
-    }
-
     auto indices = context.get_input(1);
     auto dst = context.get_input(context.get_output_name());
 
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h