2727#include < openvino/op/constant.hpp>
2828#include < openvino/op/convert.hpp>
2929#include < openvino/op/parameter.hpp>
30- #include < openvino/op/unsqueeze.hpp>
3130#include < openvino/runtime/tensor.hpp>
3231#include < optional>
3332#include < ostream>
3938GgmlOvDecoder::GgmlOvDecoder (ggml_tensor * node,
4039 ggml_cgraph * cgraph,
4140 bool is_static,
42- bool is_first_token,
4341 int context_size,
4442 int context_size_swa,
4543 int num_heads,
@@ -55,25 +53,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
5553 m_num_heads(num_heads),
5654 m_num_heads_kv(num_heads_kv),
5755 m_head_size(head_size),
58- m_is_static(is_static),
59- m_is_first_token(is_first_token) {
56+ m_is_static(is_static) {
6057 set_input_output (node);
6158}
6259
6360GgmlOvDecoder::GgmlOvDecoder (ggml_cgraph * cgraph,
6461 std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
65- bool is_static,
66- bool is_first_token) :
62+ bool is_static) :
6763 m_cgraph(cgraph),
6864 m_op_name(m_node ? std::string(m_node->name) : ""),
6965 m_model_weights(model_weights),
70- m_is_static(is_static),
71- m_is_first_token(is_first_token ) {
72- if (is_first_token && getenv (" GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS" )) {
66+ m_is_static(is_static) {
67+ if ( auto * env = getenv ( " GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS " ); env && std::string (env) != " 0 " ) {
68+ unsetenv (" GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS" );
7369 print_tensor_address_map (cgraph);
7470 }
7571
7672 set_llm_params ();
73+ validate_cgraph ();
7774
7875 for (int node_n = 0 ; node_n < cgraph->n_nodes ; node_n++) {
7976 auto * cur_node = cgraph->nodes [node_n];
@@ -160,8 +157,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
160157 // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
161158 static std::set<std::string> debug_output_names = {};
162159 // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
163- if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || node_name. find ( " result " ) == 0 ||
164- debug_output_names.count (node_name)) {
160+ if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
161+ node_name. find ( " output " ) != std::string::npos || debug_output_names.count (node_name)) {
165162 if (node->op == GGML_OP_SET_ROWS) {
166163 assert (node_name.find (" cache_k" ) == 0 || node_name.find (" cache_v" ) == 0 );
167164 if (auto it = std::find (m_kv_names.begin (), m_kv_names.end (), node_name); it == m_kv_names.end ()) {
@@ -285,53 +282,54 @@ void GgmlOvDecoder::set_llm_params() {
285282 } else {
286283 m_context_size = cache_k->ne [1 ];
287284 }
288- } else if (node->op == GGML_OP_ROPE &&
289- (name.find (" Qcur-0" ) == 0 || std::string (node->src [0 ]->name ).find (" Qcur-0" ) == 0 )) {
290- m_head_size = node->ne [0 ];
291- m_num_heads = node->ne [1 ];
292- m_rope_params = node->op_params ;
293- } else if (node->op == GGML_OP_ROPE &&
294- (name.find (" Kcur-0" ) == 0 || std::string (node->src [0 ]->name ).find (" Kcur-0" ) == 0 )) {
295- m_num_heads_kv = node->ne [1 ];
285+ } else if (node->op == GGML_OP_ROPE) {
286+ if (name.find (" Qcur-0" ) == 0 || std::string (node->src [0 ]->name ).find (" Qcur-0" ) == 0 ) {
287+ m_head_size = node->ne [0 ];
288+ m_num_heads = node->ne [1 ];
289+ m_rope_params = node->op_params ;
290+ auto * inp_pos = node->src [1 ];
291+ m_input_len = inp_pos->ne [0 ];
292+ m_past_kv_len = *(int32_t *) inp_pos->data ;
293+ } else if (name.find (" Kcur-0" ) == 0 || std::string (node->src [0 ]->name ).find (" Kcur-0" ) == 0 ) {
294+ m_num_heads_kv = node->ne [1 ];
295+ }
296296 }
297297 }
298298}
299299
300+ void GgmlOvDecoder::validate_cgraph () const {
301+ if (m_is_static && m_input_len != 1 ) {
302+ throw std::runtime_error (" Static graph (NPU) must have input_len == 1, but got " + std::to_string (m_input_len) +
303+ " , try set -ub 1" );
304+ }
305+ }
306+
300307ov::PartialShape GgmlOvDecoder::get_graph_input_shape (const ggml_tensor * src) const {
301308 auto name = std::string (src->name );
302309 ov::PartialShape input_shape;
303- if (name == " inp_tokens" || name == " inp_pos" ) {
304- if (m_is_static) {
305- if (m_is_first_token) {
306- input_shape = ov::PartialShape{1 , 1 , m_context_size};
307- } else {
308- input_shape = ov::PartialShape{1 , 1 , 1 };
309- }
310- } else {
311- input_shape = ov::PartialShape{1 , 1 , -1 };
312- }
313- } else if (name == " inp_out_ids" && !m_is_static) {
314- input_shape = ov::PartialShape{1 , 1 , -1 };
310+
311+ if (name == " inp_tokens" || name == " inp_pos" || name == " inp_out_ids" ) {
312+ input_shape = ov::PartialShape{1 , 1 , m_is_static ? 1 : -1 };
313+
315314 } else if (name.find (" KQ_mask" ) == 0 ) {
316315 if (m_is_static) {
317- if (m_is_first_token) {
318- input_shape = ov::PartialShape{1 , m_context_size, m_context_size};
319- } else {
320- input_shape = ov::PartialShape{1 , 1 , m_context_size};
321- }
316+ input_shape = ov::PartialShape{1 , 1 , m_context_size};
322317 } else {
323318 input_shape = ov::PartialShape{1 , -1 , -1 };
324319 }
320+
325321 } else if (name.find (" cache_" ) == 0 ) {
322+ auto past_token_len = -1 ;
326323 if (m_is_static) {
327324 int layer = extract_layer_from_name (name);
328325 bool is_swa = is_swa_layer (layer);
329- input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size};
330- } else {
331- input_shape = ov::PartialShape{1 , -1 , m_num_heads_kv, m_head_size};
326+ past_token_len = is_swa ? m_context_size_swa : m_context_size;
332327 }
328+ input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size};
329+
333330 } else if (const auto * op = get_tensor_used_op (src); op && op->op == GGML_OP_SET_ROWS) {
334331 input_shape = ov::PartialShape{1 , 1 , m_is_static ? 1 : -1 };
332+
335333 } else if (src->op == GGML_OP_VIEW) {
336334 // This case is added to make test-backend-ops work
337335 input_shape = ov::PartialShape{get_shape (src->view_src )};
@@ -745,9 +743,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {
745743
746744void GgmlOvDecoder::visit_subgraph (std::function<void (std::shared_ptr<GgmlDecoder>)> node_visitor) const {
747745 for (const auto & node : m_nodes) {
748- auto decoder =
749- std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
750- m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
746+ auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa,
747+ m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
751748 node_visitor (decoder);
752749 }
753750}
0 commit comments