2727#include < openvino/op/constant.hpp>
2828#include < openvino/op/convert.hpp>
2929#include < openvino/op/parameter.hpp>
30- #include < openvino/op/unsqueeze.hpp>
3130#include < openvino/runtime/tensor.hpp>
3231#include < optional>
3332#include < ostream>
3938GgmlOvDecoder::GgmlOvDecoder (ggml_tensor * node,
4039 ggml_cgraph * cgraph,
4140 bool is_static,
42- bool is_first_token,
4341 int context_size,
4442 int context_size_swa,
4543 int num_heads,
@@ -55,25 +53,24 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_tensor * node,
5553 m_num_heads(num_heads),
5654 m_num_heads_kv(num_heads_kv),
5755 m_head_size(head_size),
58- m_is_static(is_static),
59- m_is_first_token(is_first_token) {
56+ m_is_static(is_static) {
6057 set_input_output (node);
6158}
6259
6360GgmlOvDecoder::GgmlOvDecoder (ggml_cgraph * cgraph,
6461 std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
65- bool is_static,
66- bool is_first_token) :
62+ bool is_static) :
6763 m_cgraph(cgraph),
6864 m_op_name(m_node ? std::string(m_node->name) : ""),
6965 m_model_weights(model_weights),
70- m_is_static(is_static),
71- m_is_first_token(is_first_token ) {
72- if (is_first_token && getenv (" GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS" )) {
66+ m_is_static(is_static) {
67+ if ( auto * env = getenv ( " GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS " ); env && std::string (env) != " 0 " ) {
68+ unsetenv (" GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS" );
7369 print_tensor_address_map (cgraph);
7470 }
7571
7672 set_llm_params ();
73+ validate_cgraph ();
7774
7875 for (int node_n = 0 ; node_n < cgraph->n_nodes ; node_n++) {
7976 auto * cur_node = cgraph->nodes [node_n];
@@ -300,41 +297,39 @@ void GgmlOvDecoder::set_llm_params() {
300297 }
301298}
302299
300+ void GgmlOvDecoder::validate_cgraph () const {
301+ if (m_is_static && m_input_len != 1 ) {
302+ throw std::runtime_error (" Static graph (NPU) must have input_len == 1, but got " + std::to_string (m_input_len) +
303+ " , try set -ub 1" );
304+ }
305+ }
306+
303307ov::PartialShape GgmlOvDecoder::get_graph_input_shape (const ggml_tensor * src) const {
304308 auto name = std::string (src->name );
305309 ov::PartialShape input_shape;
306- if (name == " inp_tokens" || name == " inp_pos" ) {
307- if (m_is_static) {
308- if (m_is_first_token) {
309- input_shape = ov::PartialShape{1 , 1 , m_context_size};
310- } else {
311- input_shape = ov::PartialShape{1 , 1 , 1 };
312- }
313- } else {
314- input_shape = ov::PartialShape{1 , 1 , -1 };
315- }
316- } else if (name == " inp_out_ids" && !m_is_static) {
317- input_shape = ov::PartialShape{1 , 1 , -1 };
310+
311+ if (name == " inp_tokens" || name == " inp_pos" || name == " inp_out_ids" ) {
312+ input_shape = ov::PartialShape{1 , 1 , m_is_static ? 1 : -1 };
313+
318314 } else if (name.find (" KQ_mask" ) == 0 ) {
319315 if (m_is_static) {
320- if (m_is_first_token) {
321- input_shape = ov::PartialShape{1 , m_context_size, m_context_size};
322- } else {
323- input_shape = ov::PartialShape{1 , 1 , m_context_size};
324- }
316+ input_shape = ov::PartialShape{1 , 1 , m_context_size};
325317 } else {
326318 input_shape = ov::PartialShape{1 , -1 , -1 };
327319 }
320+
328321 } else if (name.find (" cache_" ) == 0 ) {
322+ auto past_token_len = -1 ;
329323 if (m_is_static) {
330324 int layer = extract_layer_from_name (name);
331325 bool is_swa = is_swa_layer (layer);
332- input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size};
333- } else {
334- input_shape = ov::PartialShape{1 , -1 , m_num_heads_kv, m_head_size};
326+ past_token_len = is_swa ? m_context_size_swa : m_context_size;
335327 }
328+ input_shape = ov::PartialShape{past_token_len, m_num_heads_kv, m_head_size};
329+
336330 } else if (const auto * op = get_tensor_used_op (src); op && op->op == GGML_OP_SET_ROWS) {
337331 input_shape = ov::PartialShape{1 , 1 , m_is_static ? 1 : -1 };
332+
338333 } else if (src->op == GGML_OP_VIEW) {
339334 // This case is added to make test-backend-ops work
340335 input_shape = ov::PartialShape{get_shape (src->view_src )};
@@ -748,9 +743,8 @@ int32_t * GgmlOvDecoder::get_output_op_params(const std::string & name) const {
748743
749744void GgmlOvDecoder::visit_subgraph (std::function<void (std::shared_ptr<GgmlDecoder>)> node_visitor) const {
750745 for (const auto & node : m_nodes) {
751- auto decoder =
752- std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_is_first_token, m_context_size,
753- m_context_size_swa, m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
746+ auto decoder = std::make_shared<GgmlOvDecoder>(node, m_cgraph, m_is_static, m_context_size, m_context_size_swa,
747+ m_num_heads, m_num_heads_kv, m_head_size, m_swa_layers);
754748 node_visitor (decoder);
755749 }
756750}
0 commit comments