Skip to content

Commit a8fa0e5

Browse files
wine99ravi9
authored andcommitted
Fix llama-perplexity
1 parent edcebc0 commit a8fa0e5

File tree

3 files changed

+71
-58
lines changed

3 files changed

+71
-58
lines changed

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,9 @@ void GgmlOvDecoder::set_llm_params() {
236236
}
237237

238238
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
239+
auto name = std::string(src->name);
239240
ov::PartialShape input_shape;
240-
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
241+
if (name == "inp_tokens" || name == "inp_pos") {
241242
if (m_is_static) {
242243
if (m_is_first_token) {
243244
input_shape = ov::PartialShape{1, 1, m_context_size};
@@ -247,7 +248,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
247248
} else {
248249
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
249250
}
250-
} else if (std::string(src->name) == "KQ_mask") {
251+
} else if (name == "inp_out_ids" && !m_is_static) {
252+
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
253+
} else if (name == "KQ_mask") {
251254
if (m_is_static) {
252255
if (m_is_first_token) {
253256
input_shape = ov::PartialShape{1, m_context_size, m_context_size};
@@ -258,9 +261,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
258261
auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD);
259262
input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
260263
}
261-
} else if (std::string(src->name).find("cache_k") == 0) {
264+
} else if (name.find("cache_k") == 0) {
262265
input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
263-
} else if (std::string(src->name).find("cache_v") == 0) {
266+
} else if (name.find("cache_v") == 0) {
264267
input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
265268
} else if (src->op == GGML_OP_VIEW) {
266269
// This case is added to make test-backend-ops work
@@ -273,18 +276,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
273276

274277
void GgmlOvDecoder::add_extra_inputs() {
275278
// Extra inputs:
276-
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
277-
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
278-
// Not used for NPU
279+
// 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for
280+
// llama-perplexity.
281+
// 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
282+
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
283+
// Not used for NPU
284+
int64_t past_token_len = -1;
279285
int64_t attention_size = -1;
280286

281-
int64_t past_token_len = -1;
287+
int64_t token_len = -1;
282288
int64_t past_token_len_from_inp_pos = -1;
283289
for (const auto& node : m_nodes) {
284290
if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") {
285291
if (node->src[1]->type != GGML_TYPE_I32) {
286292
throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32");
287293
}
294+
token_len = node->src[1]->ne[0];
288295
past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0];
289296
}
290297
if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
@@ -294,29 +301,39 @@ void GgmlOvDecoder::add_extra_inputs() {
294301
break;
295302
}
296303
}
304+
297305
if (past_token_len == -1) {
298306
throw std::runtime_error("Failed to find input \"cache_k\" in the graph");
299307
}
300308
if (past_token_len != past_token_len_from_inp_pos) {
301-
throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " +
302-
std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos));
309+
GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n",
310+
past_token_len,
311+
past_token_len_from_inp_pos);
303312
}
304313

305-
for (const auto& node : m_nodes) {
306-
if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
307-
int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
308-
attention_size = GGML_PAD(total_token_len, 32);
309-
std::string name = "attention_size";
310-
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
311-
param_node->set_friendly_name(name);
312-
param_node->output(0).get_tensor().set_names({name});
313-
m_model_extra_inputs[name] = param_node;
314-
315-
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
316-
*tensor->data<int64_t>() = attention_size;
317-
m_model_extra_input_values[name] = tensor;
318-
break;
319-
}
314+
{
315+
std::string name = "past_token_len";
316+
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
317+
param_node->set_friendly_name(name);
318+
param_node->output(0).get_tensor().set_names({name});
319+
m_model_extra_inputs[name] = param_node;
320+
321+
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
322+
*tensor->data<int64_t>() = past_token_len;
323+
m_model_extra_input_values[name] = tensor;
324+
}
325+
{
326+
int64_t total_token_len = token_len + past_token_len;
327+
attention_size = GGML_PAD(total_token_len, 32);
328+
std::string name = "attention_size";
329+
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
330+
param_node->set_friendly_name(name);
331+
param_node->output(0).get_tensor().set_names({name});
332+
m_model_extra_inputs[name] = param_node;
333+
334+
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
335+
*tensor->data<int64_t>() = attention_size;
336+
m_model_extra_input_values[name] = tensor;
320337
}
321338
}
322339

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 23 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <map>
66
#include <memory>
77
#include <openvino/core/node.hpp>
8+
#include <openvino/op/add.hpp>
89
#include <openvino/op/broadcast.hpp>
910
#include <openvino/op/concat.hpp>
1011
#include <openvino/op/convert.hpp>
@@ -78,67 +79,59 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
7879
// cache_k layout: [S, N, H] (seq, num_heads, head_size)
7980
// cache_v layout: [N, H, S] (num_heads, head_size, seq)
8081
// When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened
81-
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
82+
auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr();
8283
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
8384

84-
std::shared_ptr<ov::Node> update_indices_k;
85-
std::shared_ptr<ov::Node> update_indices_v;
85+
Output<Node> update_indices_k;
86+
Output<Node> update_indices_v;
8687

8788
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
8889
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
8990
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
9091
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
9192
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
9293

93-
update_indices_k =
94-
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
95-
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices_k, one);
96-
update_indices_k->set_friendly_name("update_indices_k");
97-
tensor_map.insert({"update_indices_k", update_indices_k->output(0)});
94+
auto past_token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(past_token_len, zero);
95+
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
96+
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
97+
98+
Output<Node> update_indices = std::make_shared<ov::op::v4::Range>(
99+
past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64);
100+
if (ggml_model_decoder.is_static()) {
101+
update_indices = past_token_len;
102+
}
103+
104+
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices, one);
105+
update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k");
106+
tensor_map.insert({"update_indices_k", update_indices_k});
98107

99108
auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size();
100109
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
101110
auto total_head_size_scalar = std::make_shared<ov::op::v0::Squeeze>(total_head_size_node, zero);
102111

103112
// 1D tensor of shape [total_head_size], values starting from 0
104113
auto range_row =
105-
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32);
114+
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
106115
auto range_row_reshaped =
107116
std::make_shared<ov::op::v0::Unsqueeze>(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
108117
auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
109118
range_row_reshaped,
110119
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
111120

112121
// 1D tensor of shape [token_len], values starting from past_token_len
113-
auto range_col =
114-
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
122+
auto range_col = update_indices;
115123
auto range_col_reshaped =
116124
std::make_shared<ov::op::v0::Unsqueeze>(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
117125
auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
118126
range_col_reshaped,
119127
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
120128

121129
// Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
122-
auto indices = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
130+
update_indices_v = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
123131
update_indices_v = std::make_shared<ov::op::v1::Reshape>(
124-
indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
125-
update_indices_v->set_friendly_name("update_indices_v");
126-
tensor_map.insert({"update_indices_v", update_indices_v->output(0)});
127-
}
128-
129-
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
130-
#ifndef M_PI
131-
# define M_PI 3.14159265358979323846
132-
#endif
133-
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
134-
}
135-
136-
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
137-
float dims[2]) {
138-
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
139-
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
140-
dims[0] = std::max(0.0f, start);
141-
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
132+
update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
133+
update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v");
134+
tensor_map.insert({"update_indices_v", update_indices_v});
142135
}
143136

144137
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {

ggml/src/ggml-openvino/utils.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,13 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor)
356356
std::cout << *(tensor.data<float>()) << std::endl;
357357
break;
358358
case ov::element::f16:
359-
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>())) << std::endl;
359+
std::cout << *(tensor.data<ov::float16>()) << std::endl;
360360
break;
361361
case ov::element::i32:
362-
std::cout << *(tensor.data<int32_t>()) << std::endl;
362+
for (size_t i = 0; i < tensor.get_size(); ++i) {
363+
std::cout << tensor.data<int32_t>()[i] << " ";
364+
}
365+
std::cout << std::endl;
363366
break;
364367
case ov::element::i64:
365368
std::cout << *(tensor.data<int64_t>()) << std::endl;
@@ -379,7 +382,7 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
379382
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
380383
break;
381384
case ov::element::f16:
382-
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>())) << std::endl;
385+
std::cout << *(tensor.data<ov::float16>()) << std::endl;
383386
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
384387
break;
385388
default:

0 commit comments

Comments
 (0)