Skip to content

Commit 1f149a6

Browse files
authored
[llm_bench] add infer latency metrics for genai (#1391)
1 parent c244054 commit 1f149a6

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

tools/llm_bench/task/speech_to_text_generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list):
5757
- np.array(perf_metrics.raw_metrics.m_new_token_times[:-1])
5858
).tolist()
5959
tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist()
60-
tm_infer_list = None
60+
tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist()
6161
result_text = result_text.texts[0]
6262
else:
6363
start = time.perf_counter()

tools/llm_bench/task/text_generation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ def token_printer():
294294
np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000,
295295
np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000
296296
)
297+
inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000
297298
iter_data = gen_output_data.gen_iterate_data(
298299
iter_idx=num,
299300
in_size=num_input_tokens * args['batch_size'],
@@ -313,7 +314,7 @@ def token_printer():
313314
num,
314315
iter_data,
315316
tm_list.tolist(),
316-
None,
317+
inference_durations.tolist(),
317318
warm_up=(num == 0),
318319
max_rss_mem=max_rss_mem_consumption,
319320
max_shared_mem=max_shared_mem_consumption,

0 commit comments

Comments
 (0)