diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py index 905decf72b..de9d0126f8 100644 --- a/tools/llm_bench/llm_bench_utils/metrics_print.py +++ b/tools/llm_bench/llm_bench_utils/metrics_print.py @@ -149,7 +149,7 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch avg_input_size = int(avg_input_size / index_num) if avg_2nd_tokens_latency > 0: avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000 - latency_unit = 'token' if is_text_gen is True else 'step' + tput_unit = latency_unit = 'token' if is_text_gen is True else 'step' if batch_size > 1: if is_text_gen is True: latency_unit = '{}tokens'.format(batch_size) @@ -157,7 +157,7 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch latency_unit = '{}steps'.format(batch_size) avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}' avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}' - avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s' + avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {tput_unit}s/s' prefix = f'[ INFO ] [Average] P[{p_idx}]L[{loop_idx}]' if loop_idx != -1 else f'[ INFO ] [Average] P[{p_idx}]' if is_text_gen is True: output_info = '' diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py index ad49109bab..be9c9ab295 100644 --- a/tools/llm_bench/task/speech_to_text_generation.py +++ b/tools/llm_bench/task/speech_to_text_generation.py @@ -51,10 +51,10 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): ) end = time.perf_counter() perf_metrics = result_text.perf_metrics - first_token_time = perf_metrics.get_ttft().mean / args["batch_size"] + first_token_time = perf_metrics.get_ttft().mean second_tokens_durations = ( np.array(perf_metrics.raw_metrics.m_new_token_times[1:]) - - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"] + - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist() tm_infer_list = [] diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index d936721344..4c715848c6 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -228,10 +228,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) else: log.warning("No generated tokens") - first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) / args["batch_size"] + first_token_time = (perf_metrics.get_ttft().mean - perf_metrics.raw_metrics.tokenization_durations[-1] / 1000) * args["batch_size"] second_tokens_durations = ( np.array(perf_metrics.raw_metrics.m_new_token_times[1:]) - - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) / args["batch_size"] + - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() tm_list = np.array([first_token_time] + second_tokens_durations) / 1000