diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/input_examples/llama3/128_tokens b/input_examples/llama3/128_tokens new file mode 100644 index 0000000..4ac9f08 --- /dev/null +++ b/input_examples/llama3/128_tokens @@ -0,0 +1 @@ +Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. Mr. Dursley made drills. He was a big, beefy man with hardly any neck, although he did have a very large moustache. Mrs. Dursley was thin and blonde and had twice the usual amount of neck, which came in very useful as she spent so much of her time spying on the neighbours. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they diff --git a/input_examples/llama3/2048_tokens b/input_examples/llama3/2048_tokens new file mode 100644 index 0000000..dc64a4e --- /dev/null +++ b/input_examples/llama3/2048_tokens @@ -0,0 +1 @@ +Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn't want Dudley mixing with a child like that. When Mr. and Mrs. Dursley woke up on the dull, gray Tuesday our story starts, there was nothing about the cloudy sky outside to suggest that strange and mysterious things would soon be happening all over the country. Mr. Dursley hummed as he picked out his most boring tie for work, and Mrs. Dursley gossiped away happily as she wrestled a screaming Dudley into his high chair. None of them noticed a large, tawny owl flutter past the window. At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. Little tyke,' chortled Mr. Dursley as he left the house. He got into his car and backed out of number four's drive. It was on the corner of the street that he noticed the first sign of something peculiar a cat reading a map. For a second, Mr. Dursley didn't realize what he had seen then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn't a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat. It stared back. As Mr. Dursley drove around the corner and up the road, he watched the cat in his mirror. It was now reading the sign that said Privet Drive no, looking at the sign; cats couldn't read maps or signs. Mr. Dursley gave himself a little shake and put the cat out of his mind. As he drove toward town he thought of nothing except a large order of drills he was hoping to get that day. But on the edge of town, drills were driven out of his mind by something else. As he sat in the usual morning traffic jam, he couldn't help noticing that there seemed to be a lot of strangely dressed people about. People in cloaks. Mr. Dursley couldn't bear people who dressed in funny clothes the getups you saw on young people! He supposed this was some stupid new fashion. He drummed his fingers on the steering wheel and his eyes fell on a huddle of these weirdos standing quite close by. They were whispering excitedly together. Mr. Dursley was enraged to see that a couple of them weren't young at all; why, that man had to be older than he was, and wearing an emerald-green cloak! The nerve of him! But then it struck Mr. Dursley that this was probably some silly stunt these people were obviously collecting for something . . . yes, that would be it. The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills. Mr. Dursley always sat with his back to the window in his office on the ninth floor. If he hadn't, he might have found it harder to concentrate on drills that morning. He didn't see the owls swoop- ing past in broad daylight, though people down in the street did; they pointed and gazed open-mouthed as owl after owl sped overhead. Most of them had never seen an owl even at nighttime. Mr. Dursley, however, had a perfectly normal, owl-free morning. He yelled at five different people. He made several important telephone calls and shouted a bit more. He was in a very good mood until lunchtime, when he thought he'd stretch his legs and walk across the road to buy himself a bun from the bakery. He'd forgotten all about the people in cloaks until he passed a group of them next to the baker's. He eyed them angrily as he passed. He didn't know why, but they made him uneasy. This bunch were whispering excitedly, too, and he couldn't see a single collecting tin. It was on his way back past them, clutching a large doughnut in a bag, that he caught a few words of what they were saying. 'The Potters, that's right, that's what I heard ' ' yes, their son, Harry ' Mr. Dursley stopped dead. Fear flooded him. He looked back at the whisperers as if he wanted to say something to them, but thought better of it. He dashed back across the road, hurried up to his office,snapped at his secretary not to disturb him, seized his telephone,and had almost finished dialing his home number when hechanged his mind. He put the receiver back down and stroked hismustache, thinking . . . no, he was being stupid. Potter wasn't suchan unusual name. He was sure there were lots of people called Potter who had a son called Harry. Come to think of it, he wasn't evensure his nephew was called Harry. He'd never even seen the boy. It might have been Harvey. Or Harold. There was no point in worrying Mrs. Dursley; she always got so upset at any mention of hersister. He didn't blame her if he'd had a sister like that . . . but allthe same, those people in cloaks . . .He found it a lot harder to concentrate on drills that afternoonand when he left the building at five o'clock, he was still so worriedthat he walked straight into someone just outside the door.'Sorry,' he grunted, as the tiny old man stumbled and almostfell. It was a few seconds before Mr. Dursley realized that the manwas wearing a violet cloak. He didn't seem at all upset at being almost knocked to the ground. On the contrary, his face split into awide smile and he said in a squeaky voice that made passersby stare,'Don't be sorry, my dear sir, for nothing could upset me today! Rejoice, for You-Know-Who has gone at last! Even Muggles like yourself should be celebrating, this happy, happy day!'And the old man hugged Mr. Dursley around the middle andwalked off.Mr. Dursley stood rooted to the spot. He had been hugged by acomplete stranger. He also thought he had been called a Muggle,whatever that was. He was rattled. He hurried to his car and set offfor home, hoping he was imagining things, which he had neverhoped before, because he didn't approve of imagination.As he pulled into the driveway of number four, the first thing hesaw and it didn't improve his mood was the tabby cat he'dspotted that morning. It was now sitting on his garden wall. Hewas sure it was the same one; it had the same markings around itseyes.'Shoo!' said Mr. Dursley loudly. The cat didn't move. It just gave him a stern look. Was this normal cat behavior? Mr. Dursley wondered. Trying to pull himself together, he let himself into the house. He was still determined not tomention anything to his wife.Mrs. Dursley had had a nice, normal day. She told him over dinner all about Mrs. Next Door's problems with her daughter and how Dudley had learnt a new word (‘Shan’t!’). Mr Dursley tried to act normally. When Dudley had been put to bed, he went into the living-room in time to catch the last report on the evening news: ‘And finally, bird-watchers everywhere have reported that the nation’s owls have been behaving very unusually today. Although owls normally hunt at night and are hardly ever seen in daylight, there have been hundreds of sightings of these birds flying in every direction since sunrise. Experts are unable to explain why the owls have suddenly changed their sleeping pattern.’ The news reader allowed himself a grin. ‘Most mysterious. And now, over to Jim McGuffin with the weather. Going to be any more showers of owls tonight, Jim?’ ‘Well, Ted,’ said the weatherman, ‘I don’t know about that, but it’s diff --git a/input_examples/token_counter.py b/input_examples/token_counter.py new file mode 100644 index 0000000..9ba97c8 --- /dev/null +++ b/input_examples/token_counter.py @@ -0,0 +1,22 @@ +import argparse +from transformers import AutoTokenizer + +def count_tokens(tokenizer_name, text): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokens = tokenizer.tokenize(text) + return len(tokens) + +def main(): + parser = argparse.ArgumentParser(description="Count the number of tokens in a given text using a specified tokenizer.") + parser.add_argument('--tokenizer', type=str, required=True, help="The name of the tokenizer to use.") + parser.add_argument('--text', type=str, required=True, help="The text to tokenize.") + + args = parser.parse_args() + + num_tokens = count_tokens(args.tokenizer, args.text) + + print(f"Number of tokens: {num_tokens}") + +if __name__ == "__main__": + main() + diff --git a/llmperf.py b/llmperf.py index 28205a4..da28fa3 100644 --- a/llmperf.py +++ b/llmperf.py @@ -1,8 +1,5 @@ import argparse import openai_perf -import tgi_perf -import vllm_perf -import triton_perf import asyncio import math import json @@ -60,59 +57,19 @@ async def send_sampled_request_periodically(request, samples, qps, t, total): def run_ttft(args): prompt = read_prompt_from_file(args.prompt_file) measurer = None - if args.engine == "vllm": - measurer = vllm_perf.ttft_measurer(prompt, args) - elif args.engine == "openai": - measurer = openai_perf.ttft_measurer(prompt, args) - elif args.engine == "tgi": - measurer = tgi_perf.ttft_measurer(prompt, args) - elif args.engine == "triton": - measurer = triton_perf.ttft_measurer(prompt, args) - else: - print(f"TTFT test not implemented for {args.engine}") - return + measurer = openai_perf.ttft_measurer(prompt, args) run_test_n_times(measurer, args.iterations) def run_tpot(args): prompt = read_prompt_from_file(args.prompt_file) measurer = None - if args.engine == "vllm": - measurer = vllm_perf.tpot_measurer(prompt, args) - elif args.engine == "openai": - measurer = openai_perf.tpot_measurer(prompt, args) - elif args.engine == "tgi": - measurer = tgi_perf.tpot_measurer(prompt, args) - elif args.engine == "triton": - measurer = triton_perf.tpot_measurer(prompt, args) - else: - print(f"TPOT test not implemented for {args.engine}") - return + measurer = openai_perf.tpot_measurer(prompt, args) asyncio.run(async_run_test_n_times(measurer, args.iterations)) -def run_static_batch(args): - prompt = read_prompt_from_file(args.prompt_file) - measurer = None - if args.engine == "vllm": - measurer = vllm_perf.static_batch_measurer(prompt, args) - else: - print(f"Static batch test not implemented for {args.engine}") - return - run_test_n_times(measurer, args.iterations) - def run_rate_throughput(args): prompt = read_prompt_from_file(args.prompt_file) measurer = None - if args.engine == "vllm": - measurer = vllm_perf.rate_throughput_measurer(prompt, args) - elif args.engine == "openai": - measurer = openai_perf.rate_throughput_measurer(prompt, args) - elif args.engine == "tgi": - measurer = tgi_perf.rate_throughput_measurer(prompt, args) - elif args.engine == "triton": - measurer = triton_perf.rate_throughput_measurer(prompt, args) - else: - print(f"Rate throughput test not implemented for {args.engine}") - return + measurer = openai_perf.rate_throughput_measurer(prompt, args) async def wrapper(): return await send_request_periodically(measurer, args.qps, args.t, args.total_requests) @@ -122,17 +79,7 @@ def run_rate_sampled_throughput(args): with open(args.dataset, 'r') as file: samples = json.load(file) measurer = None - if args.engine == "vllm": - measurer = vllm_perf.sample_rate_throughput_measurer(args) - elif args.engine == "openai": - measurer = openai_perf.sample_rate_throughput_measurer(args) - elif args.engine == "tgi": - measurer = tgi_perf.sample_rate_throughput_measurer(args) - elif args.engine == "triton": - measurer = triton_perf.sample_rate_throughput_measurer(args) - else: - print(f"Rate sampled throughput test not implemented for {args.engine}") - return + measurer = openai_perf.sample_rate_throughput_measurer(args) async def wrapper(): return await send_sampled_request_periodically(measurer, samples, args.qps, args.t, args.total_requests) @@ -142,43 +89,28 @@ def run_rate_sampled_output_throughput(args): with open(args.dataset, 'r') as file: samples = json.load(file) measurer = None - if args.engine == "vllm": - measurer = vllm_perf.sample_output_rate_throughput_measurer(args) - elif args.engine == "tgi": - measurer = tgi_perf.sample_output_rate_throughput_measurer(args) - elif args.engine == "openai": - measurer = openai_perf.sample_output_rate_throughput_measurer(args) - elif args.engine == "triton": - measurer = triton_perf.sample_output_rate_throughput_measurer(args) - else: - print(f"Rate sampled throughput test not implemented for {args.engine}") - return + measurer = openai_perf.sample_output_rate_throughput_measurer(args) async def wrapper(): return await send_sampled_request_periodically(measurer, samples, args.qps, args.t, args.total_requests) asyncio.run(async_run_test_n_times(wrapper, args.iterations)) -def add_engines_parser(base_parser, vllm_batch_size = False): +def add_parser(base_parser, vllm_batch_size = False): engine_parser = base_parser.add_subparsers(title="Engine", dest="engine", required=True) vllm_parser = engine_parser.add_parser("vllm", help="vLLM Engine") vllm_parser.add_argument("--model", type=str, default="", help="The model.") - vllm_parser.add_argument("--dtype", type=str, default="float16", help="The dtype.") - vllm_parser.add_argument("--gpu_memory_utilization", type=float, default=0.9, help="GPU Memory fraction") - if vllm_batch_size: - vllm_parser.add_argument("--batch_size", type=int, default=128, help="The batch size.") - - openai_parser = engine_parser.add_parser("openai", help="OpenAI Engine") - openai_parser.add_argument("--api_key", type=str, default="API_KEY", help="The OpenAI API Key") - openai_parser.add_argument("--api_base", type=str, default="http://localhost:8000/v1", help="The OpenAI Server URL") - - triton_parser = engine_parser.add_parser("triton", help="Triton Engine") - triton_parser.add_argument("--model", type=str, default="ensemble", help="The model.") - triton_parser.add_argument("--http_server", type=str, default="http://localhost:8000", help="The Triton Server URL") - triton_parser.add_argument("--grpc_server", type=str, default="localhost:8001", help="The Triton gRPC Server URL") - - tgi_parser = engine_parser.add_parser("tgi", help="Text-generation-inference Engine") - tgi_parser.add_argument("--server", type=str, default="http://127.0.0.1:80/", help="The TGI Server URL") + vllm_parser.add_argument("--api_key", type=str, default="API_KEY", help="The OpenAI API Key") + vllm_parser.add_argument("--api_base", type=str, default="http://localhost:8000/v1", help="The OpenAI Server URL") + nim_parser = engine_parser.add_parser("nim", help="NVIDIA NIM (TRT-LLM engine with Triton)") + nim_parser.add_argument("--model", type=str, default="", help="The model.") + nim_parser.add_argument("--api_key", type=str, default="API_KEY", help="The OpenAI API Key") + nim_parser.add_argument("--api_base", type=str, default="http://localhost:8000/v1", help="The OpenAI Server URL") + + tgi_parser = engine_parser.add_parser("tgi", help="Text-generation-inference Engine by HuggingFace") + tgi_parser.add_argument("--model", type=str, default="", help="The model.") + tgi_parser.add_argument("--api_key", type=str, default="API_KEY", help="The OpenAI API Key") + tgi_parser.add_argument("--api_base", type=str, default="http://localhost:8080/v1", help="The OpenAI Server URL") if __name__ == "__main__": parser = argparse.ArgumentParser(description="LLMPerf tools to measure LLM performance") @@ -188,13 +120,13 @@ def add_engines_parser(base_parser, vllm_batch_size = False): ttft_parser = test_parser.add_parser("ttft", help="Measure Time To First Token (TTFT)") ttft_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.") ttft_parser.add_argument("--iterations", type=int, default=10, help="The iterations parameter.") - add_engines_parser(ttft_parser) + add_parser(ttft_parser) tpot_parser = test_parser.add_parser("tpot", help="Measure Time Per Output Token (TPOT)") tpot_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.") tpot_parser.add_argument("--iterations", type=int, default=10, help="The iterations parameter.") tpot_parser.add_argument("--output_tokens", type=int, default=128, help="Number of tokens to retrieve") - add_engines_parser(tpot_parser) + add_parser(tpot_parser) stb_parser = test_parser.add_parser("static_batch_throughput", help="Measure throughput for static batch") stb_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.") @@ -213,7 +145,7 @@ def add_engines_parser(base_parser, vllm_batch_size = False): rth_parser.add_argument("--qps", type=int, default=4, help="Number of queries to send per second") rth_parser.add_argument("--t", type=int, default=1, help="Time frame to send the QPS amount requests") rth_parser.add_argument("--total_requests", type=int, default=5000, help="Number of requests to send in total") - add_engines_parser(rth_parser, True) + add_parser(rth_parser, True) rst_parser = test_parser.add_parser("rate_sampled_throughput", help="Measure throughput with sending requests at constant rate") rst_parser.add_argument("--dataset", type=str, help="Path to a file containing the dataset.") @@ -221,7 +153,7 @@ def add_engines_parser(base_parser, vllm_batch_size = False): rst_parser.add_argument("--qps", type=int, default=4, help="Number of queries to send per second (Per t)") rst_parser.add_argument("--t", type=int, default=1, help="Time frame to send the QPS amount requests") rst_parser.add_argument("--total_requests", type=int, default=5000, help="Number of requests to send in total") - add_engines_parser(rst_parser, True) + add_parser(rst_parser, True) rsot_parser = test_parser.add_parser("rate_sampled_output_throughput", help="Measure throughput with sending requests at constant rate") rsot_parser.add_argument("--dataset", type=str, help="Path to a file containing the dataset.") @@ -231,7 +163,7 @@ def add_engines_parser(base_parser, vllm_batch_size = False): rsot_parser.add_argument("--total_requests", type=int, default=5000, help="Number of requests to send in total") rsot_parser.add_argument("--temperature", type=float, default=1, help="Temperature in sampling phase") rsot_parser.add_argument("--top_k", type=int, default=15, help="Tok K in sampling phase") - add_engines_parser(rsot_parser, True) + add_parser(rsot_parser, True) args = parser.parse_args() @@ -247,4 +179,4 @@ def add_engines_parser(base_parser, vllm_batch_size = False): elif args.test == "rate_sampled_throughput": run_rate_sampled_throughput(args) elif args.test == "rate_sampled_output_throughput": - run_rate_sampled_output_throughput(args) \ No newline at end of file + run_rate_sampled_output_throughput(args) diff --git a/openai_perf.py b/openai_perf.py index 5d10ab3..c762283 100644 --- a/openai_perf.py +++ b/openai_perf.py @@ -2,10 +2,10 @@ from timeit import default_timer as timer def ttft_measurer(prompt, args): - model = get_model(args) + client, model = get_client_model(args) def single_request(): start = timer() - completion = openai.Completion.create( + completion = client.completions.create( model=model, echo=False, prompt=prompt, @@ -20,15 +20,15 @@ def single_request(): return single_request def tpot_measurer(prompt, args): - model = get_model(args) + client, model = get_client_model(args) async def single_request(): start = timer() - completion = openai.Completion.create( + completion = client.completions.create( model=model, echo=False, prompt=prompt, max_tokens=args.output_tokens, - temperature=0, + temperature=0.01, n=1, stream=True, ) @@ -41,9 +41,9 @@ async def single_request(): return single_request def rate_throughput_measurer(prompt, args): - model = get_model(args) + client, model = get_client_model(args, async_client = True) async def single_request(): - completion = await openai.Completion.acreate( + completion = await client.completions.create( model=model, echo=False, prompt=prompt, @@ -58,9 +58,9 @@ async def single_request(): return single_request def sample_rate_throughput_measurer(args): - model = get_model(args) + client, model = get_client_model(args, async_client = True) async def single_request(sample): - completion = await openai.Completion.acreate( + completion = await client.completions.create( model=model, echo=False, prompt=sample["prompt"], @@ -75,23 +75,26 @@ async def single_request(sample): return single_request def sample_output_rate_throughput_measurer(args): - model = get_model(args) + client, model = get_client_model(args, async_client = True) async def single_request(sample): - completion = await openai.Completion.acreate( + completion = await client.completions.create( model=model, echo=False, prompt=sample["prompt"], temperature=1, max_tokens=2048, - top_k=15, + #top_k=15, n=1, stream=False, ) return completion.usage.completion_tokens return single_request -def get_model(args): - openai.api_key = args.api_key - openai.api_base = args.api_base - models = openai.Model.list() - return models["data"][0]["id"] +def get_client_model(args, async_client=False): + client = (openai.Client if not async_client else openai.AsyncClient) ( + api_key = args.api_key, + base_url = args.api_base + ) + + model = args.model + return client, model diff --git a/requirements.txt b/requirements.txt index 93cd049..ebf483b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,2 @@ -openai==0.28.1 -vllm==0.2.1 -text-generation==0.6.1 -tritonclient==2.39.0 \ No newline at end of file +openai==1.34.0 +transformers==4.41.2 diff --git a/tgi_perf.py b/tgi_perf.py deleted file mode 100644 index cf22c9d..0000000 --- a/tgi_perf.py +++ /dev/null @@ -1,44 +0,0 @@ -from text_generation import Client, AsyncClient -from timeit import default_timer as timer - -TIMEOUT_24_HOURS = 1440 - -def ttft_measurer(prompt, args): - client = Client(args.server) - def single_request(): - start = timer() - _ = client.generate(prompt, max_new_tokens=1) - return timer() - start - return single_request - -def tpot_measurer(prompt, args): - client = Client(args.server) - async def single_request(): - i = 0 - for _ in client.generate_stream(prompt, max_new_tokens=args.output_tokens): - if i == 0: - start = timer() - i += 1 - return (timer() - start) / (i - 1) - return single_request - -def rate_throughput_measurer(prompt, args): - client = AsyncClient(args.server, timeout=TIMEOUT_24_HOURS) - async def single_request(): - _ = await client.generate(prompt, max_new_tokens=args.output_tokens) - return args.output_tokens - return single_request - -def sample_rate_throughput_measurer(args): - client = AsyncClient(args.server, timeout=TIMEOUT_24_HOURS) - async def single_request(sample): - _ = await client.generate(sample["prompt"], max_new_tokens=sample["output_len"]) - return sample["output_len"] - return single_request - -def sample_output_rate_throughput_measurer(args): - client = AsyncClient(args.server, timeout=TIMEOUT_24_HOURS) - async def single_request(sample): - response = await client.generate(sample["prompt"], max_new_tokens=2048, temperature=args.temperature, top_k=args.top_k) - return response.details.generated_tokens - return single_request diff --git a/triton_perf.py b/triton_perf.py deleted file mode 100644 index ae1ce9b..0000000 --- a/triton_perf.py +++ /dev/null @@ -1,184 +0,0 @@ -import requests -import aiohttp -import tritonclient.grpc as grpcclient -from tritonclient.utils import InferenceServerException, np_to_triton_dtype -from timeit import default_timer as timer -import numpy as np -from functools import partial -import queue - -class UserData: - def __init__(self): - self._completed_requests = queue.Queue() - -def prepare_tensor(name, input): - t = grpcclient.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - -def ttft_measurer(prompt, args): - server = args.http_server - model = args.model - def single_request(): - req = { - "text_input": prompt, - "max_tokens": 1, - "bad_words": "", - "stop_words": "" - } - start = timer() - res = requests.post(f"{server}/v2/models/{model}/generate", json=req) - return timer() - start - return single_request - -def tpot_measurer(prompt, args): - client = grpcclient.InferenceServerClient(url=args.grpc_server) - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * args.output_tokens - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - streaming = [[True]] - streaming_data = np.array(streaming, dtype=bool) - beam_width = [[1]] - beam_width_data = np.array(beam_width, dtype=np.uint32) - inputs = [ - prepare_tensor("text_input", input0_data), - prepare_tensor("max_tokens", output0_len), - prepare_tensor("bad_words", bad_words_list), - prepare_tensor("stop_words", stop_words_list), - prepare_tensor("stream", streaming_data), - prepare_tensor("beam_width", beam_width_data), - ] - - async def single_request(): - user_data = UserData() - i = 0 - start = timer() - def callback(user_data, result, error): - nonlocal start - nonlocal i - if error: - user_data._completed_requests.put(error) - else: - i += 1 - if i == 1: - start = timer() - user_data._completed_requests.put(result) - client.start_stream(callback=partial(callback, user_data)) - client.async_stream_infer(args.model, inputs, request_id=str(1)) - client.stop_stream() - while True: - try: - result = user_data._completed_requests.get(block=False) - except Exception: - break - - if type(result) == InferenceServerException: - print("Received an error from server:") - print(result) - else: - result.as_numpy('text_output') - return (timer() - start) / (i - 1) - return single_request - -def rate_throughput_measurer(prompt, args): - server = args.http_server - model = args.model - async def single_request(): - conn = aiohttp.TCPConnector(limit=None, ttl_dns_cache=300) - session = aiohttp.ClientSession(connector=conn) - req = { - "text_input": prompt, - "max_tokens": args.output_tokens, - "bad_words": "", - "stop_words": "" - } - async with session.post(f"{server}/v2/models/{model}/generate", json=req) as response: - _ = await response.text() - await session.close() - await conn.close() - return args.output_tokens - return single_request - -def sample_rate_throughput_measurer(args): - server = args.http_server - model = args.model - async def single_request(sample): - conn = aiohttp.TCPConnector(limit=None, ttl_dns_cache=300) - session = aiohttp.ClientSession(connector=conn) - req = { - "text_input": sample["prompt"], - "max_tokens": sample["output_len"], - "bad_words": "", - "stop_words": "" - } - async with session.post(f"{server}/v2/models/{model}/generate", json=req) as response: - _ = await response.text() - await session.close() - await conn.close() - return sample["output_len"] - return single_request - -def sample_output_rate_throughput_measurer(args): - client = grpcclient.InferenceServerClient(url=args.grpc_server) - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - streaming = [[True]] - streaming_data = np.array(streaming, dtype=bool) - beam_width = [[1]] - beam_width_data = np.array(beam_width, dtype=np.uint32) - temperature = [[args.temperature]] - temperature_data = np.array(temperature, dtype=np.float32) - top_k = [[args.top_k]] - top_k_data = np.array(top_k, dtype=np.uint32) - eos = [[2]] - eos_data = np.array(eos, dtype=np.uint32) - inputs = [ - prepare_tensor("bad_words", bad_words_list), - prepare_tensor("stop_words", stop_words_list), - prepare_tensor("stream", streaming_data), - prepare_tensor("beam_width", beam_width_data), - prepare_tensor("temperature", temperature_data), - prepare_tensor("top_k", top_k_data), - prepare_tensor("end_id", eos_data), - ] - global_id = 0 - async def single_request(sample): - nonlocal global_id - user_data = UserData() - - n_inputs = inputs.copy() - input0 = [[sample["prompt"]]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * 2048 - n_inputs.append(prepare_tensor("text_input", input0_data)) - n_inputs.append(prepare_tensor("max_tokens", output0_len)) - - i = 0 - def callback(user_data, result, error): - nonlocal i - if error: - user_data._completed_requests.put(error) - else: - i += 1 - user_data._completed_requests.put(result) - client.start_stream(callback=partial(callback, user_data)) - client.async_stream_infer(args.model, n_inputs, request_id=str(global_id)) - global_id += 1 - client.stop_stream() - while True: - try: - result = user_data._completed_requests.get(block=False) - except Exception: - break - - if type(result) == InferenceServerException: - print("Received an error from server:") - print(result) - else: - result.as_numpy('text_output') - print(i) - return i - return single_request diff --git a/vllm_perf.py b/vllm_perf.py deleted file mode 100644 index 1601c4d..0000000 --- a/vllm_perf.py +++ /dev/null @@ -1,137 +0,0 @@ -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.utils import random_uuid -from timeit import default_timer as timer - -def ttft_measurer(prompt, args): - llm = LLM( - model=args.model, - trust_remote_code=True, - dtype=args.dtype, - ) - tokenizer = llm.get_tokenizer() - def single_request(): - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=1, - ) - prompt_token_ids = tokenizer.encode(prompt) - llm._add_request( - prompt=None, - prompt_token_ids=prompt_token_ids, - sampling_params=sampling_params, - ) - start = timer() - llm._run_engine(use_tqdm=False) - return timer() - start - return single_request - -def tpot_measurer(prompt, args): - engineArgs = AsyncEngineArgs(args.model) - engineArgs.trust_remote_code = True - engineArgs.dtype = args.dtype - engineArgs.disable_log_stats = True - engineArgs.disable_log_requests = True - llm = AsyncLLMEngine.from_engine_args(engineArgs) - - async def single_request(): - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=args.output_tokens, - ) - request_id = random_uuid() - results_generator = llm.generate(prompt, sampling_params, request_id) - i = 0 - async for _ in results_generator: - if i == 0: - start = timer() - i += 1 - return (timer() - start) / (i - 1) - return single_request - -def static_batch_measurer(prompt, args): - llm = LLM( - model=args.model, - trust_remote_code=True, - dtype=args.dtype, - ) - tokenizer = llm.get_tokenizer() - def single_request(): - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=args.output_tokens, - ) - prompt_token_ids = tokenizer.encode(prompt) - for _ in range(args.batch_size): - llm._add_request( - prompt=None, - prompt_token_ids=prompt_token_ids, - sampling_params=sampling_params, - ) - start = timer() - llm._run_engine(use_tqdm=True) - total_time = timer() - start - tokens_count = args.batch_size * args.output_tokens - return tokens_count / total_time - return single_request - -def rate_throughput_measurer(prompt, args): - llm = init_async_llm(args) - - async def single_request(): - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=args.output_tokens, - ) - request_id = random_uuid() - results_generator = llm.generate(prompt, sampling_params, request_id) - async for _ in results_generator: - pass - return args.output_tokens - return single_request - -def sample_rate_throughput_measurer(args): - llm = init_async_llm(args) - async def single_request(sample): - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=sample["output_len"], - ) - request_id = random_uuid() - results_generator = llm.generate(sample["prompt"], sampling_params, request_id) - async for _ in results_generator: - pass - return sample["output_len"] - return single_request - -def sample_output_rate_throughput_measurer(args): - llm = init_async_llm(args) - async def single_request(sample): - sampling_params = SamplingParams( - top_k=args.top_k, - temperature=args.temperature, - max_tokens=4096, - ) - request_id = random_uuid() - results_generator = llm.generate(sample["prompt"], sampling_params, request_id) - i = 0 - async for _ in results_generator: - i += 1 - return i - return single_request - -def init_async_llm(args): - engineArgs = AsyncEngineArgs(args.model) - engineArgs.trust_remote_code = True - engineArgs.dtype = args.dtype - engineArgs.max_num_seqs = args.batch_size - engineArgs.gpu_memory_utilization = args.gpu_memory_utilization - engineArgs.disable_log_stats = True - engineArgs.disable_log_requests = True - return AsyncLLMEngine.from_engine_args(engineArgs)