Skip to content

Commit 754aeb8

Browse files
committed
benchmark: Update benchmarks to use prefill chunking.
1 parent a1d796e commit 754aeb8

File tree

4 files changed

+26
-31
lines changed

4 files changed

+26
-31
lines changed

README.md

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
4343
* [Visualize the results](#visualize-the-results)
4444
* [Development](#development)
4545
* [Frequently Asked Questions](#frequently-asked-questions)
46-
* [TODO](#todo)
4746
<!-- TOC -->
4847

4948
## Get started
@@ -265,11 +264,3 @@ $ make build
265264
There is currently no way to guarantee a fixed number of tokens generated without modifying the inference server.
266265
So you may have `(successful requests) * max_tokens < generated tokens`.
267266

268-
## TODO
269-
270-
- [X] Customizable token count and variance
271-
- [X] Check results
272-
- [X] Allow for system prompts for prefix caching
273-
- [ ] Allow for multi-turn prompts
274-
- [X] Script to generate plots from results
275-
- [X] Add support for multiple tokens in stream chunks (when speculation is active)

extra/slurm/benchmark.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,11 @@
88
def main():
99
models = [
1010
('meta-llama/Llama-3.1-8B-Instruct', 1),
11-
# ('meta-llama/Llama-3.1-70B-Instruct', 4),
12-
# ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
13-
# ('neuralmagic/Meta-Llama-3-70B-Instruct-FP8', 2),
14-
# ('CohereForAI/c4ai-command-r-plus-08-2024', 4),
11+
('meta-llama/Llama-3.1-70B-Instruct', 4),
12+
('meta-llama/Llama-3.1-70B-Instruct', 2),
13+
('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
1514
]
16-
num_passes = 2
15+
num_passes = 1
1716
engines = ['tgi', 'vllm']
1817
for i in range(num_passes):
1918
for model in models:

extra/slurm/tgi.slurm

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ fi
2020

2121
echo "Starting TGI benchmark for $MODEL"
2222
export RUST_BACKTRACE=full
23-
export RUST_LOG=text_generation_inference_benchmark=info
23+
export RUST_LOG=inference_benchmarker=info
2424

2525
# set a random available port to avoid conflicts
2626
PORT=$(shuf -i 8000-9999 -n 1)
2727
export PORT
28+
export PREFILL_CHUNKING=1
2829

2930
echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}"
3031
echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}"
@@ -40,9 +41,9 @@ srun --het-group=0 \
4041
--no-container-mount-home \
4142
/usr/local/bin/text-generation-launcher \
4243
--model-id $MODEL \
43-
--max-concurrent-requests 512 \
44-
--max-waiting-tokens 5 \
45-
--cuda-graphs="1,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128"&
44+
--max-concurrent-requests 1024 \
45+
--max-waiting-tokens 0 \
46+
--max-batch-prefill-tokens 512&
4647

4748
# wait until /health is available, die after 5 minutes
4849
timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1
@@ -58,20 +59,21 @@ if [[ $exit_code != 124 ]]; then
5859
srun --het-group=1 \
5960
-u \
6061
-n 1 \
61-
--container-image="ghcr.io#huggingface/text-generation-inference-benchmark:latest" \
62-
--container-mounts="${RESULTS_DIR}:/opt/text-generation-inference-benchmark/results" \
62+
--container-image="ghcr.io#huggingface/inference-benchmarker:latest" \
63+
--container-mounts="${RESULTS_DIR}:/opt/inference-benchmarker/results" \
6364
--no-container-mount-home \
64-
text-generation-inference-benchmark \
65+
inference-benchmarker \
6566
--tokenizer-name "$MODEL" \
6667
--max-vus 800 \
6768
--url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
6869
--duration 120s \
6970
--warmup 30s \
7071
--benchmark-kind rate \
71-
--rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
72+
--rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
73+
--extra-meta "version=$VERSION,engine=TGI,tp=$TP,max_batch_prefill_tokens=512" \
7274
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
73-
--decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
74-
--extra-meta "version=$VERSION,engine=TGI,tp=$TP" \
75+
--decode-options "num_tokens=800,max_tokens=800,min_tokens=800,variance=0" \
76+
--dataset-file share_gpt_cleaned.json \
7577
--no-console
7678
fi
7779

extra/slurm/vllm.slurm

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ fi
2121

2222
echo "Starting vLLM benchmark for $MODEL"
2323
export RUST_BACKTRACE=full
24-
export RUST_LOG=text_generation_inference_benchmark=info
24+
export RUST_LOG=inference_benchmarker=info
2525
# set a random available port to avoid conflicts
2626
PORT=$(shuf -i 8000-9999 -n 1)
2727
export PORT
@@ -41,6 +41,8 @@ srun --het-group=0 \
4141
python3 -m vllm.entrypoints.openai.api_server \
4242
--model "${MODEL}" \
4343
--port "${PORT}" \
44+
--enable-chunked-prefill \
45+
--max-num-batched-tokens 512 \
4446
--tensor-parallel-size "${SLURM_GPUS_ON_NODE}"&
4547

4648
# wait until /health is available, die after 5 minutes
@@ -57,20 +59,21 @@ if [[ $exit_code != 124 ]]; then
5759
srun --het-group=1 \
5860
-u \
5961
-n 1 \
60-
--container-image="ghcr.io#huggingface/text-generation-inference-benchmark:latest" \
61-
--container-mounts="${RESULTS_DIR}:/opt/text-generation-inference-benchmark/results" \
62+
--container-image="ghcr.io#huggingface/inference-benchmarker:latest" \
63+
--container-mounts="${RESULTS_DIR}:/opt/inference-benchmarker/results" \
6264
--no-container-mount-home \
63-
text-generation-inference-benchmark \
65+
inference-benchmarker \
6466
--tokenizer-name "$MODEL" \
6567
--max-vus 800 \
6668
--url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
6769
--duration 120s \
6870
--warmup 30s \
6971
--benchmark-kind rate \
70-
--rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
72+
--rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
73+
--extra-meta "version=$VERSION,engine=vLLM,tp=$TP,max_num_batched_tokens=512" \
7174
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
72-
--decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
73-
--extra-meta "version=$VERSION,engine=vLLM,tp=$TP" \
75+
--decode-options "num_tokens=800,max_tokens=800,min_tokens=800,variance=0" \
76+
--dataset-file share_gpt_cleaned.json \
7477
--no-console
7578
fi
7679

0 commit comments

Comments
 (0)