2020
2121echo " Starting TGI benchmark for $MODEL "
2222export RUST_BACKTRACE=full
23- export RUST_LOG=text_generation_inference_benchmark =info
23+ export RUST_LOG=inference_benchmarker =info
2424
2525# set a random available port to avoid conflicts
2626PORT=$( shuf -i 8000-9999 -n 1)
2727export PORT
28+ export PREFILL_CHUNKING=1
2829
2930echo " Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0} :${PORT} "
3031echo " Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1} "
@@ -40,9 +41,9 @@ srun --het-group=0 \
4041 --no-container-mount-home \
4142 /usr/local/bin/text-generation-launcher \
4243 --model-id $MODEL \
43- --max-concurrent-requests 512 \
44- --max-waiting-tokens 5 \
45- --cuda-graphs= " 1,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128 " &
44+ --max-concurrent-requests 1024 \
45+ --max-waiting-tokens 0 \
46+ --max-batch-prefill-tokens 512 &
4647
4748# wait until /health is available, die after 5 minutes
4849timeout 600 bash -c " while [[ \"\$ (curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT} /health)\" != \" 200\" ]]; do sleep 1 && echo \" Waiting for TGI to start...\" ; done" || exit 1
@@ -58,20 +59,21 @@ if [[ $exit_code != 124 ]]; then
5859 srun --het-group=1 \
5960 -u \
6061 -n 1 \
61- --container-image=" ghcr.io#huggingface/text-generation- inference-benchmark :latest" \
62- --container-mounts=" ${RESULTS_DIR} :/opt/text-generation- inference-benchmark /results" \
62+ --container-image=" ghcr.io#huggingface/inference-benchmarker :latest" \
63+ --container-mounts=" ${RESULTS_DIR} :/opt/inference-benchmarker /results" \
6364 --no-container-mount-home \
64- text-generation- inference-benchmark \
65+ inference-benchmarker \
6566 --tokenizer-name " $MODEL " \
6667 --max-vus 800 \
6768 --url " http://${SLURM_JOB_NODELIST_HET_GROUP_0} :${PORT} " \
6869 --duration 120s \
6970 --warmup 30s \
7071 --benchmark-kind rate \
71- --rates 0.8 --rates 1.6 --rates 2.4 --rates 3.2 --rates 4.0 --rates 4.8 --rates 5.6 --rates 6.4 --rates 7.2 --rates 8.0 --rates 8.8 --rates 9.6 --rates 10.4 --rates 11.2 --rates 12.0 --rates 12.8 --rates 13.6 --rates 14.4 --rates 15.2 --rates 16.0 --rates 16.8 --rates 17.6 --rates 18.4 --rates 19.2 --rates 20.0 --rates 20.8 --rates 21.6 --rates 22.4 --rates 23.2 --rates 24.0 \
72+ --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
73+ --extra-meta " version=$VERSION ,engine=TGI,tp=$TP ,max_batch_prefill_tokens=512" \
7274 --prompt-options " num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
73- --decode-options " num_tokens=200 ,max_tokens=220 ,min_tokens=180 ,variance=10 " \
74- --extra-meta " version= $VERSION ,engine=TGI,tp= $TP " \
75+ --decode-options " num_tokens=800 ,max_tokens=800 ,min_tokens=800 ,variance=0 " \
76+ --dataset-file share_gpt_cleaned.json \
7577 --no-console
7678fi
7779
0 commit comments