Skip to content

Commit 2f398a0

Browse files
Merge vllm-gaudi 1.20 into rhoai_2.19
2 parents fbdb4b6 + 28e6b4d commit 2f398a0

File tree

1,103 files changed

+83070
-33948
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,103 files changed

+83070
-33948
lines changed

.buildkite/generate_index.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import argparse
2+
import os
3+
4+
template = """<!DOCTYPE html>
5+
<html>
6+
<body>
7+
<h1>Links for vLLM</h1/>
8+
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
9+
</body>
10+
</html>
11+
"""
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument("--wheel", help="The wheel path.", required=True)
15+
args = parser.parse_args()
16+
17+
filename = os.path.basename(args.wheel)
18+
19+
with open("index.html", "w") as f:
20+
print(f"Generated index.html for {args.wheel}")
21+
# cloudfront requires escaping the '+' character
22+
f.write(
23+
template.format(wheel=filename,
24+
wheel_html_escaped=filename.replace("+", "%2B")))
Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
steps:
22
- label: "Wait for container to be ready"
3+
key: wait-for-container-image
34
agents:
45
queue: A100
56
plugins:
@@ -9,16 +10,18 @@ steps:
910
- image: badouralix/curl-jq
1011
command:
1112
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
12-
- wait
13+
1314
- label: "A100"
15+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
1416
agents:
1517
queue: A100
18+
depends_on: wait-for-container-image
1619
plugins:
1720
- kubernetes:
1821
podSpec:
1922
priorityClassName: perf-benchmark
2023
containers:
21-
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
24+
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
2225
command:
2326
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
2427
resources:
@@ -41,20 +44,49 @@ steps:
4144
- name: devshm
4245
emptyDir:
4346
medium: Memory
44-
# - label: "H100"
45-
# agents:
46-
# queue: H100
47-
# plugins:
48-
# - docker#v5.11.0:
49-
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
50-
# command:
51-
# - bash
52-
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
53-
# mount-buildkite-agent: true
54-
# propagate-environment: true
55-
# ipc: host
56-
# gpus: all
57-
# environment:
58-
# - VLLM_USAGE_SOURCE
59-
# - HF_TOKEN
6047

48+
- label: "H200"
49+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
50+
agents:
51+
queue: H200
52+
depends_on: wait-for-container-image
53+
plugins:
54+
- docker#v5.12.0:
55+
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
56+
command:
57+
- bash
58+
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
59+
mount-buildkite-agent: true
60+
propagate-environment: true
61+
ipc: host
62+
gpus: 4,5,6,7
63+
volumes:
64+
- /data/benchmark-hf-cache:/root/.cache/huggingface
65+
environment:
66+
- VLLM_USAGE_SOURCE
67+
- HF_TOKEN
68+
69+
#- block: "Run H100 Benchmark"
70+
#key: block-h100
71+
#depends_on: ~
72+
73+
- label: "H100"
74+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
75+
agents:
76+
queue: H100
77+
depends_on: wait-for-container-image
78+
plugins:
79+
- docker#v5.12.0:
80+
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
81+
command:
82+
- bash
83+
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
84+
mount-buildkite-agent: true
85+
propagate-environment: true
86+
ipc: host
87+
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
88+
volumes:
89+
- /data/benchmark-hf-cache:/root/.cache/huggingface
90+
environment:
91+
- VLLM_USAGE_SOURCE
92+
- HF_TOKEN

.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving):
157157
throughput_results,
158158
serving_results)
159159

160+
for df in [latency_results, serving_results, throughput_results]:
161+
if df.empty:
162+
continue
163+
164+
# Sort all dataframes by their respective "Test name" columns
165+
df.sort_values(by="Test name", inplace=True)
166+
167+
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
168+
# we want to turn it into "8xGPUTYPE"
169+
df["GPU"] = df["GPU"].apply(
170+
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
171+
160172
# get markdown tables
161173
latency_md_table = tabulate(latency_results,
162174
headers='keys',

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,104 @@ run_serving_tests() {
301301
kill_gpu_processes
302302
}
303303

304+
run_genai_perf_tests() {
305+
# run genai-perf tests
306+
307+
# $1: a json file specifying genai-perf test cases
308+
local genai_perf_test_file
309+
genai_perf_test_file=$1
310+
311+
# Iterate over genai-perf tests
312+
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
313+
# get the test name, and append the GPU type back to it.
314+
test_name=$(echo "$params" | jq -r '.test_name')
315+
316+
# if TEST_SELECTOR is set, only run the test cases that match the selector
317+
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
318+
echo "Skip test case $test_name."
319+
continue
320+
fi
321+
322+
# prepend the current serving engine to the test name
323+
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
324+
325+
# get common parameters
326+
common_params=$(echo "$params" | jq -r '.common_parameters')
327+
model=$(echo "$common_params" | jq -r '.model')
328+
tp=$(echo "$common_params" | jq -r '.tp')
329+
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
330+
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
331+
port=$(echo "$common_params" | jq -r '.port')
332+
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
333+
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
334+
335+
# get client and server arguments
336+
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
337+
qps_list=$(echo "$params" | jq -r '.qps_list')
338+
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
339+
echo "Running over qps list $qps_list"
340+
341+
# check if there is enough GPU to run the test
342+
if [[ $gpu_count -lt $tp ]]; then
343+
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
344+
continue
345+
fi
346+
347+
if [[ $reuse_server == "true" ]]; then
348+
echo "Reuse previous server for test case $test_name"
349+
else
350+
kill_gpu_processes
351+
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
352+
"$server_params" "$common_params"
353+
fi
354+
355+
if wait_for_server; then
356+
echo ""
357+
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
358+
else
359+
echo ""
360+
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
361+
break
362+
fi
363+
364+
# iterate over different QPS
365+
for qps in $qps_list; do
366+
# remove the surrounding single quote from qps
367+
if [[ "$qps" == *"inf"* ]]; then
368+
echo "qps was $qps"
369+
qps=$num_prompts
370+
echo "now qps is $qps"
371+
fi
372+
373+
new_test_name=$test_name"_qps_"$qps
374+
backend=$CURRENT_LLM_SERVING_ENGINE
375+
376+
if [[ "$backend" == *"vllm"* ]]; then
377+
backend="vllm"
378+
fi
379+
#TODO: add output dir.
380+
client_command="genai-perf profile \
381+
-m $model \
382+
--service-kind openai \
383+
--backend vllm \
384+
--endpoint-type chat \
385+
--streaming \
386+
--url localhost:$port \
387+
--request-rate $qps \
388+
--num-prompts $num_prompts \
389+
"
390+
391+
echo "Client command: $client_command"
392+
393+
eval "$client_command"
394+
395+
#TODO: process/record outputs
396+
done
397+
done
398+
399+
kill_gpu_processes
400+
401+
}
304402

305403
prepare_dataset() {
306404

@@ -328,12 +426,17 @@ main() {
328426

329427
pip install -U transformers
330428

429+
pip install -r requirements-dev.txt
430+
which genai-perf
431+
331432
# check storage
332433
df -h
333434

334435
ensure_installed wget
335436
ensure_installed curl
336437
ensure_installed jq
438+
# genai-perf dependency
439+
ensure_installed libb64-0d
337440

338441
prepare_dataset
339442

@@ -345,6 +448,10 @@ main() {
345448
# run the test
346449
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
347450

451+
# run genai-perf tests
452+
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
453+
mv artifacts/ $RESULTS_FOLDER/
454+
348455
# upload benchmark results to buildkite
349456
python3 -m pip install tabulate pandas
350457
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
88
# and we still want to see other benchmarking results even when mixtral crashes.
9+
set -x
910
set -o pipefail
1011

1112
check_gpus() {
@@ -85,11 +86,7 @@ kill_gpu_processes() {
8586

8687
ps -aux
8788
lsof -t -i:8000 | xargs -r kill -9
88-
pkill -f pt_main_thread
89-
# this line doesn't work now
90-
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
91-
pkill -f python3
92-
pkill -f /usr/bin/python3
89+
pgrep python3 | xargs -r kill -9
9390

9491

9592
# wait until GPU memory usage smaller than 1GB
@@ -289,7 +286,7 @@ run_serving_tests() {
289286
# run the server
290287
echo "Running test case $test_name"
291288
echo "Server command: $server_command"
292-
eval "$server_command" &
289+
bash -c "$server_command" &
293290
server_pid=$!
294291

295292
# wait until the server is alive
@@ -322,7 +319,7 @@ run_serving_tests() {
322319
echo "Running test case $test_name with qps $qps"
323320
echo "Client command: $client_command"
324321

325-
eval "$client_command"
322+
bash -c "$client_command"
326323

327324
# record the benchmarking commands
328325
jq_output=$(jq -n \

.buildkite/nightly-benchmarks/scripts/wait-for-image.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/sh
2-
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
3-
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
2+
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
3+
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
44

55
TIMEOUT_SECONDS=10
66

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[
2+
{
3+
"test_name": "llama8B_tp1_genai_perf",
4+
"qps_list": [4,8,16,32],
5+
"common_parameters": {
6+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
7+
"tp": 1,
8+
"port": 8000,
9+
"num_prompts": 500,
10+
"reuse_server": false
11+
},
12+
"vllm_server_parameters": {
13+
"disable_log_stats": "",
14+
"disable_log_requests": "",
15+
"gpu_memory_utilization": 0.9,
16+
"num_scheduler_steps": 10,
17+
"max_num_seqs": 512,
18+
"dtype": "bfloat16"
19+
},
20+
"genai_perf_input_parameters": {
21+
}
22+
}
23+
]

0 commit comments

Comments
 (0)