Skip to content

Commit 43896af

Browse files
authored
[None][chore] benchmark refactor (#9207)
Signed-off-by: Zero Zeng <[email protected]>
1 parent 96cfdd8 commit 43896af

File tree

10 files changed

+138
-182
lines changed

10 files changed

+138
-182
lines changed

examples/disaggregated/slurm/benchmark/accuracy_eval.sh

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,9 @@ model_args_extra=${5}
1111
echo "Starting accuracy evaluation..."
1212
echo "Log directory: ${full_logdir}"
1313

14-
# Parse hostname and port from server_config.yaml
14+
# Parse hostname and port from server_config.yaml (server is already healthy)
1515
config_file="${full_logdir}/server_config.yaml"
1616

17-
# Wait for server_config.yaml to be created
18-
max_wait=1800
19-
wait_count=0
20-
while [ ! -f "${config_file}" ] && [ ${wait_count} -lt ${max_wait} ]; do
21-
echo "Waiting for server_config.yaml to be created..."
22-
sleep 1
23-
wait_count=$((wait_count + 1))
24-
done
25-
26-
if [ ${wait_count} -ge ${max_wait} ]; then
27-
echo "Error: server_config.yaml not found after ${max_wait} seconds"
28-
exit 1
29-
fi
30-
31-
# grep the host and port from the config file
3217
hostname=$(grep -i "hostname:" ${config_file} | awk '{print $2}')
3318
port=$(grep -i "port:" ${config_file} | awk '{print $2}')
3419

@@ -41,22 +26,6 @@ echo "Hostname: ${hostname}, Port: ${port}"
4126
base_url="http://${hostname}:${port}/v1/completions"
4227
echo "Using base_url: ${base_url}"
4328

44-
# check server is health by curl every 10 seconds timeout 1800 seconds
45-
timeout=1800
46-
start_time=$(date +%s)
47-
while ! curl -s -o /dev/null -w "%{http_code}" http://${hostname}:${port}/health; do
48-
current_time=$(date +%s)
49-
elapsed=$((current_time - start_time))
50-
if [ $elapsed -ge $timeout ]; then
51-
echo "Error: Server is not healthy after ${timeout} seconds"
52-
exit 1
53-
fi
54-
if [ $((elapsed % 30)) -eq 0 ]; then
55-
echo "Waiting for server to be healthy... (${elapsed}s elapsed)"
56-
fi
57-
sleep 10
58-
done
59-
6029
# Install lm_eval and run evaluation
6130
echo "Installing lm_eval[api] and running evaluation..."
6231
pip install lm_eval[api]==0.4.8

examples/disaggregated/slurm/benchmark/config.yaml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,16 @@ benchmark:
1515
benchmark_ratio: 0.8 # Benchmark ratio
1616
streaming: true # Enable streaming mode
1717
concurrency_list: "16"
18+
input_length: 1024 # Input sequence length
19+
output_length: 1024 # Output sequence length
20+
dataset_file: "<dataset_file>"
1821

1922
# Hardware Configuration
2023
hardware:
2124
gpus_per_node: 4 # Modify this with your hardware configuration
2225
num_ctx_servers: 1 # Number of context servers
2326
num_gen_servers: 1 # Number of generation servers
2427

25-
# Sequence Configuration
26-
sequence:
27-
input_length: 1024 # Input sequence length
28-
output_length: 1024 # Output sequence length
29-
3028
# Environment Configuration
3129
environment:
3230
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
@@ -35,8 +33,9 @@ environment:
3533
trtllm_repo: "<trtllm_repo>"
3634
build_wheel: false # Don't build the wheel when launching multiple jobs
3735
trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
38-
dataset_file: "<dataset_file>"
3936
work_dir: "<full_path_to_work_dir>"
37+
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"
38+
server_env_var: ""
4039

4140
# Profiling Configuration
4241
profiling:

examples/disaggregated/slurm/benchmark/disaggr_torch.slurm

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ accuracy_model=${31}
4747
accuracy_tasks=${32}
4848
model_args_extra=${33}
4949

50+
# Worker environment variables
51+
worker_env_var=${34}
52+
53+
# Server environment variables
54+
server_env_var=${35}
55+
5056
# Print all parsed arguments
5157
echo "Parsed arguments:"
5258
echo "Hardware Configuration:"
@@ -90,6 +96,12 @@ echo " enable_accuracy_test: ${enable_accuracy_test}"
9096
echo " accuracy_model: ${accuracy_model}"
9197
echo " accuracy_tasks: ${accuracy_tasks}"
9298
echo " model_args_extra: ${model_args_extra}"
99+
echo
100+
echo "Worker Environment Variables:"
101+
echo " worker_env_var: ${worker_env_var}"
102+
echo
103+
echo "Server Environment Variables:"
104+
echo " server_env_var: ${server_env_var}"
93105

94106
container_name="disaggr-test"
95107

@@ -199,7 +211,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
199211
--container-mounts=${container_mount} \
200212
--mpi=pmix \
201213
bash ${work_dir}/start_worker.sh \
202-
"GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" \
214+
"GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
203215
&> ${full_logdir}/output_gen_${i}.log &
204216
done
205217

@@ -214,19 +226,30 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
214226
--container-mounts=${container_mount} \
215227
--mpi=pmix \
216228
bash ${work_dir}/start_worker.sh \
217-
"CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" \
229+
"CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
218230
&> ${full_logdir}/output_ctx_${i}.log &
219231
done
220232

221-
# start the server
233+
# start the server (in background)
222234
echo "Starting server..."
223235
srun -l --container-name=${container_name} \
224236
--container-image=${container_image} \
225237
--container-mounts=${container_mount} \
226238
--mpi=pmix --overlap -N 1 -n 1 \
227-
bash ${work_dir}/start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} \
239+
bash ${work_dir}/start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} "${server_env_var}" \
228240
&> ${full_logdir}/output_server.log &
229241

242+
# Wait for server to be ready (runs synchronously)
243+
echo "Waiting for server to be ready..."
244+
if ! srun -l --container-name=${container_name} \
245+
--container-mounts=${container_mount} \
246+
--mpi=pmix --overlap -N 1 -n 1 \
247+
bash ${work_dir}/wait_server.sh ${full_logdir} \
248+
&> ${full_logdir}/wait_server.log; then
249+
cleanup_on_failure "Server failed to become ready. Check ${full_logdir}/wait_server.log for details"
250+
fi
251+
echo "Server is ready!"
252+
230253
# Start benchmarking
231254
echo "Starting benchmark..."
232255
if [ "${use_nv_sa_benchmark}" = "true" ]; then

examples/disaggregated/slurm/benchmark/run_benchmark.sh

Lines changed: 1 addition & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,7 @@ fi
2828

2929
config_file=${log_path}/server_config.yaml
3030

31-
# check if the config file exists every 10 seconds timeout 1800 seconds
32-
timeout=1800
33-
start_time=$(date +%s)
34-
while [ ! -f ${config_file} ]; do
35-
current_time=$(date +%s)
36-
elapsed=$((current_time - start_time))
37-
if [ $elapsed -ge $timeout ]; then
38-
echo "Error: Config file ${config_file} not found within ${timeout} seconds"
39-
exit 1
40-
fi
41-
if [ $((elapsed % 30)) -eq 0 ]; then
42-
echo "Waiting for config file... (${elapsed}s elapsed)"
43-
fi
44-
sleep 10
45-
done
46-
47-
# grep the host and port from the config file
31+
# Extract hostname and port from config file (server is already healthy)
4832
hostname=$(grep -i "hostname:" ${config_file} | awk '{print $2}')
4933
port=$(grep -i "port:" ${config_file} | awk '{print $2}')
5034
if [ -z "$hostname" ] || [ -z "$port" ]; then
@@ -53,51 +37,6 @@ if [ -z "$hostname" ] || [ -z "$port" ]; then
5337
fi
5438
echo "Hostname: ${hostname}, Port: ${port}"
5539

56-
# check server is health by curl every 10 seconds timeout 1800 seconds
57-
timeout=1800
58-
start_time=$(date +%s)
59-
while ! curl -s -o /dev/null -w "%{http_code}" http://${hostname}:${port}/health; do
60-
current_time=$(date +%s)
61-
elapsed=$((current_time - start_time))
62-
if [ $elapsed -ge $timeout ]; then
63-
echo "Error: Server is not healthy after ${timeout} seconds"
64-
exit 1
65-
fi
66-
if [ $((elapsed % 30)) -eq 0 ]; then
67-
echo "Waiting for server to be healthy... (${elapsed}s elapsed)"
68-
fi
69-
sleep 10
70-
done
71-
72-
# try client
73-
74-
do_get_logs(){
75-
worker_log_path=$1
76-
output_folder=$2
77-
78-
# Check if log file exists
79-
if [ ! -f "${worker_log_path}" ]; then
80-
echo "Warning: Worker log file ${worker_log_path} not found"
81-
touch "${output_folder}/gen_only.txt"
82-
touch "${output_folder}/ctx_only.txt"
83-
return 0
84-
fi
85-
86-
# Create output folder if it doesn't exist
87-
mkdir -p "${output_folder}"
88-
89-
# Extract metrics with better error handling
90-
if ! grep -a "'num_ctx_requests': 0, 'num_ctx_tokens': 0" "${worker_log_path}" > "${output_folder}/gen_only.txt" 2>/dev/null; then
91-
echo "Note: No generation-only metrics found in ${worker_log_path}"
92-
touch "${output_folder}/gen_only.txt"
93-
fi
94-
95-
if ! grep -a "'num_generation_tokens': 0" "${worker_log_path}" > "${output_folder}/ctx_only.txt" 2>/dev/null; then
96-
echo "Note: No context-only metrics found in ${worker_log_path}"
97-
touch "${output_folder}/ctx_only.txt"
98-
fi
99-
}
100-
10140
echo "Starting benchmark..."
10241
for concurrency in ${concurrency_list}; do
10342
concurrency=$((concurrency * num_gen_servers))

examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh

Lines changed: 5 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@ set -euo pipefail
55
trap 'echo "Error occurred at line $LINENO"; exit 1' ERR
66

77
# Constants
8-
readonly TIMEOUT=1800 # 30 minutes
9-
readonly HEALTH_CHECK_INTERVAL=10
10-
readonly STATUS_UPDATE_INTERVAL=30
118
readonly BENCH_SERVING_REPO="https://github.com/kedarpotdar-nv/bench_serving.git"
129
readonly BENCH_SERVING_DIR="/tmp/bench_serving"
1310
readonly BENCH_SCRIPT="${BENCH_SERVING_DIR}/benchmark_serving.py"
@@ -47,61 +44,11 @@ log_path=$9
4744

4845
config_file="${log_path}/server_config.yaml"
4946

50-
wait_for_file() {
51-
local file=$1
52-
local start_time=$(date +%s)
53-
54-
while [ ! -f "${file}" ]; do
55-
local elapsed=$(($(date +%s) - start_time))
56-
[[ $elapsed -ge $TIMEOUT ]] && { echo "Error: File ${file} not found within ${TIMEOUT} seconds"; exit 1; }
57-
[[ $((elapsed % STATUS_UPDATE_INTERVAL)) -eq 0 ]] && echo "Waiting for file... (${elapsed}s elapsed)"
58-
sleep $HEALTH_CHECK_INTERVAL
59-
done
60-
}
61-
62-
wait_for_server() {
63-
local host=$1
64-
local port=$2
65-
local start_time=$(date +%s)
66-
67-
while ! curl -s -o /dev/null -w "%{http_code}" "http://${host}:${port}/health"; do
68-
local elapsed=$(($(date +%s) - start_time))
69-
[[ $elapsed -ge $TIMEOUT ]] && { echo "Error: Server not healthy after ${TIMEOUT} seconds"; exit 1; }
70-
[[ $((elapsed % STATUS_UPDATE_INTERVAL)) -eq 0 ]] && echo "Waiting for server... (${elapsed}s elapsed)"
71-
sleep $HEALTH_CHECK_INTERVAL
72-
done
73-
}
74-
75-
extract_server_info() {
76-
local config=$1
77-
hostname=$(grep -i "hostname:" "${config}" | awk '{print $2}')
78-
port=$(grep -i "port:" "${config}" | awk '{print $2}')
79-
[[ -z "$hostname" || -z "$port" ]] && { echo "Error: Failed to extract hostname or port from config file"; exit 1; }
80-
echo "Hostname: ${hostname}, Port: ${port}"
81-
}
82-
83-
do_get_logs() {
84-
local worker_log_path=$1
85-
local output_folder=$2
86-
grep -a "'num_ctx_requests': 0, 'num_ctx_tokens': 0" "${worker_log_path}" > "${output_folder}/gen_only.txt" || true
87-
grep -a "'num_generation_tokens': 0" "${worker_log_path}" > "${output_folder}/ctx_only.txt" || true
88-
}
89-
90-
cleanup_processes() {
91-
echo "Cleaning up processes..."
92-
pkill -f 'start_server.sh|start_worker_e2e.sh|trtllm-serve' || true
93-
sleep 20 # Allow time for cleanup
94-
95-
if pgrep -f "trtllm-serve"; then
96-
echo "Warning: Some processes may still be running"
97-
else
98-
echo "All processes successfully terminated"
99-
fi
100-
}
101-
102-
# Main execution flow
103-
wait_for_file "${config_file}"
104-
extract_server_info "${config_file}"
47+
# Extract hostname and port from config file (server is already healthy)
48+
hostname=$(grep -i "hostname:" "${config_file}" | awk '{print $2}')
49+
port=$(grep -i "port:" "${config_file}" | awk '{print $2}')
50+
[[ -z "$hostname" || -z "$port" ]] && { echo "Error: Failed to extract hostname or port from config file"; exit 1; }
51+
echo "Hostname: ${hostname}, Port: ${port}"
10552

10653
# Clean up and clone benchmark repository
10754
if [ -d "${BENCH_SERVING_DIR}" ]; then
@@ -111,9 +58,6 @@ fi
11158
echo "Cloning benchmark repository..."
11259
git clone "${BENCH_SERVING_REPO}" "${BENCH_SERVING_DIR}"
11360

114-
# Wait for server to be healthy
115-
wait_for_server "${hostname}" "${port}"
116-
11761
# Run benchmarks
11862
echo "Starting benchmark..."
11963
for concurrency in ${concurrency_list}; do
@@ -149,6 +93,3 @@ done
14993
if [ -n "${SLURM_JOB_ID:-}" ]; then
15094
echo "${SLURM_JOB_NODELIST}" > "${log_path}/job_${SLURM_JOB_ID}.txt"
15195
fi
152-
153-
# Cleanup
154-
cleanup_processes

examples/disaggregated/slurm/benchmark/start_server.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,18 @@ num_ctx_servers=$1
77
num_gen_servers=$2
88
work_dir=$3
99
script_dir=$4
10+
server_env_var=$5
1011

1112
python3 ${script_dir}/gen_server_config.py \
1213
--num_ctx_servers ${num_ctx_servers} \
1314
--num_gen_servers ${num_gen_servers} \
1415
--work_dir ${work_dir}
15-
echo "server config generated to ${work_dir}/server_config.yaml"
16+
echo "Server config generated to ${work_dir}/server_config.yaml"
17+
18+
# Export server environment variables from config
19+
for env_var in ${server_env_var}; do
20+
export "${env_var}"
21+
echo "Exported: ${env_var}"
22+
done
1623

1724
trtllm-serve disaggregated -c ${work_dir}/server_config.yaml -t 7200 -r 7200

examples/disaggregated/slurm/benchmark/start_worker.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,17 @@ numa_bind=${8}
1414
log_dir=${9}
1515
enable_nsys=${10}
1616
config_file=${11}
17+
worker_env_var=${12}
1718

1819
unset UCX_TLS
1920
echo "enable_pdl: ${enable_pdl}, log_dir: ${log_dir}"
2021
echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"
2122

22-
export TLLM_LOG_LEVEL=INFO
23-
export TRTLLM_SERVER_DISABLE_GC=1
24-
export TRTLLM_WORKER_DISABLE_GC=1
23+
# Export worker environment variables from config
24+
for env_var in ${worker_env_var}; do
25+
export "${env_var}"
26+
echo "Exported: ${env_var}"
27+
done
2528

2629
if [ "${enable_pdl}" = "true" ]; then
2730
export TRTLLM_ENABLE_PDL=1

0 commit comments

Comments
 (0)