[None][chore] benchmark refactor (#9207)

zerollzeng · web-flow · commit 43896af1b11b · 2025-11-17T23:29:28.000-08:00
Signed-off-by: Zero Zeng &lt;38289304+zerollzeng@users.noreply.github.com&gt;
diff --git a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh b/examples/disaggregated/slurm/benchmark/accuracy_eval.sh
@@ -11,24 +11,9 @@ model_args_extra=${5}
 echo "Starting accuracy evaluation..."
 echo "Log directory: ${full_logdir}"
 
-# Parse hostname and port from server_config.yaml
+# Parse hostname and port from server_config.yaml (server is already healthy)
 config_file="${full_logdir}/server_config.yaml"
 
-# Wait for server_config.yaml to be created
-max_wait=1800
-wait_count=0
-while [ ! -f "${config_file}" ] && [ ${wait_count} -lt ${max_wait} ]; do
-    echo "Waiting for server_config.yaml to be created..."
-    sleep 1
-    wait_count=$((wait_count + 1))
-done
-
-if [ ${wait_count} -ge ${max_wait} ]; then
-    echo "Error: server_config.yaml not found after ${max_wait} seconds"
-    exit 1
-fi
-
-# grep the host and port from the config file
 hostname=$(grep -i "hostname:" ${config_file} | awk '{print $2}')
 port=$(grep -i "port:" ${config_file} | awk '{print $2}')
 
@@ -41,22 +26,6 @@ echo "Hostname: ${hostname}, Port: ${port}"
 base_url="http://${hostname}:${port}/v1/completions"
 echo "Using base_url: ${base_url}"
 
-# check server is health by curl every 10 seconds timeout 1800 seconds
-timeout=1800
-start_time=$(date +%s)
-while ! curl -s -o /dev/null -w "%{http_code}" http://${hostname}:${port}/health; do
-    current_time=$(date +%s)
-    elapsed=$((current_time - start_time))
-    if [ $elapsed -ge $timeout ]; then
-        echo "Error: Server is not healthy after ${timeout} seconds"
-        exit 1
-    fi
-    if [ $((elapsed % 30)) -eq 0 ]; then
-        echo "Waiting for server to be healthy... (${elapsed}s elapsed)"
-    fi
-    sleep 10
-done
-
 # Install lm_eval and run evaluation
 echo "Installing lm_eval[api] and running evaluation..."
 pip install lm_eval[api]==0.4.8
diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -15,18 +15,16 @@ benchmark:
   benchmark_ratio: 0.8  # Benchmark ratio
   streaming: true  # Enable streaming mode
   concurrency_list: "16"
+  input_length: 1024  # Input sequence length
+  output_length: 1024  # Output sequence length
+  dataset_file: "<dataset_file>"
 
 # Hardware Configuration
 hardware:
   gpus_per_node: 4  # Modify this with your hardware configuration
   num_ctx_servers: 1  # Number of context servers
   num_gen_servers: 1  # Number of generation servers
 
-# Sequence Configuration
-sequence:
-  input_length: 1024  # Input sequence length
-  output_length: 1024  # Output sequence length
-
 # Environment Configuration
 environment:
   container_mount: "<container_mount>"  # Format: path1:path1,path2:path2
@@ -35,8 +33,9 @@ environment:
   trtllm_repo: "<trtllm_repo>"
   build_wheel: false  # Don't build the wheel when launching multiple jobs
   trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
-  dataset_file: "<dataset_file>"
   work_dir: "<full_path_to_work_dir>"
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"
+  server_env_var: ""
 
 # Profiling Configuration
 profiling:
diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -47,6 +47,12 @@ accuracy_model=${31}
 accuracy_tasks=${32}
 model_args_extra=${33}
 
+# Worker environment variables
+worker_env_var=${34}
+
+# Server environment variables
+server_env_var=${35}
+
 # Print all parsed arguments
 echo "Parsed arguments:"
 echo "Hardware Configuration:"
@@ -90,6 +96,12 @@ echo "  enable_accuracy_test: ${enable_accuracy_test}"
 echo "  accuracy_model: ${accuracy_model}"
 echo "  accuracy_tasks: ${accuracy_tasks}"
 echo "  model_args_extra: ${model_args_extra}"
+echo
+echo "Worker Environment Variables:"
+echo "  worker_env_var: ${worker_env_var}"
+echo
+echo "Server Environment Variables:"
+echo "  server_env_var: ${server_env_var}"
 
 container_name="disaggr-test"
 
@@ -199,7 +211,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" \
+        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_gen_${i}.log &
 done
 
@@ -214,19 +226,30 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" \
+        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_ctx_${i}.log &
 done
 
-# start the server
+# start the server (in background)
 echo "Starting server..."
 srun -l --container-name=${container_name} \
     --container-image=${container_image} \
     --container-mounts=${container_mount} \
     --mpi=pmix --overlap -N 1 -n 1 \
-    bash ${work_dir}/start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} \
+    bash ${work_dir}/start_server.sh ${num_ctx_servers} ${num_gen_servers} ${full_logdir} ${work_dir} "${server_env_var}" \
     &> ${full_logdir}/output_server.log &
 
+# Wait for server to be ready (runs synchronously)
+echo "Waiting for server to be ready..."
+if ! srun -l --container-name=${container_name} \
+    --container-mounts=${container_mount} \
+    --mpi=pmix --overlap -N 1 -n 1 \
+    bash ${work_dir}/wait_server.sh ${full_logdir} \
+    &> ${full_logdir}/wait_server.log; then
+    cleanup_on_failure "Server failed to become ready. Check ${full_logdir}/wait_server.log for details"
+fi
+echo "Server is ready!"
+
 # Start benchmarking
 echo "Starting benchmark..."
 if [ "${use_nv_sa_benchmark}" = "true" ]; then
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh
@@ -28,23 +28,7 @@ fi
 
 config_file=${log_path}/server_config.yaml
 
-# check if the config file exists every 10 seconds timeout 1800 seconds
-timeout=1800
-start_time=$(date +%s)
-while [ ! -f ${config_file} ]; do
-    current_time=$(date +%s)
-    elapsed=$((current_time - start_time))
-    if [ $elapsed -ge $timeout ]; then
-        echo "Error: Config file ${config_file} not found within ${timeout} seconds"
-        exit 1
-    fi
-    if [ $((elapsed % 30)) -eq 0 ]; then
-        echo "Waiting for config file... (${elapsed}s elapsed)"
-    fi
-    sleep 10
-done
-
-# grep the host and port from the config file
+# Extract hostname and port from config file (server is already healthy)
 hostname=$(grep -i "hostname:" ${config_file} | awk '{print $2}')
 port=$(grep -i "port:" ${config_file} | awk '{print $2}')
 if [ -z "$hostname" ] || [ -z "$port" ]; then
@@ -53,51 +37,6 @@ if [ -z "$hostname" ] || [ -z "$port" ]; then
 fi
 echo "Hostname: ${hostname}, Port: ${port}"
 
-# check server is health by curl every 10 seconds timeout 1800 seconds
-timeout=1800
-start_time=$(date +%s)
-while ! curl -s -o /dev/null -w "%{http_code}" http://${hostname}:${port}/health; do
-    current_time=$(date +%s)
-    elapsed=$((current_time - start_time))
-    if [ $elapsed -ge $timeout ]; then
-        echo "Error: Server is not healthy after ${timeout} seconds"
-        exit 1
-    fi
-    if [ $((elapsed % 30)) -eq 0 ]; then
-        echo "Waiting for server to be healthy... (${elapsed}s elapsed)"
-    fi
-    sleep 10
-done
-
-# try client
-
-do_get_logs(){
-    worker_log_path=$1
-    output_folder=$2
-
-    # Check if log file exists
-    if [ ! -f "${worker_log_path}" ]; then
-        echo "Warning: Worker log file ${worker_log_path} not found"
-        touch "${output_folder}/gen_only.txt"
-        touch "${output_folder}/ctx_only.txt"
-        return 0
-    fi
-
-    # Create output folder if it doesn't exist
-    mkdir -p "${output_folder}"
-
-    # Extract metrics with better error handling
-    if ! grep -a "'num_ctx_requests': 0, 'num_ctx_tokens': 0" "${worker_log_path}" > "${output_folder}/gen_only.txt" 2>/dev/null; then
-        echo "Note: No generation-only metrics found in ${worker_log_path}"
-        touch "${output_folder}/gen_only.txt"
-    fi
-
-    if ! grep -a "'num_generation_tokens': 0" "${worker_log_path}" > "${output_folder}/ctx_only.txt" 2>/dev/null; then
-        echo "Note: No context-only metrics found in ${worker_log_path}"
-        touch "${output_folder}/ctx_only.txt"
-    fi
-}
-
 echo "Starting benchmark..."
 for concurrency in ${concurrency_list}; do
     concurrency=$((concurrency * num_gen_servers))
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh
@@ -5,9 +5,6 @@ set -euo pipefail
 trap 'echo "Error occurred at line $LINENO"; exit 1' ERR
 
 # Constants
-readonly TIMEOUT=1800  # 30 minutes
-readonly HEALTH_CHECK_INTERVAL=10
-readonly STATUS_UPDATE_INTERVAL=30
 readonly BENCH_SERVING_REPO="https://github.com/kedarpotdar-nv/bench_serving.git"
 readonly BENCH_SERVING_DIR="/tmp/bench_serving"
 readonly BENCH_SCRIPT="${BENCH_SERVING_DIR}/benchmark_serving.py"
@@ -47,61 +44,11 @@ log_path=$9
 
 config_file="${log_path}/server_config.yaml"
 
-wait_for_file() {
-    local file=$1
-    local start_time=$(date +%s)
-
-    while [ ! -f "${file}" ]; do
-        local elapsed=$(($(date +%s) - start_time))
-        [[ $elapsed -ge $TIMEOUT ]] && { echo "Error: File ${file} not found within ${TIMEOUT} seconds"; exit 1; }
-        [[ $((elapsed % STATUS_UPDATE_INTERVAL)) -eq 0 ]] && echo "Waiting for file... (${elapsed}s elapsed)"
-        sleep $HEALTH_CHECK_INTERVAL
-    done
-}
-
-wait_for_server() {
-    local host=$1
-    local port=$2
-    local start_time=$(date +%s)
-
-    while ! curl -s -o /dev/null -w "%{http_code}" "http://${host}:${port}/health"; do
-        local elapsed=$(($(date +%s) - start_time))
-        [[ $elapsed -ge $TIMEOUT ]] && { echo "Error: Server not healthy after ${TIMEOUT} seconds"; exit 1; }
-        [[ $((elapsed % STATUS_UPDATE_INTERVAL)) -eq 0 ]] && echo "Waiting for server... (${elapsed}s elapsed)"
-        sleep $HEALTH_CHECK_INTERVAL
-    done
-}
-
-extract_server_info() {
-    local config=$1
-    hostname=$(grep -i "hostname:" "${config}" | awk '{print $2}')
-    port=$(grep -i "port:" "${config}" | awk '{print $2}')
-    [[ -z "$hostname" || -z "$port" ]] && { echo "Error: Failed to extract hostname or port from config file"; exit 1; }
-    echo "Hostname: ${hostname}, Port: ${port}"
-}
-
-do_get_logs() {
-    local worker_log_path=$1
-    local output_folder=$2
-    grep -a "'num_ctx_requests': 0, 'num_ctx_tokens': 0" "${worker_log_path}" > "${output_folder}/gen_only.txt" || true
-    grep -a "'num_generation_tokens': 0" "${worker_log_path}" > "${output_folder}/ctx_only.txt" || true
-}
-
-cleanup_processes() {
-    echo "Cleaning up processes..."
-    pkill -f 'start_server.sh|start_worker_e2e.sh|trtllm-serve' || true
-    sleep 20  # Allow time for cleanup
-
-    if pgrep -f "trtllm-serve"; then
-        echo "Warning: Some processes may still be running"
-    else
-        echo "All processes successfully terminated"
-    fi
-}
-
-# Main execution flow
-wait_for_file "${config_file}"
-extract_server_info "${config_file}"
+# Extract hostname and port from config file (server is already healthy)
+hostname=$(grep -i "hostname:" "${config_file}" | awk '{print $2}')
+port=$(grep -i "port:" "${config_file}" | awk '{print $2}')
+[[ -z "$hostname" || -z "$port" ]] && { echo "Error: Failed to extract hostname or port from config file"; exit 1; }
+echo "Hostname: ${hostname}, Port: ${port}"
 
 # Clean up and clone benchmark repository
 if [ -d "${BENCH_SERVING_DIR}" ]; then
@@ -111,9 +58,6 @@ fi
 echo "Cloning benchmark repository..."
 git clone "${BENCH_SERVING_REPO}" "${BENCH_SERVING_DIR}"
 
-# Wait for server to be healthy
-wait_for_server "${hostname}" "${port}"
-
 # Run benchmarks
 echo "Starting benchmark..."
 for concurrency in ${concurrency_list}; do
@@ -149,6 +93,3 @@ done
 if [ -n "${SLURM_JOB_ID:-}" ]; then
     echo "${SLURM_JOB_NODELIST}" > "${log_path}/job_${SLURM_JOB_ID}.txt"
 fi
-
-# Cleanup
-cleanup_processes
diff --git a/examples/disaggregated/slurm/benchmark/start_server.sh b/examples/disaggregated/slurm/benchmark/start_server.sh
@@ -7,11 +7,18 @@ num_ctx_servers=$1
 num_gen_servers=$2
 work_dir=$3
 script_dir=$4
+server_env_var=$5
 
 python3 ${script_dir}/gen_server_config.py \
     --num_ctx_servers ${num_ctx_servers} \
     --num_gen_servers ${num_gen_servers} \
     --work_dir ${work_dir}
-echo "server config generated to ${work_dir}/server_config.yaml"
+echo "Server config generated to ${work_dir}/server_config.yaml"
+
+# Export server environment variables from config
+for env_var in ${server_env_var}; do
+    export "${env_var}"
+    echo "Exported: ${env_var}"
+done
 
 trtllm-serve disaggregated -c ${work_dir}/server_config.yaml -t 7200 -r 7200
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -14,14 +14,17 @@ numa_bind=${8}
 log_dir=${9}
 enable_nsys=${10}
 config_file=${11}
+worker_env_var=${12}
 
 unset UCX_TLS
 echo "enable_pdl: ${enable_pdl}, log_dir: ${log_dir}"
 echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"
 
-export TLLM_LOG_LEVEL=INFO
-export TRTLLM_SERVER_DISABLE_GC=1
-export TRTLLM_WORKER_DISABLE_GC=1
+# Export worker environment variables from config
+for env_var in ${worker_env_var}; do
+    export "${env_var}"
+    echo "Exported: ${env_var}"
+done
 
 if [ "${enable_pdl}" = "true" ]; then
     export TRTLLM_ENABLE_PDL=1
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
diff --git a/examples/disaggregated/slurm/benchmark/wait_server.sh b/examples/disaggregated/slurm/benchmark/wait_server.sh
diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml