[TRTLLM-9053][feat] Support accuracy test and install from wheel (#9038)

zerollzeng · web-flow · commit c6cce398f5ca · 2025-11-13T23:34:47.000-08:00
Signed-off-by: Zero Zeng &lt;38289304+zerollzeng@users.noreply.github.com&gt;
diff --git a/examples/disaggregated/slurm/benchmark/accuracy_eval.sh b/examples/disaggregated/slurm/benchmark/accuracy_eval.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+set -euo pipefail
+
+# Parse arguments
+full_logdir=${1}
+accuracy_model=${2}
+accuracy_tasks=${3}
+model_path=${4}
+model_args_extra=${5}
+
+echo "Starting accuracy evaluation..."
+echo "Log directory: ${full_logdir}"
+
+# Parse hostname and port from server_config.yaml
+config_file="${full_logdir}/server_config.yaml"
+
+# Wait for server_config.yaml to be created
+max_wait=1800
+wait_count=0
+while [ ! -f "${config_file}" ] && [ ${wait_count} -lt ${max_wait} ]; do
+    echo "Waiting for server_config.yaml to be created..."
+    sleep 1
+    wait_count=$((wait_count + 1))
+done
+
+if [ ${wait_count} -ge ${max_wait} ]; then
+    echo "Error: server_config.yaml not found after ${max_wait} seconds"
+    exit 1
+fi
+
+# grep the host and port from the config file
+hostname=$(grep -i "hostname:" ${config_file} | awk '{print $2}')
+port=$(grep -i "port:" ${config_file} | awk '{print $2}')
+
+if [ -z "$hostname" ] || [ -z "$port" ]; then
+    echo "Error: Failed to extract hostname or port from config file"
+    exit 1
+fi
+
+echo "Hostname: ${hostname}, Port: ${port}"
+base_url="http://${hostname}:${port}/v1/completions"
+echo "Using base_url: ${base_url}"
+
+# check server is health by curl every 10 seconds timeout 1800 seconds
+timeout=1800
+start_time=$(date +%s)
+while ! curl -s -o /dev/null -w "%{http_code}" http://${hostname}:${port}/health; do
+    current_time=$(date +%s)
+    elapsed=$((current_time - start_time))
+    if [ $elapsed -ge $timeout ]; then
+        echo "Error: Server is not healthy after ${timeout} seconds"
+        exit 1
+    fi
+    if [ $((elapsed % 30)) -eq 0 ]; then
+        echo "Waiting for server to be healthy... (${elapsed}s elapsed)"
+    fi
+    sleep 10
+done
+
+# Install lm_eval and run evaluation
+echo "Installing lm_eval[api] and running evaluation..."
+pip install lm_eval[api]==0.4.8
+
+echo "Running lm_eval with tasks: ${accuracy_tasks}..."
+lm_eval --model ${accuracy_model} \
+    --tasks ${accuracy_tasks} \
+    --model_args model=${model_path},base_url=${base_url},${model_args_extra} \
+    --trust_remote_code
+
+echo "Accuracy evaluation completed successfully"
diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -34,13 +34,21 @@ environment:
   model_path: "<model_path>"
   trtllm_repo: "<trtllm_repo>"
   build_wheel: false  # Don't build the wheel when launching multiple jobs
+  trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   dataset_file: "<dataset_file>"
   work_dir: "<full_path_to_work_dir>"
 
 # Profiling Configuration
 profiling:
   nsys_on: false  # Set to true to enable profiling
 
+# Accuracy Configuration
+accuracy:
+  enable_accuracy_test: false  # Set to true to enable accuracy evaluation
+  model: "local-completions"  # Model type for lm_eval
+  tasks: "gsm8k"  # Evaluation tasks (comma-separated)
+  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096"  # Extra model arguments for lm_eval
+
 worker_config:
   gen:
     tensor_parallel_size: 8
diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -36,9 +36,16 @@ full_logdir=${24}
 container_mount=${25}
 container_image=${26}
 build_wheel=${27}
+trtllm_wheel_path=${28}
 
 # Profiling
-nsys_on=${28}
+nsys_on=${29}
+
+# Accuracy evaluation
+enable_accuracy_test=${30}
+accuracy_model=${31}
+accuracy_tasks=${32}
+model_args_extra=${33}
 
 # Print all parsed arguments
 echo "Parsed arguments:"
@@ -74,12 +81,18 @@ echo "  container_image: ${container_image}"
 echo "  model_path: ${model_path}"
 echo "  trtllm_repo: ${trtllm_repo}"
 echo "  build_wheel: ${build_wheel}"
+echo "  trtllm_wheel_path: ${trtllm_wheel_path}"
 echo "  work_dir: ${work_dir}"
 echo "  nsys_on: ${nsys_on}"
+echo
+echo "Accuracy Configuration:"
+echo "  enable_accuracy_test: ${enable_accuracy_test}"
+echo "  accuracy_model: ${accuracy_model}"
+echo "  accuracy_tasks: ${accuracy_tasks}"
+echo "  model_args_extra: ${model_args_extra}"
 
 container_name="disaggr-test"
 
-# Log directory is now passed directly
 echo "Log directory: ${full_logdir}"
 
 # Function to cleanup on failure
@@ -102,8 +115,20 @@ if ! srun -l --container-image=${container_image} \
     cleanup_on_failure "Failed to start container. Check ${full_logdir}/container_launch.log"
 fi
 
-# Build TensorRT-LLM if needed
-if [ -d "${trtllm_repo}" ]; then
+# Install TensorRT-LLM
+if [ -n "${trtllm_wheel_path}" ]; then
+    # Install from pre-built wheel if path is provided
+    echo "Installing TensorRT-LLM from wheel: ${trtllm_wheel_path}..."
+    if ! srun --container-name=${container_name} \
+        --container-mounts=${container_mount} --no-container-mount-home \
+        --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
+        bash -c "pip install ${trtllm_wheel_path}" \
+        &> ${full_logdir}/install.log; then
+        cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/install.log for details"
+    fi
+    echo "TensorRT-LLM wheel installation completed successfully"
+elif [ -d "${trtllm_repo}" ]; then
+    # Build and install from repository if no wheel path provided
     echo "Installing TensorRT-LLM from ${trtllm_repo}..."
     TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
     echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
@@ -226,6 +251,22 @@ else
     fi
 fi
 echo "Benchmark completed successfully"
+
+# Run accuracy evaluation if enabled
+if [ "${enable_accuracy_test}" = "true" ]; then
+    echo "Starting accuracy evaluation..."
+    if ! srun -l --container-name=${container_name} \
+        --container-mounts=${container_mount} \
+        --mpi=pmix --overlap -N 1 -n 1 \
+        bash ${work_dir}/accuracy_eval.sh \
+        "${full_logdir}" "${accuracy_model}" "${accuracy_tasks}" "${model_path}" \
+        "${model_args_extra}" \
+        &> ${full_logdir}/accuracy_eval.log; then
+        cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/accuracy_eval.log for details"
+    fi
+    echo "Accuracy evaluation completed successfully"
+fi
+
 echo "Total runtime: $SECONDS seconds"
 
 # try to kill the server and workers
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh
@@ -127,16 +127,3 @@ job_id=${SLURM_JOB_ID}
 if [ -n "${job_id}" ]; then
     echo "${SLURM_JOB_NODELIST}" > ${log_path}/job_${job_id}.txt
 fi
-
-echo "Benchmark done, gracefully shutting down server and workers..."
-kill -9 $(ps aux | grep '[s]tart_server.sh' | awk '{print $2}') >/dev/null 2>&1 || true
-kill -9 $(ps aux | grep '[s]tart_worker.sh' | awk '{print $2}') >/dev/null 2>&1 || true
-kill -9 $(ps aux | grep '[t]rtllm-serve' | awk '{print $2}') >/dev/null 2>&1 || true
-sleep 20  # Give processes some time to clean up
-
-# Check if there are any remaining processes
-if pgrep -f "trtllm-serve"; then
-    echo "Warning: Some processes may still be running"
-else
-    echo "All processes successfully terminated"
-fi
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
@@ -3,6 +3,7 @@
 import argparse
 import glob
 import os
+import shutil
 import subprocess
 import sys
 
@@ -50,6 +51,24 @@ def submit_job(config):
     hw_config = config['hardware']
     env_config = config['environment']
 
+    # Set default accuracy configuration for backward compatibility
+    if 'accuracy' not in config:
+        config['accuracy'] = {
+            'enable_accuracy_test':
+            False,
+            'model':
+            'local-completions',
+            'tasks':
+            'gsm8k',
+            'model_args_extra':
+            'num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096'
+        }
+
+    # Set default environment configuration for backward compatibility
+    env_config.setdefault('trtllm_repo', '')
+    env_config.setdefault('build_wheel', False)
+    env_config.setdefault('trtllm_wheel_path', '')
+
     # Get number of servers from config
     ctx_num = hw_config['num_ctx_servers']
     gen_num = hw_config['num_gen_servers']
@@ -94,7 +113,10 @@ def submit_job(config):
 
     # Create full log directory path
     log_dir = os.path.join(log_base, dir_suffix)
-    os.makedirs(log_dir, exist_ok=True)
+    # Remove existing directory if it exists
+    if os.path.exists(log_dir):
+        shutil.rmtree(log_dir)
+    os.makedirs(log_dir)
 
     # Setup config file paths and save worker configs
     ctx_config_path = os.path.join(log_dir, 'ctx_config.yaml')
@@ -150,9 +172,16 @@ def submit_job(config):
         env_config['container_mount'],
         env_config['container_image'],
         str(env_config['build_wheel']).lower(),
+        env_config['trtllm_wheel_path'],
 
         # Profiling
-        str(config['profiling']['nsys_on']).lower()
+        str(config['profiling']['nsys_on']).lower(),
+
+        # Accuracy evaluation
+        str(config['accuracy']['enable_accuracy_test']).lower(),
+        config['accuracy']['model'],
+        config['accuracy']['tasks'],
+        config['accuracy']['model_args_extra']
     ]
 
     # Submit the job
diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml
@@ -34,13 +34,21 @@ environment:
   model_path: "<model_path>"
   trtllm_repo: "<trtllm_repo>"
   build_wheel: false  # Don't build the wheel when launching multiple jobs
+  trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   dataset_file: "<dataset_file>"
   work_dir: "<full_path_to_work_dir>"
 
 # Profiling Configuration
 profiling:
   nsys_on: false  # Set to true to enable profiling
 
+# Accuracy Configuration
+accuracy:
+  enable_accuracy_test: false  # Set to true to enable accuracy evaluation
+  model: "local-completions"  # Model type for lm_eval
+  tasks: "gsm8k"  # Evaluation tasks (comma-separated)
+  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=False,timeout=1200,max_gen_toks=256,max_length=512"  # Extra model arguments for lm_eval
+
 # Worker Configuration
 worker_config:
   gen: