Skip to content

Commit 4782304

Browse files
committed
[None][ci] Move more test stages to use OCI machines
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 336593c commit 4782304

File tree

3 files changed

+60
-57
lines changed

3 files changed

+60
-57
lines changed

jenkins/L0_Test.groovy

Lines changed: 47 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
114114
def remote = [
115115
ip : randomLoginNode,
116116
host : randomLoginNode,
117-
port : cluster.sshPort,
117+
port : cluster.sshPort?:22,
118118
user : "${pipeline.USERNAME}",
119119
passwd : "${pipeline.PASSWORD}",
120120
allowAnyHosts: true,
@@ -127,8 +127,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
127127
pipeline.stage('Submit Test Results') {
128128
sh "mkdir -p ${stageName}"
129129
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results.xml"
130-
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
131-
downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
130+
downloadSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
132131
if (downloadSucceed) {
133132
sh "ls ${stageName}"
134133
echo "Upload test results."
@@ -419,7 +418,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
419418
// The slurm_run.sh will add the slurm job id in that file.
420419
script: Utils.sshUserCmd(
421420
remote,
422-
"\"test -f ${jobWorkspace}/slurm_job_id.txt && cat ${jobWorkspace}/slurm_job_id.txt || true\""
421+
"\"cat ${jobWorkspace}/slurm_job_id.txt || true\""
423422
),
424423
returnStdout: true
425424
).trim()
@@ -440,19 +439,23 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
440439

441440
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
442441

442+
def cleanupCommands = [
443+
"rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
444+
"rm -rf ${jobWorkspace} || true",
445+
].join(" && ")
443446
Utils.exec(
444447
pipeline,
445448
script: Utils.sshUserCmd(
446449
remote,
447-
"\"rm -rf ${jobWorkspace} || true\""
450+
"\"${cleanupCommands}\""
448451
)
449452
)
450453

451454
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
452455
}
453456
}
454457

455-
// Methods to run slurm job with Jenkins Agent
458+
// Methods to run Slurm job with Jenkins Agent
456459
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
457460
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
458461
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
@@ -537,12 +540,10 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
537540

538541
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
539542

540-
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
541-
542-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-${entrypoint}", numRetries: 3)
543-
544543
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
545544

545+
Utils.copyFileToRemoteHost(pipeline, remote, jenkinsSetupPath, "/home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint}", true)
546+
546547
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
547548

548549
// Specific for OCI machines
@@ -606,7 +607,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
606607
// Wait 10 minutes to check status of the node again
607608
sleep(time: 10, unit: 'MINUTES')
608609
// Avoid the node being stuck in the held state.
609-
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
610+
if (counter % 3 == 0) {
611+
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
612+
}
610613
counter++
611614
}
612615
}
@@ -684,7 +687,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
684687
}
685688
}
686689
}
687-
// End of Methods to run slurm job with Jenkins Agent
688690

689691
def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
690692
{
@@ -716,6 +718,7 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
716718
})
717719
}
718720
}
721+
// End of Methods to run Slurm job with Jenkins Agent
719722

720723
def getNodeArgs(int nodeCount, int gpuCount) {
721724
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
@@ -802,8 +805,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
802805

803806
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
804807

805-
def slurmOutputFile = null
806-
807808
try {
808809
// Run ssh command to start node in desired cluster via SLURM
809810
withCredentials([
@@ -830,16 +831,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
830831
def resourcePathNode = "/tmp"
831832
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
832833
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
833-
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
834-
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
835-
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
834+
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
835+
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
836836
def testListPathNode = "${jobWorkspace}/${testList}.txt"
837837
def waivesListPathNode = "${jobWorkspace}/waives.txt"
838838
def outputPath = "${jobWorkspace}/job-output.log"
839839
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
840-
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
840+
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
841841
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
842-
def scriptExecPathNode = "${jobWorkspace}/slurm_exec.sh"
842+
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
843843
def isAarch64 = config.contains("aarch64")
844844
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
845845

@@ -851,15 +851,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
851851
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
852852
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"
853853

854-
// Upload slurm_run_sh to Frontend node
855-
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
856-
857-
Utils.exec(pipeline, script: "echo \"Script to trigger slurm job: \" && cat ${scriptRunLocalPath}")
854+
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}")
858855
Utils.copyFileToRemoteHost(
859856
pipeline,
860857
remote,
861858
scriptRunLocalPath,
862-
scriptRunNode,
859+
scriptRunPathNode,
863860
true
864861
)
865862

@@ -995,22 +992,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
995992
export pytestCommand="$pytestCommand"
996993
export coverageConfigFile="$coverageConfigFile"
997994
export NVIDIA_IMEX_CHANNELS=0
998-
export NVIDIA_IMEX_CHANNELS=0
999-
export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
995+
[ -z "\${NVIDIA_VISIBLE_DEVICES:-}" ] && export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
1000996
1001997
${srunPrologue}
1002998
1003-
chmod +x $scriptRunNode
1004-
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
999+
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
10051000
""".replaceAll("(?m)^\\s*", "")
10061001
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
1002+
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
10071003
Utils.copyFileToRemoteHost(
10081004
pipeline,
10091005
remote,
10101006
scriptLaunchPathLocal,
10111007
scriptLaunchPathNode,
10121008
true
10131009
)
1010+
10141011
def scriptExec = """
10151012
touch ${outputPath}
10161013
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
@@ -1035,6 +1032,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10351032
fi
10361033
""".replaceAll("(?m)^\\s*", "").trim()
10371034
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
1035+
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
10381036
Utils.copyFileToRemoteHost(
10391037
pipeline,
10401038
remote,
@@ -1050,7 +1048,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10501048
script: Utils.sshUserCmd(
10511049
remote,
10521050
scriptExecPathNode
1053-
)
1051+
),
1052+
numRetries: 3
10541053
)
10551054
}
10561055

@@ -2568,8 +2567,8 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
25682567
docker.image(image).pull()
25692568
}
25702569
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
2571-
// The timeout here is to avoid the Slurm job being stuck.
2572-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
2570+
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
2571+
timeout(time: SlurmConfig.DEFAULT_TIMEOUT - 10, unit: 'MINUTES') {
25732572
docker.image(image).inside(dockerArgs) {
25742573
runner()
25752574
}
@@ -2589,7 +2588,9 @@ def runInEnrootOnNode(label)
25892588
{
25902589
return {
25912590
runner -> node(label) {
2592-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT, unit: 'MINUTES') {
2591+
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
2592+
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
2593+
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT - 10, unit: 'MINUTES') {
25932594
runner()
25942595
}
25952596
}
@@ -2628,8 +2629,6 @@ def launchTestJobs(pipeline, testFilter)
26282629
// may break the mapping functionality.
26292630

26302631
x86TestConfigs = [
2631-
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
2632-
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
26332632
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
26342633
"A10-PyTorch-1": ["a10", "l0_a10", 1, 2],
26352634
"A10-PyTorch-2": ["a10", "l0_a10", 2, 2],
@@ -2739,7 +2738,9 @@ def launchTestJobs(pipeline, testFilter)
27392738
fullSet = parallelJobs.keySet()
27402739

27412740
x86SlurmTestConfigs = [
2741+
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
27422742
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
2743+
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27432744
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27442745
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27452746
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
@@ -2777,28 +2778,23 @@ def launchTestJobs(pipeline, testFilter)
27772778
fullSet += SBSATestConfigs.keySet()
27782779

27792780
SBSASlurmTestConfigs = [
2781+
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2782+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
27802783
// Disable GB300 stages due to nodes will be offline temporarily.
27812784
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
2782-
"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2783-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2784-
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
2785+
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
27852786
]
27862787
fullSet += SBSASlurmTestConfigs.keySet()
27872788

2788-
// multiNodesSBSAConfigs = [
2789-
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
2790-
// Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
2791-
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2792-
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2793-
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2794-
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2795-
// "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2796-
// ]
2797-
multiNodesSBSAConfigs = [:]
2798-
def numMultiNodeTests = 3
2799-
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
2800-
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
2801-
}
2789+
multiNodesSBSAConfigs = [
2790+
// Each testcase uses 8 GPUs and 2 nodes.
2791+
// https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
2792+
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 2, 8, 2],
2793+
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 2, 8, 2],
2794+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
2795+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
2796+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
2797+
]
28022798
fullSet += multiNodesSBSAConfigs.keySet()
28032799

28042800
if (env.targetArch == AARCH64_TRIPLE) {

jenkins/scripts/slurm_run.sh

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,13 @@ set_value_in_command() {
2929
echo "$result"
3030
}
3131

32-
if [ $SLURM_LOCALID -eq 0 ]; then
33-
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
32+
# Only the first process will save the job ID
33+
if [ $SLURM_PROCID -eq 0 ]; then
34+
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
3435
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
36+
fi
3537

38+
if [ $SLURM_LOCALID -eq 0 ]; then
3639
wget -nv $llmTarfile
3740
tar -zxf $tarName
3841
which python3
@@ -55,7 +58,6 @@ else
5558
done
5659
fi
5760

58-
5961
llmapiLaunchScript="$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
6062
chmod +x $llmapiLaunchScript
6163
cd $llmSrcNode/tests/integration/defs
@@ -64,10 +66,14 @@ cd $llmSrcNode/tests/integration/defs
6466
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
6567
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
6668
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
67-
if [ $SLURM_LOCALID -eq 0 ]; then
69+
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
70+
71+
# Only the first process will save the coverage config file
72+
if [ $SLURM_PROCID -eq 0 ]; then
6873
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
6974
fi
70-
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
75+
# Sleep 10 seconds to wait for the coverage config file to be saved
76+
sleep 10
7177

7278
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
7379
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def result(self):
4444

4545
DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async'])
4646

47-
DEFAULT_TEST_TIMEOUT = 1800
47+
# TODO: Change back to 1800 when the disaggregated serving test slowdown issue is resolved.
48+
DEFAULT_TEST_TIMEOUT = 3600
4849
DEFAULT_SERVER_WAITING_TIMEOUT = 3600
4950

5051

0 commit comments

Comments
 (0)