Skip to content

Commit 8fbc169

Browse files
committed
[None][ci] Move more test stages to use OCI machines
Signed-off-by: Yanchao Lu <[email protected]>
1 parent f95edb5 commit 8fbc169

File tree

3 files changed

+55
-56
lines changed

3 files changed

+55
-56
lines changed

jenkins/L0_Test.groovy

Lines changed: 42 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
114114
def remote = [
115115
ip : randomLoginNode,
116116
host : randomLoginNode,
117-
port : cluster.sshPort,
117+
port : cluster.sshPort?:22,
118118
user : "${pipeline.USERNAME}",
119119
passwd : "${pipeline.PASSWORD}",
120120
allowAnyHosts: true,
@@ -127,8 +127,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
127127
pipeline.stage('Submit Test Results') {
128128
sh "mkdir -p ${stageName}"
129129
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results.xml"
130-
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
131-
downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
130+
downloadSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
132131
if (downloadSucceed) {
133132
sh "ls ${stageName}"
134133
echo "Upload test results."
@@ -419,7 +418,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
419418
// The slurm_run.sh will add the slurm job id in that file.
420419
script: Utils.sshUserCmd(
421420
remote,
422-
"\"test -f ${jobWorkspace}/slurm_job_id.txt && cat ${jobWorkspace}/slurm_job_id.txt || true\""
421+
"\"cat ${jobWorkspace}/slurm_job_id.txt || true\""
423422
),
424423
returnStdout: true
425424
).trim()
@@ -452,7 +451,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
452451
}
453452
}
454453

455-
// Methods to run slurm job with Jenkins Agent
454+
// Methods to run Slurm job with Jenkins Agent
456455
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
457456
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
458457
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
@@ -537,12 +536,10 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
537536

538537
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
539538

540-
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
541-
542-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-${entrypoint}", numRetries: 3)
543-
544539
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
545540

541+
Utils.copyFileToRemoteHost(pipeline, remote, jenkinsSetupPath, "/home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint}", true)
542+
546543
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
547544

548545
// Specific for OCI machines
@@ -606,7 +603,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
606603
// Wait 10 minutes to check status of the node again
607604
sleep(time: 10, unit: 'MINUTES')
608605
// Avoid the node being stuck in the held state.
609-
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
606+
if (counter % 3 == 0) {
607+
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
608+
}
610609
counter++
611610
}
612611
}
@@ -684,7 +683,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
684683
}
685684
}
686685
}
687-
// End of Methods to run slurm job with Jenkins Agent
688686

689687
def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
690688
{
@@ -716,6 +714,7 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
716714
})
717715
}
718716
}
717+
// End of Methods to run Slurm job with Jenkins Agent
719718

720719
def getNodeArgs(int nodeCount, int gpuCount) {
721720
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
@@ -802,8 +801,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
802801

803802
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
804803

805-
def slurmOutputFile = null
806-
807804
try {
808805
// Run ssh command to start node in desired cluster via SLURM
809806
withCredentials([
@@ -830,16 +827,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
830827
def resourcePathNode = "/tmp"
831828
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
832829
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
833-
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
834-
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
835-
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
830+
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
831+
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
836832
def testListPathNode = "${jobWorkspace}/${testList}.txt"
837833
def waivesListPathNode = "${jobWorkspace}/waives.txt"
838834
def outputPath = "${jobWorkspace}/job-output.log"
839835
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
840-
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
836+
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
841837
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
842-
def scriptExecPathNode = "${jobWorkspace}/slurm_exec.sh"
838+
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
843839
def isAarch64 = config.contains("aarch64")
844840
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
845841

@@ -851,15 +847,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
851847
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
852848
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"
853849

854-
// Upload slurm_run_sh to Frontend node
855-
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
856-
857-
Utils.exec(pipeline, script: "echo \"Script to trigger slurm job: \" && cat ${scriptRunLocalPath}")
850+
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}")
858851
Utils.copyFileToRemoteHost(
859852
pipeline,
860853
remote,
861854
scriptRunLocalPath,
862-
scriptRunNode,
855+
scriptRunPathNode,
863856
true
864857
)
865858

@@ -995,22 +988,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
995988
export pytestCommand="$pytestCommand"
996989
export coverageConfigFile="$coverageConfigFile"
997990
export NVIDIA_IMEX_CHANNELS=0
998-
export NVIDIA_IMEX_CHANNELS=0
999-
export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
991+
[ -z "\$NVIDIA_VISIBLE_DEVICES" ] && export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
1000992
1001993
${srunPrologue}
1002994
1003-
chmod +x $scriptRunNode
1004-
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
995+
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
1005996
""".replaceAll("(?m)^\\s*", "")
1006997
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
998+
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
1007999
Utils.copyFileToRemoteHost(
10081000
pipeline,
10091001
remote,
10101002
scriptLaunchPathLocal,
10111003
scriptLaunchPathNode,
10121004
true
10131005
)
1006+
10141007
def scriptExec = """
10151008
touch ${outputPath}
10161009
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
@@ -1035,6 +1028,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10351028
fi
10361029
""".replaceAll("(?m)^\\s*", "").trim()
10371030
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
1031+
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
10381032
Utils.copyFileToRemoteHost(
10391033
pipeline,
10401034
remote,
@@ -1050,7 +1044,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10501044
script: Utils.sshUserCmd(
10511045
remote,
10521046
scriptExecPathNode
1053-
)
1047+
),
1048+
numRetries: 3
10541049
)
10551050
}
10561051

@@ -2568,8 +2563,8 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
25682563
docker.image(image).pull()
25692564
}
25702565
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
2571-
// The timeout here is to avoid the Slurm job being stuck.
2572-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
2566+
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
2567+
timeout(time: SlurmConfig.DEFAULT_TIMEOUT - 10, unit: 'MINUTES') {
25732568
docker.image(image).inside(dockerArgs) {
25742569
runner()
25752570
}
@@ -2589,7 +2584,9 @@ def runInEnrootOnNode(label)
25892584
{
25902585
return {
25912586
runner -> node(label) {
2592-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT, unit: 'MINUTES') {
2587+
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
2588+
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
2589+
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT - 10, unit: 'MINUTES') {
25932590
runner()
25942591
}
25952592
}
@@ -2628,8 +2625,6 @@ def launchTestJobs(pipeline, testFilter)
26282625
// may break the mapping functionality.
26292626

26302627
x86TestConfigs = [
2631-
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
2632-
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
26332628
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
26342629
"A10-PyTorch-1": ["a10", "l0_a10", 1, 2],
26352630
"A10-PyTorch-2": ["a10", "l0_a10", 2, 2],
@@ -2739,7 +2734,9 @@ def launchTestJobs(pipeline, testFilter)
27392734
fullSet = parallelJobs.keySet()
27402735

27412736
x86SlurmTestConfigs = [
2737+
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
27422738
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
2739+
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27432740
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27442741
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27452742
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
@@ -2777,28 +2774,23 @@ def launchTestJobs(pipeline, testFilter)
27772774
fullSet += SBSATestConfigs.keySet()
27782775

27792776
SBSASlurmTestConfigs = [
2777+
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2778+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
27802779
// Disable GB300 stages due to nodes will be offline temporarily.
27812780
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
2782-
"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2783-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2784-
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
2781+
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
27852782
]
27862783
fullSet += SBSASlurmTestConfigs.keySet()
27872784

2788-
// multiNodesSBSAConfigs = [
2789-
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
2790-
// Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
2791-
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2792-
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2793-
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2794-
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2795-
// "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2796-
// ]
2797-
multiNodesSBSAConfigs = [:]
2798-
def numMultiNodeTests = 3
2799-
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
2800-
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
2801-
}
2785+
multiNodesSBSAConfigs = [
2786+
// Each testcase uses 8 GPUs and 2 nodes.
2787+
// https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
2788+
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 2, 8, 2],
2789+
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 2, 8, 2],
2790+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
2791+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
2792+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
2793+
]
28022794
fullSet += multiNodesSBSAConfigs.keySet()
28032795

28042796
if (env.targetArch == AARCH64_TRIPLE) {

jenkins/scripts/slurm_run.sh

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,13 @@ set_value_in_command() {
2929
echo "$result"
3030
}
3131

32-
if [ $SLURM_LOCALID -eq 0 ]; then
33-
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
32+
# Only the first process will save the job ID
33+
if [ $SLURM_PROCID -eq 0 ]; then
34+
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
3435
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
36+
fi
3537

38+
if [ $SLURM_LOCALID -eq 0 ]; then
3639
wget -nv $llmTarfile
3740
tar -zxf $tarName
3841
which python3
@@ -55,7 +58,6 @@ else
5558
done
5659
fi
5760

58-
5961
llmapiLaunchScript="$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
6062
chmod +x $llmapiLaunchScript
6163
cd $llmSrcNode/tests/integration/defs
@@ -64,10 +66,14 @@ cd $llmSrcNode/tests/integration/defs
6466
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
6567
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
6668
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
67-
if [ $SLURM_LOCALID -eq 0 ]; then
69+
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
70+
71+
# Only the first process will save the coverage config file
72+
if [ $SLURM_PROCID -eq 0 ]; then
6873
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
6974
fi
70-
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
75+
# Sleep 10 seconds to wait for the coverage config file to be saved
76+
sleep 10
7177

7278
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
7379
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def result(self):
4444

4545
DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async'])
4646

47-
DEFAULT_TEST_TIMEOUT = 1800
47+
# TODO: Change back to 1800 when the disaggregated serving test slowdown issue is resolved.
48+
DEFAULT_TEST_TIMEOUT = 3600
4849
DEFAULT_SERVER_WAITING_TIMEOUT = 3600
4950

5051

0 commit comments

Comments
 (0)