Skip to content

Commit 138c740

Browse files
committed
[None][ci] Move more test stages to use OCI machines
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 6e5384d commit 138c740

File tree

1 file changed

+18
-22
lines changed

1 file changed

+18
-22
lines changed

jenkins/L0_Test.groovy

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
419419
// The slurm_run.sh will add the slurm job id in that file.
420420
script: Utils.sshUserCmd(
421421
remote,
422-
"\"test -f ${jobWorkspace}/slurm_job_id.txt && cat ${jobWorkspace}/slurm_job_id.txt || true\""
422+
"\"cat ${jobWorkspace}/slurm_job_id.txt || true\""
423423
),
424424
returnStdout: true
425425
).trim()
@@ -606,7 +606,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
606606
// Wait 10 minutes to check status of the node again
607607
sleep(time: 10, unit: 'MINUTES')
608608
// Avoid the node being stuck in the held state.
609-
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
609+
if (counter % 3 == 0) {
610+
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
611+
}
610612
counter++
611613
}
612614
}
@@ -995,7 +997,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
995997
export pytestCommand="$pytestCommand"
996998
export coverageConfigFile="$coverageConfigFile"
997999
export NVIDIA_IMEX_CHANNELS=0
998-
export NVIDIA_IMEX_CHANNELS=0
9991000
export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
10001001
10011002
${srunPrologue}
@@ -2628,8 +2629,6 @@ def launchTestJobs(pipeline, testFilter)
26282629
// may break the mapping functionality.
26292630

26302631
x86TestConfigs = [
2631-
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
2632-
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
26332632
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
26342633
"A10-PyTorch-1": ["a10", "l0_a10", 1, 2],
26352634
"A10-PyTorch-2": ["a10", "l0_a10", 2, 2],
@@ -2739,7 +2738,9 @@ def launchTestJobs(pipeline, testFilter)
27392738
fullSet = parallelJobs.keySet()
27402739

27412740
x86SlurmTestConfigs = [
2741+
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
27422742
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
2743+
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27432744
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27442745
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
27452746
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
@@ -2777,28 +2778,23 @@ def launchTestJobs(pipeline, testFilter)
27772778
fullSet += SBSATestConfigs.keySet()
27782779

27792780
SBSASlurmTestConfigs = [
2781+
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2782+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
27802783
// Disable GB300 stages due to nodes will be offline temporarily.
27812784
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
2782-
"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2783-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2784-
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
2785+
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
27852786
]
27862787
fullSet += SBSASlurmTestConfigs.keySet()
27872788

2788-
// multiNodesSBSAConfigs = [
2789-
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
2790-
// Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
2791-
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2792-
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2793-
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2794-
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2795-
// "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2796-
// ]
2797-
multiNodesSBSAConfigs = [:]
2798-
def numMultiNodeTests = 3
2799-
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
2800-
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
2801-
}
2789+
multiNodesSBSAConfigs = [
2790+
// Each testcase uses 8 GPUs and 2 nodes.
2791+
// https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
2792+
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 2, 8, 2],
2793+
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 2, 8, 2],
2794+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
2795+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
2796+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
2797+
]
28022798
fullSet += multiNodesSBSAConfigs.keySet()
28032799

28042800
if (env.targetArch == AARCH64_TRIPLE) {

0 commit comments

Comments
 (0)