@@ -419,7 +419,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
419419 // The slurm_run.sh will add the slurm job id in that file.
420420 script : Utils . sshUserCmd(
421421 remote,
422- " \" test -f ${ jobWorkspace } /slurm_job_id.txt && cat ${ jobWorkspace} /slurm_job_id.txt || true\" "
422+ " \" cat ${ jobWorkspace} /slurm_job_id.txt || true\" "
423423 ),
424424 returnStdout : true
425425 ). trim()
@@ -606,7 +606,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
606606 // Wait 10 minutes to check status of the node again
607607 sleep(time : 10 , unit : ' MINUTES' )
608608 // Avoid the node being stuck in the held state.
609- Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" scontrol release ${ slurmJobID} || true\" " ), numRetries : 3 )
609+ if (counter % 3 == 0 ) {
610+ Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" scontrol release ${ slurmJobID} || true\" " ), numRetries : 3 )
611+ }
610612 counter++
611613 }
612614 }
@@ -995,7 +997,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
995997 export pytestCommand="$pytestCommand "
996998 export coverageConfigFile="$coverageConfigFile "
997999 export NVIDIA_IMEX_CHANNELS=0
998- export NVIDIA_IMEX_CHANNELS=0
9991000 export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
10001001
10011002 ${ srunPrologue}
@@ -2628,8 +2629,6 @@ def launchTestJobs(pipeline, testFilter)
26282629 // may break the mapping functionality.
26292630
26302631 x86TestConfigs = [
2631- " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
2632- " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2" , " l0_dgx_h100" , 1 , 1 , 2 ],
26332632 " DGX_H100-4_GPUs-CPP-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
26342633 " A10-PyTorch-1" : [" a10" , " l0_a10" , 1 , 2 ],
26352634 " A10-PyTorch-2" : [" a10" , " l0_a10" , 2 , 2 ],
@@ -2739,7 +2738,9 @@ def launchTestJobs(pipeline, testFilter)
27392738 fullSet = parallelJobs. keySet()
27402739
27412740 x86SlurmTestConfigs = [
2741+ " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
27422742 " DGX_H100-2_GPUs-PyTorch-Ray-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
2743+ " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27432744 " DGX_H100-4_GPUs-PyTorch-GptOss-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27442745 " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27452746 " B300-PyTorch-1" : [" b300-single" , " l0_b300" , 1 , 1 ],
@@ -2777,28 +2778,23 @@ def launchTestJobs(pipeline, testFilter)
27772778 fullSet + = SBSATestConfigs . keySet()
27782779
27792780 SBSASlurmTestConfigs = [
2781+ " GB200-4_GPUs-PyTorch-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2782+ " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
27802783 // Disable GB300 stages due to nodes will be offline temporarily.
27812784 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
2782- " GB200-4_GPUs-PyTorch-1" : [" gb200-trtllm" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2783- " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2784- // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
2785+ // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
27852786 ]
27862787 fullSet + = SBSASlurmTestConfigs . keySet()
27872788
2788- // multiNodesSBSAConfigs = [
2789- // Each stage test 1 testcase with 8 GPUs and 2 nodes.
2790- // Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
2791- // "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2792- // "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2793- // "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2794- // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2795- // "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2796- // ]
2797- multiNodesSBSAConfigs = [:]
2798- def numMultiNodeTests = 3
2799- multiNodesSBSAConfigs + = (1 .. numMultiNodeTests). collectEntries { i ->
2800- [" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${ i} " . toString(), [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , i, numMultiNodeTests, 8 , 2 ]]
2801- }
2789+ multiNodesSBSAConfigs = [
2790+ // Each testcase uses 8 GPUs and 2 nodes.
2791+ // https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
2792+ " GB200-8_GPUs-2_Nodes-PyTorch-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 2 , 8 , 2 ],
2793+ " GB200-8_GPUs-2_Nodes-PyTorch-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 2 , 8 , 2 ],
2794+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
2795+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
2796+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
2797+ ]
28022798 fullSet + = multiNodesSBSAConfigs. keySet()
28032799
28042800 if (env. targetArch == AARCH64_TRIPLE ) {
0 commit comments