@@ -114,7 +114,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
114114 def remote = [
115115 ip : randomLoginNode,
116116 host : randomLoginNode,
117- port : cluster. sshPort,
117+ port : cluster. sshPort?: 22 ,
118118 user : " ${ pipeline.USERNAME} " ,
119119 passwd : " ${ pipeline.PASSWORD} " ,
120120 allowAnyHosts : true ,
@@ -127,8 +127,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
127127 pipeline. stage(' Submit Test Results' ) {
128128 sh " mkdir -p ${ stageName} "
129129 def resultsFilePath = " /home/svc_tensorrt/bloom/scripts/${ nodeName} /results.xml"
130- def downloadResultCmd = " sshpass -p '${ remote.passwd} ' scp -P ${ remote.port} -r -p ${ COMMON_SSH_OPTIONS} ${ remote.user} @${ remote.host} :${ resultsFilePath} ${ stageName} /"
131- downloadSucceed = sh(script : downloadResultCmd, returnStatus : true ) == 0
130+ downloadSucceed = Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -P ${ remote.port} -r -p ${ COMMON_SSH_OPTIONS} ${ remote.user} @${ remote.host} :${ resultsFilePath} ${ stageName} /" , returnStatus : true , numRetries : 3 ) == 0
132131 if (downloadSucceed) {
133132 sh " ls ${ stageName} "
134133 echo " Upload test results."
@@ -419,7 +418,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
419418 // The slurm_run.sh will add the slurm job id in that file.
420419 script : Utils . sshUserCmd(
421420 remote,
422- " \" test -f ${ jobWorkspace } /slurm_job_id.txt && cat ${ jobWorkspace} /slurm_job_id.txt || true\" "
421+ " \" cat ${ jobWorkspace} /slurm_job_id.txt || true\" "
423422 ),
424423 returnStdout : true
425424 ). trim()
@@ -452,7 +451,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
452451 }
453452}
454453
455- // Methods to run slurm job with Jenkins Agent
454+ // Methods to run Slurm job with Jenkins Agent
456455def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName , String slurmJobID ) {
457456 withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
458457 def randomLoginNode = SlurmConfig . getRandomLoginNode(cluster. host)
@@ -537,12 +536,10 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
537536
538537 def jenkinsSetupPath = Utils . copyLibraryResource(pipeline, entrypoint)
539538
540- Utils . exec(pipeline, script : " chmod +x ${ jenkinsSetupPath} " , returnStdout : true )
541-
542- Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -P ${ remote.port} -r -p ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} @${ remote.host} :~/bloom/scripts/${ nodeName} -${ entrypoint} " , numRetries : 3 )
543-
544539 Utils . exec(pipeline, script : " cat ${ jenkinsSetupPath} " )
545540
541+ Utils . copyFileToRemoteHost(pipeline, remote, jenkinsSetupPath, " /home/svc_tensorrt/bloom/scripts/${ nodeName} -${ entrypoint} " , true )
542+
546543 Utils . exec(pipeline, script : " echo Sleeping before Slurm job submission; sleep \$ ((RANDOM % 29 + 1))" )
547544
548545 // Specific for OCI machines
@@ -606,7 +603,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
606603 // Wait 10 minutes to check status of the node again
607604 sleep(time : 10 , unit : ' MINUTES' )
608605 // Avoid the node being stuck in the held state.
609- Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" scontrol release ${ slurmJobID} || true\" " ), numRetries : 3 )
606+ if (counter % 3 == 0 ) {
607+ Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" scontrol release ${ slurmJobID} || true\" " ), numRetries : 3 )
608+ }
610609 counter++
611610 }
612611 }
@@ -684,7 +683,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
684683 }
685684 }
686685}
687- // End of Methods to run slurm job with Jenkins Agent
688686
689687def executeLLMTestOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" , runner )
690688{
@@ -716,6 +714,7 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
716714 })
717715 }
718716}
717+ // End of Methods to run Slurm job with Jenkins Agent
719718
720719def getNodeArgs (int nodeCount , int gpuCount ) {
721720 int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal ). setScale(0 , BigDecimal . ROUND_CEILING ). intValue()
@@ -802,8 +801,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
802801
803802 Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
804803
805- def slurmOutputFile = null
806-
807804 try {
808805 // Run ssh command to start node in desired cluster via SLURM
809806 withCredentials([
@@ -830,16 +827,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
830827 def resourcePathNode = " /tmp"
831828 def llmSrcNode = " ${ resourcePathNode} /TensorRT-LLM/src"
832829 def llmSrcLocal = " ${ llmPath} /TensorRT-LLM/src"
833- def scriptRunNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
834- def scriptLaunch = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
835- slurmOutputFile = SlurmConfig . getOutputFilePath(" /home/svc_tensorrt/slurm-logs" , jobUID)
830+ def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
831+ def scriptRunPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
836832 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
837833 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
838834 def outputPath = " ${ jobWorkspace} /job-output.log"
839835 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
840- def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
836+ def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_launch.sh"
841837 def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
842- def scriptExecPathNode = " ${ jobWorkspace} /slurm_exec.sh"
838+ def scriptExecPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_exec.sh"
843839 def isAarch64 = config. contains(" aarch64" )
844840 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
845841
@@ -851,15 +847,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
851847 trtllm_utils. llmExecStepWithRetry(pipeline, script : " cd ${ llmPath} && wget -nv ${ llmTarfile} " )
852848 sh " cd ${ llmPath} && tar -zxf ${ BUILD_CONFIGS[config][TARNAME]} "
853849
854- // Upload slurm_run_sh to Frontend node
855- def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
856-
857- Utils . exec(pipeline, script : " echo \" Script to trigger slurm job: \" && cat ${ scriptRunLocalPath} " )
850+ Utils . exec(pipeline, script : " echo \" Script to trigger Slurm srun job: \" && cat ${ scriptRunLocalPath} " )
858851 Utils . copyFileToRemoteHost(
859852 pipeline,
860853 remote,
861854 scriptRunLocalPath,
862- scriptRunNode ,
855+ scriptRunPathNode ,
863856 true
864857 )
865858
@@ -995,22 +988,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
995988 export pytestCommand="$pytestCommand "
996989 export coverageConfigFile="$coverageConfigFile "
997990 export NVIDIA_IMEX_CHANNELS=0
998- export NVIDIA_IMEX_CHANNELS=0
999- export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
991+ [ -z "\$ NVIDIA_VISIBLE_DEVICES" ] && export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
1000992
1001993 ${ srunPrologue}
1002994
1003- chmod +x $scriptRunNode
1004- srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunNode}
995+ srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunPathNode}
1005996 """ . replaceAll(" (?m)^\\ s*" , " " )
1006997 pipeline. writeFile(file : scriptLaunchPathLocal, text : scriptContent)
998+ Utils . exec(pipeline, script : " echo \" Script to trigger Slurm sbatch job: \" && cat ${ scriptLaunchPathLocal} " )
1007999 Utils . copyFileToRemoteHost(
10081000 pipeline,
10091001 remote,
10101002 scriptLaunchPathLocal,
10111003 scriptLaunchPathNode,
10121004 true
10131005 )
1006+
10141007 def scriptExec = """
10151008 touch ${ outputPath}
10161009 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
@@ -1035,6 +1028,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10351028 fi
10361029 """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
10371030 pipeline. writeFile(file : scriptExecPathLocal, text : scriptExec)
1031+ Utils . exec(pipeline, script : " echo \" Script to trigger Slurm submission job: \" && cat ${ scriptExecPathLocal} " )
10381032 Utils . copyFileToRemoteHost(
10391033 pipeline,
10401034 remote,
@@ -1050,7 +1044,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10501044 script : Utils . sshUserCmd(
10511045 remote,
10521046 scriptExecPathNode
1053- )
1047+ ),
1048+ numRetries : 3
10541049 )
10551050 }
10561051
@@ -2568,8 +2563,8 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
25682563 docker. image(image). pull()
25692564 }
25702565 // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
2571- // The timeout here is to avoid the Slurm job being stuck .
2572- timeout(time : SlurmConfig . DEFAULT_TIMEOUT , unit : ' MINUTES' ) {
2566+ // Minus 10 minutes to avoid the Slurm job being stopped earlier .
2567+ timeout(time : SlurmConfig . DEFAULT_TIMEOUT - 10 , unit : ' MINUTES' ) {
25732568 docker. image(image). inside(dockerArgs) {
25742569 runner()
25752570 }
@@ -2589,7 +2584,9 @@ def runInEnrootOnNode(label)
25892584{
25902585 return {
25912586 runner -> node(label) {
2592- timeout(time : SlurmConfig . DEFAULT_TIMEOUT_SHORT , unit : ' MINUTES' ) {
2587+ // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
2588+ // Minus 10 minutes to avoid the Slurm job being stopped earlier.
2589+ timeout(time : SlurmConfig . DEFAULT_TIMEOUT_SHORT - 10 , unit : ' MINUTES' ) {
25932590 runner()
25942591 }
25952592 }
@@ -2628,8 +2625,6 @@ def launchTestJobs(pipeline, testFilter)
26282625 // may break the mapping functionality.
26292626
26302627 x86TestConfigs = [
2631- " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
2632- " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2" , " l0_dgx_h100" , 1 , 1 , 2 ],
26332628 " DGX_H100-4_GPUs-CPP-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
26342629 " A10-PyTorch-1" : [" a10" , " l0_a10" , 1 , 2 ],
26352630 " A10-PyTorch-2" : [" a10" , " l0_a10" , 2 , 2 ],
@@ -2739,7 +2734,9 @@ def launchTestJobs(pipeline, testFilter)
27392734 fullSet = parallelJobs. keySet()
27402735
27412736 x86SlurmTestConfigs = [
2737+ " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
27422738 " DGX_H100-2_GPUs-PyTorch-Ray-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
2739+ " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27432740 " DGX_H100-4_GPUs-PyTorch-GptOss-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27442741 " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27452742 " B300-PyTorch-1" : [" b300-single" , " l0_b300" , 1 , 1 ],
@@ -2777,28 +2774,23 @@ def launchTestJobs(pipeline, testFilter)
27772774 fullSet + = SBSATestConfigs . keySet()
27782775
27792776 SBSASlurmTestConfigs = [
2777+ " GB200-4_GPUs-PyTorch-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2778+ " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
27802779 // Disable GB300 stages due to nodes will be offline temporarily.
27812780 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
2782- " GB200-4_GPUs-PyTorch-1" : [" gb200-trtllm" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2783- " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2784- // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
2781+ // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
27852782 ]
27862783 fullSet + = SBSASlurmTestConfigs . keySet()
27872784
2788- // multiNodesSBSAConfigs = [
2789- // Each stage test 1 testcase with 8 GPUs and 2 nodes.
2790- // Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
2791- // "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2792- // "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2793- // "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2794- // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2795- // "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2796- // ]
2797- multiNodesSBSAConfigs = [:]
2798- def numMultiNodeTests = 3
2799- multiNodesSBSAConfigs + = (1 .. numMultiNodeTests). collectEntries { i ->
2800- [" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${ i} " . toString(), [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , i, numMultiNodeTests, 8 , 2 ]]
2801- }
2785+ multiNodesSBSAConfigs = [
2786+ // Each testcase uses 8 GPUs and 2 nodes.
2787+ // https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
2788+ " GB200-8_GPUs-2_Nodes-PyTorch-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 2 , 8 , 2 ],
2789+ " GB200-8_GPUs-2_Nodes-PyTorch-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 2 , 8 , 2 ],
2790+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
2791+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
2792+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
2793+ ]
28022794 fullSet + = multiNodesSBSAConfigs. keySet()
28032795
28042796 if (env. targetArch == AARCH64_TRIPLE ) {
0 commit comments