@@ -114,7 +114,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
114114 def remote = [
115115 ip : randomLoginNode,
116116 host : randomLoginNode,
117- port : cluster. sshPort,
117+ port : cluster. sshPort?: 22 ,
118118 user : " ${ pipeline.USERNAME} " ,
119119 passwd : " ${ pipeline.PASSWORD} " ,
120120 allowAnyHosts : true ,
@@ -127,8 +127,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
127127 pipeline. stage(' Submit Test Results' ) {
128128 sh " mkdir -p ${ stageName} "
129129 def resultsFilePath = " /home/svc_tensorrt/bloom/scripts/${ nodeName} /results.xml"
130- def downloadResultCmd = " sshpass -p '${ remote.passwd} ' scp -P ${ remote.port} -r -p ${ COMMON_SSH_OPTIONS} ${ remote.user} @${ remote.host} :${ resultsFilePath} ${ stageName} /"
131- downloadSucceed = sh(script : downloadResultCmd, returnStatus : true ) == 0
130+ downloadSucceed = Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -P ${ remote.port} -r -p ${ COMMON_SSH_OPTIONS} ${ remote.user} @${ remote.host} :${ resultsFilePath} ${ stageName} /" , returnStatus : true , numRetries : 3 ) == 0
132131 if (downloadSucceed) {
133132 sh " ls ${ stageName} "
134133 echo " Upload test results."
@@ -419,7 +418,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
419418 // The slurm_run.sh will add the slurm job id in that file.
420419 script : Utils . sshUserCmd(
421420 remote,
422- " \" test -f ${ jobWorkspace } /slurm_job_id.txt && cat ${ jobWorkspace} /slurm_job_id.txt || true\" "
421+ " \" cat ${ jobWorkspace} /slurm_job_id.txt || true\" "
423422 ),
424423 returnStdout : true
425424 ). trim()
@@ -440,19 +439,23 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
440439
441440 Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
442441
442+ def cleanupCommands = [
443+ " rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${ slurmJobID} .sqsh || true" ,
444+ " rm -rf ${ jobWorkspace} || true" ,
445+ ]. join(" && " )
443446 Utils . exec(
444447 pipeline,
445448 script : Utils . sshUserCmd(
446449 remote,
447- " \" rm -rf ${ jobWorkspace } || true \" "
450+ " \" ${ cleanupCommands } \" "
448451 )
449452 )
450453
451454 Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} cleaned up" )
452455 }
453456}
454457
455- // Methods to run slurm job with Jenkins Agent
458+ // Methods to run Slurm job with Jenkins Agent
456459def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName , String slurmJobID ) {
457460 withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
458461 def randomLoginNode = SlurmConfig . getRandomLoginNode(cluster. host)
@@ -537,12 +540,10 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
537540
538541 def jenkinsSetupPath = Utils . copyLibraryResource(pipeline, entrypoint)
539542
540- Utils . exec(pipeline, script : " chmod +x ${ jenkinsSetupPath} " , returnStdout : true )
541-
542- Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -P ${ remote.port} -r -p ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} @${ remote.host} :~/bloom/scripts/${ nodeName} -${ entrypoint} " , numRetries : 3 )
543-
544543 Utils . exec(pipeline, script : " cat ${ jenkinsSetupPath} " )
545544
545+ Utils . copyFileToRemoteHost(pipeline, remote, jenkinsSetupPath, " /home/svc_tensorrt/bloom/scripts/${ nodeName} -${ entrypoint} " , true )
546+
546547 Utils . exec(pipeline, script : " echo Sleeping before Slurm job submission; sleep \$ ((RANDOM % 29 + 1))" )
547548
548549 // Specific for OCI machines
@@ -606,7 +607,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
606607 // Wait 10 minutes to check status of the node again
607608 sleep(time : 10 , unit : ' MINUTES' )
608609 // Avoid the node being stuck in the held state.
609- Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" scontrol release ${ slurmJobID} || true\" " ), numRetries : 3 )
610+ if (counter % 3 == 0 ) {
611+ Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" scontrol release ${ slurmJobID} || true\" " ), numRetries : 3 )
612+ }
610613 counter++
611614 }
612615 }
@@ -684,7 +687,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
684687 }
685688 }
686689}
687- // End of Methods to run slurm job with Jenkins Agent
688690
689691def executeLLMTestOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" , runner )
690692{
@@ -716,6 +718,7 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
716718 })
717719 }
718720}
721+ // End of Methods to run Slurm job with Jenkins Agent
719722
720723def getNodeArgs (int nodeCount , int gpuCount ) {
721724 int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal ). setScale(0 , BigDecimal . ROUND_CEILING ). intValue()
@@ -802,8 +805,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
802805
803806 Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
804807
805- def slurmOutputFile = null
806-
807808 try {
808809 // Run ssh command to start node in desired cluster via SLURM
809810 withCredentials([
@@ -830,16 +831,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
830831 def resourcePathNode = " /tmp"
831832 def llmSrcNode = " ${ resourcePathNode} /TensorRT-LLM/src"
832833 def llmSrcLocal = " ${ llmPath} /TensorRT-LLM/src"
833- def scriptRunNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
834- def scriptLaunch = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
835- slurmOutputFile = SlurmConfig . getOutputFilePath(" /home/svc_tensorrt/slurm-logs" , jobUID)
834+ def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
835+ def scriptRunPathNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
836836 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
837837 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
838838 def outputPath = " ${ jobWorkspace} /job-output.log"
839839 def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
840- def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
840+ def scriptLaunchPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_launch.sh"
841841 def scriptExecPathLocal = Utils . createTempLocation(pipeline, " ./slurm_exec.sh" )
842- def scriptExecPathNode = " ${ jobWorkspace} /slurm_exec.sh"
842+ def scriptExecPathNode = " ${ jobWorkspace} /${ jobUID } - slurm_exec.sh"
843843 def isAarch64 = config. contains(" aarch64" )
844844 def coverageConfigFile = " ${ jobWorkspace} /.coveragerc"
845845
@@ -851,15 +851,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
851851 trtllm_utils. llmExecStepWithRetry(pipeline, script : " cd ${ llmPath} && wget -nv ${ llmTarfile} " )
852852 sh " cd ${ llmPath} && tar -zxf ${ BUILD_CONFIGS[config][TARNAME]} "
853853
854- // Upload slurm_run_sh to Frontend node
855- def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
856-
857- Utils . exec(pipeline, script : " echo \" Script to trigger slurm job: \" && cat ${ scriptRunLocalPath} " )
854+ Utils . exec(pipeline, script : " echo \" Script to trigger Slurm srun job: \" && cat ${ scriptRunLocalPath} " )
858855 Utils . copyFileToRemoteHost(
859856 pipeline,
860857 remote,
861858 scriptRunLocalPath,
862- scriptRunNode ,
859+ scriptRunPathNode ,
863860 true
864861 )
865862
@@ -995,22 +992,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
995992 export pytestCommand="$pytestCommand "
996993 export coverageConfigFile="$coverageConfigFile "
997994 export NVIDIA_IMEX_CHANNELS=0
998- export NVIDIA_IMEX_CHANNELS=0
999- export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
995+ [ -z "\$ {NVIDIA_VISIBLE_DEVICES:-}" ] && export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
1000996
1001997 ${ srunPrologue}
1002998
1003- chmod +x $scriptRunNode
1004- srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunNode}
999+ srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunPathNode}
10051000 """ . replaceAll(" (?m)^\\ s*" , " " )
10061001 pipeline. writeFile(file : scriptLaunchPathLocal, text : scriptContent)
1002+ Utils . exec(pipeline, script : " echo \" Script to trigger Slurm sbatch job: \" && cat ${ scriptLaunchPathLocal} " )
10071003 Utils . copyFileToRemoteHost(
10081004 pipeline,
10091005 remote,
10101006 scriptLaunchPathLocal,
10111007 scriptLaunchPathNode,
10121008 true
10131009 )
1010+
10141011 def scriptExec = """
10151012 touch ${ outputPath}
10161013 jobId=\$ (sbatch ${ scriptLaunchPathNode} | awk '{print \$ 4}')
@@ -1035,6 +1032,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10351032 fi
10361033 """ . replaceAll(" (?m)^\\ s*" , " " ). trim()
10371034 pipeline. writeFile(file : scriptExecPathLocal, text : scriptExec)
1035+ Utils . exec(pipeline, script : " echo \" Script to trigger Slurm submission job: \" && cat ${ scriptExecPathLocal} " )
10381036 Utils . copyFileToRemoteHost(
10391037 pipeline,
10401038 remote,
@@ -1050,7 +1048,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10501048 script : Utils . sshUserCmd(
10511049 remote,
10521050 scriptExecPathNode
1053- )
1051+ ),
1052+ numRetries : 3
10541053 )
10551054 }
10561055
@@ -2568,8 +2567,8 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
25682567 docker. image(image). pull()
25692568 }
25702569 // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
2571- // The timeout here is to avoid the Slurm job being stuck .
2572- timeout(time : SlurmConfig . DEFAULT_TIMEOUT , unit : ' MINUTES' ) {
2570+ // Minus 10 minutes to avoid the Slurm job being stopped earlier .
2571+ timeout(time : SlurmConfig . DEFAULT_TIMEOUT - 10 , unit : ' MINUTES' ) {
25732572 docker. image(image). inside(dockerArgs) {
25742573 runner()
25752574 }
@@ -2589,7 +2588,9 @@ def runInEnrootOnNode(label)
25892588{
25902589 return {
25912590 runner -> node(label) {
2592- timeout(time : SlurmConfig . DEFAULT_TIMEOUT_SHORT , unit : ' MINUTES' ) {
2591+ // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
2592+ // Minus 10 minutes to avoid the Slurm job being stopped earlier.
2593+ timeout(time : SlurmConfig . DEFAULT_TIMEOUT_SHORT - 10 , unit : ' MINUTES' ) {
25932594 runner()
25942595 }
25952596 }
@@ -2628,8 +2629,6 @@ def launchTestJobs(pipeline, testFilter)
26282629 // may break the mapping functionality.
26292630
26302631 x86TestConfigs = [
2631- " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
2632- " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2" , " l0_dgx_h100" , 1 , 1 , 2 ],
26332632 " DGX_H100-4_GPUs-CPP-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
26342633 " A10-PyTorch-1" : [" a10" , " l0_a10" , 1 , 2 ],
26352634 " A10-PyTorch-2" : [" a10" , " l0_a10" , 2 , 2 ],
@@ -2739,7 +2738,9 @@ def launchTestJobs(pipeline, testFilter)
27392738 fullSet = parallelJobs. keySet()
27402739
27412740 x86SlurmTestConfigs = [
2741+ " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
27422742 " DGX_H100-2_GPUs-PyTorch-Ray-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
2743+ " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27432744 " DGX_H100-4_GPUs-PyTorch-GptOss-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27442745 " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
27452746 " B300-PyTorch-1" : [" b300-single" , " l0_b300" , 1 , 1 ],
@@ -2777,28 +2778,23 @@ def launchTestJobs(pipeline, testFilter)
27772778 fullSet + = SBSATestConfigs . keySet()
27782779
27792780 SBSASlurmTestConfigs = [
2781+ " GB200-4_GPUs-PyTorch-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2782+ " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
27802783 // Disable GB300 stages due to nodes will be offline temporarily.
27812784 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
2782- " GB200-4_GPUs-PyTorch-1" : [" gb200-trtllm" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2783- " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2784- // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
2785+ // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
27852786 ]
27862787 fullSet + = SBSASlurmTestConfigs . keySet()
27872788
2788- // multiNodesSBSAConfigs = [
2789- // Each stage test 1 testcase with 8 GPUs and 2 nodes.
2790- // Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
2791- // "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2792- // "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2793- // "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2794- // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2795- // "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2796- // ]
2797- multiNodesSBSAConfigs = [:]
2798- def numMultiNodeTests = 3
2799- multiNodesSBSAConfigs + = (1 .. numMultiNodeTests). collectEntries { i ->
2800- [" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${ i} " . toString(), [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , i, numMultiNodeTests, 8 , 2 ]]
2801- }
2789+ multiNodesSBSAConfigs = [
2790+ // Each testcase uses 8 GPUs and 2 nodes.
2791+ // https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
2792+ " GB200-8_GPUs-2_Nodes-PyTorch-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 2 , 8 , 2 ],
2793+ " GB200-8_GPUs-2_Nodes-PyTorch-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 2 , 8 , 2 ],
2794+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 1 , 3 , 8 , 2 ],
2795+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 2 , 3 , 8 , 2 ],
2796+ " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3" : [" gb200-oci-trtllm" , " l0_gb200_multi_nodes" , 3 , 3 , 8 , 2 ],
2797+ ]
28022798 fullSet + = multiNodesSBSAConfigs. keySet()
28032799
28042800 if (env. targetArch == AARCH64_TRIPLE ) {
0 commit comments