@@ -113,7 +113,7 @@ CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
113113MODEL_CACHE_DIR = " /scratch.trt_llm_data/llm-models"
114114
115115// GPU types that require open driver
116- REQUIRED_OPEN_DRIVER_TYPES = [" b100-ts2" , " rtx-5080" , " rtx-5090" , " rtx-pro-6000" ]
116+ REQUIRED_OPEN_DRIVER_TYPES = [" b100-ts2" , " rtx-5080" , " rtx-5090" , " rtx-pro-6000" , " rtx-pro-6000d " ]
117117
118118// GPU types that don't support dynamic driver flashing
119119REQUIRED_NO_DRIVER_TYPES = [" dgx-h100" , " dgx-h200" , " gh200" ]
@@ -1386,6 +1386,21 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
13861386 path: /vol/scratch1/scratch.svc_tensorrt_blossom
13871387 """
13881388 }
1389+ def llmModelVolume = """
1390+ - name: scratch-trt-llm-data
1391+ nfs:
1392+ server: 10.117.145.14
1393+ path: /vol/scratch1/scratch.michaeln_blossom
1394+ """
1395+ if (type. contains(" 6000d" )) {
1396+ // rtx-pro-6000d nodes are located in Austin DC, we use the FlexCache to speed up the data access.
1397+ llmModelVolume = """
1398+ - name: scratch-trt-llm-data
1399+ nfs:
1400+ server: 10.20.162.212
1401+ path: /vol/scratch26/scratch.trt_llm_data
1402+ """
1403+ }
13891404
13901405 def podConfig = [
13911406 cloud : targetCould,
@@ -1432,10 +1447,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
14321447 - name: dshm
14331448 emptyDir:
14341449 medium: Memory
1435- - name: scratch-trt-llm-data
1436- nfs:
1437- server: 10.117.145.14
1438- path: /vol/scratch1/scratch.michaeln_blossom
1450+ ${ llmModelVolume}
14391451 ${ pvcVolume}
14401452 """ . stripIndent(),
14411453 ]
@@ -2578,9 +2590,6 @@ def launchTestJobs(pipeline, testFilter)
25782590 x86TestConfigs = [
25792591 " DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
25802592 " DGX_H100-2_GPUs-PyTorch-Others-1" : [" dgx-h100-x2" , " l0_dgx_h100" , 1 , 1 , 2 ],
2581- " DGX_H100-4_GPUs-PyTorch-GptOss-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
2582- " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
2583- " DGX_H100-2_GPUs-PyTorch-Ray-1" : [" dgx-h100-x2" , " l0_dgx_h100" , 1 , 1 , 2 ],
25842593 " DGX_H100-4_GPUs-CPP-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
25852594 " A10-PyTorch-1" : [" a10" , " l0_a10" , 1 , 2 ],
25862595 " A10-PyTorch-2" : [" a10" , " l0_a10" , 2 , 2 ],
@@ -2664,10 +2673,14 @@ def launchTestJobs(pipeline, testFilter)
26642673 // "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
26652674 // "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
26662675 // "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
2667- // Disable RTXPro6000 stages due to nodes will be offline temporarily
2676+ // Disable RTXPro6000 stages due to nodes will be offline temporarily.
2677+ // [TODO] Split tests between RTXPro6000 and RTXPro6000D and move reasonable mount of tests to pre-merge.
26682678 // "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
26692679 // "RTXPro6000-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
26702680 // "RTXPro6000-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
2681+ " RTXPro6000D-PyTorch-Post-Merge-1" : [" rtx-pro-6000d" , " l0_rtx_pro_6000" , 1 , 1 ],
2682+ " RTXPro6000D-4_GPUs-PyTorch-Post-Merge-1" : [" rtx-pro-6000d-x4" , " l0_rtx_pro_6000" , 1 , 2 , 4 ],
2683+ " RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2" : [" rtx-pro-6000d-x4" , " l0_rtx_pro_6000" , 2 , 2 , 4 ],
26712684 ]
26722685
26732686 parallelJobs = x86TestConfigs. collectEntries{key , values -> [key, [createKubernetesPodConfig(key. contains(" -CU12-" ) ? LLM_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE , values[0 ], " amd64" , values[4 ] ?: 1 , key. contains(" Perf" )), {
@@ -2689,6 +2702,9 @@ def launchTestJobs(pipeline, testFilter)
26892702 fullSet = parallelJobs. keySet()
26902703
26912704 x86SlurmTestConfigs = [
2705+ " DGX_H100-2_GPUs-PyTorch-Ray-1" : [" dgx-h100-x2-oci" , " l0_dgx_h100" , 1 , 1 , 2 ],
2706+ " DGX_H100-4_GPUs-PyTorch-GptOss-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
2707+ " DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4-oci" , " l0_dgx_h100" , 1 , 1 , 4 ],
26922708 " B300-PyTorch-1" : [" b300-single" , " l0_b300" , 1 , 1 ],
26932709 " DGX_B200-4_GPUs-PyTorch-1" : [" b200-x4" , " l0_dgx_b200" , 1 , 2 , 4 ],
26942710 " DGX_B200-4_GPUs-PyTorch-2" : [" b200-x4" , " l0_dgx_b200" , 2 , 2 , 4 ],
0 commit comments