File tree Expand file tree Collapse file tree 4 files changed +24
-0
lines changed Expand file tree Collapse file tree 4 files changed +24
-0
lines changed Original file line number Diff line number Diff line change @@ -304,6 +304,12 @@ _ensure_nvlink5_prerequisites() (
304304# the correct set of parameters are passed to 'modprobe'.
305305_get_module_params () {
306306 local base_path=" /drivers"
307+
308+ # Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
309+ # This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
310+ # and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
311+ NVIDIA_MODULE_PARAMS+=(" NVreg_CoherentGPUMemoryMode=driver" )
312+
307313 # nvidia
308314 if [ -f " ${base_path} /nvidia.conf" ]; then
309315 while IFS=" " read -r param || [ -n " $param " ]; do
Original file line number Diff line number Diff line change @@ -318,6 +318,12 @@ _ensure_nvlink5_prerequisites() (
318318# the correct set of parameters are passed to 'modprobe'.
319319_get_module_params () {
320320 local base_path=" /drivers"
321+
322+ # Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
323+ # This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
324+ # and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
325+ NVIDIA_MODULE_PARAMS+=(" NVreg_CoherentGPUMemoryMode=driver" )
326+
321327 # nvidia
322328 if [ -f " ${base_path} /nvidia.conf" ]; then
323329 while IFS=" " read -r param || [ -n " $param " ]; do
Original file line number Diff line number Diff line change @@ -255,6 +255,12 @@ _gpu_direct_rdma_enabled() {
255255# the correct set of parameters are passed to 'modprobe'.
256256_get_module_params () {
257257 local base_path=" /drivers"
258+
259+ # Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
260+ # This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
261+ # and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
262+ NVIDIA_MODULE_PARAMS+=(" NVreg_CoherentGPUMemoryMode=driver" )
263+
258264 # nvidia
259265 if [ -f " ${base_path} /nvidia.conf" ]; then
260266 while IFS=" " read -r param || [ -n " $param " ]; do
Original file line number Diff line number Diff line change @@ -186,6 +186,12 @@ _gpu_direct_rdma_enabled() {
186186# the correct set of parameters are passed to 'modprobe'.
187187_get_module_params () {
188188 local base_path=" /drivers"
189+
190+ # Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
191+ # This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
192+ # and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
193+ NVIDIA_MODULE_PARAMS+=(" NVreg_CoherentGPUMemoryMode=driver" )
194+
189195 # nvidia
190196 if [ -f " ${base_path} /nvidia.conf" ]; then
191197 while IFS=" " read -r param || [ -n " $param " ]; do
You can’t perform that action at this time.
0 commit comments