Skip to content

Commit b8f5eaa

Browse files
committed
enable CDMM by default for drivers R580 & greater
Signed-off-by: Tariq Ibrahim <[email protected]>
1 parent 78fa4ac commit b8f5eaa

File tree

4 files changed

+24
-0
lines changed

4 files changed

+24
-0
lines changed

rhel8/nvidia-driver

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,12 @@ _ensure_nvlink5_prerequisites() (
304304
# the correct set of parameters are passed to 'modprobe'.
305305
_get_module_params() {
306306
local base_path="/drivers"
307+
308+
# Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
309+
# This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
310+
# and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
311+
NVIDIA_MODULE_PARAMS+=("NVreg_CoherentGPUMemoryMode=driver")
312+
307313
# nvidia
308314
if [ -f "${base_path}/nvidia.conf" ]; then
309315
while IFS="" read -r param || [ -n "$param" ]; do

rhel9/nvidia-driver

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,12 @@ _ensure_nvlink5_prerequisites() (
318318
# the correct set of parameters are passed to 'modprobe'.
319319
_get_module_params() {
320320
local base_path="/drivers"
321+
322+
# Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
323+
# This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
324+
# and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
325+
NVIDIA_MODULE_PARAMS+=("NVreg_CoherentGPUMemoryMode=driver")
326+
321327
# nvidia
322328
if [ -f "${base_path}/nvidia.conf" ]; then
323329
while IFS="" read -r param || [ -n "$param" ]; do

ubuntu22.04/nvidia-driver

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,12 @@ _gpu_direct_rdma_enabled() {
255255
# the correct set of parameters are passed to 'modprobe'.
256256
_get_module_params() {
257257
local base_path="/drivers"
258+
259+
# Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
260+
# This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
261+
# and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
262+
NVIDIA_MODULE_PARAMS+=("NVreg_CoherentGPUMemoryMode=driver")
263+
258264
# nvidia
259265
if [ -f "${base_path}/nvidia.conf" ]; then
260266
while IFS="" read -r param || [ -n "$param" ]; do

ubuntu24.04/nvidia-driver

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,12 @@ _gpu_direct_rdma_enabled() {
186186
# the correct set of parameters are passed to 'modprobe'.
187187
_get_module_params() {
188188
local base_path="/drivers"
189+
190+
# Starting from R580, we need to enable the CDMM (Coherent Driver Memory Management) module parameter.
191+
# This prevents the GPU memory for coherent systems (GH200, GB200 etc) from being exposed as a NUMA node
192+
# and thereby preventing over-reporting of a Kubernetes node's memory. This is needed for Kubernetes use-cases
193+
NVIDIA_MODULE_PARAMS+=("NVreg_CoherentGPUMemoryMode=driver")
194+
189195
# nvidia
190196
if [ -f "${base_path}/nvidia.conf" ]; then
191197
while IFS="" read -r param || [ -n "$param" ]; do

0 commit comments

Comments
 (0)