Skip to content

Commit 9ebf2a4

Browse files
committed
Add NUMA node ID attribute for topology aware allocation between CPU, GPU and NICs
Signed-off-by: Shiva Krishna, Merla <[email protected]>
1 parent cfe35ff commit 9ebf2a4

File tree

2 files changed

+25
-0
lines changed

2 files changed

+25
-0
lines changed

cmd/gpu-kubelet-plugin/deviceinfo.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ import (
2828
"k8s.io/utils/ptr"
2929
)
3030

31+
const (
32+
// SharedDeviceAttributePrefix is the prefix used for shared attributes among NVIDIA devices {NIC, GPU}.
33+
SharedDeviceAttributePrefix = "resource.nvidia.com/"
34+
// DeviceAttributeNumaNodeID is a device attribute name which describe the NUMA node ID of the device.
35+
// This attribute can be used to identify devices that share the same NUMA node.
36+
SharedDeviceAttributeNumaNodeID resourceapi.QualifiedName = SharedDeviceAttributePrefix + "numaNodeID"
37+
)
38+
3139
type GpuInfo struct {
3240
UUID string `json:"uuid"`
3341
minor int
@@ -42,6 +50,7 @@ type GpuInfo struct {
4250
pcieBusID string
4351
pcieRootAttr *deviceattribute.DeviceAttribute
4452
migProfiles []*MigProfileInfo
53+
numaNodeID *int
4554
}
4655

4756
type MigDeviceInfo struct {
@@ -55,6 +64,7 @@ type MigDeviceInfo struct {
5564
ciInfo *nvml.ComputeInstanceInfo
5665
pcieBusID string
5766
pcieRootAttr *deviceattribute.DeviceAttribute
67+
numaNodeID *int
5868
}
5969

6070
type MigProfileInfo struct {
@@ -119,6 +129,9 @@ func (d *GpuInfo) GetDevice() resourceapi.Device {
119129
if d.pcieRootAttr != nil {
120130
device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value
121131
}
132+
if d.numaNodeID != nil {
133+
device.Attributes[SharedDeviceAttributeNumaNodeID] = resourceapi.DeviceAttribute{IntValue: ptr.To(int64(*d.numaNodeID))}
134+
}
122135
return device
123136
}
124137

@@ -181,5 +194,8 @@ func (d *MigDeviceInfo) GetDevice() resourceapi.Device {
181194
if d.pcieRootAttr != nil {
182195
device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value
183196
}
197+
if d.numaNodeID != nil {
198+
device.Attributes[SharedDeviceAttributeNumaNodeID] = resourceapi.DeviceAttribute{IntValue: ptr.To(int64(*d.numaNodeID))}
199+
}
184200
return device
185201
}

cmd/gpu-kubelet-plugin/nvlib.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,13 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
208208
klog.Warningf("error getting PCIe root for device %d, continuing without attribute: %v", index, err)
209209
}
210210

211+
var numaNodeID *int
212+
if id, ret := device.GetNumaNodeId(); ret == nvml.SUCCESS {
213+
numaNodeID = &id
214+
} else {
215+
klog.Warningf("error getting NUMA node ID for device %d, continuing without attribute: %v", index, ret)
216+
}
217+
211218
var migProfiles []*MigProfileInfo
212219
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
213220
giProfileInfo, ret := device.GetGpuInstanceProfileInfo(i)
@@ -275,6 +282,7 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
275282
pcieBusID: pcieBusID,
276283
pcieRootAttr: pcieRootAttr,
277284
migProfiles: migProfiles,
285+
numaNodeID: numaNodeID,
278286
}
279287

280288
return gpuInfo, nil
@@ -368,6 +376,7 @@ func (l deviceLib) getMigDevices(gpuInfo *GpuInfo) (map[string]*MigDeviceInfo, e
368376
ciInfo: &ciInfo,
369377
pcieBusID: gpuInfo.pcieBusID,
370378
pcieRootAttr: gpuInfo.pcieRootAttr,
379+
numaNodeID: gpuInfo.numaNodeID,
371380
}
372381
return nil
373382
})

0 commit comments

Comments
 (0)