diff --git a/cmd/gpu-kubelet-plugin/deviceinfo.go b/cmd/gpu-kubelet-plugin/deviceinfo.go index c40842f04..2573d1947 100644 --- a/cmd/gpu-kubelet-plugin/deviceinfo.go +++ b/cmd/gpu-kubelet-plugin/deviceinfo.go @@ -28,6 +28,14 @@ import ( "k8s.io/utils/ptr" ) +const ( + // SharedDeviceAttributePrefix is the prefix used for shared attributes among NVIDIA devices {NIC, GPU}. + SharedDeviceAttributePrefix = "resource.nvidia.com/" + // DeviceAttributeNumaNodeID is a device attribute name which describe the NUMA node ID of the device. + // This attribute can be used to identify devices that share the same NUMA node. + SharedDeviceAttributeNumaNodeID resourceapi.QualifiedName = SharedDeviceAttributePrefix + "numaNodeID" +) + type GpuInfo struct { UUID string `json:"uuid"` minor int @@ -42,6 +50,7 @@ type GpuInfo struct { pcieBusID string pcieRootAttr *deviceattribute.DeviceAttribute migProfiles []*MigProfileInfo + numaNodeID *int } type MigDeviceInfo struct { @@ -55,6 +64,7 @@ type MigDeviceInfo struct { ciInfo *nvml.ComputeInstanceInfo pcieBusID string pcieRootAttr *deviceattribute.DeviceAttribute + numaNodeID *int } type MigProfileInfo struct { @@ -119,6 +129,9 @@ func (d *GpuInfo) GetDevice() resourceapi.Device { if d.pcieRootAttr != nil { device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value } + if d.numaNodeID != nil { + device.Attributes[SharedDeviceAttributeNumaNodeID] = resourceapi.DeviceAttribute{IntValue: ptr.To(int64(*d.numaNodeID))} + } return device } @@ -181,5 +194,8 @@ func (d *MigDeviceInfo) GetDevice() resourceapi.Device { if d.pcieRootAttr != nil { device.Attributes[d.pcieRootAttr.Name] = d.pcieRootAttr.Value } + if d.numaNodeID != nil { + device.Attributes[SharedDeviceAttributeNumaNodeID] = resourceapi.DeviceAttribute{IntValue: ptr.To(int64(*d.numaNodeID))} + } return device } diff --git a/cmd/gpu-kubelet-plugin/nvlib.go b/cmd/gpu-kubelet-plugin/nvlib.go index 2d33a2a0f..94dd34626 100644 --- a/cmd/gpu-kubelet-plugin/nvlib.go +++ b/cmd/gpu-kubelet-plugin/nvlib.go @@ -208,6 +208,13 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) klog.Warningf("error getting PCIe root for device %d, continuing without attribute: %v", index, err) } + var numaNodeID *int + if id, ret := device.GetNumaNodeId(); ret == nvml.SUCCESS { + numaNodeID = &id + } else { + klog.Warningf("error getting NUMA node ID for device %d, continuing without attribute: %v", index, ret) + } + var migProfiles []*MigProfileInfo for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ { giProfileInfo, ret := device.GetGpuInstanceProfileInfo(i) @@ -275,6 +282,7 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) pcieBusID: pcieBusID, pcieRootAttr: pcieRootAttr, migProfiles: migProfiles, + numaNodeID: numaNodeID, } return gpuInfo, nil @@ -368,6 +376,7 @@ func (l deviceLib) getMigDevices(gpuInfo *GpuInfo) (map[string]*MigDeviceInfo, e ciInfo: &ciInfo, pcieBusID: gpuInfo.pcieBusID, pcieRootAttr: gpuInfo.pcieRootAttr, + numaNodeID: gpuInfo.numaNodeID, } return nil })