Skip to content

Commit 2ad1041

Browse files
committed
Expose device UUIDs to node label
Signed-off-by: Zubiao Xiong <[email protected]>
1 parent f7dc5f1 commit 2ad1041

File tree

6 files changed

+81
-0
lines changed

6 files changed

+81
-0
lines changed

internal/lm/nvml.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
8585
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
8686
}
8787

88+
uuidLabler, err := newGPUUUIDLabeler(devices)
89+
if err != nil {
90+
return nil, fmt.Errorf("error creating UUID labeler: %v", err)
91+
}
92+
8893
l := Merge(
8994
machineTypeLabeler,
9095
versionLabeler,
@@ -93,6 +98,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
9398
resourceLabeler,
9499
gpuModeLabeler,
95100
imexLabeler,
101+
uuidLabler,
96102
)
97103

98104
return l, nil
@@ -261,3 +267,16 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
261267
}
262268
return classes, nil
263269
}
270+
271+
// newGPUUUIDLabeler creates a new labeler that reports the UUIDs of GPUs on the node.
272+
func newGPUUUIDLabeler(devices []resource.Device) (Labeler, error) {
273+
labels := make(Labels, len(devices))
274+
for idx, d := range devices {
275+
uuid, err := d.GetUUID()
276+
if err != nil {
277+
return nil, err
278+
}
279+
labels[fmt.Sprintf("nvidia.com/gpu-%d.uuid", idx)] = uuid
280+
}
281+
return labels, nil
282+
}

internal/resource/device_mock.go

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/resource/nvml-device.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@ func (d nvmlDevice) GetName() (string, error) {
8181
return name, nil
8282
}
8383

84+
// GetUUID returns the device UUID.
85+
func (d nvmlDevice) GetUUID() (string, error) {
86+
uuid, ret := d.Device.GetUUID()
87+
if ret != nvml.SUCCESS {
88+
return "", ret
89+
}
90+
return uuid, nil
91+
}
92+
8493
// GetTotalMemoryMB returns the total memory on a device in MB
8594
func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) {
8695
info, ret := d.Device.GetMemoryInfo()

internal/resource/nvml-mig-device.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,15 @@ func (d nvmlMigDevice) GetName() (string, error) {
104104
return resourceName, nil
105105
}
106106

107+
// GetUUID returns the UUID of the nvmlMigDevice.
108+
func (d nvmlMigDevice) GetUUID() (string, error) {
109+
uuid, ret := d.MigDevice.GetUUID()
110+
if ret != nvml.SUCCESS {
111+
return "", ret
112+
}
113+
return uuid, nil
114+
}
115+
107116
// GetTotalMemoryMB returns the total memory on a device in MB
108117
func (d nvmlMigDevice) GetTotalMemoryMB() (uint64, error) {
109118
attr, err := d.GetAttributes()

internal/resource/sysfs-device.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ func (d vfioDevice) GetName() (string, error) {
5151
return d.nvidiaPCIDevice.DeviceName, nil
5252
}
5353

54+
// GetUUID is unsupported for vfio devices
55+
func (d vfioDevice) GetUUID() (string, error) {
56+
return "", fmt.Errorf("GetUUID is not supported for vfio devices")
57+
}
58+
5459
// GetTotalMemoryMB returns the total memory on a device in MB
5560
func (d vfioDevice) GetTotalMemoryMB() (uint64, error) {
5661
_, val := d.nvidiaPCIDevice.Resources.GetTotalAddressableMemory(true)
@@ -72,6 +77,7 @@ func (d vfioDevice) GetPCIClass() (uint32, error) {
7277
func (d vfioDevice) IsFabricAttached() (bool, error) {
7378
return false, nil
7479
}
80+
7581
func (d vfioDevice) GetFabricIDs() (string, string, error) {
7682
return "", "", fmt.Errorf("GetFabricIDs is not supported for vfio devices")
7783
}

internal/resource/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type Device interface {
3737
GetMigDevices() ([]Device, error)
3838
GetAttributes() (map[string]interface{}, error)
3939
GetName() (string, error)
40+
GetUUID() (string, error)
4041
GetTotalMemoryMB() (uint64, error)
4142
GetDeviceHandleFromMigDeviceHandle() (Device, error)
4243
GetCudaComputeCapability() (int, int, error)

0 commit comments

Comments
 (0)