Skip to content

Commit 89cd885

Browse files
committed
Expose device UUIDs to node label
Signed-off-by: Zubiao Xiong <[email protected]>
1 parent f7dc5f1 commit 89cd885

File tree

11 files changed

+86
-0
lines changed

11 files changed

+86
-0
lines changed

internal/lm/nvml.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
8585
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
8686
}
8787

88+
uuidLabler, err := newGPUUUIDLabeler(devices)
89+
if err != nil {
90+
return nil, fmt.Errorf("error creating UUID labeler: %v", err)
91+
}
92+
8893
l := Merge(
8994
machineTypeLabeler,
9095
versionLabeler,
@@ -93,6 +98,7 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
9398
resourceLabeler,
9499
gpuModeLabeler,
95100
imexLabeler,
101+
uuidLabler,
96102
)
97103

98104
return l, nil
@@ -261,3 +267,16 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
261267
}
262268
return classes, nil
263269
}
270+
271+
// newGPUUUIDLabeler creates a new labeler that reports the UUIDs of GPUs on the node.
272+
func newGPUUUIDLabeler(devices []resource.Device) (Labeler, error) {
273+
labels := make(Labels, len(devices))
274+
for idx, d := range devices {
275+
uuid, err := d.GetUUID()
276+
if err != nil {
277+
return nil, err
278+
}
279+
labels[fmt.Sprintf("nvidia.com/gpu-%d.uuid", idx)] = uuid
280+
}
281+
return labels, nil
282+
}

internal/resource/device_mock.go

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/resource/nvml-device.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,15 @@ func (d nvmlDevice) GetName() (string, error) {
8181
return name, nil
8282
}
8383

84+
// GetUUID returns the device UUID.
85+
func (d nvmlDevice) GetUUID() (string, error) {
86+
uuid, ret := d.Device.GetUUID()
87+
if ret != nvml.SUCCESS {
88+
return "", ret
89+
}
90+
return uuid, nil
91+
}
92+
8493
// GetTotalMemoryMB returns the total memory on a device in MB
8594
func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) {
8695
info, ret := d.Device.GetMemoryInfo()

internal/resource/nvml-mig-device.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,15 @@ func (d nvmlMigDevice) GetName() (string, error) {
104104
return resourceName, nil
105105
}
106106

107+
// GetUUID returns the UUID of the nvmlMigDevice.
108+
func (d nvmlMigDevice) GetUUID() (string, error) {
109+
uuid, ret := d.MigDevice.GetUUID()
110+
if ret != nvml.SUCCESS {
111+
return "", ret
112+
}
113+
return uuid, nil
114+
}
115+
107116
// GetTotalMemoryMB returns the total memory on a device in MB
108117
func (d nvmlMigDevice) GetTotalMemoryMB() (uint64, error) {
109118
attr, err := d.GetAttributes()

internal/resource/sysfs-device.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ func (d vfioDevice) GetName() (string, error) {
5151
return d.nvidiaPCIDevice.DeviceName, nil
5252
}
5353

54+
// GetUUID is unsupported for vfio devices
55+
func (d vfioDevice) GetUUID() (string, error) {
56+
return "", fmt.Errorf("GetUUID is not supported for vfio devices")
57+
}
58+
5459
// GetTotalMemoryMB returns the total memory on a device in MB
5560
func (d vfioDevice) GetTotalMemoryMB() (uint64, error) {
5661
_, val := d.nvidiaPCIDevice.Resources.GetTotalAddressableMemory(true)
@@ -72,6 +77,7 @@ func (d vfioDevice) GetPCIClass() (uint32, error) {
7277
func (d vfioDevice) IsFabricAttached() (bool, error) {
7378
return false, nil
7479
}
80+
7581
func (d vfioDevice) GetFabricIDs() (string, string, error) {
7682
return "", "", fmt.Errorf("GetFabricIDs is not supported for vfio devices")
7783
}

internal/resource/testing/resource-testing.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ func NewMigEnabledDevice(migs ...*resource.DeviceMock) resource.Device {
4141
func NewDeviceMock(migEnabled bool) *DeviceMock {
4242
d := DeviceMock{resource.DeviceMock{
4343
GetNameFunc: func() (string, error) { return "MOCKMODEL", nil },
44+
GetUUIDFunc: func() (string, error) { return "MOCKUUID", nil },
4445
GetCudaComputeCapabilityFunc: func() (int, int, error) {
4546
if migEnabled {
4647
return 0, 0, nil

internal/resource/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type Device interface {
3737
GetMigDevices() ([]Device, error)
3838
GetAttributes() (map[string]interface{}, error)
3939
GetName() (string, error)
40+
GetUUID() (string, error)
4041
GetTotalMemoryMB() (uint64, error)
4142
GetDeviceHandleFromMigDeviceHandle() (Device, error)
4243
GetCudaComputeCapability() (int, int, error)

tests/expected-output-mig-mixed.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,4 @@ nvidia\.com\/mig-[0-9]+g\.[0-9]+gb\.engines\.ofa=[0-9]+
3737
nvidia\.com\/mig-[0-9]+g\.[0-9]+gb\.slices\.gi=[0-9]+
3838
nvidia\.com\/mig-[0-9]+g\.[0-9]+gb\.slices\.ci=[0-9]+
3939
nvidia\.com\/mps\.capable=[true|false]
40+
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+

tests/expected-output-mig-none.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,4 @@ nvidia\.com\/mig\.capable=[true|false]
2323
nvidia\.com\/gpu\.compute\.major=[0-9]+
2424
nvidia\.com\/gpu\.compute\.minor=[0-9]+
2525
nvidia\.com\/mps\.capable=[true|false]
26+
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+

tests/expected-output-mig-single.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ nvidia\.com\/gpu\.slices\.gi=[0-9]+
3232
nvidia\.com\/gpu\.slices\.ci=[0-9]+
3333
nvidia\.com\/gpu\.mode=[compute]
3434
nvidia\.com\/mps\.capable=[true|false]
35+
nvidia\.com\/gpu-[0-9]+\.uuid=[0-9a-zA-Z-]+

0 commit comments

Comments
 (0)