Skip to content

Commit d1852f0

Browse files
committed
handle mig devices
Signed-off-by: Swati Gupta <[email protected]>
1 parent 407562d commit d1852f0

File tree

2 files changed

+126
-80
lines changed

2 files changed

+126
-80
lines changed
Lines changed: 120 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
/*
2-
* Copyright 2025 The Kubernetes Authors.
3-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
43
*
54
* Licensed under the Apache License, Version 2.0 (the "License");
65
* you may not use this file except in compliance with the License.
@@ -26,124 +25,146 @@ import (
2625
"k8s.io/klog/v2"
2726
)
2827

28+
const (
29+
InstanceID uint32 = 0xFFFFFFFF
30+
)
31+
2932
type deviceHealthMonitor struct {
30-
nvdevlib *deviceLib
31-
allocatable AllocatableDevices
32-
eventSet nvml.EventSet
33-
unhealthy chan *AllocatableDevice
34-
stop chan struct{}
35-
wg sync.WaitGroup
33+
nvmllib nvml.Interface
34+
eventSet nvml.EventSet
35+
unhealthy chan *AllocatableDevice
36+
stop chan struct{}
37+
uuidToDeviceMap map[string]*AllocatableDevice
38+
wg sync.WaitGroup
3639
}
3740

3841
func newDeviceHealthMonitor(ctx context.Context, config *Config, allocatable AllocatableDevices, nvdevlib *deviceLib) (*deviceHealthMonitor, error) {
39-
klog.Info("[SWATI DEBUG] initializing NVML..")
40-
if err := nvdevlib.Init(); err != nil {
41-
return nil, fmt.Errorf("failed to initialize NVML: %w", err)
42+
if nvdevlib.nvmllib == nil {
43+
return nil, fmt.Errorf("nvml library is nil")
4244
}
43-
//defer nvdevlib.alwaysShutdown()
4445

45-
//klog.Info("[SWATI DEBUG] getting all devices..")
46-
//allocatable, err := nvdevlib.enumerateAllPossibleDevices(config)
47-
//if err != nil {
48-
// return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
49-
//}
46+
m := &deviceHealthMonitor{
47+
nvmllib: nvdevlib.nvmllib,
48+
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
49+
stop: make(chan struct{}),
50+
}
5051

51-
klog.Info("[SWATI DEBUG] creating NVML events")
52-
eventSet, err := nvdevlib.nvmllib.EventSetCreate()
52+
if r := m.nvmllib.Init(); r != nvml.SUCCESS {
53+
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
54+
}
55+
56+
klog.V(6).Info("creating NVML events for device health monitor")
57+
eventSet, err := m.nvmllib.EventSetCreate()
5358
if err != nvml.SUCCESS {
59+
_ = m.nvmllib.Shutdown()
5460
return nil, fmt.Errorf("failed to create event set: %w", err)
5561
}
62+
m.eventSet = eventSet
5663

57-
monitor := &deviceHealthMonitor{
58-
nvdevlib: nvdevlib,
59-
allocatable: allocatable,
60-
eventSet: eventSet,
61-
unhealthy: make(chan *AllocatableDevice, len(allocatable)),
62-
stop: make(chan struct{}),
63-
}
64+
m.uuidToDeviceMap = getUUIDToDeviceMap(allocatable)
6465

65-
klog.Info("[SWATI DEBUG] registering NVML events")
66-
if err := monitor.registerDevicesForEvents(); err != nil {
67-
monitor.eventSet.Free()
68-
return nil, fmt.Errorf("failed to register devices for health monitoring: %w", err)
69-
}
66+
klog.V(6).Info("registering NVML events for device health monitor")
67+
m.registerDevicesForEvents()
7068

71-
monitor.start()
72-
return monitor, nil
69+
klog.V(6).Info("started device health monitoring")
70+
m.wg.Add(1)
71+
go m.run()
72+
73+
return m, nil
7374
}
7475

75-
func (m *deviceHealthMonitor) registerDevicesForEvents() error {
76-
nvmllib := m.nvdevlib.nvmllib
76+
func (m *deviceHealthMonitor) registerDevicesForEvents() {
77+
// TODO: add a list of xids to ignore
7778
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
7879

79-
for _, uuid := range m.allocatable.UUIDs() {
80-
gpu, err := nvmllib.DeviceGetHandleByUUID(uuid)
80+
processedUUIDs := make(map[string]bool)
81+
82+
for uuid, dev := range m.uuidToDeviceMap {
83+
var u string
84+
if dev.Type() == MigDeviceType {
85+
u = dev.Mig.parent.UUID
86+
} else {
87+
u = uuid
88+
}
89+
90+
if processedUUIDs[u] {
91+
continue
92+
}
93+
gpu, err := m.nvmllib.DeviceGetHandleByUUID(u)
94+
if err != nvml.SUCCESS {
95+
klog.Infof("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", u, err)
96+
m.unhealthy <- dev
97+
continue
98+
}
99+
100+
supportedEvents, err := gpu.GetSupportedEventTypes()
81101
if err != nvml.SUCCESS {
82-
klog.Infof("Unable to get NVML handle for UUID %s: %v; skipping health check for this device", uuid, err)
102+
klog.Infof("unable to determine the supported events for %s: %v; marking it as unhealthy", u, err)
103+
m.unhealthy <- dev
83104
continue
84105
}
85106

86-
if err := gpu.RegisterEvents(eventMask, m.eventSet); err != nvml.SUCCESS {
87-
klog.Infof("Failed to register events for device %s: %v; skipping health check for this device", uuid, err)
107+
err = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
108+
if err == nvml.ERROR_NOT_SUPPORTED {
109+
klog.Warningf("Device %v is too old to support healthchecking.", u)
110+
}
111+
if err != nvml.SUCCESS {
112+
klog.Infof("unable to register events for %s: %v; marking it as unhealthy", u, err)
113+
m.unhealthy <- dev
88114
}
115+
processedUUIDs[u] = true
89116
}
90-
return nil
91-
}
92-
93-
func (m *deviceHealthMonitor) start() {
94-
klog.Info("[SWATI DEBUG] starting health monitor")
95-
m.wg.Add(1)
96-
go m.run()
97117
}
98118

99119
func (m *deviceHealthMonitor) Stop() {
100120
if m == nil {
101121
return
102122
}
103-
klog.Info("[SWATI DEBUG] stopping health monitor")
123+
klog.V(6).Info("stopping health monitor")
124+
104125
close(m.stop)
105126
m.wg.Wait()
106-
close(m.unhealthy)
127+
107128
m.eventSet.Free()
108129

109-
if m.nvdevlib != nil {
110-
m.nvdevlib.alwaysShutdown()
130+
if r := m.nvmllib.Shutdown(); r != nvml.SUCCESS {
131+
klog.Warningf("failed to shutdown NVML: %v", r)
111132
}
133+
close(m.unhealthy)
112134
}
113135

114-
func (m *deviceHealthMonitor) run() {
115-
defer m.wg.Done()
116-
136+
func getUUIDToDeviceMap(allocatable AllocatableDevices) map[string]*AllocatableDevice {
117137
uuidToDeviceMap := make(map[string]*AllocatableDevice)
118-
for _, device := range m.allocatable {
119-
uuid := device.GetUUID()
120-
if uuid != "" {
121-
uuidToDeviceMap[uuid] = device
138+
139+
for _, d := range allocatable {
140+
if u := d.GetUUID(); u != "" {
141+
uuidToDeviceMap[u] = d
122142
}
123143
}
144+
return uuidToDeviceMap
145+
}
124146

125-
klog.Info("Starting event-driven GPU health monitor...")
126-
147+
func (m *deviceHealthMonitor) run() {
148+
defer m.wg.Done()
127149
for {
128150
select {
129151
case <-m.stop:
130-
klog.Info("Stopping event-driven GPU health monitor...")
152+
klog.V(6).Info("Stopping event-driven GPU health monitor...")
131153
return
132154
default:
133155
event, err := m.eventSet.Wait(5000)
134156
if err == nvml.ERROR_TIMEOUT {
135-
klog.Info("[SWATI DEBUG] timedout")
136157
continue
137158
}
138159
if err != nvml.SUCCESS {
139160
klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", err)
140-
for _, dev := range m.allocatable {
161+
for _, dev := range m.uuidToDeviceMap {
141162
m.unhealthy <- dev
142163
}
143164
continue
144165
}
145166

146-
// Process health events
167+
klog.Infof("Processing event %+v", event)
147168
switch event.EventType {
148169
case nvml.EventTypeXidCriticalError:
149170
klog.Warningf("Critical XID error detected on device: %+v", event)
@@ -158,30 +179,53 @@ func (m *deviceHealthMonitor) run() {
158179
eventUUID, err := event.Device.GetUUID()
159180
if err != nvml.SUCCESS {
160181
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, err)
161-
for _, dev := range m.allocatable {
182+
for _, dev := range m.uuidToDeviceMap {
162183
m.unhealthy <- dev
163184
}
164185
continue
165186
}
166187

167-
device, exists := uuidToDeviceMap[eventUUID]
168-
if !exists {
169-
continue
188+
var affectedDevice *AllocatableDevice
189+
if event.GpuInstanceId != InstanceID && event.ComputeInstanceId != InstanceID {
190+
affectedDevice = m.findMigDevice(eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
191+
klog.Infof("Event for mig device: %v", affectedDevice)
192+
} else {
193+
affectedDevice = m.findGpuDevice(eventUUID)
170194
}
171195

172-
// Send notification to driver
173-
klog.Infof("Sending unhealthy notification for device %s due to event type %v", eventUUID, event.EventType)
174-
select {
175-
case m.unhealthy <- device:
176-
// Successfully sent notification
177-
default:
178-
// Channel full, log and continue
179-
klog.Warningf("Health notification channel full, dropping event for device %s", eventUUID)
196+
if affectedDevice == nil {
197+
klog.Infof("Ignoring event for unexpected device (UUID: %s, GI: %d, CI: %d)", eventUUID, event.GpuInstanceId, event.ComputeInstanceId)
198+
continue
180199
}
200+
klog.Infof("Sending unhealthy notification for device %s due to event type %v", eventUUID, event.EventType)
201+
m.unhealthy <- affectedDevice
181202
}
182203
}
183204
}
184205

185206
func (m *deviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
186207
return m.unhealthy
187208
}
209+
210+
func (m *deviceHealthMonitor) findMigDevice(parentUUID string, giID uint32, ciID uint32) *AllocatableDevice {
211+
for _, device := range m.uuidToDeviceMap {
212+
if device.Type() != MigDeviceType {
213+
continue
214+
}
215+
216+
if device.Mig.parent.UUID == parentUUID &&
217+
device.Mig.giInfo.Id == giID &&
218+
device.Mig.ciInfo.Id == ciID {
219+
return device
220+
}
221+
}
222+
return nil
223+
}
224+
225+
func (m *deviceHealthMonitor) findGpuDevice(uuid string) *AllocatableDevice {
226+
device, exists := m.uuidToDeviceMap[uuid]
227+
if exists && device.Type() == GpuDeviceType {
228+
return device
229+
}
230+
return nil
231+
}

cmd/gpu-kubelet-plugin/driver.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {
105105
return nil, err
106106
}
107107

108-
go driver.handleHealthNotifications(ctx, config.flags.nodeName)
108+
go driver.deviceHealthEvents(ctx, config.flags.nodeName)
109109

110110
return driver, nil
111111
}
@@ -191,12 +191,12 @@ func (d *driver) nodeUnprepareResource(ctx context.Context, claimNs kubeletplugi
191191
return nil
192192
}
193193

194-
func (d *driver) handleHealthNotifications(ctx context.Context, nodeName string) {
195-
klog.Info("[SWATI DEBUG] handling Health Notifications")
194+
func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
195+
klog.Info("Processing device health notifications")
196196
for {
197197
select {
198198
case <-ctx.Done():
199-
klog.Info("Stopping health notification handler")
199+
klog.Info("Stop processing device health notifications")
200200
return
201201
case device, ok := <-d.deviceHealthMonitor.Unhealthy():
202202
if !ok {
@@ -222,6 +222,8 @@ func (d *driver) handleHealthNotifications(ctx context.Context, nodeName string)
222222
}
223223

224224
// Republish updated resources
225+
// TODO: 1. remove this.
226+
// 2. Add device taints
225227
klog.Info("[SWATI DEBUG] rebulishing resourceslice with healthy devices")
226228
resources := resourceslice.DriverResources{
227229
Pools: map[string]resourceslice.Pool{

0 commit comments

Comments
 (0)