@@ -20,130 +20,138 @@ package main
2020import (
2121 "context"
2222 "fmt"
23- "sync"
2423
2524 "github.com/NVIDIA/go-nvml/pkg/nvml"
2625 "k8s.io/klog/v2"
2726)
2827
28+ const (
29+ InstanceID uint32 = 0xFFFFFFFF
30+ )
31+
2932type deviceHealthMonitor struct {
30- nvdevlib * deviceLib
31- allocatable AllocatableDevices
32- eventSet nvml.EventSet
33- unhealthy chan * AllocatableDevice
34- stop chan struct {}
35- wg sync.WaitGroup
33+ nvmllib nvml.Interface
34+ eventSet nvml.EventSet
35+ unhealthy chan * AllocatableDevice
36+ stop chan struct {}
37+ uuidToDeviceMap map [string ]* AllocatableDevice
3638}
3739
3840func newDeviceHealthMonitor (ctx context.Context , config * Config , allocatable AllocatableDevices , nvdevlib * deviceLib ) (* deviceHealthMonitor , error ) {
39- klog .Info ("[SWATI DEBUG] initializing NVML.." )
40- if err := nvdevlib .Init (); err != nil {
41- return nil , fmt .Errorf ("failed to initialize NVML: %w" , err )
41+ if nvdevlib .nvmllib == nil {
42+ return nil , fmt .Errorf ("nvml library is nil" )
43+ }
44+
45+ m := & deviceHealthMonitor {
46+ nvmllib : nvdevlib .nvmllib ,
47+ unhealthy : make (chan * AllocatableDevice , len (allocatable )),
48+ stop : make (chan struct {}),
4249 }
43- //defer nvdevlib.alwaysShutdown()
4450
45- //klog.Info("[SWATI DEBUG] getting all devices..")
46- //allocatable, err := nvdevlib.enumerateAllPossibleDevices(config)
47- //if err != nil {
48- // return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
49- //}
51+ if r := m .nvmllib .Init (); r != nvml .SUCCESS {
52+ return nil , fmt .Errorf ("failed to initialize NVML: %v" , r )
53+ }
5054
51- klog .Info ("[SWATI DEBUG] creating NVML events" )
52- eventSet , err := nvdevlib .nvmllib .EventSetCreate ()
55+ klog .V ( 6 ). Info ("creating NVML events for device health monitor " )
56+ eventSet , err := m .nvmllib .EventSetCreate ()
5357 if err != nvml .SUCCESS {
58+ _ = m .nvmllib .Shutdown ()
5459 return nil , fmt .Errorf ("failed to create event set: %w" , err )
5560 }
61+ m .eventSet = eventSet
5662
57- monitor := & deviceHealthMonitor {
58- nvdevlib : nvdevlib ,
59- allocatable : allocatable ,
60- eventSet : eventSet ,
61- unhealthy : make (chan * AllocatableDevice , len (allocatable )),
62- stop : make (chan struct {}),
63- }
63+ m .uuidToDeviceMap = getUuidToDeviceMap (allocatable )
6464
65- klog .Info ("[SWATI DEBUG] registering NVML events" )
66- if err := monitor .registerDevicesForEvents (); err != nil {
67- monitor .eventSet .Free ()
68- return nil , fmt .Errorf ("failed to register devices for health monitoring: %w" , err )
69- }
65+ klog .V (6 ).Info ("registering NVML events for device health monitor" )
66+ m .registerDevicesForEvents ()
7067
71- monitor .start ()
72- return monitor , nil
68+ klog .V (6 ).Info ("started device health monitoring" )
69+ go m .run ()
70+
71+ return m , nil
7372}
7473
75- func (m * deviceHealthMonitor ) registerDevicesForEvents () error {
76- nvmllib := m . nvdevlib . nvmllib
74+ func (m * deviceHealthMonitor ) registerDevicesForEvents () {
75+ // TODO: add a list of xids to ignore
7776 eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
7877
79- for _ , uuid := range m .allocatable .UUIDs () {
80- gpu , err := nvmllib .DeviceGetHandleByUUID (uuid )
78+ for uuid , dev := range m .uuidToDeviceMap {
79+ // if its a mig device, get its Parent UUID
80+ if dev .Type () == MigDeviceType {
81+ uuid = dev .Mig .parent .UUID
82+ }
83+ gpu , err := m .nvmllib .DeviceGetHandleByUUID (uuid )
8184 if err != nvml .SUCCESS {
82- klog .Infof ("Unable to get NVML handle for UUID %s: %v; skipping health check for this device" , uuid , err )
85+ klog .Infof ("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy" , uuid , err )
86+ m .unhealthy <- dev
8387 continue
8488 }
8589
86- if err := gpu .RegisterEvents (eventMask , m .eventSet ); err != nvml .SUCCESS {
87- klog .Infof ("Failed to register events for device %s: %v; skipping health check for this device" , uuid , err )
90+ supportedEvents , err := gpu .GetSupportedEventTypes ()
91+ if err != nvml .SUCCESS {
92+ klog .Infof ("unable to determine the supported events for %s: %v; marking it as unhealthy" , uuid , err )
93+ m .unhealthy <- dev
94+ continue
8895 }
89- }
90- return nil
91- }
9296
93- func (m * deviceHealthMonitor ) start () {
94- klog .Info ("[SWATI DEBUG] starting health monitor" )
95- m .wg .Add (1 )
96- go m .run ()
97+ err = gpu .RegisterEvents (eventMask & supportedEvents , m .eventSet )
98+ if err == nvml .ERROR_NOT_SUPPORTED {
99+ klog .Warningf ("Device %v is too old to support healthchecking." , uuid )
100+ }
101+ if err != nvml .SUCCESS {
102+ klog .Infof ("unable to register events for %s: %v; marking it as unhealthy" , uuid , err )
103+ m .unhealthy <- dev
104+ }
105+
106+ }
97107}
98108
99109func (m * deviceHealthMonitor ) Stop () {
100110 if m == nil {
101111 return
102112 }
103- klog .Info ("[SWATI DEBUG] stopping health monitor" )
113+ klog .V (6 ).Info ("stopping health monitor" )
114+
104115 close (m .stop )
105- m .wg .Wait ()
106- close (m .unhealthy )
107116 m .eventSet .Free ()
108117
109- if m . nvdevlib != nil {
110- m . nvdevlib . alwaysShutdown ( )
118+ if r := m . nvmllib . Shutdown (); r != nvml . SUCCESS {
119+ klog . Warningf ( "failed to shutdown NVML: %v" , r )
111120 }
121+ close (m .unhealthy )
112122}
113123
114- func (m * deviceHealthMonitor ) run () {
115- defer m .wg .Done ()
116-
124+ func getUuidToDeviceMap (allocatable AllocatableDevices ) map [string ]* AllocatableDevice {
117125 uuidToDeviceMap := make (map [string ]* AllocatableDevice )
118- for _ , device := range m . allocatable {
126+ for _ , device := range allocatable {
119127 uuid := device .GetUUID ()
120128 if uuid != "" {
121129 uuidToDeviceMap [uuid ] = device
122130 }
123131 }
132+ return uuidToDeviceMap
133+ }
124134
125- klog .Info ("Starting event-driven GPU health monitor..." )
126-
135+ func (m * deviceHealthMonitor ) run () {
127136 for {
128137 select {
129138 case <- m .stop :
130- klog .Info ("Stopping event-driven GPU health monitor..." )
139+ klog .V ( 6 ). Info ("Stopping event-driven GPU health monitor..." )
131140 return
132141 default :
133142 event , err := m .eventSet .Wait (5000 )
134143 if err == nvml .ERROR_TIMEOUT {
135- klog .Info ("[SWATI DEBUG] timedout" )
136144 continue
137145 }
138146 if err != nvml .SUCCESS {
139147 klog .Infof ("Error waiting for event: %v; Marking all devices as unhealthy" , err )
140- for _ , dev := range m .allocatable {
148+ for _ , dev := range m .uuidToDeviceMap {
141149 m .unhealthy <- dev
142150 }
143151 continue
144152 }
145153
146- // Process health events
154+ klog . Infof ( "Processing event %+v" , event )
147155 switch event .EventType {
148156 case nvml .EventTypeXidCriticalError :
149157 klog .Warningf ("Critical XID error detected on device: %+v" , event )
@@ -158,16 +166,25 @@ func (m *deviceHealthMonitor) run() {
158166 eventUUID , err := event .Device .GetUUID ()
159167 if err != nvml .SUCCESS {
160168 klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , event , err )
161- for _ , dev := range m .allocatable {
169+ for _ , dev := range m .uuidToDeviceMap {
162170 m .unhealthy <- dev
163171 }
164172 continue
165173 }
166174
167- device , exists := uuidToDeviceMap [eventUUID ]
175+ device , exists := m . uuidToDeviceMap [eventUUID ]
168176 if ! exists {
177+ klog .Infof ("Ignoring event for unexpected device: %v" , eventUUID )
169178 continue
170179 }
180+ if device .Type () == MigDeviceType && event .GpuInstanceId != InstanceID && event .ComputeInstanceId != InstanceID {
181+ giID := device .Mig .giInfo .Id
182+ ciID := device .Mig .ciInfo .Id
183+ if giID != event .GpuInstanceId || ciID != event .ComputeInstanceId {
184+ continue
185+ }
186+ klog .Infof ("Event for mig device %v (giID=%v, ciID=%v)" , device .Mig .UUID , giID , ciID )
187+ }
171188
172189 // Send notification to driver
173190 klog .Infof ("Sending unhealthy notification for device %s due to event type %v" , eventUUID , event .EventType )
0 commit comments