11/*
2- * Copyright 2025 The Kubernetes Authors.
3- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
43 *
54 * Licensed under the Apache License, Version 2.0 (the "License");
65 * you may not use this file except in compliance with the License.
@@ -26,124 +25,146 @@ import (
2625 "k8s.io/klog/v2"
2726)
2827
28+ const (
29+ InstanceID uint32 = 0xFFFFFFFF
30+ )
31+
2932type deviceHealthMonitor struct {
30- nvdevlib * deviceLib
31- allocatable AllocatableDevices
32- eventSet nvml. EventSet
33- unhealthy chan * AllocatableDevice
34- stop chan struct {}
35- wg sync.WaitGroup
33+ nvmllib nvml. Interface
34+ eventSet nvml. EventSet
35+ unhealthy chan * AllocatableDevice
36+ stop chan struct {}
37+ uuidToDeviceMap map [ string ] * AllocatableDevice
38+ wg sync.WaitGroup
3639}
3740
3841func newDeviceHealthMonitor (ctx context.Context , config * Config , allocatable AllocatableDevices , nvdevlib * deviceLib ) (* deviceHealthMonitor , error ) {
39- klog .Info ("[SWATI DEBUG] initializing NVML.." )
40- if err := nvdevlib .Init (); err != nil {
41- return nil , fmt .Errorf ("failed to initialize NVML: %w" , err )
42+ if nvdevlib .nvmllib == nil {
43+ return nil , fmt .Errorf ("nvml library is nil" )
4244 }
43- //defer nvdevlib.alwaysShutdown()
4445
45- //klog.Info("[SWATI DEBUG] getting all devices..")
46- //allocatable, err := nvdevlib.enumerateAllPossibleDevices(config)
47- //if err != nil {
48- // return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
49- // }
46+ m := & deviceHealthMonitor {
47+ nvmllib : nvdevlib .nvmllib ,
48+ unhealthy : make ( chan * AllocatableDevice , len ( allocatable )),
49+ stop : make ( chan struct {}),
50+ }
5051
51- klog .Info ("[SWATI DEBUG] creating NVML events" )
52- eventSet , err := nvdevlib .nvmllib .EventSetCreate ()
52+ if r := m .nvmllib .Init (); r != nvml .SUCCESS {
53+ return nil , fmt .Errorf ("failed to initialize NVML: %v" , r )
54+ }
55+
56+ klog .V (6 ).Info ("creating NVML events for device health monitor" )
57+ eventSet , err := m .nvmllib .EventSetCreate ()
5358 if err != nvml .SUCCESS {
59+ _ = m .nvmllib .Shutdown ()
5460 return nil , fmt .Errorf ("failed to create event set: %w" , err )
5561 }
62+ m .eventSet = eventSet
5663
57- monitor := & deviceHealthMonitor {
58- nvdevlib : nvdevlib ,
59- allocatable : allocatable ,
60- eventSet : eventSet ,
61- unhealthy : make (chan * AllocatableDevice , len (allocatable )),
62- stop : make (chan struct {}),
63- }
64+ m .uuidToDeviceMap = getUUIDToDeviceMap (allocatable )
6465
65- klog .Info ("[SWATI DEBUG] registering NVML events" )
66- if err := monitor .registerDevicesForEvents (); err != nil {
67- monitor .eventSet .Free ()
68- return nil , fmt .Errorf ("failed to register devices for health monitoring: %w" , err )
69- }
66+ klog .V (6 ).Info ("registering NVML events for device health monitor" )
67+ m .registerDevicesForEvents ()
7068
71- monitor .start ()
72- return monitor , nil
69+ klog .V (6 ).Info ("started device health monitoring" )
70+ m .wg .Add (1 )
71+ go m .run ()
72+
73+ return m , nil
7374}
7475
75- func (m * deviceHealthMonitor ) registerDevicesForEvents () error {
76- nvmllib := m . nvdevlib . nvmllib
76+ func (m * deviceHealthMonitor ) registerDevicesForEvents () {
77+ // TODO: add a list of xids to ignore
7778 eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
7879
79- for _ , uuid := range m .allocatable .UUIDs () {
80- gpu , err := nvmllib .DeviceGetHandleByUUID (uuid )
80+ processedUUIDs := make (map [string ]bool )
81+
82+ for uuid , dev := range m .uuidToDeviceMap {
83+ var u string
84+ if dev .Type () == MigDeviceType {
85+ u = dev .Mig .parent .UUID
86+ } else {
87+ u = uuid
88+ }
89+
90+ if processedUUIDs [u ] {
91+ continue
92+ }
93+ gpu , err := m .nvmllib .DeviceGetHandleByUUID (u )
94+ if err != nvml .SUCCESS {
95+ klog .Infof ("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy" , u , err )
96+ m .unhealthy <- dev
97+ continue
98+ }
99+
100+ supportedEvents , err := gpu .GetSupportedEventTypes ()
81101 if err != nvml .SUCCESS {
82- klog .Infof ("Unable to get NVML handle for UUID %s: %v; skipping health check for this device" , uuid , err )
102+ klog .Infof ("unable to determine the supported events for %s: %v; marking it as unhealthy" , u , err )
103+ m .unhealthy <- dev
83104 continue
84105 }
85106
86- if err := gpu .RegisterEvents (eventMask , m .eventSet ); err != nvml .SUCCESS {
87- klog .Infof ("Failed to register events for device %s: %v; skipping health check for this device" , uuid , err )
107+ err = gpu .RegisterEvents (eventMask & supportedEvents , m .eventSet )
108+ if err == nvml .ERROR_NOT_SUPPORTED {
109+ klog .Warningf ("Device %v is too old to support healthchecking." , u )
110+ }
111+ if err != nvml .SUCCESS {
112+ klog .Infof ("unable to register events for %s: %v; marking it as unhealthy" , u , err )
113+ m .unhealthy <- dev
88114 }
115+ processedUUIDs [u ] = true
89116 }
90- return nil
91- }
92-
93- func (m * deviceHealthMonitor ) start () {
94- klog .Info ("[SWATI DEBUG] starting health monitor" )
95- m .wg .Add (1 )
96- go m .run ()
97117}
98118
99119func (m * deviceHealthMonitor ) Stop () {
100120 if m == nil {
101121 return
102122 }
103- klog .Info ("[SWATI DEBUG] stopping health monitor" )
123+ klog .V (6 ).Info ("stopping health monitor" )
124+
104125 close (m .stop )
105126 m .wg .Wait ()
106- close ( m . unhealthy )
127+
107128 m .eventSet .Free ()
108129
109- if m . nvdevlib != nil {
110- m . nvdevlib . alwaysShutdown ( )
130+ if r := m . nvmllib . Shutdown (); r != nvml . SUCCESS {
131+ klog . Warningf ( "failed to shutdown NVML: %v" , r )
111132 }
133+ close (m .unhealthy )
112134}
113135
114- func (m * deviceHealthMonitor ) run () {
115- defer m .wg .Done ()
116-
136+ func getUUIDToDeviceMap (allocatable AllocatableDevices ) map [string ]* AllocatableDevice {
117137 uuidToDeviceMap := make (map [string ]* AllocatableDevice )
118- for _ , device := range m . allocatable {
119- uuid := device . GetUUID ()
120- if uuid != "" {
121- uuidToDeviceMap [uuid ] = device
138+
139+ for _ , d := range allocatable {
140+ if u := d . GetUUID (); u != "" {
141+ uuidToDeviceMap [u ] = d
122142 }
123143 }
144+ return uuidToDeviceMap
145+ }
124146
125- klog . Info ( "Starting event-driven GPU health monitor..." )
126-
147+ func ( m * deviceHealthMonitor ) run () {
148+ defer m . wg . Done ()
127149 for {
128150 select {
129151 case <- m .stop :
130- klog .Info ("Stopping event-driven GPU health monitor..." )
152+ klog .V ( 6 ). Info ("Stopping event-driven GPU health monitor..." )
131153 return
132154 default :
133155 event , err := m .eventSet .Wait (5000 )
134156 if err == nvml .ERROR_TIMEOUT {
135- klog .Info ("[SWATI DEBUG] timedout" )
136157 continue
137158 }
138159 if err != nvml .SUCCESS {
139160 klog .Infof ("Error waiting for event: %v; Marking all devices as unhealthy" , err )
140- for _ , dev := range m .allocatable {
161+ for _ , dev := range m .uuidToDeviceMap {
141162 m .unhealthy <- dev
142163 }
143164 continue
144165 }
145166
146- // Process health events
167+ klog . Infof ( "Processing event %+v" , event )
147168 switch event .EventType {
148169 case nvml .EventTypeXidCriticalError :
149170 klog .Warningf ("Critical XID error detected on device: %+v" , event )
@@ -158,30 +179,53 @@ func (m *deviceHealthMonitor) run() {
158179 eventUUID , err := event .Device .GetUUID ()
159180 if err != nvml .SUCCESS {
160181 klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , event , err )
161- for _ , dev := range m .allocatable {
182+ for _ , dev := range m .uuidToDeviceMap {
162183 m .unhealthy <- dev
163184 }
164185 continue
165186 }
166187
167- device , exists := uuidToDeviceMap [eventUUID ]
168- if ! exists {
169- continue
188+ var affectedDevice * AllocatableDevice
189+ if event .GpuInstanceId != InstanceID && event .ComputeInstanceId != InstanceID {
190+ affectedDevice = m .findMigDevice (eventUUID , event .GpuInstanceId , event .ComputeInstanceId )
191+ klog .Infof ("Event for mig device: %v" , affectedDevice )
192+ } else {
193+ affectedDevice = m .findGpuDevice (eventUUID )
170194 }
171195
172- // Send notification to driver
173- klog .Infof ("Sending unhealthy notification for device %s due to event type %v" , eventUUID , event .EventType )
174- select {
175- case m .unhealthy <- device :
176- // Successfully sent notification
177- default :
178- // Channel full, log and continue
179- klog .Warningf ("Health notification channel full, dropping event for device %s" , eventUUID )
196+ if affectedDevice == nil {
197+ klog .Infof ("Ignoring event for unexpected device (UUID: %s, GI: %d, CI: %d)" , eventUUID , event .GpuInstanceId , event .ComputeInstanceId )
198+ continue
180199 }
200+ klog .Infof ("Sending unhealthy notification for device %s due to event type %v" , eventUUID , event .EventType )
201+ m .unhealthy <- affectedDevice
181202 }
182203 }
183204}
184205
185206func (m * deviceHealthMonitor ) Unhealthy () <- chan * AllocatableDevice {
186207 return m .unhealthy
187208}
209+
210+ func (m * deviceHealthMonitor ) findMigDevice (parentUUID string , giID uint32 , ciID uint32 ) * AllocatableDevice {
211+ for _ , device := range m .uuidToDeviceMap {
212+ if device .Type () != MigDeviceType {
213+ continue
214+ }
215+
216+ if device .Mig .parent .UUID == parentUUID &&
217+ device .Mig .giInfo .Id == giID &&
218+ device .Mig .ciInfo .Id == ciID {
219+ return device
220+ }
221+ }
222+ return nil
223+ }
224+
225+ func (m * deviceHealthMonitor ) findGpuDevice (uuid string ) * AllocatableDevice {
226+ device , exists := m .uuidToDeviceMap [uuid ]
227+ if exists && device .Type () == GpuDeviceType {
228+ return device
229+ }
230+ return nil
231+ }
0 commit comments