@@ -71,6 +71,10 @@ type nvidiaDevicePlugin struct {
7171 health chan * rm.Device
7272 stop chan interface {}
7373
74+ // deviceListUpdate is used to trigger ListAndWatch to send updated device
75+ // list to kubelet (e.g., when devices recover from unhealthy state)
76+ deviceListUpdate chan struct {}
77+
7478 imexChannels imex.Channels
7579
7680 mps mpsOptions
@@ -117,13 +121,18 @@ func (plugin *nvidiaDevicePlugin) initialize() {
117121 plugin .server = grpc .NewServer ([]grpc.ServerOption {}... )
118122 plugin .health = make (chan * rm.Device , healthChannelBufferSize )
119123 plugin .stop = make (chan interface {})
124+ plugin .deviceListUpdate = make (chan struct {}, 1 )
120125}
121126
122127func (plugin * nvidiaDevicePlugin ) cleanup () {
123128 close (plugin .stop )
129+ if plugin .deviceListUpdate != nil {
130+ close (plugin .deviceListUpdate )
131+ }
124132 plugin .server = nil
125133 plugin .health = nil
126134 plugin .stop = nil
135+ plugin .deviceListUpdate = nil
127136}
128137
129138// Devices returns the full set of devices associated with the plugin.
@@ -163,6 +172,9 @@ func (plugin *nvidiaDevicePlugin) Start(kubeletSocket string) error {
163172 }
164173 }()
165174
175+ // Start recovery worker to detect when unhealthy devices become healthy
176+ go plugin .runRecoveryWorker ()
177+
166178 return nil
167179}
168180
@@ -270,7 +282,9 @@ func (plugin *nvidiaDevicePlugin) GetDevicePluginOptions(context.Context, *plugi
270282 return options , nil
271283}
272284
273- // ListAndWatch lists devices and update that list according to the health status
285+ // ListAndWatch lists devices and update that list according to the health
286+ // status. This now supports device recovery: when devices that were marked
287+ // unhealthy recover, they are automatically re-advertised to kubelet.
274288func (plugin * nvidiaDevicePlugin ) ListAndWatch (e * pluginapi.Empty , s pluginapi.DevicePlugin_ListAndWatchServer ) error {
275289 if err := s .Send (& pluginapi.ListAndWatchResponse {Devices : plugin .apiDevices ()}); err != nil {
276290 return err
@@ -281,9 +295,17 @@ func (plugin *nvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.D
281295 case <- plugin .stop :
282296 return nil
283297 case d := <- plugin .health :
284- // FIXME: there is no way to recover from the Unhealthy state.
298+ // Device marked unhealthy by health check
285299 d .Health = pluginapi .Unhealthy
286- klog .Infof ("'%s' device marked unhealthy: %s" , plugin .rm .Resource (), d .ID )
300+ klog .Infof ("'%s' device marked unhealthy: %s (reason: %s)" ,
301+ plugin .rm .Resource (), d .ID , d .UnhealthyReason )
302+ if err := s .Send (& pluginapi.ListAndWatchResponse {Devices : plugin .apiDevices ()}); err != nil {
303+ return nil
304+ }
305+ case <- plugin .deviceListUpdate :
306+ // Device recovery or other device list change
307+ klog .Infof ("'%s' device list updated, notifying kubelet" ,
308+ plugin .rm .Resource ())
287309 if err := s .Send (& pluginapi.ListAndWatchResponse {Devices : plugin .apiDevices ()}); err != nil {
288310 return nil
289311 }
@@ -519,6 +541,80 @@ func (plugin *nvidiaDevicePlugin) updateResponseForDeviceMounts(response *plugin
519541 }
520542}
521543
544+ // runRecoveryWorker periodically checks if unhealthy devices have recovered
545+ // and notifies kubelet when they do.
546+ func (plugin * nvidiaDevicePlugin ) runRecoveryWorker () {
547+ const recoveryInterval = 30 * time .Second
548+
549+ ticker := time .NewTicker (recoveryInterval )
550+ defer ticker .Stop ()
551+
552+ klog .V (2 ).Infof ("Recovery worker started for '%s' (interval=%v)" ,
553+ plugin .rm .Resource (), recoveryInterval )
554+
555+ for {
556+ select {
557+ case <- plugin .stop :
558+ klog .V (2 ).Info ("Recovery worker stopped" )
559+ return
560+ case <- ticker .C :
561+ plugin .checkForRecoveredDevices ()
562+ }
563+ }
564+ }
565+
566+ // checkForRecoveredDevices checks all unhealthy devices to see if they have
567+ // recovered. If any have recovered, triggers a device list update to
568+ // kubelet.
569+ func (plugin * nvidiaDevicePlugin ) checkForRecoveredDevices () {
570+ recoveredDevices := []* rm.Device {}
571+
572+ for _ , d := range plugin .rm .Devices () {
573+ if ! d .IsUnhealthy () {
574+ continue
575+ }
576+
577+ // Increment recovery attempts
578+ d .RecoveryAttempts ++
579+
580+ // Check if device has recovered
581+ healthy , err := plugin .rm .CheckDeviceHealth (d )
582+ if err != nil {
583+ klog .V (4 ).Infof ("Device %s recovery check failed (attempt %d): %v" ,
584+ d .ID , d .RecoveryAttempts , err )
585+ continue
586+ }
587+
588+ if healthy {
589+ klog .Infof ("Device %s has RECOVERED! Was unhealthy for %v (reason: %s)" ,
590+ d .ID , d .UnhealthyDuration (), d .UnhealthyReason )
591+ d .MarkHealthy ()
592+ recoveredDevices = append (recoveredDevices , d )
593+ } else {
594+ klog .V (3 ).Infof ("Device %s still unhealthy (attempt %d, duration %v)" ,
595+ d .ID , d .RecoveryAttempts , d .UnhealthyDuration ())
596+ }
597+ }
598+
599+ // If any devices recovered, notify ListAndWatch
600+ if len (recoveredDevices ) > 0 {
601+ klog .Infof ("Total recovered devices: %d" , len (recoveredDevices ))
602+ plugin .triggerDeviceListUpdate ()
603+ }
604+ }
605+
606+ // triggerDeviceListUpdate sends a signal to ListAndWatch to send an updated
607+ // device list to kubelet. Uses a buffered channel with non-blocking send to
608+ // avoid blocking the recovery worker.
609+ func (plugin * nvidiaDevicePlugin ) triggerDeviceListUpdate () {
610+ select {
611+ case plugin .deviceListUpdate <- struct {}{}:
612+ klog .V (3 ).Info ("Device list update triggered" )
613+ default :
614+ klog .V (4 ).Info ("Device list update already pending, skipping" )
615+ }
616+ }
617+
522618func (plugin * nvidiaDevicePlugin ) apiDeviceSpecs (devRoot string , ids []string ) []* pluginapi.DeviceSpec {
523619 optional := map [string ]bool {
524620 "/dev/nvidiactl" : true ,
0 commit comments