@@ -23,7 +23,10 @@ import (
2323 "sync"
2424
2525 resourceapi "k8s.io/api/resource/v1"
26+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627 "k8s.io/apimachinery/pkg/runtime"
28+ metav1apply "k8s.io/client-go/applyconfigurations/meta/v1"
29+ resourceapply "k8s.io/client-go/applyconfigurations/resource/v1"
2730 "k8s.io/dynamic-resource-allocation/kubeletplugin"
2831 "k8s.io/klog/v2"
2932 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
@@ -285,8 +288,11 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
285288 Config : configapi .DefaultMigDeviceConfig (),
286289 })
287290
291+ // Swati: Add resourceclaim status update
288292 // Look through the configs and figure out which one will be applied to
289293 // each device allocation result based on their order of precedence and type.
294+ resourceClaimStatus := resourceapply .ResourceClaimStatus ()
295+ var deviceStatuses []* resourceapply.AllocatedDeviceStatusApplyConfiguration
290296 configResultsMap := make (map [runtime.Object ][]* resourceapi.DeviceRequestAllocationResult )
291297 for _ , result := range claim .Status .Allocation .Devices .Results {
292298 if result .Driver != DriverName {
@@ -296,6 +302,41 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
296302 if ! exists {
297303 return nil , fmt .Errorf ("requested device is not allocatable: %v" , result .Device )
298304 }
305+ // Swati add health check
306+ klog .Info ("[SWATI DEBUG] adding device status" )
307+ deviceStatus := resourceapply .AllocatedDeviceStatus ().
308+ WithDevice (result .Device ).
309+ WithDriver (result .Driver ).
310+ WithPool (result .Pool )
311+
312+ if device .Health == Unhealthy {
313+ deviceStatus = deviceStatus .WithConditions (
314+ metav1apply .Condition ().
315+ WithType ("Ready" ).
316+ WithStatus (metav1 .ConditionFalse ).
317+ WithReason ("Unhealthy" ).
318+ WithMessage (fmt .Sprintf ("Device %s is not healthy" , result .Device )).
319+ WithLastTransitionTime (metav1 .Now ()),
320+ )
321+ klog .Warningf ("Device %s is unhealthy, marking as not ready" , result .Device )
322+ } else {
323+ deviceStatus = deviceStatus .WithConditions (
324+ metav1apply .Condition ().
325+ WithType ("Ready" ).
326+ WithStatus (metav1 .ConditionTrue ).
327+ WithReason ("Healthy" ).
328+ WithMessage ("Device is healthy and ready" ).
329+ WithLastTransitionTime (metav1 .Now ()),
330+ )
331+ klog .Infof ("Device %s is healthy, marking as ready" , result .Device )
332+ }
333+ deviceStatuses = append (deviceStatuses , deviceStatus )
334+
335+ // Only proceed with config mapping for healthy devices
336+ if device .Health == Unhealthy {
337+ continue
338+ }
339+
299340 for _ , c := range slices .Backward (configs ) {
300341 if slices .Contains (c .Requests , result .Request ) {
301342 if _ , ok := c .Config .(* configapi.GpuConfig ); ok && device .Type () != GpuDeviceType {
@@ -319,7 +360,25 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
319360 }
320361 }
321362 }
363+ resourceClaimStatus = resourceClaimStatus .WithDevices (deviceStatuses ... )
364+
365+ // Update the resource claim status
366+ resourceClaimApply := resourceapply .ResourceClaim (claim .Name , claim .Namespace ).WithStatus (resourceClaimStatus )
367+ _ , err = s .config .clientsets .Resource .ResourceClaims (claim .Namespace ).ApplyStatus (ctx ,
368+ resourceClaimApply ,
369+ metav1.ApplyOptions {FieldManager : DriverName , Force : true },
370+ )
371+
372+ if err != nil {
373+ klog .Infof ("failed to update status for claim %s/%s : %v" , claim .Namespace , claim .Name , err )
374+ } else {
375+ klog .Infof ("update status for claim %s/%s" , claim .Namespace , claim .Name )
376+ }
322377
378+ // If no healthy devices are available for configuration, return
379+ if len (configResultsMap ) == 0 {
380+ return nil , fmt .Errorf ("no healthy devices available for allocation" )
381+ }
323382 // Normalize, validate, and apply all configs associated with devices that
324383 // need to be prepared. Track device group configs generated from applying the
325384 // config to the set of device allocation results.
@@ -550,6 +609,21 @@ func GetOpaqueDeviceConfigs(
550609 return resultConfigs , nil
551610}
552611
612+ func (s * DeviceState ) MarkDeviceUnhealthy (unhealthyDevice * AllocatableDevice ) {
613+ s .Lock ()
614+ defer s .Unlock ()
615+
616+ uuid := unhealthyDevice .GetUUID ()
617+ device , ok := s .allocatable [uuid ]
618+ if ! ok {
619+ klog .Warningf ("Attempted to mark unknown device as unhealthy: %s" , uuid )
620+ return
621+ }
622+
623+ device .Health = Unhealthy
624+ klog .Infof ("Marked device:%s unhealthy" , uuid )
625+ }
626+
553627// TODO: Dynamic MIG is not yet supported with structured parameters.
554628// Refactor this to allow for the allocation of statically partitioned MIG
555629// devices.
0 commit comments