Skip to content

Commit c3b65e2

Browse files
fix: resolve NVIDIADriver stuck in NotReady on nodeSelector changes with OnDelete strategy
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent fd65f07 commit c3b65e2

File tree

1 file changed

+151
-8
lines changed

1 file changed

+151
-8
lines changed

internal/state/state_skel.go

Lines changed: 151 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23+
"strings"
2324

2425
"github.com/go-logr/logr"
2526
appsv1 "k8s.io/api/apps/v1"
27+
corev1 "k8s.io/api/core/v1"
2628
apierrors "k8s.io/apimachinery/pkg/api/errors"
2729
"k8s.io/apimachinery/pkg/api/meta"
2830
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -402,7 +404,7 @@ func (s *stateSkel) getSyncState(ctx context.Context, objs []*unstructured.Unstr
402404

403405
// Object exists, check for Kind specific readiness
404406
if found.GetKind() == "DaemonSet" {
405-
if ready, err := s.isDaemonSetReady(found, reqLogger); err != nil || !ready {
407+
if ready, err := s.isDaemonSetReady(ctx, found, reqLogger); err != nil || !ready {
406408
reqLogger.V(consts.LogLevelInfo).Info("Object is not ready", "Kind:", obj.GetKind(), "Name", obj.GetName())
407409
return SyncStateNotReady, err
408410
}
@@ -413,7 +415,7 @@ func (s *stateSkel) getSyncState(ctx context.Context, objs []*unstructured.Unstr
413415
}
414416

415417
// isDaemonSetReady checks if daemonset is ready
416-
func (s *stateSkel) isDaemonSetReady(uds *unstructured.Unstructured, reqLogger logr.Logger) (bool, error) {
418+
func (s *stateSkel) isDaemonSetReady(ctx context.Context, uds *unstructured.Unstructured, reqLogger logr.Logger) (bool, error) {
417419
buf, err := uds.MarshalJSON()
418420
if err != nil {
419421
return false, fmt.Errorf("failed to marshall unstructured daemonset object: %w", err)
@@ -433,14 +435,33 @@ func (s *stateSkel) isDaemonSetReady(uds *unstructured.Unstructured, reqLogger l
433435
"UpdatedPodsScheduled", ds.Status.UpdatedNumberScheduled,
434436
"PodsReady:", ds.Status.NumberReady,
435437
"Conditions:", ds.Status.Conditions)
436-
// Note(adrianc): We check for DesiredNumberScheduled!=0 as we expect to have at least one node that would need
437-
// to have DaemonSet Pods deployed onto it. DesiredNumberScheduled == 0 then indicates that this field was not yet
438-
// updated by the DaemonSet controller
439-
// TODO: Check if we can use another field maybe to indicate it was processed by the DaemonSet controller.
440-
if ds.Status.DesiredNumberScheduled != 0 && ds.Status.DesiredNumberScheduled == ds.Status.NumberAvailable &&
441-
ds.Status.UpdatedNumberScheduled == ds.Status.NumberAvailable {
438+
439+
if ds.Status.DesiredNumberScheduled == 0 {
440+
reqLogger.V(consts.LogLevelDebug).Info("DesiredNumberScheduled is 0, DaemonSet not ready")
441+
return false, nil
442+
}
443+
444+
// Check basic availability
445+
if ds.Status.NumberUnavailable != 0 {
446+
reqLogger.V(consts.LogLevelInfo).Info("DaemonSet has unavailable pods")
447+
return false, nil
448+
}
449+
450+
if ds.Status.DesiredNumberScheduled != ds.Status.NumberAvailable {
451+
reqLogger.V(consts.LogLevelInfo).Info("Not all desired pods are available")
452+
return false, nil
453+
}
454+
455+
// For OnDelete strategy, use revision hash checking with node placement verification
456+
if ds.Spec.UpdateStrategy.Type == appsv1.OnDeleteDaemonSetStrategyType {
457+
return s.isDaemonSetReadyOnDelete(ctx, ds, reqLogger)
458+
}
459+
460+
// For RollingUpdate strategy, check UpdatedNumberScheduled
461+
if ds.Status.UpdatedNumberScheduled == ds.Status.NumberAvailable {
442462
return true, nil
443463
}
464+
444465
return false, nil
445466
}
446467

@@ -454,3 +475,125 @@ func (s *stateSkel) checkAttributesExist(attrs nodeinfo.NodeAttributes, attrType
454475
}
455476
return nil
456477
}
478+
479+
// isDaemonSetReadyOnDelete checks if a DaemonSet with OnDelete update strategy is ready
480+
// by verifying pod revision hashes and node placement
481+
func (s *stateSkel) isDaemonSetReadyOnDelete(ctx context.Context, ds *appsv1.DaemonSet, reqLogger logr.Logger) (bool, error) {
482+
ownedPods, err := s.getOwnedPods(ctx, ds)
483+
if err != nil {
484+
reqLogger.Error(err, "Failed to list pods for DaemonSet")
485+
return false, err
486+
}
487+
if len(ownedPods) == 0 {
488+
return false, nil
489+
}
490+
491+
if !s.arePodsHealthy(ownedPods) {
492+
return false, nil
493+
}
494+
495+
dsRevisionHash, err := s.getLatestRevisionHash(ctx, ds)
496+
if err != nil {
497+
// If revision hash unavailable, fall back to health-only check
498+
reqLogger.V(consts.LogLevelWarning).Info("Could not get revision hash, using health-only check", "error", err)
499+
return true, nil
500+
}
501+
502+
// Check if all pods have the latest revision
503+
for _, pod := range ownedPods {
504+
if hash, ok := pod.Labels["controller-revision-hash"]; !ok || hash != dsRevisionHash {
505+
// Pods have outdated revision - verify they're on nodes matching current nodeSelector
506+
reqLogger.V(consts.LogLevelInfo).Info("Pods have outdated revision, verifying node placement")
507+
return s.verifyNodePlacement(ctx, ds, ownedPods, reqLogger)
508+
}
509+
}
510+
511+
return true, nil
512+
}
513+
514+
// getOwnedPods returns pods owned by the DaemonSet
515+
func (s *stateSkel) getOwnedPods(ctx context.Context, ds *appsv1.DaemonSet) ([]corev1.Pod, error) {
516+
podList := &corev1.PodList{}
517+
opts := []client.ListOption{
518+
client.MatchingLabels(ds.Spec.Template.Labels),
519+
client.InNamespace(ds.Namespace),
520+
}
521+
if err := s.client.List(ctx, podList, opts...); err != nil {
522+
return nil, err
523+
}
524+
525+
var ownedPods []corev1.Pod
526+
for _, pod := range podList.Items {
527+
if len(pod.OwnerReferences) > 0 && pod.OwnerReferences[0].UID == ds.UID {
528+
ownedPods = append(ownedPods, pod)
529+
}
530+
}
531+
return ownedPods, nil
532+
}
533+
534+
// getLatestRevisionHash retrieves the latest ControllerRevision hash for a DaemonSet
535+
func (s *stateSkel) getLatestRevisionHash(ctx context.Context, ds *appsv1.DaemonSet) (string, error) {
536+
revisionList := &appsv1.ControllerRevisionList{}
537+
if err := s.client.List(ctx, revisionList,
538+
client.MatchingLabels(ds.Spec.Selector.MatchLabels),
539+
client.InNamespace(ds.Namespace)); err != nil {
540+
return "", err
541+
}
542+
543+
// Find revision with highest revision number in single pass
544+
var latestRevision *appsv1.ControllerRevision
545+
namePrefix := ds.Name + "-"
546+
547+
for i := range revisionList.Items {
548+
rev := &revisionList.Items[i]
549+
if !strings.HasPrefix(rev.Name, namePrefix) {
550+
continue
551+
}
552+
if latestRevision == nil || rev.Revision > latestRevision.Revision {
553+
latestRevision = rev
554+
}
555+
}
556+
557+
if latestRevision == nil {
558+
return "", fmt.Errorf("no revisions found")
559+
}
560+
561+
return strings.TrimPrefix(latestRevision.Name, namePrefix), nil
562+
}
563+
564+
// arePodsHealthy checks if all pods are running with all containers ready
565+
func (s *stateSkel) arePodsHealthy(pods []corev1.Pod) bool {
566+
for _, pod := range pods {
567+
if pod.Status.Phase != corev1.PodRunning {
568+
return false
569+
}
570+
for _, cs := range pod.Status.ContainerStatuses {
571+
if !cs.Ready {
572+
return false
573+
}
574+
}
575+
}
576+
return true
577+
}
578+
579+
// verifyNodePlacement checks if all pods are on nodes matching the DaemonSet's nodeSelector
580+
func (s *stateSkel) verifyNodePlacement(ctx context.Context, ds *appsv1.DaemonSet, pods []corev1.Pod, reqLogger logr.Logger) (bool, error) {
581+
for _, pod := range pods {
582+
node := &corev1.Node{}
583+
if err := s.client.Get(ctx, types.NamespacedName{Name: pod.Spec.NodeName}, node); err != nil {
584+
return false, err
585+
}
586+
587+
// Check if node matches nodeSelector
588+
for key, value := range ds.Spec.Template.Spec.NodeSelector {
589+
if node.Labels[key] != value {
590+
reqLogger.V(consts.LogLevelInfo).Info("Pod on non-matching node",
591+
"pod", pod.Name, "node", pod.Spec.NodeName, "selector", key, "expected", value, "actual", node.Labels[key])
592+
return false, nil
593+
}
594+
}
595+
}
596+
597+
reqLogger.V(consts.LogLevelInfo).Info("Pods healthy on correct nodes despite outdated revision")
598+
return true, nil
599+
}

0 commit comments

Comments
 (0)