diff --git a/cmd/compute-domain-controller/cleanup.go b/cmd/compute-domain-controller/cleanup.go index 75cc45f38..5a8a3ea86 100644 --- a/cmd/compute-domain-controller/cleanup.go +++ b/cmd/compute-domain-controller/cleanup.go @@ -26,6 +26,7 @@ type CleanupManager[T metav1.Object] struct { } func NewCleanupManager[T metav1.Object](informer cache.SharedIndexInformer, getComputeDomain GetComputeDomainFunc, callback CleanupCallback[T]) *CleanupManager[T] { + klog.Infof("Creating new Cleanup Manager for %T", *new(T)) return &CleanupManager[T]{ informer: informer, getComputeDomain: getComputeDomain, @@ -61,19 +62,25 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) { case <-ticker.C: klog.V(6).Infof("Running periodic sync to remove %T objects owned by stale ComputeDomain", *new(T)) store := m.informer.GetStore() - for _, item := range store.List() { + items := store.List() + klog.V(6).Infof("Found %d items to check for cleanup", len(items)) + + for _, item := range items { obj, ok := item.(T) if !ok { + klog.V(6).Infof("Expected object %T but got %T, skipping..", *new(T), obj) continue } labels := obj.GetLabels() if labels == nil { + klog.V(6).Infof("Object %T has no labels, skipping..", *new(T)) continue } uid, exists := labels[computeDomainLabelKey] if !exists { + klog.V(6).Infof("Object %T does not have ComputeDomain label, skipping..", *new(T)) continue } @@ -84,6 +91,7 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) { } if computeDomain != nil { + klog.V(6).Infof("ComputeDomain with UID %s still exists, skipping cleanup", uid) continue } diff --git a/cmd/compute-domain-controller/computedomain.go b/cmd/compute-domain-controller/computedomain.go index 2e0829b6a..2ecd430bd 100644 --- a/cmd/compute-domain-controller/computedomain.go +++ b/cmd/compute-domain-controller/computedomain.go @@ -22,6 +22,7 @@ import ( "sync" "time" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" @@ -64,11 +65,13 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager { factory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, informerResyncPeriod) informer := factory.Resource().V1beta1().ComputeDomains().Informer() + klog.Infof("Creating new ComputeDomainManager for %s/%s", config.driverName, config.driverNamespace) m := &ComputeDomainManager{ config: config, factory: factory, informer: informer, } + // TODO (swati) add logs for daemonset and resourceClaimTemplate managers in verbose mode m.daemonSetManager = NewDaemonSetManager(config, m.Get) m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config, m.Get) @@ -147,6 +150,7 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) { return nil, fmt.Errorf("error retrieving ComputeDomain by UID: %w", err) } if len(cds) == 0 { + klog.V(2).Infof("No ComputeDomain found with UID: %s", uid) return nil, nil } if len(cds) != 1 { @@ -166,11 +170,12 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string) return fmt.Errorf("error retrieving ComputeDomain: %w", err) } if cd == nil { + klog.V(2).Infof("ComputeDomain with UID %s not found, nothing to do", uid) return nil } if cd.GetDeletionTimestamp() == nil { - return fmt.Errorf("attempting to remove finalizer before ComputeDomain marked for deletion") + return fmt.Errorf("attempting to remove finalizer before ComputeDomain %s/%s with UID %s marked for deletion", cd.Namespace, cd.Name, uid) } newCD := cd.DeepCopy() @@ -181,6 +186,7 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string) } } if len(cd.Finalizers) == len(newCD.Finalizers) { + klog.V(2).Infof("Finalizer not found on ComputeDomain %s/%s, nothing to do", cd.Namespace, cd.Name) return nil } @@ -191,6 +197,20 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string) return nil } +// logNodesWithComputeDomainLabel logs nodes that have a ComputeDomain label and returns their names. +func (m *ComputeDomainManager) logNodesWithComputeDomainLabel(nodes *corev1.NodeList, cdUID string) []string { + if len(nodes.Items) == 0 { + klog.Infof("No nodes found with label for ComputeDomain with UID %s", cdUID) + return nil + } + + nodeNames := []string{} + for _, node := range nodes.Items { + nodeNames = append(nodeNames, node.Name) + } + return nodeNames +} + // AssertWorkloadsCompletes ensures that all workloads asssociated with a ComputeDomain have completed. // // TODO: We should probably also check to ensure that all ResourceClaims @@ -215,9 +235,9 @@ func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdU } if len(nodes.Items) != 0 { - return fmt.Errorf("nodes exist with label for ComputeDomain %s", cdUID) + nodeNames := m.logNodesWithComputeDomainLabel(nodes, cdUID) + return fmt.Errorf("nodes %v with label for ComputeDomain %s", nodeNames, cdUID) } - return nil } diff --git a/cmd/compute-domain-controller/controller.go b/cmd/compute-domain-controller/controller.go index 5e29d54e2..4ea9552c0 100644 --- a/cmd/compute-domain-controller/controller.go +++ b/cmd/compute-domain-controller/controller.go @@ -21,6 +21,8 @@ import ( "context" "fmt" + "k8s.io/klog/v2" + "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags" "github.com/NVIDIA/k8s-dra-driver-gpu/pkg/workqueue" ) @@ -57,6 +59,8 @@ func NewController(config *Config) *Controller { // It initializes the work queue, starts the ComputeDomain manager, and handles // graceful shutdown when the context is cancelled. func (c *Controller) Run(ctx context.Context) error { + klog.Info("Starting ComputeDomain Controller") + workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter()) managerConfig := &ManagerConfig{ @@ -78,5 +82,6 @@ func (c *Controller) Run(ctx context.Context) error { return fmt.Errorf("error stopping ComputeDomain manager: %w", err) } + klog.Info("ComputeDomain Controller is shutdown") return nil } diff --git a/cmd/compute-domain-controller/daemonset.go b/cmd/compute-domain-controller/daemonset.go index 9b6157f92..a79f0b4e6 100644 --- a/cmd/compute-domain-controller/daemonset.go +++ b/cmd/compute-domain-controller/daemonset.go @@ -86,6 +86,7 @@ func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomai informer := factory.Apps().V1().DaemonSets().Informer() + klog.Infof("Creating new DaemonSetManager for driver %s/%s", config.driverNamespace, config.driverName) m := &DaemonSetManager{ config: config, getComputeDomain: getComputeDomain, @@ -162,7 +163,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva return nil, fmt.Errorf("error retrieving DaemonSet: %w", err) } if len(ds) > 1 { - return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") + return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID %s", cd.UID) } if len(ds) == 1 { return ds[0], nil @@ -209,6 +210,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva return nil, fmt.Errorf("error creating DaemonSet: %w", err) } + klog.V(2).Infof("Successfully created DaemonSet %s/%s for ComputeDomain %s/%s", d.Namespace, d.Name, cd.Namespace, cd.Name) return d, nil } @@ -218,9 +220,10 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { return fmt.Errorf("error retrieving DaemonSet: %w", err) } if len(ds) > 1 { - return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") + return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID %s", cdUID) } if len(ds) == 0 { + klog.V(2).Infof("No DaemonSet found for ComputeDomain UID %s, nothing to delete", cdUID) return nil } @@ -231,6 +234,7 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { } if d.GetDeletionTimestamp() != nil { + klog.V(2).Infof("DaemonSet %s/%s is already marked for deletion", d.Namespace, d.Name) return nil } @@ -239,6 +243,7 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { return fmt.Errorf("erroring deleting DaemonSet: %w", err) } + klog.V(2).Infof("Successfully deleted DaemonSet %s/%s for ComputeDomain UID %s", d.Namespace, d.Name, cdUID) return nil } @@ -271,6 +276,7 @@ func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) er return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") } if len(ds) == 0 { + klog.V(2).Infof("No DaemonSet found for ComputeDomain UID %s, nothing to remove finalizer from", cdUID) return nil } @@ -288,6 +294,7 @@ func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) er } } if len(d.Finalizers) == len(newD.Finalizers) { + klog.V(2).Infof("Finalizer %s not found on DaemonSet %s/%s", computeDomainFinalizer, d.Namespace, d.Name) return nil } @@ -322,10 +329,12 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error { return fmt.Errorf("error getting ComputeDomain: %w", err) } if cd == nil { + klog.V(2).Info("No ComputeDomain found, skipping processing") return nil } if int(d.Status.NumberReady) != cd.Spec.NumNodes { + klog.V(2).Infof("DaemonSet %s/%s has %d ready nodes, expecting %d, waiting for all nodes to be ready", d.Namespace, d.Name, d.Status.NumberReady, cd.Spec.NumNodes) return nil } diff --git a/cmd/compute-domain-controller/indexers.go b/cmd/compute-domain-controller/indexers.go index 6f9a1f58b..068834552 100644 --- a/cmd/compute-domain-controller/indexers.go +++ b/cmd/compute-domain-controller/indexers.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2" ) func uidIndexer[T metav1.ObjectMetaAccessor](obj any) ([]string, error) { @@ -43,6 +44,7 @@ func addComputeDomainLabelIndexer[T metav1.ObjectMetaAccessor](informer cache.Sh if value, exists := labels[computeDomainLabelKey]; exists { return []string{value}, nil } + klog.V(2).Info("No object found with ComputeDomain Label") return nil, nil }, }) @@ -58,6 +60,7 @@ func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.S return nil, fmt.Errorf("error getting %T via ComputeDomain label: %w", *new(T1), err) } if len(objs) == 0 { + klog.V(2).Infof("No object found with ComputeDomain Label with UID %s", cdUID) return nil, nil } @@ -70,5 +73,6 @@ func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.S ds = append(ds, d) } + klog.V(2).Infof("Found %d objects with ComputeDomain Label with UID %s", len(ds), cdUID) return ds, nil } diff --git a/cmd/compute-domain-controller/resourceclaimtemplate.go b/cmd/compute-domain-controller/resourceclaimtemplate.go index 5a06598c4..75c9a339a 100644 --- a/cmd/compute-domain-controller/resourceclaimtemplate.go +++ b/cmd/compute-domain-controller/resourceclaimtemplate.go @@ -89,6 +89,7 @@ func newBaseResourceClaimTemplateManager(config *ManagerConfig, getComputeDomain informer := factory.Resource().V1beta1().ResourceClaimTemplates().Informer() + klog.Infof("Creating new ResourceClaimTemplateManager for driver %s/%s", config.driverNamespace, config.driverName) m := &BaseResourceClaimTemplateManager{ config: config, getComputeDomain: getComputeDomain, @@ -167,6 +168,7 @@ func (m *BaseResourceClaimTemplateManager) Create(ctx context.Context, templateP return nil, fmt.Errorf("error creating ResourceClaimTemplate: %w", err) } + klog.V(2).Infof("Successfully created ResourceClaimTemplate %s/%s for ComputeDomain %s/%s", rct.Namespace, rct.Name, m.config.driverNamespace, m.config.driverName) return rct, nil } @@ -179,12 +181,14 @@ func (m *BaseResourceClaimTemplateManager) Delete(ctx context.Context, cdUID str return fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID") } if len(rcts) == 0 { + klog.V(2).Infof("No ResourceClaimTemplate found for ComputeDomain UID %s, nothing to delete", cdUID) return nil } rct := rcts[0] if rct.GetDeletionTimestamp() != nil { + klog.V(2).Infof("ResourceClaimTemplate %s/%s is already marked for deletion", rct.Namespace, rct.Name) return nil } @@ -193,6 +197,7 @@ func (m *BaseResourceClaimTemplateManager) Delete(ctx context.Context, cdUID str return fmt.Errorf("erroring deleting ResourceClaimTemplate: %w", err) } + klog.V(2).Infof("Successfully deleted ResourceClaimTemplate %s/%s for ComputeDomain UID %s", rct.Namespace, rct.Name, cdUID) return nil } @@ -205,6 +210,7 @@ func (m *BaseResourceClaimTemplateManager) RemoveFinalizer(ctx context.Context, return fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID") } if len(rcts) == 0 { + klog.V(2).Infof("No ResourceClaimTemplate found for ComputeDomain UID %s, nothing to remove finalizer from", cdUID) return nil } @@ -222,6 +228,7 @@ func (m *BaseResourceClaimTemplateManager) RemoveFinalizer(ctx context.Context, } } if len(rct.Finalizers) == len(newRCT.Finalizers) { + klog.V(2).Infof("Finalizer %s not found on DaemonSet %s/%s", computeDomainFinalizer, rct.Namespace, rct.Name) return nil } @@ -286,6 +293,7 @@ func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, name return nil, fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID") } if len(rcts) == 1 { + klog.V(2).Infof("Found ResourceClaimTemplate %s/%s for ComputeDomain UID %s", rcts[0].Namespace, rcts[0].Name, cd.UID) return rcts[0], nil } @@ -310,6 +318,7 @@ func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, name return nil, fmt.Errorf("error creating ResourceClaimTemplate from base: %w", err) } + klog.V(2).Infof("Successfully created ResourceClaimTemplate from base %s/%s for ComputeDomain %s/%s", rct.Namespace, rct.Name, cd.Namespace, cd.Name) return rct, nil } @@ -346,6 +355,7 @@ func (m *WorkloadResourceClaimTemplateManager) Create(ctx context.Context, names return nil, fmt.Errorf("more than one ResourceClaimTemplate found with same ComputeDomain UID") } if len(rcts) == 1 { + klog.V(2).Infof("Found ResourceClaimTemplate %s/%s for ComputeDomain UID %s", rcts[0].Namespace, rcts[0].Name, cd.UID) return rcts[0], nil }