@@ -64,11 +64,13 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
6464 factory := nvinformers .NewSharedInformerFactory (config .clientsets .Nvidia , informerResyncPeriod )
6565 informer := factory .Resource ().V1beta1 ().ComputeDomains ().Informer ()
6666
67+ klog .Infof ("Creating new ComputeDomainManager with config %+v" , config )
6768 m := & ComputeDomainManager {
6869 config : config ,
6970 factory : factory ,
7071 informer : informer ,
7172 }
73+ // TODO (swati) add logs for daemonset and resourceClaimTemplate managers in verbose mode
7274 m .daemonSetManager = NewDaemonSetManager (config , m .Get )
7375 m .resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager (config , m .Get )
7476
@@ -97,6 +99,8 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
9799
98100 _ , err = m .informer .AddEventHandler (cache.ResourceEventHandlerFuncs {
99101 AddFunc : func (obj any ) {
102+ cd := obj .(* nvapi.ComputeDomain )
103+ klog .Infof ("ComputeDomain %s/%s added to work queue" , cd .Namespace , cd .Name )
100104 m .config .workQueue .Enqueue (obj , m .onAddOrUpdate )
101105 },
102106 UpdateFunc : func (oldObj , newObj any ) {
@@ -147,6 +151,7 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) {
147151 return nil , fmt .Errorf ("error retrieving ComputeDomain by UID: %w" , err )
148152 }
149153 if len (cds ) == 0 {
154+ klog .Infof ("No ComputeDomain found with UID: %s" , uid )
150155 return nil , nil
151156 }
152157 if len (cds ) != 1 {
@@ -166,11 +171,12 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
166171 return fmt .Errorf ("error retrieving ComputeDomain: %w" , err )
167172 }
168173 if cd == nil {
174+ klog .Infof ("ComputeDomain with UID %s not found, nothing to do" , uid )
169175 return nil
170176 }
171177
172178 if cd .GetDeletionTimestamp () == nil {
173- return fmt .Errorf ("attempting to remove finalizer before ComputeDomain marked for deletion" )
179+ return fmt .Errorf ("attempting to remove finalizer before ComputeDomain %s/%s with UID %s marked for deletion" , cd . Namespace , cd . Name , uid )
174180 }
175181
176182 newCD := cd .DeepCopy ()
@@ -181,13 +187,13 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
181187 }
182188 }
183189 if len (cd .Finalizers ) == len (newCD .Finalizers ) {
190+ klog .Infof ("Finalizer not found on ComputeDomain %s/%s, nothing to do" , cd .Namespace , cd .Name )
184191 return nil
185192 }
186193
187194 if _ , err = m .config .clientsets .Nvidia .ResourceV1beta1 ().ComputeDomains (cd .Namespace ).Update (ctx , newCD , metav1.UpdateOptions {}); err != nil {
188195 return fmt .Errorf ("error updating ComputeDomain: %w" , err )
189196 }
190-
191197 return nil
192198}
193199
@@ -215,9 +221,38 @@ func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdU
215221 }
216222
217223 if len (nodes .Items ) != 0 {
224+ // show nodes with labels
225+ nodeNames := []string {}
226+ for _ , node := range nodes .Items {
227+ nodeNames = append (nodeNames , node .Name )
228+ }
229+ klog .Errorf ("Found %d nodes with label for ComputeDomain with UID %s: %v" , len (nodes .Items ), cdUID , nodeNames )
218230 return fmt .Errorf ("nodes exist with label for ComputeDomain %s" , cdUID )
219231 }
220232
233+ // check if all resource claims for workloads are gone
234+ cd , err := m .Get (cdUID )
235+ if err != nil {
236+ return fmt .Errorf ("error retrieving ComputeDomain: %w" , err )
237+ }
238+
239+ resourceClaims , err := m .config .clientsets .Core .ResourceV1beta1 ().ResourceClaims (cd .Namespace ).List (ctx , metav1.ListOptions {
240+ LabelSelector : metav1 .FormatLabelSelector (labelSelector ),
241+ })
242+ if err != nil {
243+ return fmt .Errorf ("error retrieving ResourceClaims: %w" , err )
244+ }
245+
246+ if len (resourceClaims .Items ) != 0 {
247+ claimNames := []string {}
248+ for _ , claim := range resourceClaims .Items {
249+ claimNames = append (claimNames , claim .Name )
250+ }
251+ klog .Errorf ("Found %d ResourceClaims for ComputeDomain with UID %s: %v" ,
252+ len (resourceClaims .Items ), cdUID , claimNames )
253+ return fmt .Errorf ("ResourceClaims exist for ComputeDomain %s" , cdUID )
254+ }
255+
221256 return nil
222257}
223258
0 commit comments