Skip to content

Commit cc65465

Browse files
committed
Add more logs to computedomain manager
Signed-off-by: Swati Gupta <[email protected]>
1 parent 085c179 commit cc65465

File tree

1 file changed

+37
-2
lines changed

1 file changed

+37
-2
lines changed

cmd/compute-domain-controller/computedomain.go

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,13 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
6464
factory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, informerResyncPeriod)
6565
informer := factory.Resource().V1beta1().ComputeDomains().Informer()
6666

67+
klog.Infof("Creating new ComputeDomainManager with config %+v", config)
6768
m := &ComputeDomainManager{
6869
config: config,
6970
factory: factory,
7071
informer: informer,
7172
}
73+
// TODO (swati) add logs for daemonset and resourceClaimTemplate managers in verbose mode
7274
m.daemonSetManager = NewDaemonSetManager(config, m.Get)
7375
m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config, m.Get)
7476

@@ -97,6 +99,8 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
9799

98100
_, err = m.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
99101
AddFunc: func(obj any) {
102+
cd := obj.(*nvapi.ComputeDomain)
103+
klog.Infof("ComputeDomain %s/%s added to work queue", cd.Namespace, cd.Name)
100104
m.config.workQueue.Enqueue(obj, m.onAddOrUpdate)
101105
},
102106
UpdateFunc: func(oldObj, newObj any) {
@@ -147,6 +151,7 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) {
147151
return nil, fmt.Errorf("error retrieving ComputeDomain by UID: %w", err)
148152
}
149153
if len(cds) == 0 {
154+
klog.Infof("No ComputeDomain found with UID: %s", uid)
150155
return nil, nil
151156
}
152157
if len(cds) != 1 {
@@ -166,11 +171,12 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
166171
return fmt.Errorf("error retrieving ComputeDomain: %w", err)
167172
}
168173
if cd == nil {
174+
klog.Infof("ComputeDomain with UID %s not found, nothing to do", uid)
169175
return nil
170176
}
171177

172178
if cd.GetDeletionTimestamp() == nil {
173-
return fmt.Errorf("attempting to remove finalizer before ComputeDomain marked for deletion")
179+
return fmt.Errorf("attempting to remove finalizer before ComputeDomain %s/%s with UID %s marked for deletion", cd.Namespace, cd.Name, uid)
174180
}
175181

176182
newCD := cd.DeepCopy()
@@ -181,13 +187,13 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
181187
}
182188
}
183189
if len(cd.Finalizers) == len(newCD.Finalizers) {
190+
klog.Infof("Finalizer not found on ComputeDomain %s/%s, nothing to do", cd.Namespace, cd.Name)
184191
return nil
185192
}
186193

187194
if _, err = m.config.clientsets.Nvidia.ResourceV1beta1().ComputeDomains(cd.Namespace).Update(ctx, newCD, metav1.UpdateOptions{}); err != nil {
188195
return fmt.Errorf("error updating ComputeDomain: %w", err)
189196
}
190-
191197
return nil
192198
}
193199

@@ -215,9 +221,38 @@ func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdU
215221
}
216222

217223
if len(nodes.Items) != 0 {
224+
// show nodes with labels
225+
nodeNames := []string{}
226+
for _, node := range nodes.Items {
227+
nodeNames = append(nodeNames, node.Name)
228+
}
229+
klog.Errorf("Found %d nodes with label for ComputeDomain with UID %s: %v", len(nodes.Items), cdUID, nodeNames)
218230
return fmt.Errorf("nodes exist with label for ComputeDomain %s", cdUID)
219231
}
220232

233+
// check if all resource claims for workloads are gone
234+
cd, err := m.Get(cdUID)
235+
if err != nil {
236+
return fmt.Errorf("error retrieving ComputeDomain: %w", err)
237+
}
238+
239+
resourceClaims, err := m.config.clientsets.Core.ResourceV1beta1().ResourceClaims(cd.Namespace).List(ctx, metav1.ListOptions{
240+
LabelSelector: metav1.FormatLabelSelector(labelSelector),
241+
})
242+
if err != nil {
243+
return fmt.Errorf("error retrieving ResourceClaims: %w", err)
244+
}
245+
246+
if len(resourceClaims.Items) != 0 {
247+
claimNames := []string{}
248+
for _, claim := range resourceClaims.Items {
249+
claimNames = append(claimNames, claim.Name)
250+
}
251+
klog.Errorf("Found %d ResourceClaims for ComputeDomain with UID %s: %v",
252+
len(resourceClaims.Items), cdUID, claimNames)
253+
return fmt.Errorf("ResourceClaims exist for ComputeDomain %s", cdUID)
254+
}
255+
221256
return nil
222257
}
223258

0 commit comments

Comments
 (0)