@@ -108,6 +108,8 @@ type NicClusterPolicyReconciler struct {
108108
109109// Reconcile is part of the main kubernetes reconciliation loop which aims to
110110// move the current state of the cluster closer to the desired state.
111+ //
112+ //nolint:gocognit,funlen
111113func (r * NicClusterPolicyReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
112114 // Wait for migration flow to finish
113115 select {
@@ -126,14 +128,24 @@ func (r *NicClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Req
126128 // Request object not found, could have been deleted after reconcile request.
127129 // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
128130 // Return and don't requeue
129- shouldRequeue , err := r .handleMOFEDWaitLabelsNoConfig (ctx )
131+ shouldRequeue , err := r .handleWaitLabelsNoConfig (ctx , consts . OfedDriverLabel , nodeinfo . NodeLabelWaitOFED )
130132 if err != nil {
131133 reqLogger .V (consts .LogLevelError ).Error (err , "Fail to clear Mofed label on CR deletion." )
132134 return reconcile.Result {}, err
133135 }
134136 if shouldRequeue {
135137 return r .requeue ()
136138 }
139+
140+ shouldRequeue , err = r .handleWaitLabelsNoConfig (
141+ ctx , consts .NicConfigurationDaemonLabel , nodeinfo .NodeLabelWaitNicConfig )
142+ if err != nil {
143+ reqLogger .V (consts .LogLevelError ).Error (err , "Fail to clear NIC Configuration wait label on CR deletion." )
144+ return reconcile.Result {}, err
145+ }
146+ if shouldRequeue {
147+ return r .requeue ()
148+ }
137149 return reconcile.Result {}, nil
138150 }
139151 // Error reading the object - requeue the request.
@@ -175,16 +187,27 @@ func (r *NicClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Req
175187 } else {
176188 r .DocaDriverImagesProvider .SetImageSpec (nil )
177189 }
190+
178191 // Sync state and update status
179192 managerStatus := r .stateManager .SyncState (ctx , instance , sc )
180193 r .updateCrStatus (ctx , instance , managerStatus )
181194
182- shouldRequeue , err := r .handleMOFEDWaitLabels (ctx , instance )
195+ shouldRequeueMofed , err := r .handleMOFEDWaitLabels (ctx , instance )
183196 if err != nil {
184197 return reconcile.Result {}, err
185198 }
186199
187- if shouldRequeue || managerStatus .Status != state .SyncStateReady {
200+ shouldRequeueNicConfig := false
201+ // If NIC Configuration Operator is not configured, handle NIC Configuration wait labels
202+ if instance .Spec .NicConfigurationOperator == nil {
203+ shouldRequeueNicConfig , err = r .handleWaitLabelsNoConfig (
204+ ctx , consts .NicConfigurationDaemonLabel , nodeinfo .NodeLabelWaitNicConfig )
205+ if err != nil {
206+ return reconcile.Result {}, err
207+ }
208+ }
209+
210+ if shouldRequeueMofed || shouldRequeueNicConfig || managerStatus .Status != state .SyncStateReady {
188211 return r .requeue ()
189212 }
190213
@@ -206,7 +229,7 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabels(
206229 reqLogger := log .FromContext (ctx )
207230 if cr .Spec .OFEDDriver == nil {
208231 reqLogger .V (consts .LogLevelDebug ).Info ("no OFED config in the policy, check OFED wait label on nodes" )
209- return r .handleMOFEDWaitLabelsNoConfig (ctx )
232+ return r .handleWaitLabelsNoConfig (ctx , consts . OfedDriverLabel , nodeinfo . NodeLabelWaitOFED )
210233 }
211234 pods := & corev1.PodList {}
212235 _ = r .Client .List (ctx , pods , client.MatchingLabels {"nvidia.com/ofed-driver" : "" })
@@ -224,31 +247,32 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabels(
224247 "node" , pod .Spec .NodeName )
225248 labelValue = "false"
226249 }
227- if err := setOFEDWaitLabel (ctx , r .Client , pod .Spec .NodeName , labelValue ); err != nil {
250+ if err := setNodeLabel (ctx , r .Client , pod .Spec .NodeName , nodeinfo . NodeLabelWaitOFED , labelValue ); err != nil {
228251 return false , err
229252 }
230253 }
231254 return false , nil
232255}
233256
234- // handleMOFEDWaitLabelsNoConfig handles MOFED wait label for scenarios when OFED is
235- // not configured in NicClusterPolicy, do the following:
236- // - set "network.nvidia.com/operator.mofed.wait" to false on Nodes with NVIDIA NICs
237- // - remove "network.nvidia.com/operator.mofed.wait" which have no NVIDIA NICs anymore
238- // - set "network.nvidia.com/operator.mofed.wait" to true if detects OFED Pod
239- // on the node (probably in the terminating state).
257+ // handleWaitLabelsNoConfig handles wait labels for for scenarios when given component is
258+ // not configured in NicClusterPolicy and does the following:
259+ // - sets given label to false on Nodes with NVIDIA NICs
260+ // - removes given label from nodes which have no NVIDIA NICs anymore
261+ // - sets given label to true if detects a pod with a specific label on the node (probably in the terminating state).
240262// returns true if requeue (resync) is required
241- func (r * NicClusterPolicyReconciler ) handleMOFEDWaitLabelsNoConfig (ctx context.Context ) (bool , error ) {
263+ //
264+ //nolint:lll
265+ func (r * NicClusterPolicyReconciler ) handleWaitLabelsNoConfig (ctx context.Context , podLabel , waitLabel string ) (bool , error ) {
242266 reqLogger := log .FromContext (ctx )
243- nodesWithOFEDContainer := map [string ]struct {}{}
267+ nodesWithPod := map [string ]struct {}{}
244268 pods := & corev1.PodList {}
245- if err := r .Client .List (ctx , pods , client.MatchingLabels {consts . OfedDriverLabel : "" }); err != nil {
269+ if err := r .Client .List (ctx , pods , client.MatchingLabels {podLabel : "" }); err != nil {
246270 return false , errors .Wrap (err , "failed to list pods" )
247271 }
248272 for i := range pods .Items {
249273 pod := pods .Items [i ]
250274 if pod .Spec .NodeName != "" {
251- nodesWithOFEDContainer [pod .Spec .NodeName ] = struct {}{}
275+ nodesWithPod [pod .Spec .NodeName ] = struct {}{}
252276 }
253277 }
254278 nodes := & corev1.NodeList {}
@@ -258,40 +282,40 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabelsNoConfig(ctx context.C
258282 for i := range nodes .Items {
259283 node := nodes .Items [i ]
260284 labelValue := ""
261- if _ , hasOFED := nodesWithOFEDContainer [node .Name ]; hasOFED {
285+ if _ , hasPod := nodesWithPod [node .Name ]; hasPod {
262286 labelValue = "true"
263287 } else if node .GetLabels ()[nodeinfo .NodeLabelMlnxNIC ] == "true" {
264288 labelValue = "false"
265289 }
266- if err := setOFEDWaitLabel (ctx , r .Client , node .Name , labelValue ); err != nil {
290+ if err := setNodeLabel (ctx , r .Client , node .Name , waitLabel , labelValue ); err != nil {
267291 return false , err
268292 }
269293 }
270- if len (nodesWithOFEDContainer ) > 0 {
271- // There is no OFED spec in the NicClusterPolicy, but some OFED Pods are on nodes.
294+ if len (nodesWithPod ) > 0 {
295+ // There is no given component spec in the NicClusterPolicy, but some pods are on nodes.
272296 // These Pods should be eventually removed from the cluster,
273- // and we will need to update the OFED wait label for nodes.
297+ // and we will need to update the given wait label for nodes.
274298 // Here, we trigger resync explicitly to ensure that we will always handle the removal of the Pod.
275- // This explicit resync is required because we don't watch for Pods and can't rely on the OFED DaemonSet
276- // update in this case (cache with Pods can be outdated when we handle OFED DaemonSet removal event).
299+ // This explicit resync is required because we don't watch for Pods and can't rely on the DaemonSet
300+ // update in this case (cache with Pods can be outdated when we handle DaemonSet removal event).
277301 reqLogger .V (consts .LogLevelDebug ).Info (
278- "no OFED spec in NicClusterPolicy but there are OFED Pods on nodes, requeue" )
302+ "no given component spec in NicClusterPolicy but there are pods on nodes, requeue" , "component" , podLabel )
279303 return true , nil
280304 }
281305 return false , nil
282306}
283307
284- // set the value for the OFED wait label, remove the label if the value is ""
285- func setOFEDWaitLabel (ctx context.Context , c client.Client , node , value string ) error {
308+ // setNodeLabel sets the value for the given label, remove the label if the value is ""
309+ func setNodeLabel (ctx context.Context , c client.Client , node , label , value string ) error {
286310 reqLogger := log .FromContext (ctx )
287311 var patch []byte
288312 if value == "" {
289- patch = []byte (fmt .Sprintf (`{"metadata":{"labels":{%q: null}}}` , nodeinfo . NodeLabelWaitOFED ))
290- reqLogger .V (consts .LogLevelDebug ).Info ("remove OFED wait label from the node" , "node" , node )
313+ patch = []byte (fmt .Sprintf (`{"metadata":{"labels":{%q: null}}}` , label ))
314+ reqLogger .V (consts .LogLevelDebug ).Info ("remove given label from the node" , "node" , node , "label" , label )
291315 } else {
292- patch = []byte (fmt .Sprintf (`{"metadata":{"labels":{%q: %q}}}` , nodeinfo . NodeLabelWaitOFED , value ))
293- reqLogger .V (consts .LogLevelDebug ).Info ("update OFED wait label for the node" ,
294- "node" , node , "value" , value )
316+ patch = []byte (fmt .Sprintf (`{"metadata":{"labels":{%q: %q}}}` , label , value ))
317+ reqLogger .V (consts .LogLevelDebug ).Info ("update given label for the node" ,
318+ "node" , node , "label" , label , " value" , value )
295319 }
296320
297321 err := c .Patch (ctx , & corev1.Node {
@@ -301,8 +325,7 @@ func setOFEDWaitLabel(ctx context.Context, c client.Client, node, value string)
301325 }, client .RawPatch (types .StrategicMergePatchType , patch ))
302326
303327 if err != nil {
304- return errors .Wrapf (err , "unable to patch %s label for node %s" , nodeinfo .NodeLabelWaitOFED ,
305- node )
328+ return errors .Wrapf (err , "unable to patch %s label for node %s" , label , node )
306329 }
307330 return nil
308331}
0 commit comments