@@ -126,14 +126,22 @@ func (r *NicClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Req
126126 // Request object not found, could have been deleted after reconcile request.
127127 // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
128128 // Return and don't requeue
129- shouldRequeue , err := r .handleMOFEDWaitLabelsNoConfig (ctx )
129+ shouldRequeue , err := r .handleWaitLabelsNoConfig (ctx , consts . OfedDriverLabel , nodeinfo . NodeLabelWaitOFED )
130130 if err != nil {
131131 reqLogger .V (consts .LogLevelError ).Error (err , "Fail to clear Mofed label on CR deletion." )
132132 return reconcile.Result {}, err
133133 }
134134 if shouldRequeue {
135135 return r .requeue ()
136136 }
137+
138+ shouldRequeue , err = r .handleWaitLabelsNoConfig (ctx , consts .NicConfigurationDaemonLabel , nodeinfo .NodeLabelWaitNicConfig )
139+ if err != nil {
140+ return reconcile.Result {}, err
141+ }
142+ if shouldRequeue {
143+ return r .requeue ()
144+ }
137145 return reconcile.Result {}, nil
138146 }
139147 // Error reading the object - requeue the request.
@@ -175,6 +183,18 @@ func (r *NicClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Req
175183 } else {
176184 r .DocaDriverImagesProvider .SetImageSpec (nil )
177185 }
186+
187+ // If NIC Configuration Operator is not configured, handle NIC Configuration wait labels
188+ if instance .Spec .NicConfigurationOperator == nil {
189+ shouldRequeue , err := r .handleWaitLabelsNoConfig (ctx , consts .NicConfigurationDaemonLabel , nodeinfo .NodeLabelWaitNicConfig )
190+ if err != nil {
191+ return reconcile.Result {}, err
192+ }
193+ if shouldRequeue {
194+ return r .requeue ()
195+ }
196+ }
197+
178198 // Sync state and update status
179199 managerStatus := r .stateManager .SyncState (ctx , instance , sc )
180200 r .updateCrStatus (ctx , instance , managerStatus )
@@ -206,7 +226,7 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabels(
206226 reqLogger := log .FromContext (ctx )
207227 if cr .Spec .OFEDDriver == nil {
208228 reqLogger .V (consts .LogLevelDebug ).Info ("no OFED config in the policy, check OFED wait label on nodes" )
209- return r .handleMOFEDWaitLabelsNoConfig (ctx )
229+ return r .handleWaitLabelsNoConfig (ctx , consts . OfedDriverLabel , nodeinfo . NodeLabelWaitOFED )
210230 }
211231 pods := & corev1.PodList {}
212232 _ = r .Client .List (ctx , pods , client.MatchingLabels {"nvidia.com/ofed-driver" : "" })
@@ -231,24 +251,23 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabels(
231251 return false , nil
232252}
233253
234- // handleMOFEDWaitLabelsNoConfig handles MOFED wait label for scenarios when OFED is
235- // not configured in NicClusterPolicy, do the following:
236- // - set "network.nvidia.com/operator.mofed.wait" to false on Nodes with NVIDIA NICs
237- // - remove "network.nvidia.com/operator.mofed.wait" which have no NVIDIA NICs anymore
238- // - set "network.nvidia.com/operator.mofed.wait" to true if detects OFED Pod
239- // on the node (probably in the terminating state).
254+ // handleWaitLabelsNoConfig handles wait labels for for scenarios when given component is
255+ // not configured in NicClusterPolicy and does the following:
256+ // - sets given label to false on Nodes with NVIDIA NICs
257+ // - removes given label from nodes which have no NVIDIA NICs anymore
258+ // - sets given label to true if detects a pod with a specific label on the node (probably in the terminating state).
240259// returns true if requeue (resync) is required
241- func (r * NicClusterPolicyReconciler ) handleMOFEDWaitLabelsNoConfig (ctx context.Context ) (bool , error ) {
260+ func (r * NicClusterPolicyReconciler ) handleWaitLabelsNoConfig (ctx context.Context , podLabel string , waitLabel string ) (bool , error ) {
242261 reqLogger := log .FromContext (ctx )
243- nodesWithOFEDContainer := map [string ]struct {}{}
262+ nodesWithPod := map [string ]struct {}{}
244263 pods := & corev1.PodList {}
245- if err := r .Client .List (ctx , pods , client.MatchingLabels {consts . OfedDriverLabel : "" }); err != nil {
264+ if err := r .Client .List (ctx , pods , client.MatchingLabels {podLabel : "" }); err != nil {
246265 return false , errors .Wrap (err , "failed to list pods" )
247266 }
248267 for i := range pods .Items {
249268 pod := pods .Items [i ]
250269 if pod .Spec .NodeName != "" {
251- nodesWithOFEDContainer [pod .Spec .NodeName ] = struct {}{}
270+ nodesWithPod [pod .Spec .NodeName ] = struct {}{}
252271 }
253272 }
254273 nodes := & corev1.NodeList {}
@@ -258,29 +277,54 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabelsNoConfig(ctx context.C
258277 for i := range nodes .Items {
259278 node := nodes .Items [i ]
260279 labelValue := ""
261- if _ , hasOFED := nodesWithOFEDContainer [node .Name ]; hasOFED {
280+ if _ , hasPod := nodesWithPod [node .Name ]; hasPod {
262281 labelValue = "true"
263282 } else if node .GetLabels ()[nodeinfo .NodeLabelMlnxNIC ] == "true" {
264283 labelValue = "false"
265284 }
266- if err := setOFEDWaitLabel (ctx , r .Client , node .Name , labelValue ); err != nil {
285+ if err := setNodeLabel (ctx , r .Client , node .Name , waitLabel , labelValue ); err != nil {
267286 return false , err
268287 }
269288 }
270- if len (nodesWithOFEDContainer ) > 0 {
271- // There is no OFED spec in the NicClusterPolicy, but some OFED Pods are on nodes.
289+ if len (nodesWithPod ) > 0 {
290+ // There is no given component spec in the NicClusterPolicy, but some pods are on nodes.
272291 // These Pods should be eventually removed from the cluster,
273- // and we will need to update the OFED wait label for nodes.
292+ // and we will need to update the given wait label for nodes.
274293 // Here, we trigger resync explicitly to ensure that we will always handle the removal of the Pod.
275- // This explicit resync is required because we don't watch for Pods and can't rely on the OFED DaemonSet
276- // update in this case (cache with Pods can be outdated when we handle OFED DaemonSet removal event).
294+ // This explicit resync is required because we don't watch for Pods and can't rely on the DaemonSet
295+ // update in this case (cache with Pods can be outdated when we handle DaemonSet removal event).
277296 reqLogger .V (consts .LogLevelDebug ).Info (
278- "no OFED spec in NicClusterPolicy but there are OFED Pods on nodes, requeue" )
297+ "no given component spec in NicClusterPolicy but there are pods on nodes, requeue" , "component" , podLabel )
279298 return true , nil
280299 }
281300 return false , nil
282301}
283302
303+ // setNodeLabel sets the value for the given label, remove the label if the value is ""
304+ func setNodeLabel (ctx context.Context , c client.Client , node , label , value string ) error {
305+ reqLogger := log .FromContext (ctx )
306+ var patch []byte
307+ if value == "" {
308+ patch = []byte (fmt .Sprintf (`{"metadata":{"labels":{%q: null}}}` , label ))
309+ reqLogger .V (consts .LogLevelDebug ).Info ("remove given label from the node" , "node" , node , "label" , label )
310+ } else {
311+ patch = []byte (fmt .Sprintf (`{"metadata":{"labels":{%q: %q}}}` , label , value ))
312+ reqLogger .V (consts .LogLevelDebug ).Info ("update given label for the node" ,
313+ "node" , node , "label" , label , "value" , value )
314+ }
315+
316+ err := c .Patch (ctx , & corev1.Node {
317+ ObjectMeta : metav1.ObjectMeta {
318+ Name : node ,
319+ },
320+ }, client .RawPatch (types .StrategicMergePatchType , patch ))
321+
322+ if err != nil {
323+ return errors .Wrapf (err , "unable to patch %s label for node %s" , label , node )
324+ }
325+ return nil
326+ }
327+
284328// set the value for the OFED wait label, remove the label if the value is ""
285329func setOFEDWaitLabel (ctx context.Context , c client.Client , node , value string ) error {
286330 reqLogger := log .FromContext (ctx )
0 commit comments