Skip to content

Commit d44a9b7

Browse files
Merge pull request #182 from almaslennikov/wait-label
feat: manage nvidia.com/operator.nic-configuration.wait label
2 parents 5b6aae5 + 6df9178 commit d44a9b7

File tree

8 files changed

+302
-3
lines changed

8 files changed

+302
-3
lines changed

README.md

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ NVIDIA NIC Configuration Operator uses the [Maintenance Operator](https://github
1616
### Prerequisites
1717

1818
* Kubernetes cluster
19-
* [NVIDIA Network Operator](https://github.com/Mellanox/network-operator) deployed
19+
* [NVIDIA Network Operator](https://github.com/Mellanox/network-operator) deployed. It is recommended to deploy the [DOCA-OFED driver](https://github.com/Mellanox/network-operator?tab=readme-ov-file#driver-containers)
2020
* [Maintenance Operator](https://github.com/Mellanox/maintenance-operator) deployed
2121

2222
NVIDIA NIC Configuration Operator can be deployed as part of the [NIC Cluster Policy CRD](https://github.com/Mellanox/network-operator?tab=readme-ov-file#nicclusterpolicy-spec).
@@ -65,6 +65,7 @@ spec:
6565
qos:
6666
trust: dscp
6767
pfc: "0,0,0,1,0,0,0,0"
68+
tos: 0
6869
gpuDirectOptimized:
6970
enabled: true
7071
env: Baremetal
@@ -92,9 +93,9 @@ spec:
9293
* `ROCE_CC_PRIO_MASK_P1=255`, `ROCE_CC_PRIO_MASK_P2=255`
9394
* `CNP_DSCP_P1=4`, `CNP_DSCP_P2=4`
9495
* `CNP_802P_PRIO_P1=6`, `CNP_802P_PRIO_P2=6`
95-
* Configure pfc (Priority Flow Control) for priority 3 and set trust to dscp on each PF
96+
* Configure pfc (Priority Flow Control) for priority 3, set trust to dscp on each PF, set ToS (Type of Service) to 0.
9697
* Non-persistent (need to be applied after each boot)
97-
* Users can override values via `trust` and `pfc` parameters
98+
* Users can override values via `trust`, `pfc` and `tos` parameters
9899
* Can only be enabled with `linkType=Ethernet`
99100
* `gpuDirectOptimized`: performs gpu direct optimizations. ATM only optimizations for Baremetal environment are supported. If enabled perform the following:
100101
* Set nvconfig `ATS_ENABLED=0`
@@ -227,3 +228,11 @@ status:
227228
#### Implementation details:
228229

229230
The NicDevice CRD is created and reconciled by the configuration daemon. The reconciliation logic scheme can be found [here](docs/nic-configuration-reconcile-diagram.png).
231+
232+
## Order of operations
233+
234+
To include the NIC Configuration Operator as part of network configuration workflows, strict order of operations might need to be enforced. For example, [SR-IOV Network Configuration Daemon](https://github.com/k8snetworkplumbingwg/sriov-network-operator) pod should start AFTER the NIC Configuration Daemon has finished.
235+
To indicate when NIC configuration is in progress to the pods that depend on it, the operator manages the `nvidia.com/operator.nic-configuration.wait` label, which has the value `false` when the requested NIC configuration has successfuly been applied, and the value `true` when the NIC configuration is in progress.
236+
To use this mechanism, the next pods in the pipeline can add `nvidia.com/operator.nic-configuration.wait=false` to their node label selectors. That way, they will automatically be evicted from the node when the NICs are being configured.
237+
238+
The NIC Configuration Daemon itself relies on the `network.nvidia.com/operator.mofed.wait=false` label to be present on the node as it requires the DOCA-OFED driver to be running for some of the configurations.

cmd/nic-configuration-daemon/main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
"github.com/Mellanox/nic-configuration-operator/api/v1alpha1"
3939
"github.com/Mellanox/nic-configuration-operator/internal/controller"
4040
"github.com/Mellanox/nic-configuration-operator/pkg/configuration"
41+
"github.com/Mellanox/nic-configuration-operator/pkg/consts"
4142
"github.com/Mellanox/nic-configuration-operator/pkg/devicediscovery"
4243
"github.com/Mellanox/nic-configuration-operator/pkg/dms"
4344
"github.com/Mellanox/nic-configuration-operator/pkg/firmware"
@@ -169,6 +170,13 @@ func main() {
169170

170171
ctx := ctrl.SetupSignalHandler()
171172

173+
// Set the nic configuration wait label on the node to true until desired configuration is confirmed to be applied
174+
err = maintenanceManager.SetNodeWaitLabel(ctx, consts.LabelValueTrue)
175+
if err != nil {
176+
log.Log.Error(err, "failed to set the nic configuration wait label on the node to true")
177+
os.Exit(1)
178+
}
179+
172180
err = mgr.GetCache().IndexField(ctx, &v1alpha1.NicDevice{}, "status.node", func(o client.Object) []string {
173181
return []string{o.(*v1alpha1.NicDevice).Status.Node}
174182
})

deployment/nic-configuration-operator-chart/templates/config-daemon.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ spec:
1919
kubectl.kubernetes.io/default-container: nic-configuration-daemon
2020
labels:
2121
control-plane: nic-configuration-daemon
22+
nvidia.com/nic-configuration-daemon: ""
2223
{{- include "nic-configuration-operator.selectorLabels" . | nindent 8 }}
2324
spec:
2425
nodeSelector: {{- toYaml .Values.configDaemon.nodeSelector | nindent 8 }}

pkg/consts/consts.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,8 @@ const (
137137

138138
OverlayNone = "none"
139139
OverlayL3 = "l3"
140+
141+
NodeNicConfigurationWaitLabel = "network.nvidia.com/operator.nic-configuration.wait"
142+
LabelValueTrue = "true"
143+
LabelValueFalse = "false"
140144
)

pkg/maintenance/maintenancemanager.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@ package maintenance
1717

1818
import (
1919
"context"
20+
"fmt"
2021

2122
maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
23+
corev1 "k8s.io/api/core/v1"
2224
"k8s.io/apimachinery/pkg/api/meta"
2325
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/types"
2427
"k8s.io/client-go/util/workqueue"
2528
"sigs.k8s.io/controller-runtime/pkg/client"
2629
"sigs.k8s.io/controller-runtime/pkg/event"
@@ -67,6 +70,7 @@ type MaintenanceManager interface {
6770
ScheduleMaintenance(ctx context.Context) error
6871
MaintenanceAllowed(ctx context.Context) (bool, error)
6972
ReleaseMaintenance(ctx context.Context) error
73+
SetNodeWaitLabel(ctx context.Context, value string) error
7074
Reboot() error
7175
}
7276

@@ -132,6 +136,12 @@ func (m maintenanceManager) ScheduleMaintenance(ctx context.Context) error {
132136
return err
133137
}
134138

139+
err = m.SetNodeWaitLabel(ctx, consts.LabelValueTrue)
140+
if err != nil {
141+
log.Log.Error(err, "failed to set the nic configuration wait label on the node to true")
142+
return err
143+
}
144+
135145
return nil
136146
}
137147

@@ -179,6 +189,12 @@ func (m maintenanceManager) ReleaseMaintenance(ctx context.Context) error {
179189
}
180190
}
181191

192+
err = m.SetNodeWaitLabel(ctx, consts.LabelValueFalse)
193+
if err != nil {
194+
log.Log.Error(err, "failed to set the nic configuration wait label on the node to false")
195+
return err
196+
}
197+
182198
return nil
183199
}
184200

@@ -188,6 +204,27 @@ func (m maintenanceManager) Reboot() error {
188204
return m.hostUtils.ScheduleReboot()
189205
}
190206

207+
// SetNodeWaitLabel ensures the node has the network.nvidia.com/operator.nic-configuration.wait label with provided value.
208+
// It performs a merge patch and is idempotent when the label already has the desired value.
209+
func (m maintenanceManager) SetNodeWaitLabel(ctx context.Context, value string) error {
210+
log.Log.Info("maintenanceManager.SetNodeLabel()", "node", m.nodeName, "key", consts.NodeNicConfigurationWaitLabel, "value", value)
211+
212+
var patch []byte
213+
if value == "" {
214+
// Remove label when value is empty
215+
patch = []byte(fmt.Sprintf(`{"metadata":{"labels":{%q: null}}}`, consts.NodeNicConfigurationWaitLabel))
216+
} else {
217+
// Set/update label
218+
patch = []byte(fmt.Sprintf(`{"metadata":{"labels":{%q: %q}}}`, consts.NodeNicConfigurationWaitLabel, value))
219+
}
220+
221+
if err := m.client.Patch(ctx, &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: m.nodeName}}, client.RawPatch(types.StrategicMergePatchType, patch)); err != nil {
222+
log.Log.Error(err, "failed to patch node label", "node", m.nodeName, "key", consts.NodeNicConfigurationWaitLabel, "value", value)
223+
return err
224+
}
225+
return nil
226+
}
227+
191228
func New(client client.Client, hostUtils host.HostUtils, nodeName string, namespace string) MaintenanceManager {
192229
return maintenanceManager{client: client, hostUtils: hostUtils, nodeName: nodeName, namespace: namespace}
193230
}
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
// Copyright 2025 NVIDIA CORPORATION & AFFILIATES
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
//
15+
// SPDX-License-Identifier: Apache-2.0
16+
17+
package maintenance
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
. "github.com/onsi/ginkgo/v2"
24+
. "github.com/onsi/gomega"
25+
26+
maintenanceoperator "github.com/Mellanox/maintenance-operator/api/v1alpha1"
27+
corev1 "k8s.io/api/core/v1"
28+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/apimachinery/pkg/runtime"
30+
"k8s.io/apimachinery/pkg/types"
31+
"sigs.k8s.io/controller-runtime/pkg/client"
32+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
33+
34+
"github.com/Mellanox/nic-configuration-operator/pkg/consts"
35+
hostmocks "github.com/Mellanox/nic-configuration-operator/pkg/host/mocks"
36+
)
37+
38+
var _ = Describe("maintenanceManager", func() {
39+
var (
40+
ctx context.Context
41+
scheme *runtime.Scheme
42+
namespace string
43+
nodeName string
44+
)
45+
46+
BeforeEach(func() {
47+
ctx = context.Background()
48+
scheme = runtime.NewScheme()
49+
Expect(corev1.AddToScheme(scheme)).To(Succeed())
50+
Expect(maintenanceoperator.AddToScheme(scheme)).To(Succeed())
51+
namespace = "test-ns"
52+
nodeName = "test-node"
53+
})
54+
55+
It("SetNodeLabel adds, updates and deletes a label via strategic merge patch", func() {
56+
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
57+
cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(node).Build()
58+
m := maintenanceManager{client: cl, nodeName: nodeName}
59+
60+
// add
61+
Expect(m.SetNodeWaitLabel(ctx, "value1")).To(Succeed())
62+
updated := &corev1.Node{}
63+
Expect(m.client.Get(ctx, types.NamespacedName{Name: nodeName}, updated)).To(Succeed())
64+
Expect(updated.Labels).To(HaveKeyWithValue(consts.NodeNicConfigurationWaitLabel, "value1"))
65+
66+
// same value (no-op server-side)
67+
Expect(m.SetNodeWaitLabel(ctx, "value1")).To(Succeed())
68+
69+
// update
70+
Expect(m.SetNodeWaitLabel(ctx, "value2")).To(Succeed())
71+
Expect(m.client.Get(ctx, types.NamespacedName{Name: nodeName}, updated)).To(Succeed())
72+
Expect(updated.Labels).To(HaveKeyWithValue(consts.NodeNicConfigurationWaitLabel, "value2"))
73+
74+
// delete
75+
Expect(m.SetNodeWaitLabel(ctx, "")).To(Succeed())
76+
Expect(m.client.Get(ctx, types.NamespacedName{Name: nodeName}, updated)).To(Succeed())
77+
Expect(updated.Labels).ToNot(HaveKey(consts.NodeNicConfigurationWaitLabel))
78+
})
79+
80+
It("schedules maintenance and sets the wait label; second call is idempotent", func() {
81+
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
82+
cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(node).Build()
83+
m := maintenanceManager{client: cl, nodeName: nodeName, namespace: namespace}
84+
85+
// first schedule creates one object and sets wait label true
86+
Expect(m.ScheduleMaintenance(ctx)).To(Succeed())
87+
88+
nmList := &maintenanceoperator.NodeMaintenanceList{}
89+
Expect(cl.List(ctx, nmList, clientInNamespace(namespace))).To(Succeed())
90+
Expect(nmList.Items).To(HaveLen(1))
91+
Expect(nmList.Items[0].Spec.NodeName).To(Equal(nodeName))
92+
Expect(nmList.Items[0].Spec.RequestorID).To(Equal(consts.MaintenanceRequestor))
93+
94+
updated := &corev1.Node{}
95+
Expect(cl.Get(ctx, types.NamespacedName{Name: nodeName}, updated)).To(Succeed())
96+
Expect(updated.Labels).To(HaveKeyWithValue(consts.NodeNicConfigurationWaitLabel, consts.LabelValueTrue))
97+
98+
// second schedule is a no-op and label remains true
99+
Expect(m.ScheduleMaintenance(ctx)).To(Succeed())
100+
nmList = &maintenanceoperator.NodeMaintenanceList{}
101+
Expect(cl.List(ctx, nmList, clientInNamespace(namespace))).To(Succeed())
102+
Expect(nmList.Items).To(HaveLen(1))
103+
Expect(cl.Get(ctx, types.NamespacedName{Name: nodeName}, updated)).To(Succeed())
104+
Expect(updated.Labels).To(HaveKeyWithValue(consts.NodeNicConfigurationWaitLabel, consts.LabelValueTrue))
105+
})
106+
107+
It("reports maintenance allowed only when Ready condition is true", func() {
108+
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
109+
cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(node).Build()
110+
m := maintenanceManager{client: cl, nodeName: nodeName, namespace: namespace}
111+
112+
// no object
113+
allowed, err := m.MaintenanceAllowed(ctx)
114+
Expect(err).To(BeNil())
115+
Expect(allowed).To(BeFalse())
116+
117+
// object without Ready condition
118+
nm := &maintenanceoperator.NodeMaintenance{
119+
ObjectMeta: metav1.ObjectMeta{Name: consts.MaintenanceRequestName + "-" + nodeName, Namespace: namespace},
120+
Spec: maintenanceoperator.NodeMaintenanceSpec{RequestorID: consts.MaintenanceRequestor, NodeName: nodeName},
121+
}
122+
cl = fake.NewClientBuilder().WithScheme(scheme).WithObjects(node, nm).Build()
123+
m.client = cl
124+
125+
allowed, err = m.MaintenanceAllowed(ctx)
126+
Expect(err).To(BeNil())
127+
Expect(allowed).To(BeFalse())
128+
129+
// object with Ready=false
130+
nm.Status.Conditions = []metav1.Condition{{Type: maintenanceoperator.ConditionTypeReady, Status: metav1.ConditionFalse}}
131+
cl = fake.NewClientBuilder().WithScheme(scheme).WithObjects(node, nm).Build()
132+
m.client = cl
133+
allowed, err = m.MaintenanceAllowed(ctx)
134+
Expect(err).To(BeNil())
135+
Expect(allowed).To(BeFalse())
136+
137+
// object with Ready=true
138+
nm.Status.Conditions = []metav1.Condition{{Type: maintenanceoperator.ConditionTypeReady, Status: metav1.ConditionTrue}}
139+
cl = fake.NewClientBuilder().WithScheme(scheme).WithObjects(node, nm).Build()
140+
m.client = cl
141+
allowed, err = m.MaintenanceAllowed(ctx)
142+
Expect(err).To(BeNil())
143+
Expect(allowed).To(BeTrue())
144+
})
145+
146+
It("releases maintenance and clears the wait label when present", func() {
147+
node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
148+
nm := &maintenanceoperator.NodeMaintenance{
149+
ObjectMeta: metav1.ObjectMeta{Name: consts.MaintenanceRequestName + "-" + nodeName, Namespace: namespace},
150+
Spec: maintenanceoperator.NodeMaintenanceSpec{RequestorID: consts.MaintenanceRequestor, NodeName: nodeName},
151+
}
152+
cl := fake.NewClientBuilder().WithScheme(scheme).WithObjects(node, nm).Build()
153+
m := maintenanceManager{client: cl, nodeName: nodeName, namespace: namespace}
154+
155+
// ensure label is set true first (simulate schedule)
156+
Expect(m.SetNodeWaitLabel(ctx, consts.LabelValueTrue)).To(Succeed())
157+
158+
// release maintenance should delete object and set label false
159+
Expect(m.ReleaseMaintenance(ctx)).To(Succeed())
160+
nmList := &maintenanceoperator.NodeMaintenanceList{}
161+
Expect(cl.List(ctx, nmList, clientInNamespace(namespace))).To(Succeed())
162+
Expect(nmList.Items).To(HaveLen(0))
163+
164+
updated := &corev1.Node{}
165+
Expect(cl.Get(ctx, types.NamespacedName{Name: nodeName}, updated)).To(Succeed())
166+
Expect(updated.Labels).To(HaveKeyWithValue(consts.NodeNicConfigurationWaitLabel, consts.LabelValueFalse))
167+
})
168+
169+
It("calls host utils to reboot and propagates errors", func() {
170+
mockHU := &hostmocks.HostUtils{}
171+
mockHU.On("ScheduleReboot").Return(nil).Once()
172+
m := maintenanceManager{hostUtils: mockHU}
173+
Expect(m.Reboot()).To(Succeed())
174+
mockHU.AssertExpectations(GinkgoT())
175+
176+
mockHU2 := &hostmocks.HostUtils{}
177+
rebootErr := fmt.Errorf("reboot failed")
178+
mockHU2.On("ScheduleReboot").Return(rebootErr).Once()
179+
m = maintenanceManager{hostUtils: mockHU2}
180+
Expect(m.Reboot()).To(MatchError(rebootErr))
181+
mockHU2.AssertExpectations(GinkgoT())
182+
})
183+
})
184+
185+
// helpers
186+
func clientInNamespace(ns string) clientListOptionInNamespace {
187+
return clientListOptionInNamespace{Namespace: ns}
188+
}
189+
190+
type clientListOptionInNamespace struct{ Namespace string }
191+
192+
func (o clientListOptionInNamespace) ApplyToList(opts *client.ListOptions) {
193+
opts.Namespace = o.Namespace
194+
}

pkg/maintenance/mocks/MaintenanceManager.go

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)