Skip to content

Commit 979bd6d

Browse files
authored
feat: adding external-drainer nodestate annotation for drain-controller (Mellanox#1774)
2 parents 3178ca1 + c727f6c commit 979bd6d

File tree

7 files changed

+113
-69
lines changed

7 files changed

+113
-69
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ Setting above environment variables can be used in values.yaml, in case network-
259259
# drain controller requestor ID to be used in nodeMaintenance objects
260260
drainControllerRequestorID: "nvidia.network-operator-drain-controller"
261261
```
262+
> __Note__: in order to use network-operator drain-controller, make sure that `sriovnetwork.openshift.io/use-external-drainer=true`
263+
> exists under nodestate annotations. Is should be added by SRIOV operator
262264

263265
Using requestor mode supports a `shared-requestor` flow where multiple operators can coordinate node maintenance operations:
264266
Assumptions:

controllers/drain_controller.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,14 @@ func (r *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
124124
return ctrl.Result{}, nil
125125
}
126126

127+
// check if nodestate has annotation
128+
// TODO: Replace "sriovnetwork.openshift.io/use-external-drainer" with SRIOV constants.NodeExternalDrainerAnnotation
129+
if !utils.ObjectHasAnnotation(nodeNetworkState, "sriovnetwork.openshift.io/use-external-drainer", "true") {
130+
reqLogger.Info(`nodestate is not set with external drainer annotation, don't requeue the request. Make sure that
131+
SRIOV operator is configured to use external drainer`, "nodeState", req.Name)
132+
return ctrl.Result{}, nil
133+
}
134+
127135
// create the drain state annotation if it doesn't exist in the sriovNetworkNodeState object
128136
nodeStateDrainAnnotationCurrent, currentNodeStateExist,
129137
err := r.ensureAnnotationExists(ctx, nodeNetworkState, constants.NodeStateDrainAnnotationCurrent)
@@ -344,7 +352,7 @@ func (d DrainAnnotationPredicate) Update(e event.UpdateEvent) bool {
344352
oldAnno, hasOldAnno := e.ObjectOld.GetAnnotations()[constants.NodeDrainAnnotation]
345353
newAnno, hasNewAnno := e.ObjectNew.GetAnnotations()[constants.NodeDrainAnnotation]
346354

347-
d.log.V(consts.LogLevelDebug).Info("Update", "oldAnno", oldAnno, "newAnno", newAnno)
355+
d.log.V(consts.LogLevelDebug).Info("Update node state drain annotation", "oldAnno", oldAnno, "newAnno", newAnno)
348356
if !hasOldAnno && hasNewAnno {
349357
return true
350358
}
@@ -381,6 +389,16 @@ func (d DrainStateAnnotationPredicate) Update(e event.UpdateEvent) bool {
381389
return true
382390
}
383391

392+
// Check if the external drainer annotation has been added
393+
// TODO: Replace "sriovnetwork.openshift.io/use-external-drainer" with SRIOV constants.NodeExternalDrainerAnnotation
394+
_, hasOldAnno = e.ObjectOld.GetAnnotations()["sriovnetwork.openshift.io/use-external-drainer"]
395+
_, hasNewAnno = e.ObjectNew.GetAnnotations()["sriovnetwork.openshift.io/use-external-drainer"]
396+
if !hasOldAnno && hasNewAnno {
397+
d.log.V(consts.LogLevelDebug).Info("Update nodestate external drainer annotation", "oldAnno",
398+
oldAnno, "newAnno", newAnno)
399+
return true
400+
}
401+
384402
return oldAnno != newAnno
385403
}
386404

controllers/drain_controller_test.go

Lines changed: 48 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package controllers //nolint:dupl
1818

1919
import (
2020
"context"
21+
"fmt"
2122
"sync/atomic"
2223
"time"
2324

@@ -38,13 +39,16 @@ import (
3839
sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
3940
constants "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
4041
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/utils"
42+
43+
"github.com/Mellanox/network-operator/pkg/drain"
4144
)
4245

4346
var (
44-
testRequestorID = "secondary.requestor.com"
45-
drainRequestorID = "primary-drain.requestor.com"
46-
drainRequestorNS = "default"
47-
maxParallelOperations = atomic.Int32{}
47+
testRequestorID = "secondary.requestor.com"
48+
drainRequestorID = "primary-drain.requestor.com"
49+
drainRequestorNS = "default"
50+
testNodeMaintenancePrefix = drain.DefaultNodeMaintenanceNamePrefix
51+
maxParallelOperations = atomic.Int32{}
4852
)
4953

5054
var _ = Describe("Drain Controller", Ordered, func() {
@@ -97,9 +101,19 @@ var _ = Describe("Drain Controller", Ordered, func() {
97101
})
98102

99103
Context("when there is only one node", func() {
104+
It("should not drain node on drain require while use-external-drainer annotation is not set",
105+
func(ctx context.Context) {
106+
node, nodeState := createNode(ctx, "node1", false)
107+
108+
simulateDaemonSetAnnotation(node, constants.DrainRequired)
109+
110+
expectNodeStateAnnotation(nodeState, constants.DrainIdle)
111+
expectNodeIsSchedulable(node)
112+
113+
})
100114

101115
It("should drain single node on drain require", func(ctx context.Context) {
102-
node, nodeState := createNode(ctx, "node1")
116+
node, nodeState := createNode(ctx, "node1", true)
103117

104118
simulateDaemonSetAnnotation(node, constants.DrainRequired)
105119

@@ -114,7 +128,7 @@ var _ = Describe("Drain Controller", Ordered, func() {
114128
})
115129

116130
It("should not drain on reboot for single node", func(ctx context.Context) {
117-
node, nodeState := createNode(ctx, "node1")
131+
node, nodeState := createNode(ctx, "node1", true)
118132

119133
simulateDaemonSetAnnotation(node, constants.RebootRequired)
120134

@@ -127,8 +141,8 @@ var _ = Describe("Drain Controller", Ordered, func() {
127141
})
128142

129143
It("should drain on reboot for multiple node", func(ctx context.Context) {
130-
node, nodeState := createNode(ctx, "node1")
131-
createNode(ctx, "node2")
144+
node, nodeState := createNode(ctx, "node1", true)
145+
createNode(ctx, "node2", true)
132146

133147
simulateDaemonSetAnnotation(node, constants.RebootRequired)
134148

@@ -141,25 +155,26 @@ var _ = Describe("Drain Controller", Ordered, func() {
141155
})
142156

143157
It("should drain single node on drain require, with additional requestor", func(ctx context.Context) {
144-
node, nodeState := createNode(ctx, "node1")
145-
_ = newNodeMaintenance(ctx, k8sClient, "node1", drainRequestorNS)
158+
node, nodeState := createNode(ctx, "node1", true)
159+
nmName := fmt.Sprintf("%s-%s", testNodeMaintenancePrefix, node.Name)
160+
_ = newNodeMaintenance(ctx, k8sClient, node.Name, drainRequestorNS)
146161

147162
simulateDaemonSetAnnotation(node, constants.DrainRequired)
148163
expectNodeStateAnnotation(nodeState, constants.DrainComplete)
149164
expectNodeIsNotSchedulable(node)
150165

151166
nm := maintenancev1alpha1.NodeMaintenance{}
152-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: node.Name, Namespace: drainRequestorNS},
167+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nmName, Namespace: drainRequestorNS},
153168
&nm)).ToNot(HaveOccurred())
154-
expectNodeMaintenanceUpdate(node.Name, []string{drainRequestorID})
169+
expectNodeMaintenanceUpdate(nmName, []string{drainRequestorID})
155170

156171
expectNodeStateAnnotation(nodeState, constants.DrainComplete)
157172
simulateDaemonSetAnnotation(node, constants.DrainIdle)
158-
expectNodeMaintenanceUpdate(node.Name, nil)
173+
expectNodeMaintenanceUpdate(nmName, nil)
159174
// expect node to be unschedulable
160175
expectNodeIsNotSchedulable(node)
161176

162-
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: node.Name, Namespace: drainRequestorNS},
177+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: nmName, Namespace: drainRequestorNS},
163178
&nm)).ToNot(HaveOccurred())
164179
Eventually(k8sClient.Delete(ctx, &nm)).ToNot(HaveOccurred())
165180
expectNodeStateAnnotation(nodeState, constants.DrainIdle)
@@ -170,9 +185,9 @@ var _ = Describe("Drain Controller", Ordered, func() {
170185
Context("when there are multiple nodes", func() {
171186

172187
It("should drain nodes serially", func(ctx context.Context) {
173-
node1, nodeState1 := createNode(ctx, "node1")
174-
node2, nodeState2 := createNode(ctx, "node2")
175-
node3, nodeState3 := createNode(ctx, "node3")
188+
node1, nodeState1 := createNode(ctx, "node1", true)
189+
node2, nodeState2 := createNode(ctx, "node2", true)
190+
node3, nodeState3 := createNode(ctx, "node3", true)
176191

177192
// Two nodes require to drain at the same time
178193
maxParallelOperations.Store(1)
@@ -211,9 +226,9 @@ var _ = Describe("Drain Controller", Ordered, func() {
211226
})
212227

213228
It("should drain nodes in parallel", func(ctx context.Context) {
214-
node1, nodeState1 := createNode(ctx, "node1")
215-
node2, nodeState2 := createNode(ctx, "node2")
216-
node3, nodeState3 := createNode(ctx, "node3")
229+
node1, nodeState1 := createNode(ctx, "node1", true)
230+
node2, nodeState2 := createNode(ctx, "node2", true)
231+
node3, nodeState3 := createNode(ctx, "node3", true)
217232

218233
// two nodes require to drain at the same time
219234
maxParallelOperations.Store(2)
@@ -309,7 +324,8 @@ func simulateDaemonSetAnnotation(node *corev1.Node, drainAnnotationValue string)
309324
ToNot(HaveOccurred())
310325
}
311326

312-
func createNode(ctx context.Context, nodeName string) (*corev1.Node, *sriovnetworkv1.SriovNetworkNodeState) {
327+
func createNode(ctx context.Context, nodeName string, useExternalDrainer bool) (*corev1.Node,
328+
*sriovnetworkv1.SriovNetworkNodeState) {
313329
node := corev1.Node{
314330
ObjectMeta: metav1.ObjectMeta{
315331
Name: nodeName,
@@ -327,11 +343,15 @@ func createNode(ctx context.Context, nodeName string) (*corev1.Node, *sriovnetwo
327343
ObjectMeta: metav1.ObjectMeta{
328344
Name: nodeName,
329345
Namespace: namespaceName,
330-
Labels: map[string]string{
346+
Annotations: map[string]string{
331347
constants.NodeStateDrainAnnotationCurrent: constants.DrainIdle,
332348
},
333349
},
334350
}
351+
if useExternalDrainer {
352+
// TODO: change annotation name to constants.NodeExternalDrainerAnnotation once SRIOV PR is merged
353+
nodeState.Annotations["sriovnetwork.openshift.io/use-external-drainer"] = "true"
354+
}
335355

336356
Expect(k8sClient.Create(ctx, &node)).ToNot(HaveOccurred())
337357
Expect(k8sClient.Create(ctx, &nodeState)).ToNot(HaveOccurred())
@@ -403,7 +423,7 @@ func processItems(ctx context.Context, c client.Client, maxParallel *atomic.Int3
403423

404424
// Handle deletion case
405425
if !nm.DeletionTimestamp.IsZero() {
406-
By("maintenance operator: remove finalizer")
426+
By("maintenance operator: remove node-maintenance finalizer")
407427
if controllerutil.ContainsFinalizer(&nm, maintenancev1alpha1.MaintenanceFinalizerName) {
408428
original := nm.DeepCopy()
409429
nm.SetFinalizers([]string{})
@@ -414,22 +434,23 @@ func processItems(ctx context.Context, c client.Client, maxParallel *atomic.Int3
414434
// uncordon node
415435
By("maintenance operator: uncordon node")
416436
node := &corev1.Node{}
417-
Expect(c.Get(ctx, types.NamespacedName{Name: nm.Name}, node)).ToNot(HaveOccurred())
437+
Expect(c.Get(ctx, types.NamespacedName{Name: nm.Spec.NodeName}, node)).ToNot(HaveOccurred())
418438
node.Spec.Unschedulable = false
419439
Expect(c.Update(ctx, node)).To(Succeed())
420440
continue
421441
}
422442

423443
// Handle creation/update case
424444
if !controllerutil.ContainsFinalizer(&nm, maintenancev1alpha1.MaintenanceFinalizerName) {
425-
By("maintenance operator: add finalizer")
445+
By("maintenance operator: add node-maintenance finalizer")
426446
nm.Finalizers = append(nm.Finalizers, maintenancev1alpha1.MaintenanceFinalizerName)
427447
Expect(c.Update(ctx, &nm)).To(Succeed())
428448
continue
429449
}
430450

431451
// Update status conditions
432452
if nm.Status.Conditions == nil {
453+
By("maintenance operator: add node-maintenance conditions")
433454
Expect(nm.Finalizers).To(ContainElement(maintenancev1alpha1.MaintenanceFinalizerName))
434455
// Update status conditions
435456
meta.SetStatusCondition(&nm.Status.Conditions, desiredCondition)
@@ -439,7 +460,7 @@ func processItems(ctx context.Context, c client.Client, maxParallel *atomic.Int3
439460
// cordon node
440461
By("maintenance operator: cordon node")
441462
node := &corev1.Node{}
442-
Expect(c.Get(ctx, types.NamespacedName{Name: nm.Name}, node)).ToNot(HaveOccurred())
463+
Expect(c.Get(ctx, types.NamespacedName{Name: nm.Spec.NodeName}, node)).ToNot(HaveOccurred())
443464
node.Spec.Unschedulable = true
444465
Expect(c.Update(ctx, node)).To(Succeed())
445466
}
@@ -450,7 +471,7 @@ func newNodeMaintenance(ctx context.Context, c client.Client,
450471
name, namespace string) *maintenancev1alpha1.NodeMaintenance {
451472
nm := &maintenancev1alpha1.NodeMaintenance{
452473
ObjectMeta: metav1.ObjectMeta{
453-
Name: name,
474+
Name: fmt.Sprintf("%s-%s", testNodeMaintenancePrefix, name),
454475
Namespace: namespace,
455476
},
456477
Spec: maintenancev1alpha1.NodeMaintenanceSpec{

controllers/suite_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ var _ = BeforeSuite(func() {
130130

131131
Expect(os.Setenv("DRAIN_CONTROLLER_REQUESTOR_NAMESPACE", drainRequestorNS)).NotTo(HaveOccurred())
132132
Expect(os.Setenv("DRAIN_CONTROLLER_REQUESTOR_ID", drainRequestorID)).NotTo(HaveOccurred())
133+
Expect(os.Setenv("DRAIN_CONTROLLER_NODE_MAINTENANCE_PREFIX", testNodeMaintenancePrefix)).NotTo(HaveOccurred())
133134
Expect(os.Setenv("DRAIN_CONTROLLER_SRIOV_NODE_STATE_NAMESPACE", namespaceName)).NotTo(HaveOccurred())
134135

135136
By("bootstrapping test environment")

deployment/network-operator/charts/sriov-network-operator/templates/operator.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ spec:
8686
value: {{ .Values.operator.metricsExporter.certificates.secretName }}
8787
- name: METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE
8888
value: {{ .Values.images.metricsExporterKubeRbacProxy }}
89-
- name: USE_MAINTENANCE_OPERATOR_DRAINER
90-
value: {{ .Values.operator.maintenanceOperatorDrainer.enabled | quote }}
89+
- name: USE_EXTERNAL_DRAINER
90+
value: {{ .Values.operator.externalDrainer.enabled | quote }}
9191
{{- if .Values.operator.metricsExporter.prometheusOperator.enabled }}
9292
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
9393
value: {{ .Values.operator.metricsExporter.prometheusOperator.enabled | quote}}

deployment/network-operator/charts/sriov-network-operator/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ operator:
4040
serviceAccount: "prometheus-k8s"
4141
namespace: "monitoring"
4242
deployRules: false
43-
# use external drain controller, utilizing NVIDIA maintenance operator
44-
maintenanceOperatorDrainer:
43+
# use external drain controller
44+
externalDrainer:
4545
enabled: false
4646
admissionControllers:
4747
enabled: false

0 commit comments

Comments
 (0)