Skip to content

Commit 16474e5

Browse files
committed
drain: adding 'USE_EXTERNAL_DRAINER' provoding an option to enable/disable SRIOV OP drain controller, in favor of using maintenance OP to drive node drain aspects
Signed-off-by: Ido Heyvi <[email protected]>
1 parent 4fd8a59 commit 16474e5

File tree

6 files changed

+57
-27
lines changed

6 files changed

+57
-27
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@ nodes in parallel from the pool the operator can drain in parallel. maxUnavailab
464464

465465
> **NOTE**: If a node is not part of any pool it will have a default configuration of maxUnavailable 1
466466

467+
> **NOTE**: Internal drain controller can be disabled by exposing the following `USE_EXTERNAL_DRAINER` env variable. This means that drain operations will be done externally, utilizing [NVIDIA maintenance OP](https://github.com/Mellanox/maintenance-operator). In addition, `SriovNetworkPoolConfig` will not take any effect during drain procedure, since the maintenance operator will be in charge of [parallel node operations](https://github.com/Mellanox/maintenance-operator/blob/main/api/v1alpha1/maintenanceoperatorconfig_types.go#L38-L46).
467468

468469
#### RDMA Mode Configuration
469470

deployment/sriov-network-operator-chart/templates/operator.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ spec:
7878
value: {{ .Values.operator.metricsExporter.certificates.secretName }}
7979
- name: METRICS_EXPORTER_KUBE_RBAC_PROXY_IMAGE
8080
value: {{ .Values.images.metricsExporterKubeRbacProxy }}
81+
- name: USE_EXTERNAL_DRAINER
82+
value: {{ .Values.operator.externalDrainer.enabled | quote }}
8183
{{- if .Values.operator.metricsExporter.prometheusOperator.enabled }}
8284
- name: METRICS_EXPORTER_PROMETHEUS_OPERATOR_ENABLED
8385
value: {{ .Values.operator.metricsExporter.prometheusOperator.enabled | quote}}

deployment/sriov-network-operator-chart/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ operator:
4040
serviceAccount: "prometheus-k8s"
4141
namespace: "monitoring"
4242
deployRules: false
43+
# use external drain controller, utilizing NVIDIA maintenance operator
44+
externalDrainer:
45+
enabled: false
4346
admissionControllers:
4447
enabled: false
4548
networkPolicy:

doc/design/parallel-node-config.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ Node annotation, `sriovnetwork.openshift.io/state` and SriovNetworkNodeState ann
6060

6161
*NOTE:* In the future we are going to drop the node annotation and only use the SriovNetworkNodeState
6262

63+
*NOTE:* Internal drain controller can be disabled by exposing the following `USE_EXTERNAL_DRAINER` env variable. This means that drain operations will be done externally, utilizing [NVIDIA maintenance OP](https://github.com/Mellanox/maintenance-operator). In addition, `SriovNetworkPoolConfig` will not take any effect during drain procedure, since the maintenance operator will be in charge of parallel node operations.
64+
6365
Draining procedure:
6466

6567
1. config daemon mark the node as `Drain_Required` or `Reboot_Required` by adding that to both the Node annotation, `sriovnetwork.openshift.io/state`

main.go

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
// to ensure that exec-entrypoint and run can make use of them.
3232
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
3333
_ "k8s.io/client-go/plugin/pkg/client/auth"
34+
"k8s.io/client-go/rest"
3435

3536
corev1 "k8s.io/api/core/v1"
3637
"k8s.io/apimachinery/pkg/runtime"
@@ -207,33 +208,8 @@ func main() {
207208
os.Exit(1)
208209
}
209210

210-
// we need a client that doesn't use the local cache for the objects
211-
drainKClient, err := client.New(restConfig, client.Options{
212-
Scheme: scheme,
213-
Cache: &client.CacheOptions{
214-
DisableFor: []client.Object{
215-
&sriovnetworkv1.SriovNetworkNodeState{},
216-
&corev1.Node{},
217-
&mcfgv1.MachineConfigPool{},
218-
},
219-
},
220-
})
221-
if err != nil {
222-
setupLog.Error(err, "unable to create drain kubernetes client")
223-
os.Exit(1)
224-
}
225-
226-
drainController, err := controllers.NewDrainReconcileController(drainKClient,
227-
mgr.GetScheme(),
228-
mgr.GetEventRecorderFor("SR-IOV operator"),
229-
platformsHelper)
230-
if err != nil {
231-
setupLog.Error(err, "unable to create controller", "controller", "DrainReconcile")
232-
os.Exit(1)
233-
}
234-
235-
if err = drainController.SetupWithManager(mgr); err != nil {
236-
setupLog.Error(err, "unable to setup controller with manager", "controller", "DrainReconcile")
211+
if err := setupDrainController(mgr, restConfig, platformsHelper, scheme); err != nil {
212+
setupLog.Error(err, "unable to setup drain controller")
237213
os.Exit(1)
238214
}
239215
// +kubebuilder:scaffold:builder
@@ -295,6 +271,46 @@ func main() {
295271
}
296272
}
297273

274+
func setupDrainController(mgr ctrl.Manager, restConfig *rest.Config,
275+
platformsHelper platforms.Interface, scheme *runtime.Scheme) error {
276+
if vars.UseExternalDrainer {
277+
setupLog.Info("internal drain controller is disabled, draining will be done externally by the maintenance operator")
278+
return nil
279+
}
280+
281+
// we need a client that doesn't use the local cache for the objects
282+
drainKClient, err := client.New(restConfig, client.Options{
283+
Scheme: scheme,
284+
Cache: &client.CacheOptions{
285+
DisableFor: []client.Object{
286+
&sriovnetworkv1.SriovNetworkNodeState{},
287+
&corev1.Node{},
288+
&mcfgv1.MachineConfigPool{},
289+
},
290+
},
291+
})
292+
if err != nil {
293+
setupLog.Error(err, "unable to create drain kubernetes client")
294+
return err
295+
}
296+
297+
drainController, err := controllers.NewDrainReconcileController(drainKClient,
298+
mgr.GetScheme(),
299+
mgr.GetEventRecorderFor("SR-IOV operator"),
300+
platformsHelper)
301+
if err != nil {
302+
setupLog.Error(err, "unable to create controller", "controller", "DrainReconcile")
303+
return err
304+
}
305+
306+
if err = drainController.SetupWithManager(mgr); err != nil {
307+
setupLog.Error(err, "unable to setup controller with manager", "controller", "DrainReconcile")
308+
return err
309+
}
310+
311+
return nil
312+
}
313+
298314
func initNicIDMap() error {
299315
kubeclient := kubernetes.NewForConfigOrDie(ctrl.GetConfigOrDie())
300316
if err := sriovnetworkv1.InitNicIDMapFromConfigMap(kubeclient, vars.Namespace); err != nil {

pkg/vars/vars.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ var (
7979

8080
// FeatureGates interface to interact with feature gates
8181
FeatureGate featuregate.FeatureGate
82+
83+
// UseExternalDrainer controls if SRIOV operator will use an external drainer
84+
// for draining nodes or its internal drain controller (default)
85+
UseExternalDrainer bool
8286
)
8387

8488
func init() {
@@ -101,4 +105,6 @@ func init() {
101105
ResourcePrefix = os.Getenv("RESOURCE_PREFIX")
102106

103107
FeatureGate = featuregate.New()
108+
109+
UseExternalDrainer = os.Getenv("USE_EXTERNAL_DRAINER") == "true"
104110
}

0 commit comments

Comments
 (0)