Skip to content

Commit 29846b9

Browse files
committed
ctrl: kubeletConfig: avoid updating ConfigMap on Paused MCP
Having a paused MCP should prevent updating/creating the corresponding config map for the specified node group. So far, the code wasn't considering the case of paused MCPs, which lead to creating/updating the config map a thing that caused a mismatch between the configuration in the config map and the one reflected on the NRTs. In this commit, we modify the kubeletconfig controller to skip on updating RTE config maps that belong to paused MCPs. Signed-off-by: Shereen Haj <[email protected]>
1 parent 8725e19 commit 29846b9

File tree

4 files changed

+177
-37
lines changed

4 files changed

+177
-37
lines changed

internal/controller/kubeletconfig_controller.go

Lines changed: 49 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import (
4949

5050
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
5151
"github.com/openshift-kni/numaresources-operator/internal/machineconfigpools"
52+
intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
5253
"github.com/openshift-kni/numaresources-operator/pkg/apply"
5354
"github.com/openshift-kni/numaresources-operator/pkg/kubeletconfig"
5455
"github.com/openshift-kni/numaresources-operator/pkg/objectnames"
@@ -57,7 +58,8 @@ import (
5758
)
5859

5960
const (
60-
kubeletConfigRetryPeriod = 30 * time.Second
61+
kubeletConfigRetryPeriod = 30 * time.Second
62+
MachineConfigPoolPausedRetryPeriod = 2 * time.Minute
6163
)
6264

6365
const (
@@ -116,22 +118,20 @@ func (r *KubeletConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques
116118

117119
// KubeletConfig changes are expected to be sporadic, yet are important enough
118120
// to be made visible at kubernetes level. So we generate events to handle them
119-
cm, err := r.reconcileConfigMap(ctx, instance, req.NamespacedName)
120-
if err != nil {
121-
var klErr *InvalidKubeletConfig
122-
if errors.As(err, &klErr) {
123-
r.Recorder.Event(instance, "Normal", "ProcessSkip", "ignored kubelet config "+klErr.ObjectName)
124-
return ctrl.Result{}, nil
125-
}
126-
127-
klog.ErrorS(err, "failed to reconcile configmap", "controller", "kubeletconfig")
121+
cm, step := r.reconcileConfigMap(ctx, instance, req.NamespacedName)
122+
if step.Error != nil && step.ConditionInfo.Reason != intreconcile.EventProcessSkip {
123+
klog.ErrorS(step.Error, "failed to reconcile configmap", "controller", "kubeletconfig")
124+
r.Recorder.Event(instance, step.ConditionInfo.Type, step.ConditionInfo.Reason, step.ConditionInfo.Message)
125+
return step.Result, step.Error
126+
}
128127

129-
r.Recorder.Event(instance, "Warning", "ProcessFailed", "Failed to update RTE config from kubelet config "+req.NamespacedName.String())
130-
return ctrl.Result{}, err
128+
if step.ConditionInfo.Reason == intreconcile.EventProcessSuccess {
129+
step = step.WithMessage(fmt.Sprintf("Updated RTE config %s/%s from kubelet config %s", cm.Namespace, cm.Name, req.NamespacedName.String()))
131130
}
132131

133-
r.Recorder.Event(instance, "Normal", "ProcessOK", fmt.Sprintf("Updated RTE config %s/%s from kubelet config %s", cm.Namespace, cm.Name, req.NamespacedName.String()))
134-
return ctrl.Result{}, nil
132+
r.Recorder.Event(instance, step.ConditionInfo.Type, step.ConditionInfo.Reason, step.ConditionInfo.Message)
133+
134+
return step.Result, nil
135135
}
136136

137137
func (r *KubeletConfigReconciler) SetupWithManager(mgr ctrl.Manager) error {
@@ -197,25 +197,29 @@ func (e *InvalidKubeletConfig) Unwrap() error {
197197
return e.Err
198198
}
199199

200-
func (r *KubeletConfigReconciler) reconcileConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, error) {
200+
func (r *KubeletConfigReconciler) reconcileConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, intreconcile.Step) {
201201
// first check if the ConfigMap should be deleted
202202
// to save all the additional work related for create/update
203203
cm, deleted, err := r.deleteConfigMap(ctx, instance, kcKey)
204204
if deleted {
205-
return cm, err
205+
return cm, intreconcile.StepWarning(fmt.Errorf("Failed to update RTE config from kubelet config %s: %v", kcKey.Name, err))
206206
}
207207

208-
kcHandler, err := r.makeKCHandlerForPlatform(ctx, instance, kcKey)
209-
if err != nil {
210-
return nil, err
208+
kcHandler, step := r.makeKCHandlerForPlatform(ctx, instance, kcKey)
209+
if step.Error != nil {
210+
return nil, step
211211
}
212+
212213
kubeletConfig, err := kubeletconfig.MCOKubeletConfToKubeletConf(kcHandler.mcoKc)
213214
if err != nil {
214215
klog.ErrorS(err, "cannot extract KubeletConfiguration from MCO KubeletConfig", "name", kcKey.Name)
215-
return nil, err
216+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
216217
}
217-
218-
return r.syncConfigMap(ctx, kubeletConfig, instance, kcHandler)
218+
cm, err = r.syncConfigMap(ctx, kubeletConfig, instance, kcHandler)
219+
if err != nil {
220+
return cm, FailedConfigMapUpdateStep(kcKey.Name, err)
221+
}
222+
return cm, intreconcile.StepNormalSucess("")
219223
}
220224

221225
func (r *KubeletConfigReconciler) syncConfigMap(ctx context.Context, kubeletConfig *kubeletconfigv1beta1.KubeletConfiguration, instance *nropv1.NUMAResourcesOperator, kcHandler *kubeletConfigHandler) (*corev1.ConfigMap, error) {
@@ -244,63 +248,68 @@ func (r *KubeletConfigReconciler) syncConfigMap(ctx context.Context, kubeletConf
244248
return rendered, nil
245249
}
246250

247-
func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*kubeletConfigHandler, error) {
251+
func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*kubeletConfigHandler, intreconcile.Step) {
248252
switch r.Platform {
249253
case platform.OpenShift:
250254
mcoKc := &mcov1.KubeletConfig{}
251255
if err := r.Client.Get(ctx, kcKey, mcoKc); err != nil {
252-
return nil, err
256+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
253257
}
254258

255259
mcps, err := machineconfigpools.GetListByNodeGroupsV1(ctx, r.Client, instance.Spec.NodeGroups)
256260
if err != nil {
257-
return nil, err
261+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
258262
}
259263

260264
mcp, err := machineconfigpools.FindBySelector(mcps, mcoKc.Spec.MachineConfigPoolSelector)
261265
if err != nil {
262266
klog.ErrorS(err, "cannot find a matching mcp for MCO KubeletConfig", "name", kcKey.Name)
263267
var notFound *machineconfigpools.NotFound
264268
if errors.As(err, &notFound) {
265-
return nil, &InvalidKubeletConfig{
266-
ObjectName: kcKey.Name,
267-
Err: notFound,
268-
}
269+
return nil, intreconcile.StepNormalSkip(fmt.Errorf("%s: %v", kcKey, notFound))
269270
}
270-
return nil, err
271+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
271272
}
272273

273274
klog.V(3).InfoS("matched MCP to MCO KubeletConfig", "kubeletconfig name", kcKey.Name, "MCP name", mcp.Name)
274275

275276
// nothing we care about, and we can't do much anyway
276277
if mcoKc.Spec.KubeletConfig == nil {
277278
klog.InfoS("detected KubeletConfig with empty payload, ignoring", "name", kcKey.Name)
278-
return nil, &InvalidKubeletConfig{ObjectName: kcKey.Name}
279+
return nil, intreconcile.StepNormalSkip(fmt.Errorf("Invalid KubeletConfig %s", kcKey.Name))
279280
}
281+
282+
if mcp.Spec.Paused {
283+
klog.InfoS("detected paused MCP", "name", mcp.Name)
284+
step := intreconcile.StepNormalSkip(fmt.Errorf("MachineConfigPool of KubeletConfig %s is paused", kcKey.Name))
285+
step.Result = ctrl.Result{RequeueAfter: MachineConfigPoolPausedRetryPeriod}
286+
return nil, step
287+
}
288+
280289
return &kubeletConfigHandler{
281290
ownerObject: mcoKc,
282291
mcoKc: mcoKc,
283292
poolName: mcp.Name,
284293
setCtrlRef: controllerutil.SetControllerReference,
285-
}, nil
294+
}, intreconcile.StepNormalSucess("")
286295

287296
case platform.HyperShift:
288297
cmKc := &corev1.ConfigMap{}
289298
if err := r.Client.Get(ctx, kcKey, cmKc); err != nil {
290-
return nil, err
299+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
291300
}
292301

293302
nodePoolName := cmKc.Labels[HyperShiftNodePoolLabel]
294303
kcData := cmKc.Data[HyperShiftConfigMapConfigKey]
295304
mcoKc, err := kubeletconfig.DecodeFromData([]byte(kcData), r.Scheme)
296305
if err != nil {
297-
return nil, err
306+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
298307
}
299308

300309
// nothing we care about, and we can't do much anyway
301310
if mcoKc.Spec.KubeletConfig == nil {
302311
klog.InfoS("detected KubeletConfig with empty payload, ignoring", "name", kcKey.Name)
303-
return nil, &InvalidKubeletConfig{ObjectName: kcKey.Name}
312+
return nil, intreconcile.StepNormalSkip(fmt.Errorf("Invalid KubeletConfig %s", kcKey.Name))
304313
}
305314
return &kubeletConfigHandler{
306315
ownerObject: cmKc,
@@ -312,9 +321,12 @@ func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context,
312321
setCtrlRef: func(owner, controlled metav1.Object, scheme *runtime.Scheme, opts ...controllerutil.OwnerReferenceOption) error {
313322
return nil
314323
},
315-
}, nil
324+
}, intreconcile.StepNormalSucess("")
316325
}
317-
return nil, fmt.Errorf("unsupported platform: %s", r.Platform)
326+
return nil, FailedConfigMapUpdateStep(kcKey.Name, fmt.Errorf("unsupported platform: %s", r.Platform))
327+
}
328+
func FailedConfigMapUpdateStep(objName string, err error) intreconcile.Step {
329+
return intreconcile.StepWarning(fmt.Errorf("Failed to update RTE config from kubelet config %s: %v", objName, err))
318330
}
319331

320332
func (r *KubeletConfigReconciler) deleteConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, bool, error) {

internal/controller/kubeletconfig_controller_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,4 +272,90 @@ var _ = Describe("Test KubeletConfig Reconcile", func() {
272272
Entry("OpenShift Platform", NewFakeKubeletConfigReconciler, platform.OpenShift),
273273
Entry("HyperShift Platform", NewFakeKubeletConfigReconcilerForHyperShift, platform.HyperShift),
274274
)
275+
276+
Context("kubeletconfig updates with paused MCPs", func() {
277+
var nro *nropv1.NUMAResourcesOperator
278+
var mcp1, mcpPaused *machineconfigv1.MachineConfigPool
279+
var mcoKC1, mcoKCPaused *machineconfigv1.KubeletConfig
280+
var label1 map[string]string
281+
var kc1Key, kc2Key client.ObjectKey
282+
var poolName1, poolName2 string
283+
cmKc1 := &corev1.ConfigMap{}
284+
var reconciler *KubeletConfigReconciler
285+
var err error
286+
287+
BeforeEach(func() {
288+
label1 = map[string]string{
289+
"test1": "test1",
290+
}
291+
mcp1 = testobjs.NewMachineConfigPool("test1", label1, &metav1.LabelSelector{MatchLabels: label1}, &metav1.LabelSelector{MatchLabels: label1})
292+
poolName1 = mcp1.Name
293+
kubeletConfig := &kubeletconfigv1beta1.KubeletConfiguration{}
294+
mcoKC1 = testobjs.NewKubeletConfig(poolName1, label1, mcp1.Spec.MachineConfigSelector, kubeletConfig)
295+
kc1Key = client.ObjectKeyFromObject(mcoKC1)
296+
297+
label2 := map[string]string{
298+
"test2": "test2",
299+
}
300+
mcpPaused = testobjs.NewMachineConfigPool("test2", label2, &metav1.LabelSelector{MatchLabels: label2}, &metav1.LabelSelector{MatchLabels: label2})
301+
mcpPaused.Spec.Paused = true
302+
poolName2 = mcpPaused.Name
303+
kubeletConfigPaused := &kubeletconfigv1beta1.KubeletConfiguration{}
304+
mcoKCPaused = testobjs.NewKubeletConfig(poolName2, label2, mcpPaused.Spec.MachineConfigSelector, kubeletConfigPaused)
305+
kc2Key = client.ObjectKeyFromObject(mcoKCPaused)
306+
307+
ng1 := nropv1.NodeGroup{
308+
PoolName: &poolName1,
309+
}
310+
ng2 := nropv1.NodeGroup{
311+
PoolName: &poolName2,
312+
}
313+
nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng1, ng2)
314+
315+
reconciler, err = NewFakeKubeletConfigReconciler(nro, mcp1, mcoKC1, cmKc1, mcpPaused, mcoKCPaused) //TODO remove cmKc1
316+
Expect(err).ToNot(HaveOccurred())
317+
})
318+
It("should create configmap for active MCP", func() {
319+
result, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: kc1Key})
320+
Expect(err).ToNot(HaveOccurred())
321+
Expect(result).To(Equal(reconcile.Result{}))
322+
cm := &corev1.ConfigMap{}
323+
key := client.ObjectKey{
324+
Namespace: testNamespace,
325+
Name: objectnames.GetComponentName(nro.Name, poolName1),
326+
}
327+
Expect(reconciler.Client.Get(context.TODO(), key, cm)).To(Succeed())
328+
})
329+
330+
It("should not create configmap for paused MCP, then create it when un-paused", func() {
331+
result, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: kc2Key})
332+
Expect(err).ToNot(HaveOccurred())
333+
334+
fakeRecorder, ok := reconciler.Recorder.(*record.FakeRecorder)
335+
Expect(ok).To(BeTrue())
336+
event := <-fakeRecorder.Events
337+
Expect(event).To(ContainSubstring("ProcessSkip"))
338+
Expect(event).To(ContainSubstring("is paused"))
339+
Expect(result).To(Equal(reconcile.Result{RequeueAfter: MachineConfigPoolPausedRetryPeriod}))
340+
341+
cm := &corev1.ConfigMap{}
342+
key := client.ObjectKey{
343+
Namespace: testNamespace,
344+
Name: objectnames.GetComponentName(nro.Name, poolName2),
345+
}
346+
Expect(reconciler.Client.Get(context.TODO(), key, cm)).To(HaveOccurred())
347+
348+
mcpPaused.Spec.Paused = false
349+
Expect(reconciler.Client.Update(context.TODO(), mcpPaused)).To(Succeed())
350+
351+
result, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: kc2Key})
352+
Expect(err).ToNot(HaveOccurred())
353+
Expect(result).To(Equal(reconcile.Result{}))
354+
Expect(reconciler.Client.Get(context.TODO(), key, cm)).To(Succeed())
355+
event = <-fakeRecorder.Events
356+
Expect(event).To(ContainSubstring("ProcessOK"))
357+
Expect(event).To(ContainSubstring(mcoKCPaused.Name))
358+
})
359+
})
360+
275361
})

internal/reconcile/event.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package reconcile

internal/reconcile/step.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ import (
2525
"github.com/openshift-kni/numaresources-operator/pkg/status/conditioninfo"
2626
)
2727

28+
const (
29+
EventTypeNormal = "Normal"
30+
EventTypeWarning = "Warning"
31+
32+
EventProcessSuccess = "ProcessOK"
33+
EventProcessSkip = "ProcessSkip"
34+
EventProcessFailed = "ProcessFailed"
35+
)
36+
2837
type Step struct {
2938
Result ctrl.Result
3039
ConditionInfo conditioninfo.ConditionInfo
@@ -92,3 +101,35 @@ func StepFailed(err error) Step {
92101
Error: err,
93102
}
94103
}
104+
105+
func StepWarning(err error) Step {
106+
return Step{
107+
ConditionInfo: conditioninfo.ConditionInfo{
108+
Type: EventTypeWarning,
109+
Message: status.MessageFromError(err),
110+
Reason: EventProcessFailed,
111+
},
112+
Error: err,
113+
}
114+
}
115+
116+
func StepNormalSkip(err error) Step {
117+
return Step{
118+
ConditionInfo: conditioninfo.ConditionInfo{
119+
Type: EventTypeNormal,
120+
Message: status.MessageFromError(err),
121+
Reason: EventProcessSkip,
122+
},
123+
Error: err,
124+
}
125+
}
126+
127+
func StepNormalSucess(msg string) Step {
128+
return Step{
129+
ConditionInfo: conditioninfo.ConditionInfo{
130+
Type: EventTypeNormal,
131+
Message: msg,
132+
Reason: EventProcessSuccess,
133+
},
134+
}
135+
}

0 commit comments

Comments
 (0)