Skip to content

Commit 1e86d9a

Browse files
committed
controllers: cleanup status conditions update
So far we had only base conditions for the NRO object, but we want to have a more flexible interface to interact with while supporting non-base conditions, just like we have for NRS controller. In this commit: 1. preserve the consistency of updating status conditions for numaresources CRs (operator and scheduler). 2. minimize Status.Update calls on degraded condition updates 3. keep using `conditioninfo` to maintain related commit modifications as much as possible, refactor later (switch to metav1 conditions) Signed-off-by: Shereen Haj <[email protected]>
1 parent 39a967f commit 1e86d9a

File tree

8 files changed

+121
-184
lines changed

8 files changed

+121
-184
lines changed

internal/controller/numaresourcesoperator_controller.go

Lines changed: 34 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,14 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
140140
return ctrl.Result{}, err
141141
}
142142

143+
initialStatus := *instance.Status.DeepCopy()
144+
if len(initialStatus.Conditions) == 0 {
145+
instance.Status.Conditions = status.DefaultBaseConditions(time.Now())
146+
}
147+
143148
if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
144149
err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name)
145-
return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
150+
return r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
146151
}
147152

148153
if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
@@ -151,80 +156,69 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
151156
}
152157

153158
if err := validation.NodeGroups(instance.Spec.NodeGroups, r.Platform); err != nil {
154-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
159+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
155160
}
156161

157162
trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups, r.Platform)
158163
if err != nil {
159-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
164+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
160165
}
161166

162167
tolerable, err := validation.NodeGroupsTree(instance, trees, r.Platform)
163168
if err != nil {
164-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
169+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
165170
}
166171

167172
for idx := range trees {
168173
conf := trees[idx].NodeGroup.NormalizeConfig()
169174
trees[idx].NodeGroup.Config = &conf
170175
}
171176

172-
curStatus := instance.Status.DeepCopy()
173-
174177
step := r.reconcileResource(ctx, instance, trees)
175178

176179
if step.Done() && tolerable != nil {
177-
return r.degradeStatus(ctx, instance, tolerable.Reason, tolerable.Error)
180+
return r.degradeStatus(ctx, initialStatus, instance, tolerable.Reason, tolerable.Error)
181+
}
182+
183+
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
184+
klog.InfoS("Failed to update numaresources-operator status", "error", err)
178185
}
179186

180-
if !status.NUMAResourceOperatorNeedsUpdate(curStatus, &instance.Status) {
181-
return step.Result, step.Error
187+
return step.Result, step.Error
188+
}
189+
190+
func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator) error {
191+
if !status.NUMAResourceOperatorNeedsUpdate(initialStatus, instance.Status) {
192+
return nil
182193
}
183194

184195
updErr := r.Client.Status().Update(ctx, instance)
185196
if updErr != nil {
186-
klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr)
187-
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
197+
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
188198
}
189-
190-
return step.Result, step.Error
199+
return nil
191200
}
192201

193-
// updateStatusConditionsIfNeeded returns true if conditions were updated.
194-
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) {
195-
if cond.Type == "" { // backward (=legacy) compatibility
196-
return
197-
}
202+
// updateStatusConditionsWithConditionInfo returns true if conditions were updated.
203+
func updateStatusConditionsWithConditionInfo(conds []metav1.Condition, gen int64, cond conditioninfo.ConditionInfo) bool {
198204
klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message)
199-
conditions, ok := status.ComputeConditions(instance.Status.Conditions, metav1.Condition{
200-
Type: cond.Type,
201-
Reason: cond.Reason,
202-
Message: cond.Message,
203-
ObservedGeneration: instance.Generation,
204-
}, time.Now())
205-
if ok {
206-
instance.Status.Conditions = conditions
207-
}
205+
return status.UpdateConditionsInPlace(conds, cond.ToMetav1Condition(gen), time.Now())
208206
}
209207

210-
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
208+
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
211209
info := conditioninfo.DegradedFromError(stErr)
212210
if reason != "" { // intentionally overwrite
213211
info.Reason = reason
214212
}
215213

216-
updateStatusConditionsIfNeeded(instance, info)
217-
// TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver
214+
updateStatusConditionsWithConditionInfo(instance.Status.Conditions, instance.Generation, info)
218215

219-
err := r.Client.Status().Update(ctx, instance)
216+
err := r.updateStatus(ctx, initialStatus, instance)
220217
if err != nil {
221218
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
222-
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), err)
223219
}
224220

225-
// we do not return an error here because to pass the validation error a user will need to update NRO CR
226-
// that will anyway initiate to reconcile loop
227-
return ctrl.Result{}, nil
221+
return ctrl.Result{}, err
228222
}
229223

230224
func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
@@ -300,34 +294,31 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context
300294

301295
func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
302296
if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() {
303-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
297+
updateStatusConditionsWithConditionInfo(instance.Status.Conditions, instance.Generation, step.ConditionInfo)
304298
return step
305299
}
306300

307301
existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace)
308302

309303
if r.Platform == platform.OpenShift {
310304
if step := r.reconcileResourceMachineConfig(ctx, instance, existing, trees); step.EarlyStop() {
311-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
305+
updateStatusConditionsWithConditionInfo(instance.Status.Conditions, instance.Generation, step.ConditionInfo)
312306
return step
313307
}
314308
}
315309

316310
dsPerPool, step := r.reconcileResourceDaemonSet(ctx, instance, existing, trees)
317311
if step.EarlyStop() {
318-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
312+
updateStatusConditionsWithConditionInfo(instance.Status.Conditions, instance.Generation, step.ConditionInfo)
319313
return step
320314
}
321315

322316
// all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which
323317
// is a certain thing if we got to this point otherwise the function would have returned already
324318
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerPool)
325319

326-
updateStatusConditionsIfNeeded(instance, conditioninfo.Available())
327-
return intreconcile.Step{
328-
Result: ctrl.Result{},
329-
ConditionInfo: conditioninfo.Available(),
330-
}
320+
updateStatusConditionsWithConditionInfo(instance.Status.Conditions, instance.Generation, conditioninfo.Available())
321+
return intreconcile.StepSuccess()
331322
}
332323

333324
func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, string, error) {

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import (
4848
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
4949
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
5050
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
51+
intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
5152
"github.com/openshift-kni/numaresources-operator/internal/relatedobjects"
5253
"github.com/openshift-kni/numaresources-operator/pkg/apply"
5354
"github.com/openshift-kni/numaresources-operator/pkg/hash"
@@ -115,23 +116,44 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
115116
}
116117

117118
initialStatus := *instance.Status.DeepCopy()
119+
if len(initialStatus.Conditions) == 0 {
120+
instance.Status.Conditions = status.NewNUMAResourcesSchedulerBaseConditions()
121+
}
118122

119123
if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName {
120124
message := fmt.Sprintf("incorrect NUMAResourcesScheduler resource name: %s", instance.Name)
121-
return ctrl.Result{}, r.updateStatus(ctx, initialStatus, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
125+
return ctrl.Result{}, r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
122126
}
123127

124128
if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
125129
klog.InfoS("Pause reconciliation enabled", "object", req.NamespacedName)
126130
return ctrl.Result{}, nil
127131
}
128132

129-
result, condition, err := r.reconcileResource(ctx, instance)
130-
if err := r.updateStatus(ctx, initialStatus, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil {
131-
klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err)
133+
step := r.reconcileResource(ctx, instance)
134+
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
135+
klog.InfoS("Failed to update numaresourcesscheduler status", "error", err)
136+
}
137+
138+
return step.Result, step.Error
139+
}
140+
141+
func (r *NUMAResourcesSchedulerReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, instance *nropv1.NUMAResourcesScheduler, reason string, message string) error {
142+
condition := metav1.Condition{
143+
Type: status.ConditionDegraded,
144+
Status: metav1.ConditionTrue,
145+
Reason: reason,
146+
Message: message,
147+
ObservedGeneration: instance.Generation,
132148
}
133149

134-
return result, err
150+
status.UpdateConditionsInPlace(instance.Status.Conditions, condition, time.Now())
151+
152+
err := r.updateStatus(ctx, initialStatus, instance)
153+
if err != nil {
154+
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
155+
}
156+
return err
135157
}
136158

137159
// SetupWithManager sets up the controller with the Manager.
@@ -183,24 +205,30 @@ func (r *NUMAResourcesSchedulerReconciler) nodeToNUMAResourcesScheduler(ctx cont
183205
return requests
184206
}
185207

186-
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) (reconcile.Result, string, error) {
208+
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) intreconcile.Step {
187209
schedStatus, err := r.syncNUMASchedulerResources(ctx, instance)
188210
if err != nil {
189-
return ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedSchedulerSync: %w", err)
211+
step := intreconcile.StepFailed(fmt.Errorf("FailedSchedulerSync: %w", err))
212+
status.UpdateConditionsInPlace(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
213+
return step
190214
}
191215

192216
instance.Status = schedStatus
193217
instance.Status.RelatedObjects = relatedobjects.Scheduler(r.Namespace, instance.Status.Deployment)
194218

195219
ok, err := isDeploymentRunning(ctx, r.Client, schedStatus.Deployment)
196220
if err != nil {
197-
return ctrl.Result{}, status.ConditionDegraded, err
221+
step := intreconcile.StepFailed(err)
222+
status.UpdateConditionsInPlace(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
223+
return step
198224
}
199225
if !ok {
200-
return ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil
226+
step := intreconcile.StepOngoing(5 * time.Second)
227+
status.UpdateConditionsInPlace(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
228+
return step
201229
}
202230

203-
return ctrl.Result{}, status.ConditionAvailable, nil
231+
return intreconcile.StepSuccess()
204232
}
205233

206234
func isDeploymentRunning(ctx context.Context, c client.Client, key nropv1.NamespacedName) (bool, error) {
@@ -303,7 +331,7 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
303331
Duration: cacheResyncPeriod,
304332
}
305333

306-
schedStatus.Conditions = updateDedicatedInformerCondition(status.NewNUMAResourcesSchedulerBaseConditions(), *instance, schedSpec)
334+
updateDedicatedInformerCondition(schedStatus.Conditions, *instance, schedSpec)
307335

308336
r.SchedulerManifests.Deployment.Spec.Replicas = schedSpec.Replicas
309337
klog.V(4).InfoS("using scheduler replicas", "replicas", *r.SchedulerManifests.Deployment.Spec.Replicas)
@@ -367,52 +395,28 @@ func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platfor
367395
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
368396
}
369397

370-
func updateDedicatedInformerCondition(conds []metav1.Condition, instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
371-
condition := status.FindCondition(conds, status.ConditionDedicatedInformerActive)
372-
if condition == nil { // should never happen
373-
klog.InfoS("missing condition: %q", status.ConditionDedicatedInformerActive)
374-
return conds
375-
}
376-
if normalized.SchedulerInformer == nil { // should never happen
377-
klog.InfoS("nil SchedulerInformer in spec")
378-
condition.Status = metav1.ConditionUnknown
379-
return conds
380-
}
398+
func updateDedicatedInformerCondition(conds []metav1.Condition, instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) {
399+
dedicatedStatus := metav1.ConditionFalse
381400
if *normalized.SchedulerInformer == nropv1.SchedulerInformerDedicated {
382-
condition.Status = metav1.ConditionTrue
383-
} else {
384-
condition.Status = metav1.ConditionFalse
401+
dedicatedStatus = metav1.ConditionTrue
385402
}
386-
condition.ObservedGeneration = instance.Generation
387-
return conds
388-
}
389403

390-
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error {
391-
conds := status.CloneConditions(sched.Status.Conditions)
392-
if len(conds) == 0 {
393-
conds = status.NewNUMAResourcesSchedulerBaseConditions()
394-
}
395-
ok := status.UpdateConditionsInPlace(conds, metav1.Condition{
396-
Type: condition,
397-
Status: metav1.ConditionTrue,
398-
ObservedGeneration: sched.Generation,
399-
Reason: reason,
400-
Message: message,
401-
}, time.Now())
402-
if !ok {
403-
klog.InfoS("fail to update condition", "conditionType", condition, "reason", reason, "message", message)
404+
informerCondition := metav1.Condition{
405+
Type: status.ConditionDedicatedInformerActive,
406+
Status: dedicatedStatus,
407+
Reason: status.ConditionDedicatedInformerActive,
408+
ObservedGeneration: instance.Generation,
409+
LastTransitionTime: metav1.Now(),
404410
}
405-
// we need to set something anyway
406-
sched.Status.Conditions = conds
407411

412+
status.UpdateConditionsInPlace(conds, informerCondition, time.Time{})
413+
}
414+
415+
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler) error {
408416
if !status.NUMAResourcesSchedulerNeedsUpdate(initialStatus, sched.Status) {
409417
return nil
410418
}
411419

412-
if status.EqualConditions(initialStatus.Conditions, sched.Status.Conditions) {
413-
sched.Status.Conditions = initialStatus.Conditions
414-
}
415-
416420
if err := r.Client.Status().Update(ctx, sched); err != nil {
417421
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(sched), err)
418422
}

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
121121

122122
node := &corev1.Node{
123123
ObjectMeta: metav1.ObjectMeta{
124-
Name: fmt.Sprintf("master-node-0"),
124+
Name: "master-node-0",
125125
Labels: map[string]string{
126126
"node-role.kubernetes.io/control-plane": "",
127127
},
@@ -132,20 +132,14 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
132132
Expect(err).ToNot(HaveOccurred())
133133

134134
Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(Succeed())
135+
136+
progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
137+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
135138
degradedCondition = getConditionByType(nrs.Status.Conditions, status.ConditionDegraded)
136139
Expect(degradedCondition.Status).To(Equal(metav1.ConditionFalse), "scheduler reported as Degraded: %v", degradedCondition)
137140
Expect(degradedCondition.Reason).To(Equal(status.ConditionDegraded))
138141
Expect(degradedCondition.Message).To(BeEmpty())
139-
140-
progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
141-
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
142-
})
143-
144-
It("should report progressing condition for un-paused MCPs but in updating state", func() {
145142
})
146-
It("should not report progressing condition for paused MCPs", func() {
147-
})
148-
149143
})
150144

151145
Context("with correct NRS CR", func() {

pkg/kubeletconfig/kubeletconfig.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"k8s.io/apimachinery/pkg/runtime"
2424
serializer "k8s.io/apimachinery/pkg/runtime/serializer/json"
2525
kubeletconfigv1beta1 "k8s.io/kubelet/config/v1beta1"
26+
2627
"sigs.k8s.io/yaml"
2728

2829
mcov1 "github.com/openshift/api/machineconfiguration/v1"

pkg/status/conditioninfo/conditioninfo.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
package conditioninfo
1818

1919
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
2022
"github.com/openshift-kni/numaresources-operator/pkg/status"
2123
)
2224

@@ -73,3 +75,13 @@ func DegradedFromError(err error) ConditionInfo {
7375
Reason: status.ReasonFromError(err),
7476
}
7577
}
78+
79+
func (ci ConditionInfo) ToMetav1Condition(gen int64) metav1.Condition {
80+
return metav1.Condition{
81+
Type: ci.Type,
82+
Status: metav1.ConditionTrue,
83+
Reason: ci.Reason,
84+
Message: ci.Message,
85+
ObservedGeneration: gen,
86+
}
87+
}

0 commit comments

Comments
 (0)