Skip to content

Commit 0058ce8

Browse files
committed
controllers: cleanup status conditions update
So far we had only base conditions for the NRO object, but we want to have a more flexible interface to interact with while supporting non-base conditions, just like we have for NRS controller. In this commit: 1. preserve the consistency of updating status conditions for numaresources CRs (operator and scheduler). 2. minimize Status.Update calls on degraded condition updates. 3. keep using `conditioninfo` to maintain related commit modifications as much as possible, refactor later (switch to metav1 conditions). 4. update conditions and return them instead of mutation. Signed-off-by: Shereen Haj <[email protected]>
1 parent 7413437 commit 0058ce8

File tree

7 files changed

+146
-185
lines changed

7 files changed

+146
-185
lines changed

internal/controller/numaresourcesoperator_controller.go

Lines changed: 27 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,14 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
139139
return ctrl.Result{}, err
140140
}
141141

142+
initialStatus := *instance.Status.DeepCopy()
143+
if len(initialStatus.Conditions) == 0 {
144+
instance.Status.Conditions = status.DefaultBaseConditions(time.Now())
145+
}
146+
142147
if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
143148
err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name)
144-
return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
149+
return r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
145150
}
146151

147152
if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
@@ -150,80 +155,64 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
150155
}
151156

152157
if err := validation.NodeGroups(instance.Spec.NodeGroups, r.Platform); err != nil {
153-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
158+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
154159
}
155160

156161
trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups, r.Platform)
157162
if err != nil {
158-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
163+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
159164
}
160165

161166
tolerable, err := validation.NodeGroupsTree(instance, trees, r.Platform)
162167
if err != nil {
163-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
168+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
164169
}
165170

166171
for idx := range trees {
167172
conf := trees[idx].NodeGroup.NormalizeConfig()
168173
trees[idx].NodeGroup.Config = &conf
169174
}
170175

171-
curStatus := instance.Status.DeepCopy()
172-
173176
step := r.reconcileResource(ctx, instance, trees)
177+
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
174178

175179
if step.Done() && tolerable != nil {
176-
return r.degradeStatus(ctx, instance, tolerable.Reason, tolerable.Error)
177-
}
178-
179-
if !status.NUMAResourceOperatorNeedsUpdate(curStatus, &instance.Status) {
180-
return step.Result, step.Error
180+
return r.degradeStatus(ctx, initialStatus, instance, tolerable.Reason, tolerable.Error)
181181
}
182182

183-
updErr := r.Client.Status().Update(ctx, instance)
184-
if updErr != nil {
185-
klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr)
186-
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
183+
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
184+
klog.InfoS("Failed to update numaresources-operator status", "error", err)
187185
}
188186

189187
return step.Result, step.Error
190188
}
191189

192-
// updateStatusConditionsIfNeeded returns true if conditions were updated.
193-
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) {
194-
if cond.Type == "" { // backward (=legacy) compatibility
195-
return
196-
}
197-
klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message)
198-
conditions, ok := status.ComputeConditions(instance.Status.Conditions, metav1.Condition{
199-
Type: cond.Type,
200-
Reason: cond.Reason,
201-
Message: cond.Message,
202-
ObservedGeneration: instance.Generation,
203-
}, time.Now())
204-
if ok {
205-
instance.Status.Conditions = conditions
190+
func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator) error {
191+
if !status.NUMAResourceOperatorNeedsUpdate(initialStatus, instance.Status) {
192+
return nil
206193
}
194+
195+
updErr := r.Client.Status().Update(ctx, instance)
196+
if updErr != nil {
197+
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
198+
}
199+
return nil
207200
}
208201

209-
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
202+
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
210203
info := conditioninfo.DegradedFromError(stErr)
211204
if reason != "" { // intentionally overwrite
212205
info.Reason = reason
213206
}
214207

215-
updateStatusConditionsIfNeeded(instance, info)
216-
// TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver
208+
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, info.ToMetav1Condition(instance.Generation), time.Now())
217209

218-
err := r.Client.Status().Update(ctx, instance)
210+
err := r.updateStatus(ctx, initialStatus, instance)
219211
if err != nil {
220212
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
221-
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), err)
222213
}
223214

224-
// we do not return an error here because to pass the validation error a user will need to update NRO CR
225-
// that will anyway initiate to reconcile loop
226-
return ctrl.Result{}, nil
215+
return ctrl.Result{}, err
227216
}
228217

229218
func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
@@ -298,34 +287,27 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context
298287

299288
func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
300289
if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() {
301-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
302290
return step
303291
}
304292

305293
existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace)
306294

307295
if r.Platform == platform.OpenShift {
308296
if step := r.reconcileResourceMachineConfig(ctx, instance, existing, trees); step.EarlyStop() {
309-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
310297
return step
311298
}
312299
}
313300

314301
dsPerPool, step := r.reconcileResourceDaemonSet(ctx, instance, existing, trees)
315302
if step.EarlyStop() {
316-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
317303
return step
318304
}
319305

320306
// all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which
321307
// is a certain thing if we got to this point otherwise the function would have returned already
322308
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerPool)
323309

324-
updateStatusConditionsIfNeeded(instance, conditioninfo.Available())
325-
return intreconcile.Step{
326-
Result: ctrl.Result{},
327-
ConditionInfo: conditioninfo.Available(),
328-
}
310+
return intreconcile.StepSuccess()
329311
}
330312

331313
func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, string, error) {

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import (
4848
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
4949
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
5050
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
51+
intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
5152
"github.com/openshift-kni/numaresources-operator/internal/relatedobjects"
5253
"github.com/openshift-kni/numaresources-operator/pkg/apply"
5354
"github.com/openshift-kni/numaresources-operator/pkg/hash"
@@ -115,23 +116,45 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
115116
}
116117

117118
initialStatus := *instance.Status.DeepCopy()
119+
if len(initialStatus.Conditions) == 0 {
120+
instance.Status.Conditions = status.NewNUMAResourcesSchedulerBaseConditions()
121+
}
118122

119123
if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName {
120124
message := fmt.Sprintf("incorrect NUMAResourcesScheduler resource name: %s", instance.Name)
121-
return ctrl.Result{}, r.updateStatus(ctx, initialStatus, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
125+
return ctrl.Result{}, r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
122126
}
123127

124128
if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
125129
klog.InfoS("Pause reconciliation enabled", "object", req.NamespacedName)
126130
return ctrl.Result{}, nil
127131
}
128132

129-
result, condition, err := r.reconcileResource(ctx, instance)
130-
if err := r.updateStatus(ctx, initialStatus, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil {
131-
klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err)
133+
step := r.reconcileResource(ctx, instance)
134+
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
135+
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
136+
klog.InfoS("Failed to update numaresourcesscheduler status", "error", err)
132137
}
133138

134-
return result, err
139+
return step.Result, step.Error
140+
}
141+
142+
func (r *NUMAResourcesSchedulerReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, instance *nropv1.NUMAResourcesScheduler, reason string, message string) error {
143+
condition := metav1.Condition{
144+
Type: status.ConditionDegraded,
145+
Status: metav1.ConditionTrue,
146+
Reason: reason,
147+
Message: message,
148+
ObservedGeneration: instance.Generation,
149+
}
150+
151+
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, condition, time.Now())
152+
153+
err := r.updateStatus(ctx, initialStatus, instance)
154+
if err != nil {
155+
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
156+
}
157+
return err
135158
}
136159

137160
// SetupWithManager sets up the controller with the Manager.
@@ -183,24 +206,24 @@ func (r *NUMAResourcesSchedulerReconciler) nodeToNUMAResourcesScheduler(ctx cont
183206
return requests
184207
}
185208

186-
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) (reconcile.Result, string, error) {
209+
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) intreconcile.Step {
187210
schedStatus, err := r.syncNUMASchedulerResources(ctx, instance)
188211
if err != nil {
189-
return ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedSchedulerSync: %w", err)
212+
return intreconcile.StepFailed(fmt.Errorf("FailedSchedulerSync: %w", err))
190213
}
191214

192215
instance.Status = schedStatus
193216
instance.Status.RelatedObjects = relatedobjects.Scheduler(r.Namespace, instance.Status.Deployment)
194217

195218
ok, err := isDeploymentRunning(ctx, r.Client, schedStatus.Deployment)
196219
if err != nil {
197-
return ctrl.Result{}, status.ConditionDegraded, err
220+
return intreconcile.StepFailed(err)
198221
}
199222
if !ok {
200-
return ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil
223+
return intreconcile.StepOngoing(5 * time.Second)
201224
}
202225

203-
return ctrl.Result{}, status.ConditionAvailable, nil
226+
return intreconcile.StepSuccess()
204227
}
205228

206229
func isDeploymentRunning(ctx context.Context, c client.Client, key nropv1.NamespacedName) (bool, error) {
@@ -303,7 +326,7 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
303326
Duration: cacheResyncPeriod,
304327
}
305328

306-
schedStatus.Conditions = updateDedicatedInformerCondition(status.NewNUMAResourcesSchedulerBaseConditions(), *instance, schedSpec)
329+
schedStatus.Conditions = updateDedicatedInformerCondition(schedStatus.Conditions, instance.Generation, schedSpec)
307330

308331
r.SchedulerManifests.Deployment.Spec.Replicas = schedSpec.Replicas
309332
klog.V(4).InfoS("using scheduler replicas", "replicas", *r.SchedulerManifests.Deployment.Spec.Replicas)
@@ -367,52 +390,29 @@ func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platfor
367390
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
368391
}
369392

370-
func updateDedicatedInformerCondition(conds []metav1.Condition, instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
371-
condition := status.FindCondition(conds, status.ConditionDedicatedInformerActive)
372-
if condition == nil { // should never happen
373-
klog.InfoS("missing condition: %q", status.ConditionDedicatedInformerActive)
374-
return conds
375-
}
376-
if normalized.SchedulerInformer == nil { // should never happen
377-
klog.InfoS("nil SchedulerInformer in spec")
378-
condition.Status = metav1.ConditionUnknown
379-
return conds
380-
}
393+
func updateDedicatedInformerCondition(conds []metav1.Condition, instanceGeneration int64, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
394+
dedicatedStatus := metav1.ConditionFalse
381395
if *normalized.SchedulerInformer == nropv1.SchedulerInformerDedicated {
382-
condition.Status = metav1.ConditionTrue
383-
} else {
384-
condition.Status = metav1.ConditionFalse
396+
dedicatedStatus = metav1.ConditionTrue
385397
}
386-
condition.ObservedGeneration = instance.Generation
387-
return conds
388-
}
389398

390-
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error {
391-
conds := status.CloneConditions(sched.Status.Conditions)
392-
if len(conds) == 0 {
393-
conds = status.NewNUMAResourcesSchedulerBaseConditions()
399+
informerCondition := metav1.Condition{
400+
Type: status.ConditionDedicatedInformerActive,
401+
Status: dedicatedStatus,
402+
Reason: status.ConditionDedicatedInformerActive,
403+
ObservedGeneration: instanceGeneration,
404+
LastTransitionTime: metav1.Now(),
394405
}
395-
ok := status.UpdateConditionsInPlace(conds, metav1.Condition{
396-
Type: condition,
397-
Status: metav1.ConditionTrue,
398-
ObservedGeneration: sched.Generation,
399-
Reason: reason,
400-
Message: message,
401-
}, time.Now())
402-
if !ok {
403-
klog.InfoS("fail to update condition", "conditionType", condition, "reason", reason, "message", message)
404-
}
405-
// we need to set something anyway
406-
sched.Status.Conditions = conds
407406

407+
conds, _ = status.UpdateConditions(conds, informerCondition, time.Time{})
408+
return conds
409+
}
410+
411+
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler) error {
408412
if !status.NUMAResourcesSchedulerNeedsUpdate(initialStatus, sched.Status) {
409413
return nil
410414
}
411415

412-
if status.EqualConditions(initialStatus.Conditions, sched.Status.Conditions) {
413-
sched.Status.Conditions = initialStatus.Conditions
414-
}
415-
416416
if err := r.Client.Status().Update(ctx, sched); err != nil {
417417
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(sched), err)
418418
}

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
121121

122122
node := &corev1.Node{
123123
ObjectMeta: metav1.ObjectMeta{
124-
Name: fmt.Sprintf("master-node-0"),
124+
Name: "master-node-0",
125125
Labels: map[string]string{
126126
"node-role.kubernetes.io/control-plane": "",
127127
},
@@ -132,13 +132,13 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
132132
Expect(err).ToNot(HaveOccurred())
133133

134134
Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(Succeed())
135+
136+
progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
137+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
135138
degradedCondition = getConditionByType(nrs.Status.Conditions, status.ConditionDegraded)
136139
Expect(degradedCondition.Status).To(Equal(metav1.ConditionFalse), "scheduler reported as Degraded: %v", degradedCondition)
137140
Expect(degradedCondition.Reason).To(Equal(status.ConditionDegraded))
138141
Expect(degradedCondition.Message).To(BeEmpty())
139-
140-
progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
141-
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
142142
})
143143
})
144144

pkg/status/conditioninfo/conditioninfo.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
package conditioninfo
1818

1919
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
2022
"github.com/openshift-kni/numaresources-operator/pkg/status"
2123
)
2224

@@ -73,3 +75,13 @@ func DegradedFromError(err error) ConditionInfo {
7375
Reason: status.ReasonFromError(err),
7476
}
7577
}
78+
79+
func (ci ConditionInfo) ToMetav1Condition(gen int64) metav1.Condition {
80+
return metav1.Condition{
81+
Type: ci.Type,
82+
Status: metav1.ConditionTrue,
83+
Reason: ci.Reason,
84+
Message: ci.Message,
85+
ObservedGeneration: gen,
86+
}
87+
}

0 commit comments

Comments
 (0)