Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 27 additions & 45 deletions internal/controller/numaresourcesoperator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,14 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
return ctrl.Result{}, err
}

initialStatus := *instance.Status.DeepCopy()
if len(initialStatus.Conditions) == 0 {
instance.Status.Conditions = status.DefaultBaseConditions(time.Now())
}

if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name)
return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
return r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
}

if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
Expand All @@ -150,80 +155,64 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
}

if err := validation.NodeGroups(instance.Spec.NodeGroups, r.Platform); err != nil {
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
}

trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups, r.Platform)
if err != nil {
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
}

tolerable, err := validation.NodeGroupsTree(instance, trees, r.Platform)
if err != nil {
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
}

for idx := range trees {
conf := trees[idx].NodeGroup.NormalizeConfig()
trees[idx].NodeGroup.Config = &conf
}

curStatus := instance.Status.DeepCopy()

step := r.reconcileResource(ctx, instance, trees)
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())

if step.Done() && tolerable != nil {
return r.degradeStatus(ctx, instance, tolerable.Reason, tolerable.Error)
}

if !status.NUMAResourceOperatorNeedsUpdate(curStatus, &instance.Status) {
return step.Result, step.Error
return r.degradeStatus(ctx, initialStatus, instance, tolerable.Reason, tolerable.Error)
}

updErr := r.Client.Status().Update(ctx, instance)
if updErr != nil {
klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr)
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
klog.InfoS("Failed to update numaresources-operator status", "error", err)
}

return step.Result, step.Error
}

// updateStatusConditionsIfNeeded returns true if conditions were updated.
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) {
if cond.Type == "" { // backward (=legacy) compatibility
return
}
klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message)
conditions, ok := status.ComputeConditions(instance.Status.Conditions, metav1.Condition{
Type: cond.Type,
Reason: cond.Reason,
Message: cond.Message,
ObservedGeneration: instance.Generation,
}, time.Now())
if ok {
instance.Status.Conditions = conditions
func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator) error {
if !status.NUMAResourceOperatorNeedsUpdate(initialStatus, instance.Status) {
return nil
}

updErr := r.Client.Status().Update(ctx, instance)
if updErr != nil {
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
}
return nil
}

func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
info := conditioninfo.DegradedFromError(stErr)
if reason != "" { // intentionally overwrite
info.Reason = reason
}

updateStatusConditionsIfNeeded(instance, info)
// TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, info.ToMetav1Condition(instance.Generation), time.Now())

err := r.Client.Status().Update(ctx, instance)
err := r.updateStatus(ctx, initialStatus, instance)
if err != nil {
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), err)
}

// we do not return an error here because to pass the validation error a user will need to update NRO CR
// that will anyway initiate to reconcile loop
return ctrl.Result{}, nil
return ctrl.Result{}, err
}

func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
Expand Down Expand Up @@ -298,34 +287,27 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context

func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}

existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace)

if r.Platform == platform.OpenShift {
if step := r.reconcileResourceMachineConfig(ctx, instance, existing, trees); step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}
}

dsPerPool, step := r.reconcileResourceDaemonSet(ctx, instance, existing, trees)
if step.EarlyStop() {
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
return step
}

// all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which
// is a certain thing if we got to this point otherwise the function would have returned already
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerPool)

updateStatusConditionsIfNeeded(instance, conditioninfo.Available())
return intreconcile.Step{
Result: ctrl.Result{},
ConditionInfo: conditioninfo.Available(),
}
return intreconcile.StepSuccess()
}

func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, string, error) {
Expand Down
96 changes: 48 additions & 48 deletions internal/controller/numaresourcesscheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import (
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
"github.com/openshift-kni/numaresources-operator/internal/relatedobjects"
"github.com/openshift-kni/numaresources-operator/pkg/apply"
"github.com/openshift-kni/numaresources-operator/pkg/hash"
Expand Down Expand Up @@ -115,23 +116,45 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
}

initialStatus := *instance.Status.DeepCopy()
if len(initialStatus.Conditions) == 0 {
instance.Status.Conditions = status.NewNUMAResourcesSchedulerBaseConditions()
}

if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName {
message := fmt.Sprintf("incorrect NUMAResourcesScheduler resource name: %s", instance.Name)
return ctrl.Result{}, r.updateStatus(ctx, initialStatus, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
return ctrl.Result{}, r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
}

if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
klog.InfoS("Pause reconciliation enabled", "object", req.NamespacedName)
return ctrl.Result{}, nil
}

result, condition, err := r.reconcileResource(ctx, instance)
if err := r.updateStatus(ctx, initialStatus, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil {
klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err)
step := r.reconcileResource(ctx, instance)
instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
klog.InfoS("Failed to update numaresourcesscheduler status", "error", err)
}

return result, err
return step.Result, step.Error
}

func (r *NUMAResourcesSchedulerReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, instance *nropv1.NUMAResourcesScheduler, reason string, message string) error {
condition := metav1.Condition{
Type: status.ConditionDegraded,
Status: metav1.ConditionTrue,
Reason: reason,
Message: message,
ObservedGeneration: instance.Generation,
}

instance.Status.Conditions, _ = status.UpdateConditions(instance.Status.Conditions, condition, time.Now())

err := r.updateStatus(ctx, initialStatus, instance)
if err != nil {
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
}
return err
}

// SetupWithManager sets up the controller with the Manager.
Expand Down Expand Up @@ -183,24 +206,24 @@ func (r *NUMAResourcesSchedulerReconciler) nodeToNUMAResourcesScheduler(ctx cont
return requests
}

func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) (reconcile.Result, string, error) {
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) intreconcile.Step {
schedStatus, err := r.syncNUMASchedulerResources(ctx, instance)
if err != nil {
return ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedSchedulerSync: %w", err)
return intreconcile.StepFailed(fmt.Errorf("FailedSchedulerSync: %w", err))
}

instance.Status = schedStatus
instance.Status.RelatedObjects = relatedobjects.Scheduler(r.Namespace, instance.Status.Deployment)

ok, err := isDeploymentRunning(ctx, r.Client, schedStatus.Deployment)
if err != nil {
return ctrl.Result{}, status.ConditionDegraded, err
return intreconcile.StepFailed(err)
}
if !ok {
return ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil
return intreconcile.StepOngoing(5 * time.Second)
}

return ctrl.Result{}, status.ConditionAvailable, nil
return intreconcile.StepSuccess()
}

func isDeploymentRunning(ctx context.Context, c client.Client, key nropv1.NamespacedName) (bool, error) {
Expand Down Expand Up @@ -303,7 +326,7 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
Duration: cacheResyncPeriod,
}

schedStatus.Conditions = updateDedicatedInformerCondition(status.NewNUMAResourcesSchedulerBaseConditions(), *instance, schedSpec)
schedStatus.Conditions = updateDedicatedInformerCondition(schedStatus.Conditions, instance.Generation, schedSpec)

r.SchedulerManifests.Deployment.Spec.Replicas = schedSpec.Replicas
klog.V(4).InfoS("using scheduler replicas", "replicas", *r.SchedulerManifests.Deployment.Spec.Replicas)
Expand Down Expand Up @@ -367,52 +390,29 @@ func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platfor
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
}

func updateDedicatedInformerCondition(conds []metav1.Condition, instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
condition := status.FindCondition(conds, status.ConditionDedicatedInformerActive)
if condition == nil { // should never happen
klog.InfoS("missing condition: %q", status.ConditionDedicatedInformerActive)
return conds
}
if normalized.SchedulerInformer == nil { // should never happen
klog.InfoS("nil SchedulerInformer in spec")
condition.Status = metav1.ConditionUnknown
return conds
}
func updateDedicatedInformerCondition(conds []metav1.Condition, instanceGeneration int64, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
dedicatedStatus := metav1.ConditionFalse
if *normalized.SchedulerInformer == nropv1.SchedulerInformerDedicated {
condition.Status = metav1.ConditionTrue
} else {
condition.Status = metav1.ConditionFalse
dedicatedStatus = metav1.ConditionTrue
}
condition.ObservedGeneration = instance.Generation
return conds
}

func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error {
conds := status.CloneConditions(sched.Status.Conditions)
if len(conds) == 0 {
conds = status.NewNUMAResourcesSchedulerBaseConditions()
informerCondition := metav1.Condition{
Type: status.ConditionDedicatedInformerActive,
Status: dedicatedStatus,
Reason: status.ConditionDedicatedInformerActive,
ObservedGeneration: instanceGeneration,
LastTransitionTime: metav1.Now(),
}
ok := status.UpdateConditionsInPlace(conds, metav1.Condition{
Type: condition,
Status: metav1.ConditionTrue,
ObservedGeneration: sched.Generation,
Reason: reason,
Message: message,
}, time.Now())
if !ok {
klog.InfoS("fail to update condition", "conditionType", condition, "reason", reason, "message", message)
}
// we need to set something anyway
sched.Status.Conditions = conds

conds, _ = status.UpdateConditions(conds, informerCondition, time.Time{})
return conds
}

func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler) error {
if !status.NUMAResourcesSchedulerNeedsUpdate(initialStatus, sched.Status) {
return nil
}

if status.EqualConditions(initialStatus.Conditions, sched.Status.Conditions) {
sched.Status.Conditions = initialStatus.Conditions
}

if err := r.Client.Status().Update(ctx, sched); err != nil {
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(sched), err)
}
Expand Down
8 changes: 4 additions & 4 deletions internal/controller/numaresourcesscheduler_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {

node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("master-node-0"),
Name: "master-node-0",
Labels: map[string]string{
"node-role.kubernetes.io/control-plane": "",
},
Expand All @@ -132,13 +132,13 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
Expect(err).ToNot(HaveOccurred())

Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(Succeed())

progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
degradedCondition = getConditionByType(nrs.Status.Conditions, status.ConditionDegraded)
Expect(degradedCondition.Status).To(Equal(metav1.ConditionFalse), "scheduler reported as Degraded: %v", degradedCondition)
Expect(degradedCondition.Reason).To(Equal(status.ConditionDegraded))
Expect(degradedCondition.Message).To(BeEmpty())

progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
})
})

Expand Down
12 changes: 12 additions & 0 deletions pkg/status/conditioninfo/conditioninfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
package conditioninfo

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/openshift-kni/numaresources-operator/pkg/status"
)

Expand Down Expand Up @@ -73,3 +75,13 @@ func DegradedFromError(err error) ConditionInfo {
Reason: status.ReasonFromError(err),
}
}

func (ci ConditionInfo) ToMetav1Condition(gen int64) metav1.Condition {
return metav1.Condition{
Type: ci.Type,
Status: metav1.ConditionTrue,
Reason: ci.Reason,
Message: ci.Message,
ObservedGeneration: gen,
}
}
Loading
Loading