Skip to content

Commit 7bb3062

Browse files
Add in-place updates support for machine controller
Signed-off-by: Alexandr Demicev <[email protected]>
1 parent 82f5743 commit 7bb3062

File tree

6 files changed

+330
-7
lines changed

6 files changed

+330
-7
lines changed

api/core/v1beta2/common_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,11 @@ const (
181181
// This annotation can be used to inform MachinePool status during in-progress scaling scenarios.
182182
ReplicasManagedByAnnotation = "cluster.x-k8s.io/replicas-managed-by"
183183

184+
// InPlaceUpdateInProgressAnnotation is set on Machine, InfraMachine, and BootstrapConfig when an in-place update is in progress.
185+
// The Machine controller waits for all three objects to have this annotation before starting the update.
186+
// The Machine controller removes this annotation when the update is complete.
187+
InPlaceUpdateInProgressAnnotation = "cluster.x-k8s.io/in-place-update-in-progress"
188+
184189
// AutoscalerMinSizeAnnotation defines the minimum node group size.
185190
// The annotation is used by autoscaler.
186191
// The annotation is copied from kubernetes/autoscaler.

api/core/v1beta2/machine_types.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,31 @@ const (
153153
MachineNotUpToDateReason = "NotUpToDate"
154154
)
155155

156+
// Machine's Updating condition and corresponding reasons.
157+
// Note: Updating condition is set by the Machine controller during in-place updates.
158+
const (
159+
// MachineUpdatingCondition is true while an in-place update is in progress on the Machine.
160+
// The condition is owned by the Machine controller and is used to track the progress of in-place updates.
161+
// This condition is considered when computing the UpToDate condition.
162+
MachineUpdatingCondition = "Updating"
163+
164+
// MachineNotUpdatingReason surfaces when the Machine is not performing an in-place update.
165+
MachineNotUpdatingReason = "NotUpdating"
166+
167+
// MachineWaitingForInPlaceUpdateAnnotationsReason surfaces when the Machine is waiting for
168+
// InfraMachine and BootstrapConfig to be annotated for in-place update.
169+
MachineWaitingForInPlaceUpdateAnnotationsReason = "WaitingForInPlaceUpdateAnnotations"
170+
171+
// MachineWaitingForUpdateMachineHookReason surfaces when the Machine is waiting for the UpdateMachine hook to complete.
172+
MachineWaitingForUpdateMachineHookReason = "WaitingForUpdateMachineHook"
173+
174+
// MachineUpdateFailedReason surfaces when the in-place update has failed.
175+
MachineUpdateFailedReason = "UpdateFailed"
176+
177+
// MachineUpdatingInternalErrorReason surfaces unexpected failures during in-place update.
178+
MachineUpdatingInternalErrorReason = InternalErrorReason
179+
)
180+
156181
// Machine's BootstrapConfigReady condition and corresponding reasons.
157182
// Note: when possible, BootstrapConfigReady condition will use reasons surfaced from the underlying bootstrap config object.
158183
const (

controllers/alias.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ func (r *ClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag
7272

7373
// MachineReconciler reconciles a Machine object.
7474
type MachineReconciler struct {
75-
Client client.Client
76-
APIReader client.Reader
77-
ClusterCache clustercache.ClusterCache
75+
Client client.Client
76+
APIReader client.Reader
77+
ClusterCache clustercache.ClusterCache
78+
RuntimeClient runtimeclient.Client
7879

7980
// WatchFilterValue is the label value used to filter events prior to reconciliation.
8081
WatchFilterValue string
@@ -90,6 +91,7 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag
9091
Client: r.Client,
9192
APIReader: r.APIReader,
9293
ClusterCache: r.ClusterCache,
94+
RuntimeClient: r.RuntimeClient,
9395
WatchFilterValue: r.WatchFilterValue,
9496
RemoteConditionsGracePeriod: r.RemoteConditionsGracePeriod,
9597
AdditionalSyncMachineLabels: r.AdditionalSyncMachineLabels,

internal/controllers/machine/machine_controller.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import (
5252
"sigs.k8s.io/cluster-api/controllers/clustercache"
5353
"sigs.k8s.io/cluster-api/controllers/external"
5454
"sigs.k8s.io/cluster-api/controllers/noderefutil"
55+
runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client"
5556
"sigs.k8s.io/cluster-api/feature"
5657
"sigs.k8s.io/cluster-api/internal/contract"
5758
"sigs.k8s.io/cluster-api/internal/controllers/machine/drain"
@@ -93,9 +94,10 @@ var (
9394

9495
// Reconciler reconciles a Machine object.
9596
type Reconciler struct {
96-
Client client.Client
97-
APIReader client.Reader
98-
ClusterCache clustercache.ClusterCache
97+
Client client.Client
98+
APIReader client.Reader
99+
ClusterCache clustercache.ClusterCache
100+
RuntimeClient runtimeclient.Client
99101

100102
// WatchFilterValue is the label value used to filter events prior to reconciliation.
101103
WatchFilterValue string
@@ -129,6 +131,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
129131
// to have some buffer.
130132
return errors.New("Client, APIReader and ClusterCache must not be nil and RemoteConditionsGracePeriod must not be < 2m")
131133
}
134+
if feature.Gates.Enabled(feature.InPlaceUpdates) && r.RuntimeClient == nil {
135+
return errors.New("RuntimeClient must not be nil when InPlaceUpdates feature gate is enabled")
136+
}
132137

133138
r.predicateLog = ptr.To(ctrl.LoggerFrom(ctx).WithValues("controller", "machine"))
134139
clusterToMachines, err := util.ClusterToTypedObjectsMapper(mgr.GetClient(), &clusterv1.MachineList{}, mgr.GetScheme())
@@ -282,7 +287,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
282287
}
283288

284289
// Handle normal reconciliation loop.
285-
return doReconcile(ctx, alwaysReconcile, s)
290+
reconcileNormal := append(
291+
alwaysReconcile,
292+
r.reconcileInPlaceUpdate,
293+
)
294+
295+
return doReconcile(ctx, reconcileNormal, s)
286296
}
287297

288298
func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -326,6 +336,7 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
326336
clusterv1.MachineNodeReadyCondition,
327337
clusterv1.MachineNodeHealthyCondition,
328338
clusterv1.MachineDeletingCondition,
339+
clusterv1.MachineUpdatingCondition,
329340
}},
330341
)
331342

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package machine
18+
19+
import (
20+
"context"
21+
"encoding/json"
22+
"fmt"
23+
"time"
24+
25+
"github.com/pkg/errors"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
ctrl "sigs.k8s.io/controller-runtime"
29+
"sigs.k8s.io/controller-runtime/pkg/client"
30+
31+
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
32+
runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1"
33+
"sigs.k8s.io/cluster-api/feature"
34+
"sigs.k8s.io/cluster-api/internal/hooks"
35+
"sigs.k8s.io/cluster-api/util/conditions"
36+
"sigs.k8s.io/cluster-api/util/patch"
37+
)
38+
39+
// reconcileInPlaceUpdate handles the in-place update workflow for a Machine.
40+
func (r *Reconciler) reconcileInPlaceUpdate(ctx context.Context, s *scope) (ctrl.Result, error) {
41+
if !feature.Gates.Enabled(feature.InPlaceUpdates) {
42+
return ctrl.Result{}, nil
43+
}
44+
45+
log := ctrl.LoggerFrom(ctx)
46+
47+
machineAnnotations := s.machine.GetAnnotations()
48+
_, inPlaceUpdateInProgress := machineAnnotations[clusterv1.InPlaceUpdateInProgressAnnotation]
49+
hasUpdateMachinePending := hooks.IsPending(runtimehooksv1.UpdateMachine, s.machine)
50+
51+
if !inPlaceUpdateInProgress {
52+
// Clean up any orphaned pending hooks before exiting.
53+
if hasUpdateMachinePending {
54+
log.Info("In-place update annotation removed but UpdateMachine hook still pending, cleaning up orphaned hook")
55+
if err := hooks.MarkAsDone(ctx, r.Client, s.machine, runtimehooksv1.UpdateMachine); err != nil { // this patches the machine, should we defer that patch until the end of reconciliation?
56+
return ctrl.Result{}, errors.Wrap(err, "failed to clean up orphaned UpdateMachine hook")
57+
}
58+
}
59+
60+
conditions.Set(s.machine, metav1.Condition{
61+
Type: clusterv1.MachineUpdatingCondition,
62+
Status: metav1.ConditionFalse,
63+
Reason: clusterv1.MachineNotUpdatingReason,
64+
})
65+
return ctrl.Result{}, nil
66+
}
67+
68+
infraReady, infraErr := r.isInfraMachineReadyForUpdate(s)
69+
if infraErr != nil {
70+
return ctrl.Result{}, errors.Wrap(infraErr, "failed to check if InfraMachine is ready for in-place update")
71+
}
72+
73+
bootstrapReady, bootstrapErr := r.isBootstrapConfigReadyForUpdate(s)
74+
if bootstrapErr != nil {
75+
return ctrl.Result{}, errors.Wrap(bootstrapErr, "failed to check if BootstrapConfig is ready for in-place update")
76+
}
77+
78+
if !infraReady || !bootstrapReady {
79+
log.Info("Waiting for InfraMachine and BootstrapConfig to be marked for in-place update")
80+
conditions.Set(s.machine, metav1.Condition{
81+
Type: clusterv1.MachineUpdatingCondition,
82+
Status: metav1.ConditionFalse,
83+
Reason: clusterv1.MachineWaitingForInPlaceUpdateAnnotationsReason,
84+
Message: "Waiting for InfraMachine and BootstrapConfig to be marked for update",
85+
})
86+
return ctrl.Result{}, nil
87+
}
88+
89+
if hasUpdateMachinePending {
90+
log.Info("UpdateMachine hook is pending, calling runtime hook")
91+
result, err := r.callUpdateMachineHook(ctx, s)
92+
if err != nil {
93+
conditions.Set(s.machine, metav1.Condition{
94+
Type: clusterv1.MachineUpdatingCondition,
95+
Status: metav1.ConditionFalse,
96+
Reason: clusterv1.MachineUpdateFailedReason,
97+
Message: fmt.Sprintf("UpdateMachine hook failed: %v", err),
98+
})
99+
return ctrl.Result{}, err
100+
}
101+
102+
if result.RequeueAfter > 0 {
103+
conditions.Set(s.machine, metav1.Condition{
104+
Type: clusterv1.MachineUpdatingCondition,
105+
Status: metav1.ConditionTrue,
106+
Reason: clusterv1.MachineWaitingForUpdateMachineHookReason,
107+
Message: "UpdateMachine hook in progress",
108+
})
109+
return result, nil
110+
}
111+
112+
if err := hooks.MarkAsDone(ctx, r.Client, s.machine, runtimehooksv1.UpdateMachine); err != nil { // this patches the machine, should we defer that patch until the end of reconciliation?
113+
return ctrl.Result{}, errors.Wrap(err, "failed to mark UpdateMachine hook as done")
114+
}
115+
116+
log.Info("In-place update completed successfully")
117+
if err := r.completeInPlaceUpdate(ctx, s); err != nil {
118+
return ctrl.Result{}, errors.Wrap(err, "failed to complete in-place update")
119+
}
120+
121+
conditions.Set(s.machine, metav1.Condition{
122+
Type: clusterv1.MachineUpdatingCondition,
123+
Status: metav1.ConditionFalse,
124+
Reason: clusterv1.MachineNotUpdatingReason,
125+
})
126+
127+
return ctrl.Result{}, nil
128+
}
129+
130+
// If we reach here, annotations are set but hook is not pending.
131+
// This means we're waiting for the owner controller to mark the hook as pending.
132+
log.Info("In-place update annotations are set, waiting for UpdateMachine hook to be marked as pending")
133+
conditions.Set(s.machine, metav1.Condition{
134+
Type: clusterv1.MachineUpdatingCondition,
135+
Status: metav1.ConditionFalse,
136+
Reason: clusterv1.MachineWaitingForInPlaceUpdateAnnotationsReason,
137+
Message: "Waiting for UpdateMachine hook to be marked as pending",
138+
})
139+
140+
return ctrl.Result{}, nil
141+
}
142+
143+
// isInfraMachineReadyForUpdate checks if the InfraMachine has the in-place update annotation.
144+
func (r *Reconciler) isInfraMachineReadyForUpdate(s *scope) (bool, error) {
145+
if s.infraMachine == nil {
146+
return false, nil
147+
}
148+
infraMachineAnnotations := s.infraMachine.GetAnnotations()
149+
if infraMachineAnnotations == nil {
150+
return false, nil
151+
}
152+
_, hasAnnotation := infraMachineAnnotations[clusterv1.InPlaceUpdateInProgressAnnotation]
153+
return hasAnnotation, nil
154+
}
155+
156+
// isBootstrapConfigReadyForUpdate checks if the BootstrapConfig has the in-place update annotation.
157+
func (r *Reconciler) isBootstrapConfigReadyForUpdate(s *scope) (bool, error) {
158+
if s.bootstrapConfig == nil {
159+
return true, nil
160+
}
161+
bootstrapConfigAnnotations := s.bootstrapConfig.GetAnnotations()
162+
if bootstrapConfigAnnotations == nil {
163+
return false, nil
164+
}
165+
_, hasAnnotation := bootstrapConfigAnnotations[clusterv1.InPlaceUpdateInProgressAnnotation]
166+
return hasAnnotation, nil
167+
}
168+
169+
// callUpdateMachineHook calls the UpdateMachine runtime hook for the machine.
170+
func (r *Reconciler) callUpdateMachineHook(ctx context.Context, s *scope) (ctrl.Result, error) {
171+
log := ctrl.LoggerFrom(ctx)
172+
173+
request := &runtimehooksv1.UpdateMachineRequest{}
174+
request.Desired.Machine = *s.machine.DeepCopy()
175+
176+
if s.infraMachine != nil { // should it return an error if infraMachine is nil?
177+
infraMachineRaw, err := runtime.DefaultUnstructuredConverter.ToUnstructured(s.infraMachine)
178+
if err != nil {
179+
return ctrl.Result{}, errors.Wrap(err, "failed to convert InfraMachine to unstructured")
180+
}
181+
request.Desired.InfrastructureMachine.Raw, err = json.Marshal(infraMachineRaw)
182+
if err != nil {
183+
return ctrl.Result{}, errors.Wrap(err, "failed to marshal InfraMachine")
184+
}
185+
}
186+
187+
if s.bootstrapConfig != nil { // should it return an error if bootstrapConfig is nil?
188+
bootstrapConfigRaw, err := runtime.DefaultUnstructuredConverter.ToUnstructured(s.bootstrapConfig)
189+
if err != nil {
190+
return ctrl.Result{}, errors.Wrap(err, "failed to convert BootstrapConfig to unstructured")
191+
}
192+
request.Desired.BootstrapConfig.Raw, err = json.Marshal(bootstrapConfigRaw)
193+
if err != nil {
194+
return ctrl.Result{}, errors.Wrap(err, "failed to marshal BootstrapConfig")
195+
}
196+
}
197+
198+
response := &runtimehooksv1.UpdateMachineResponse{}
199+
200+
if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.UpdateMachine, s.machine, request, response); err != nil {
201+
return ctrl.Result{}, errors.Wrap(err, "failed to call UpdateMachine hook")
202+
}
203+
204+
if response.GetRetryAfterSeconds() != 0 {
205+
log.Info(fmt.Sprintf("UpdateMachine hook requested retry after %d seconds", response.GetRetryAfterSeconds()))
206+
return ctrl.Result{RequeueAfter: time.Duration(response.GetRetryAfterSeconds()) * time.Second}, nil
207+
}
208+
209+
log.Info("UpdateMachine hook completed successfully")
210+
return ctrl.Result{}, nil
211+
}
212+
213+
// completeInPlaceUpdate removes in-place update annotations from InfraMachine, BootstrapConfig and Machine.
214+
func (r *Reconciler) completeInPlaceUpdate(ctx context.Context, s *scope) error {
215+
log := ctrl.LoggerFrom(ctx)
216+
217+
if s.infraMachine != nil {
218+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.infraMachine); err != nil {
219+
return errors.Wrap(err, "failed to remove in-place update annotation from InfraMachine")
220+
}
221+
}
222+
223+
if s.bootstrapConfig != nil {
224+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.bootstrapConfig); err != nil {
225+
return errors.Wrap(err, "failed to remove in-place update annotation from BootstrapConfig")
226+
}
227+
}
228+
229+
// Only remove from Machine if all child object patches succeeded.
230+
if err := r.removeInPlaceUpdateAnnotationFromMachine(s.machine); err != nil {
231+
return errors.Wrap(err, "failed to remove in-place update annotation from Machine")
232+
}
233+
234+
log.Info("Removed in-place update annotations from all objects")
235+
return nil
236+
}
237+
238+
// removeInPlaceUpdateAnnotationFromMachine removes the in-place update annotation from the Machine.
239+
func (r *Reconciler) removeInPlaceUpdateAnnotationFromMachine(machine *clusterv1.Machine) error {
240+
annotations := machine.GetAnnotations()
241+
if annotations == nil {
242+
return nil
243+
}
244+
245+
if _, exists := annotations[clusterv1.InPlaceUpdateInProgressAnnotation]; !exists {
246+
return nil
247+
}
248+
249+
delete(annotations, clusterv1.InPlaceUpdateInProgressAnnotation)
250+
machine.SetAnnotations(annotations)
251+
252+
return nil
253+
}
254+
255+
// removeInPlaceUpdateAnnotation removes the in-place update annotation from an object and patches it.
256+
func (r *Reconciler) removeInPlaceUpdateAnnotation(ctx context.Context, obj client.Object) error {
257+
annotations := obj.GetAnnotations()
258+
if annotations == nil {
259+
return nil
260+
}
261+
262+
if _, exists := annotations[clusterv1.InPlaceUpdateInProgressAnnotation]; !exists {
263+
return nil
264+
}
265+
266+
patchHelper, err := patch.NewHelper(obj, r.Client)
267+
if err != nil {
268+
return errors.Wrap(err, "failed to create patch helper")
269+
}
270+
271+
delete(annotations, clusterv1.InPlaceUpdateInProgressAnnotation)
272+
obj.SetAnnotations(annotations)
273+
274+
if err := patchHelper.Patch(ctx, obj); err != nil {
275+
return errors.Wrap(err, "failed to patch object")
276+
}
277+
278+
return nil
279+
}

0 commit comments

Comments
 (0)