Skip to content

Commit 0d96713

Browse files
Add in-place updates support for machine controller
Signed-off-by: Alexandr Demicev <[email protected]>
1 parent bb764cc commit 0d96713

File tree

7 files changed

+836
-7
lines changed

7 files changed

+836
-7
lines changed

api/core/v1beta2/machine_types.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,31 @@ const (
164164
MachineNotUpToDateReason = "NotUpToDate"
165165
)
166166

167+
// Machine's Updating condition and corresponding reasons.
168+
// Note: Updating condition is set by the Machine controller during in-place updates.
169+
const (
170+
// MachineUpdatingCondition is true while an in-place update is in progress on the Machine.
171+
// The condition is owned by the Machine controller and is used to track the progress of in-place updates.
172+
// This condition is considered when computing the UpToDate condition.
173+
MachineUpdatingCondition = "Updating"
174+
175+
// MachineNotUpdatingReason surfaces when the Machine is not performing an in-place update.
176+
MachineNotUpdatingReason = "NotUpdating"
177+
178+
// MachineWaitingForInPlaceUpdateAnnotationsReason surfaces when the Machine is waiting for
179+
// InfraMachine and BootstrapConfig to be annotated for in-place update.
180+
MachineWaitingForInPlaceUpdateAnnotationsReason = "WaitingForInPlaceUpdateAnnotations"
181+
182+
// MachineWaitingForUpdateMachineHookReason surfaces when the Machine is waiting for the UpdateMachine hook to complete.
183+
MachineWaitingForUpdateMachineHookReason = "WaitingForUpdateMachineHook"
184+
185+
// MachineUpdateFailedReason surfaces when the in-place update has failed.
186+
MachineUpdateFailedReason = "UpdateFailed"
187+
188+
// MachineUpdatingInternalErrorReason surfaces unexpected failures during in-place update.
189+
MachineUpdatingInternalErrorReason = InternalErrorReason
190+
)
191+
167192
// Machine's BootstrapConfigReady condition and corresponding reasons.
168193
// Note: when possible, BootstrapConfigReady condition will use reasons surfaced from the underlying bootstrap config object.
169194
const (

controllers/alias.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ func (r *ClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag
7272

7373
// MachineReconciler reconciles a Machine object.
7474
type MachineReconciler struct {
75-
Client client.Client
76-
APIReader client.Reader
77-
ClusterCache clustercache.ClusterCache
75+
Client client.Client
76+
APIReader client.Reader
77+
ClusterCache clustercache.ClusterCache
78+
RuntimeClient runtimeclient.Client
7879

7980
// WatchFilterValue is the label value used to filter events prior to reconciliation.
8081
WatchFilterValue string
@@ -90,6 +91,7 @@ func (r *MachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manag
9091
Client: r.Client,
9192
APIReader: r.APIReader,
9293
ClusterCache: r.ClusterCache,
94+
RuntimeClient: r.RuntimeClient,
9395
WatchFilterValue: r.WatchFilterValue,
9496
RemoteConditionsGracePeriod: r.RemoteConditionsGracePeriod,
9597
AdditionalSyncMachineLabels: r.AdditionalSyncMachineLabels,

internal/controllers/machine/machine_controller.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import (
5252
"sigs.k8s.io/cluster-api/controllers/clustercache"
5353
"sigs.k8s.io/cluster-api/controllers/external"
5454
"sigs.k8s.io/cluster-api/controllers/noderefutil"
55+
runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client"
5556
"sigs.k8s.io/cluster-api/feature"
5657
"sigs.k8s.io/cluster-api/internal/contract"
5758
"sigs.k8s.io/cluster-api/internal/controllers/machine/drain"
@@ -93,9 +94,10 @@ var (
9394

9495
// Reconciler reconciles a Machine object.
9596
type Reconciler struct {
96-
Client client.Client
97-
APIReader client.Reader
98-
ClusterCache clustercache.ClusterCache
97+
Client client.Client
98+
APIReader client.Reader
99+
ClusterCache clustercache.ClusterCache
100+
RuntimeClient runtimeclient.Client
99101

100102
// WatchFilterValue is the label value used to filter events prior to reconciliation.
101103
WatchFilterValue string
@@ -129,6 +131,9 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
129131
// to have some buffer.
130132
return errors.New("Client, APIReader and ClusterCache must not be nil and RemoteConditionsGracePeriod must not be < 2m")
131133
}
134+
if feature.Gates.Enabled(feature.InPlaceUpdates) && r.RuntimeClient == nil {
135+
return errors.New("RuntimeClient must not be nil when InPlaceUpdates feature gate is enabled")
136+
}
132137

133138
r.predicateLog = ptr.To(ctrl.LoggerFrom(ctx).WithValues("controller", "machine"))
134139
clusterToMachines, err := util.ClusterToTypedObjectsMapper(mgr.GetClient(), &clusterv1.MachineList{}, mgr.GetScheme())
@@ -282,7 +287,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
282287
}
283288

284289
// Handle normal reconciliation loop.
285-
return doReconcile(ctx, alwaysReconcile, s)
290+
reconcileNormal := append(
291+
alwaysReconcile,
292+
r.reconcileInPlaceUpdate,
293+
)
294+
295+
return doReconcile(ctx, reconcileNormal, s)
286296
}
287297

288298
func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine, options ...patch.Option) error {
@@ -326,6 +336,7 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
326336
clusterv1.MachineNodeReadyCondition,
327337
clusterv1.MachineNodeHealthyCondition,
328338
clusterv1.MachineDeletingCondition,
339+
clusterv1.MachineUpdatingCondition,
329340
}},
330341
)
331342

@@ -397,6 +408,12 @@ type scope struct {
397408

398409
// deletingMessage is the message that should be used when setting the Deleting condition.
399410
deletingMessage string
411+
412+
// updatingReason is the reason that should be used when setting the Updating condition.
413+
updatingReason string
414+
415+
// updatingMessage is the message that should be used when setting the Updating condition.
416+
updatingMessage string
400417
}
401418

402419
func (r *Reconciler) reconcileMachineOwnerAndLabels(_ context.Context, s *scope) (ctrl.Result, error) {
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package machine
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
"github.com/pkg/errors"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/klog/v2"
29+
"k8s.io/utils/ptr"
30+
ctrl "sigs.k8s.io/controller-runtime"
31+
"sigs.k8s.io/controller-runtime/pkg/client"
32+
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
33+
34+
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
35+
runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1"
36+
"sigs.k8s.io/cluster-api/feature"
37+
"sigs.k8s.io/cluster-api/internal/hooks"
38+
)
39+
40+
// reconcileInPlaceUpdate handles the in-place update workflow for a Machine.
41+
func (r *Reconciler) reconcileInPlaceUpdate(ctx context.Context, s *scope) (ctrl.Result, error) {
42+
if !feature.Gates.Enabled(feature.InPlaceUpdates) {
43+
return ctrl.Result{}, nil
44+
}
45+
46+
log := ctrl.LoggerFrom(ctx)
47+
48+
machineAnnotations := s.machine.GetAnnotations()
49+
_, inPlaceUpdateInProgress := machineAnnotations[clusterv1.UpdateInProgressAnnotation]
50+
hasUpdateMachinePending := hooks.IsPending(runtimehooksv1.UpdateMachine, s.machine)
51+
52+
if !inPlaceUpdateInProgress {
53+
// Clean up any orphaned pending hooks and annotations before exiting.
54+
// This can happen if the in-place update annotation was removed from Machine
55+
// but the UpdateMachine hook is still pending or annotations are still on InfraMachine/BootstrapConfig.
56+
if hasUpdateMachinePending {
57+
log.Info("In-place update annotation removed but UpdateMachine hook still pending, cleaning up orphaned hook and annotations")
58+
if err := r.completeInPlaceUpdate(ctx, s); err != nil {
59+
return ctrl.Result{}, errors.Wrap(err, "failed to clean up orphaned UpdateMachine hook and annotations")
60+
}
61+
}
62+
63+
return ctrl.Result{}, nil
64+
}
65+
66+
// If hook is not pending, we're waiting for the owner controller to mark it as pending.
67+
if !hasUpdateMachinePending {
68+
log.Info("In-place update annotations are set, waiting for UpdateMachine hook to be marked as pending")
69+
return ctrl.Result{}, nil
70+
}
71+
72+
if !ptr.Deref(s.machine.Status.Initialization.InfrastructureProvisioned, false) {
73+
log.V(5).Info("Infrastructure not yet provisioned, skipping in-place update")
74+
return ctrl.Result{}, nil
75+
}
76+
if !ptr.Deref(s.machine.Status.Initialization.BootstrapDataSecretCreated, false) {
77+
log.V(5).Info("Bootstrap data secret not yet created, skipping in-place update")
78+
return ctrl.Result{}, nil
79+
}
80+
81+
if s.infraMachine == nil {
82+
s.updatingReason = clusterv1.MachineUpdateFailedReason
83+
s.updatingMessage = "In-place update not possible: InfraMachine not found"
84+
return ctrl.Result{}, errors.New("in-place update failed: InfraMachine not found")
85+
}
86+
87+
infraReady := r.isInfraMachineReadyForUpdate(s)
88+
bootstrapReady := r.isBootstrapConfigReadyForUpdate(s)
89+
90+
if !infraReady || !bootstrapReady {
91+
log.Info("Waiting for InfraMachine and BootstrapConfig to be marked for in-place update")
92+
return ctrl.Result{}, nil
93+
}
94+
95+
log.Info("UpdateMachine hook is pending, calling runtime hook")
96+
result, message, err := r.callUpdateMachineHook(ctx, s)
97+
if err != nil {
98+
s.updatingReason = clusterv1.MachineUpdateFailedReason
99+
s.updatingMessage = fmt.Sprintf("UpdateMachine hook failed: %v", err)
100+
return ctrl.Result{}, err
101+
}
102+
103+
if result.RequeueAfter > 0 {
104+
s.updatingReason = clusterv1.MachineWaitingForUpdateMachineHookReason
105+
if message != "" {
106+
s.updatingMessage = fmt.Sprintf("UpdateMachine hook in progress: %s", message)
107+
} else {
108+
s.updatingMessage = "UpdateMachine hook in progress"
109+
}
110+
return result, nil
111+
}
112+
113+
log.Info("In-place update completed successfully")
114+
if err := r.completeInPlaceUpdate(ctx, s); err != nil {
115+
return ctrl.Result{}, errors.Wrap(err, "failed to complete in-place update")
116+
}
117+
118+
return ctrl.Result{}, nil
119+
}
120+
121+
// isInfraMachineReadyForUpdate checks if the InfraMachine has the in-place update annotation.
122+
func (r *Reconciler) isInfraMachineReadyForUpdate(s *scope) bool {
123+
_, hasAnnotation := s.infraMachine.GetAnnotations()[clusterv1.UpdateInProgressAnnotation]
124+
return hasAnnotation
125+
}
126+
127+
// isBootstrapConfigReadyForUpdate checks if the BootstrapConfig has the in-place update annotation.
128+
func (r *Reconciler) isBootstrapConfigReadyForUpdate(s *scope) bool {
129+
if s.bootstrapConfig == nil {
130+
return true
131+
}
132+
_, hasAnnotation := s.bootstrapConfig.GetAnnotations()[clusterv1.UpdateInProgressAnnotation]
133+
return hasAnnotation
134+
}
135+
136+
// callUpdateMachineHook calls the UpdateMachine runtime hook for the machine.
137+
func (r *Reconciler) callUpdateMachineHook(ctx context.Context, s *scope) (ctrl.Result, string, error) {
138+
log := ctrl.LoggerFrom(ctx)
139+
140+
// Validate that exactly one extension is registered for the UpdateMachine hook.
141+
// For the current iteration, we only support a single extension to ensure safe behavior.
142+
// Support for multiple extensions will be introduced in a future iteration.
143+
extensions, err := r.RuntimeClient.GetAllExtensions(ctx, runtimehooksv1.UpdateMachine, s.machine)
144+
if err != nil {
145+
return ctrl.Result{}, "", err
146+
}
147+
148+
if len(extensions) == 0 {
149+
return ctrl.Result{}, "", errors.New("no extensions registered for UpdateMachine hook")
150+
}
151+
152+
if len(extensions) > 1 {
153+
return ctrl.Result{}, "", errors.Errorf("multiple extensions registered for UpdateMachine hook: only one extension is supported in the current iteration, found %d extensions: %v", len(extensions), extensions)
154+
}
155+
156+
request := &runtimehooksv1.UpdateMachineRequest{
157+
Desired: runtimehooksv1.UpdateMachineRequestObjects{
158+
Machine: *cleanupMachine(s.machine),
159+
InfrastructureMachine: runtime.RawExtension{Object: cleanupUnstructured(s.infraMachine)},
160+
},
161+
}
162+
163+
if s.bootstrapConfig != nil {
164+
request.Desired.BootstrapConfig = runtime.RawExtension{Object: cleanupUnstructured(s.bootstrapConfig)}
165+
}
166+
167+
response := &runtimehooksv1.UpdateMachineResponse{}
168+
169+
if err := r.RuntimeClient.CallAllExtensions(ctx, runtimehooksv1.UpdateMachine, s.machine, request, response); err != nil {
170+
return ctrl.Result{}, "", errors.Wrap(err, "failed to call UpdateMachine hook")
171+
}
172+
173+
if response.GetRetryAfterSeconds() != 0 {
174+
log.Info(fmt.Sprintf("UpdateMachine hook requested retry after %d seconds", response.GetRetryAfterSeconds()))
175+
return ctrl.Result{RequeueAfter: time.Duration(response.GetRetryAfterSeconds()) * time.Second}, response.GetMessage(), nil
176+
}
177+
178+
log.Info("UpdateMachine hook completed successfully")
179+
return ctrl.Result{}, response.GetMessage(), nil
180+
}
181+
182+
// completeInPlaceUpdate removes in-place update annotations from InfraMachine, BootstrapConfig, Machine,
183+
// and then marks the UpdateMachine hook as done (removes it from pending-hooks annotation).
184+
func (r *Reconciler) completeInPlaceUpdate(ctx context.Context, s *scope) error {
185+
log := ctrl.LoggerFrom(ctx)
186+
187+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.machine); err != nil {
188+
return err
189+
}
190+
191+
if s.infraMachine == nil {
192+
return errors.New("InfraMachine must exist to complete in-place update")
193+
}
194+
195+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.infraMachine); err != nil {
196+
return err
197+
}
198+
199+
if s.bootstrapConfig != nil {
200+
if err := r.removeInPlaceUpdateAnnotation(ctx, s.bootstrapConfig); err != nil {
201+
return err
202+
}
203+
}
204+
205+
if err := hooks.MarkAsDone(ctx, r.Client, s.machine, runtimehooksv1.UpdateMachine); err != nil {
206+
return err
207+
}
208+
209+
log.Info("Removed in-place update annotations and marked hook as done")
210+
return nil
211+
}
212+
213+
// removeInPlaceUpdateAnnotation removes the in-place update annotation from an object and patches it immediately.
214+
func (r *Reconciler) removeInPlaceUpdateAnnotation(ctx context.Context, obj client.Object) error {
215+
annotations := obj.GetAnnotations()
216+
if _, exists := annotations[clusterv1.UpdateInProgressAnnotation]; !exists {
217+
return nil
218+
}
219+
220+
gvk, err := apiutil.GVKForObject(obj, r.Client.Scheme())
221+
if err != nil {
222+
return errors.Wrapf(err, "failed to remove %s annotation from object %s", clusterv1.UpdateInProgressAnnotation, klog.KObj(obj))
223+
}
224+
225+
orig := obj.DeepCopyObject().(client.Object)
226+
delete(annotations, clusterv1.UpdateInProgressAnnotation)
227+
obj.SetAnnotations(annotations)
228+
229+
if err := r.Client.Patch(ctx, obj, client.MergeFrom(orig)); err != nil {
230+
return errors.Wrapf(err, "failed to remove %s annotation from %s %s", clusterv1.UpdateInProgressAnnotation, gvk.Kind, klog.KObj(obj))
231+
}
232+
233+
return nil
234+
}
235+
236+
func cleanupMachine(machine *clusterv1.Machine) *clusterv1.Machine {
237+
return &clusterv1.Machine{
238+
// Set GVK because object is later marshalled with json.Marshal when the hook request is sent.
239+
TypeMeta: metav1.TypeMeta{
240+
APIVersion: clusterv1.GroupVersion.String(),
241+
Kind: "Machine",
242+
},
243+
ObjectMeta: metav1.ObjectMeta{
244+
Name: machine.Name,
245+
Namespace: machine.Namespace,
246+
Labels: machine.Labels,
247+
Annotations: machine.Annotations,
248+
},
249+
Spec: *machine.Spec.DeepCopy(),
250+
}
251+
}
252+
253+
func cleanupUnstructured(u *unstructured.Unstructured) *unstructured.Unstructured {
254+
cleanedUpU := &unstructured.Unstructured{
255+
Object: map[string]interface{}{
256+
"apiVersion": u.GetAPIVersion(),
257+
"kind": u.GetKind(),
258+
"spec": u.Object["spec"],
259+
},
260+
}
261+
cleanedUpU.SetName(u.GetName())
262+
cleanedUpU.SetNamespace(u.GetNamespace())
263+
cleanedUpU.SetLabels(u.GetLabels())
264+
cleanedUpU.SetAnnotations(u.GetAnnotations())
265+
return cleanedUpU
266+
}

0 commit comments

Comments
 (0)