Skip to content

Commit 5cfdba4

Browse files
committed
Refine VMG member update to avoid race condition
Signed-off-by: Gong Zhang <[email protected]>
1 parent b981168 commit 5cfdba4

File tree

2 files changed

+54
-75
lines changed

2 files changed

+54
-75
lines changed

controllers/vmware/virtualmachinegroup_reconciler.go

Lines changed: 41 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -227,31 +227,31 @@ func (r *VirtualMachineGroupReconciler) reconcileVirtualMachineGroup(ctx context
227227
return err
228228
}
229229

230-
// Member Update:
231-
// The VirtualMachineGroup's BootOrder.Members list, is only allowed to be set or added
232-
// during two phases to maintain control over VM placement:
230+
// The core purpose of isMemberUpdateAllowed is to prevent the VirtualMachineGroup from being updated with new members
231+
// that require placement, unless the VirtualMachineGroup
232+
// has successfully completed its initial placement and added the required
233+
// placement annotations. This stabilizes placement decisions before allowing new VMs
234+
// to be created under the group.
233235
//
234-
// 1. Initial Creation: When the VirtualMachineGroup object does not yet exist.
235-
// 2. Post-Placement: After the VirtualMachineGroup exists AND is marked Ready which means all members are placed successfully,
236-
// and critically, all MachineDeployments have a corresponding zone placement annotation recorded on the VMG.
237-
//
238-
// For member removal, this is always allowed since it doesn't impact ongoing placement or rely on the placement annotation.
236+
// The update is allowed if:
237+
// 1. The VirtualMachineGroup is being initially created
238+
// 2. The update is a scale-down operation (fewer target members).
239+
// 3. The new member's underlying CAPI Machine has a FailureDomain set (will skip placement process).
240+
// 4. The new member requires placement annotation AND the VirtualMachineGroup has the corresponding
241+
// placement annotation for the member's MachineDeployment.
239242
//
240243
// This prevents member updates that could lead to new VMs being created
241244
// without necessary zone labels, resulting in undesired placement, such as VM within a MachineDeployment but are
242245
// placed to different Zones.
243-
244-
isMemberUpdateAllowed, err := isMemberUpdateAllowed(ctx, r.Client, members, vmg)
246+
err := isMemberUpdateAllowed(ctx, r.Client, members, vmg)
245247
if err != nil {
246248
return err
247249
}
248250

249-
if isMemberUpdateAllowed {
250-
vmg.Spec.BootOrder = []vmoprv1.VirtualMachineGroupBootOrderGroup{
251-
{
252-
Members: members,
253-
},
254-
}
251+
vmg.Spec.BootOrder = []vmoprv1.VirtualMachineGroupBootOrderGroup{
252+
{
253+
Members: members,
254+
},
255255
}
256256

257257
// Set the owner reference
@@ -262,9 +262,8 @@ func (r *VirtualMachineGroupReconciler) reconcileVirtualMachineGroup(ctx context
262262
return nil
263263
}
264264

265-
// isMemberUpdateAllowed determines if the BootOrder.Members field can be safely updated on the VirtualMachineGroup.
266-
// It allows updates only during initial creation or after all member placement are completed successfully.
267-
func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, targetMember []vmoprv1.GroupMember, vmg *vmoprv1.VirtualMachineGroup) (bool, error) {
265+
// isMemberUpdateAllowed checks if a VirtualMachineGroup's BootOrder.Members update is allowed.
266+
func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, targetMember []vmoprv1.GroupMember, vmg *vmoprv1.VirtualMachineGroup) error {
268267
logger := log.FromContext(ctx)
269268
key := client.ObjectKey{
270269
Namespace: vmg.Namespace,
@@ -275,30 +274,30 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
275274
currentVMG := &vmoprv1.VirtualMachineGroup{}
276275
if err := kubeClient.Get(ctx, key, currentVMG); err != nil {
277276
if apierrors.IsNotFound(err) {
278-
// If VirtualMachineGroup is not found, allow member update as it should be in initial creation phase.
279-
logger.V(5).Info("VirtualMachineGroup not found, allowing member update for initial creation.")
280-
return true, nil
277+
// 1. If VirtualMachineGroup is not found, allow member update as it should be in initial creation phase.
278+
logger.V(5).Info("VirtualMachineGroup not created yet, allowing member update for initial creation.")
279+
return nil
281280
}
282-
return false, errors.Wrapf(err, "failed to get VirtualMachineGroup %s/%s", vmg.Namespace, vmg.Name)
281+
return errors.Wrapf(err, "failed to get VirtualMachineGroup %s/%s, blocking member update", vmg.Namespace, vmg.Name)
283282
}
284-
// Copy retrieved data back to the input pointer for consistency
283+
// Copy retrieved data back to the input pointer for consistency.
285284
*vmg = *currentVMG
286285

287-
// Get current member names from VirtualMachineGroup Spec.BootOrder
286+
// Get current member names from VirtualMachineGroup Spec.BootOrder.
288287
currentMemberNames := make(map[string]struct{})
289288
if len(vmg.Spec.BootOrder) > 0 {
290289
for _, m := range vmg.Spec.BootOrder[0].Members {
291290
currentMemberNames[m.Name] = struct{}{}
292291
}
293292
}
294293

295-
// 1. If removing members, allow immediately since it doesn't impact placement or placement annotation set.
294+
// 2. If removing members, allow immediately since it doesn't impact placement or placement annotation set.
296295
if len(targetMember) < len(currentMemberNames) {
297296
logger.V(5).Info("Scaling down detected (fewer target members), allowing member update.")
298-
return true, nil
297+
return nil
299298
}
300299

301-
// 2. If adding members, continue following checks.
300+
// If adding members, continue following checks.
302301
var newMembers []vmoprv1.GroupMember
303302
for _, m := range targetMember {
304303
if _, exists := currentMemberNames[m.Name]; !exists {
@@ -317,10 +316,9 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
317316
vsphereMachine := &vmwarev1.VSphereMachine{}
318317
if err := kubeClient.Get(ctx, vsphereMachineKey, vsphereMachine); err != nil {
319318
if apierrors.IsNotFound(err) {
320-
logger.V(5).Info("VSphereMachine for new member not found, temporarily blocking update.", "VSphereMachineName", newMember.Name)
321-
return false, nil
319+
return errors.Wrapf(err, "VSphereMachine for new member %s not found, temporarily blocking member update", newMember.Name)
322320
}
323-
return false, errors.Wrapf(err, "failed to get VSphereMachine %s", klog.KRef(newMember.Name, vmg.Namespace))
321+
return errors.Wrapf(err, "failed to get VSphereMachine %s", klog.KRef(newMember.Name, vmg.Namespace))
324322
}
325323

326324
var machineOwnerName string
@@ -333,8 +331,7 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
333331

334332
if machineOwnerName == "" {
335333
// VSphereMachine found but owner Machine reference is missing
336-
logger.V(5).Info("VSphereMachine found but owner Machine reference is missing, temporarily blocking update.", "VSphereMachineName", newMember.Name)
337-
return false, nil
334+
return errors.Wrapf(nil, "VSphereMachine %s found but owner Machine reference is missing, temporarily blocking member update", newMember.Name)
338335
}
339336

340337
machineKey := types.NamespacedName{
@@ -345,46 +342,38 @@ func isMemberUpdateAllowed(ctx context.Context, kubeClient client.Client, target
345342

346343
if err := kubeClient.Get(ctx, machineKey, machine); err != nil {
347344
if apierrors.IsNotFound(err) {
348-
logger.V(5).Info("CAPI Machine not found via owner reference, temporarily blocking update.", "Machine", klog.KRef(machineOwnerName, vmg.Namespace))
349-
return false, nil
345+
return errors.Wrapf(err, "Machine %s not found via owner reference, temporarily blocking member update", klog.KRef(machineOwnerName, vmg.Namespace))
350346
}
351-
return false, errors.Wrapf(err, "failed to get CAPI Machine %s", klog.KRef(machineOwnerName, vmg.Namespace))
347+
return errors.Wrapf(err, "failed to get CAPI Machine %s", klog.KRef(machineOwnerName, vmg.Namespace))
352348
}
353349

354-
// If FailureDomain is set on CAPI Machine, placement process will be skipped. Allow update.
350+
// If FailureDomain is set on CAPI Machine, placement process will be skipped. Allow update for this member.
355351
fd := machine.Spec.FailureDomain
356352
if fd != "" {
357-
logger.V(5).Info("New member's Machine has FailureDomain specified. Allowing VMG update for this member.")
353+
logger.V(5).Info("New member's Machine has FailureDomain specified. Allowing VMG update for this member.", "Member", newMember.Name)
358354
continue
359355
}
360356

361-
// If FailureDomain is NOT set. Requires placement or placement Annotation. Fall through to full VMG Annotation check.
362-
logger.V(5).Info("New member's CAPI Machine lacks FailureDomain. Falling through to VMG Annotation check.", "MachineName", machineOwnerName)
363-
364-
// If no Placement Annotations, skip member update and wait for it.
357+
// 4. If FailureDomain is NOT set. Requires placement or placement Annotation. Fall through to Annotation check.
358+
// If no Placement Annotations, block member update and wait for it.
365359
annotations := vmg.GetAnnotations()
366360
if len(annotations) == 0 {
367-
return false, nil
361+
return errors.Wrapf(nil, "waiting for placement annotation to update VMG member %s, temporarily blocking member update", newMember.Name)
368362
}
369363

370364
mdLabelName := vsphereMachine.Labels[clusterv1.MachineDeploymentNameLabel]
371365
if mdLabelName == "" {
372-
return false, errors.Wrapf(nil, "VSphereMachine doesn't have MachineDeployment name label %s", klog.KObj(vsphereMachine))
366+
return errors.Wrapf(nil, "VSphereMachine doesn't have MachineDeployment name label %s, blocking member update", klog.KObj(vsphereMachine))
373367
}
374368

375369
annotationKey := fmt.Sprintf("%s/%s", ZoneAnnotationPrefix, mdLabelName)
376-
377370
if _, found := annotations[annotationKey]; !found {
378-
logger.V(5).Info("Required placement annotation is missing.",
379-
"Member", newMember, "Annotation", annotationKey)
380-
return false, nil
371+
return errors.Wrapf(nil, "waiting for placement annotation %s to update VMG member %s, temporarily blocking member update", annotationKey, newMember.Name)
381372
}
382-
383-
logger.V(5).Info("New member requires placement annotation and it is present. Allowing this member.", "Member", newMember)
384373
}
385374

386-
logger.V(5).Info("Either no new members, or all newly added members existed or have satisfied placement requirements, allowing update.")
387-
return true, nil
375+
logger.V(5).Info("All newly added members either existed or have satisfied placement requirements, allowing member update.")
376+
return nil
388377
}
389378

390379
// getExpectedVSphereMachineCount get expected total count of Machines belonging to the Cluster.

controllers/vmware/virtualmachinegroup_reconciler_test.go

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
126126
vmgInput: baseVMG.DeepCopy(),
127127
mdNames: []string{mdName1},
128128
existingObjects: nil,
129-
wantAllowed: true,
130129
wantErr: false,
131130
},
132131
{
@@ -144,8 +143,7 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
144143
}}}}
145144
return []runtime.Object{v}
146145
}(),
147-
wantAllowed: true,
148-
wantErr: false,
146+
wantErr: false,
149147
},
150148
{
151149
name: "Allow member update when VMG Ready and All Annotations Present",
@@ -168,11 +166,10 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
168166

169167
return []runtime.Object{v}
170168
}(),
171-
wantAllowed: true,
172-
wantErr: false,
169+
wantErr: false,
173170
},
174171
{
175-
name: "Skip member update if new member VSphereMachine Not Found",
172+
name: "Block member update if new member VSphereMachine Not Found",
176173
targetMember: []vmoprv1.GroupMember{member(memberName1), member(memberName2)}, // vm-02 is new
177174
vmgInput: baseVMG.DeepCopy(),
178175
mdNames: []string{mdName1},
@@ -187,11 +184,10 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
187184
// vm-02 VSphereMachine is missing
188185
return []runtime.Object{v, makeVSphereMachineOwned(memberName1, vmgNamespace, ownerMachineName1, mdName1), makeCAPIMachine(ownerMachineName1, vmgNamespace, ptr.To(failureDomainA))}
189186
}(),
190-
wantAllowed: false,
191-
wantErr: false,
187+
wantErr: true,
192188
},
193189
{
194-
name: "Skip member update if VSphereMachine found but CAPI Machine missing",
190+
name: "Block member update if VSphereMachine found but CAPI Machine missing",
195191
targetMember: []vmoprv1.GroupMember{member(memberName1), member(memberName2)}, // vm-02 is new
196192
vmgInput: baseVMG.DeepCopy(),
197193
mdNames: []string{mdName1},
@@ -206,8 +202,7 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
206202
// vm-02 VSphereMachine exists but has no owner ref
207203
return []runtime.Object{v, makeVSphereMachineOwned(memberName1, vmgNamespace, "ownerMachineName1", mdName1), makeCAPIMachine("ownerMachineName1", vmgNamespace, ptr.To(failureDomainA)), makeVSphereMachineNoOwner(memberName2, vmgNamespace)}
208204
}(),
209-
wantAllowed: false,
210-
wantErr: false,
205+
wantErr: true,
211206
},
212207
{
213208
name: "Allow member update if all new members have Machine FailureDomain specified",
@@ -222,15 +217,14 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
222217
Kind: memberKind,
223218
},
224219
}}}
225-
// m-02 (owner of vownerMachineName2) has FailureDomain set
220+
// m-02 (owner of ownerMachineName2) has FailureDomain set
226221
return []runtime.Object{
227222
v,
228223
makeVSphereMachineOwned(memberName1, vmgNamespace, "ownerMachineName1", mdName1), makeCAPIMachine("ownerMachineName1", vmgNamespace, nil),
229224
makeVSphereMachineOwned(memberName2, vmgNamespace, "ownerMachineName2", mdName2), makeCAPIMachine("ownerMachineName2", vmgNamespace, ptr.To(failureDomainA)),
230225
}
231226
}(),
232-
wantAllowed: true, // Allowed because new members don't require VMO placement
233-
wantErr: false,
227+
wantErr: false, // Allowed because new members don't require placement
234228
},
235229
{
236230
name: "Allow member update if no new member",
@@ -248,11 +242,10 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
248242
}}}
249243
return []runtime.Object{v}
250244
}(),
251-
wantAllowed: true,
252-
wantErr: false,
245+
wantErr: false,
253246
},
254247
{
255-
name: "Skip member update if new member Machine requires placement annotation",
248+
name: "Block member update if new member Machine requires placement annotation",
256249
targetMember: []vmoprv1.GroupMember{member(memberName1), member(memberName2)}, // vm-02 is new and requires placement
257250
vmgInput: baseVMG.DeepCopy(),
258251
mdNames: []string{mdName1},
@@ -271,8 +264,7 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
271264
makeVSphereMachineOwned(memberName2, vmgNamespace, "ownerMachineName2", mdName2), makeUnplacedCAPIMachine("ownerMachineName2", vmgNamespace),
272265
}
273266
}(),
274-
wantAllowed: false,
275-
wantErr: false,
267+
wantErr: true,
276268
},
277269
{
278270
name: "Allow new member Machine since required placement annotation exists",
@@ -297,8 +289,7 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
297289
makeVSphereMachineOwned(memberName2, vmgNamespace, "ownerMachineName2", mdName2), makeUnplacedCAPIMachine("ownerMachineName2", vmgNamespace),
298290
}
299291
}(),
300-
wantAllowed: true,
301-
wantErr: false,
292+
wantErr: false,
302293
},
303294
}
304295

@@ -311,13 +302,12 @@ func TestIsMemberUpdateAllowed(t *testing.T) {
311302

312303
vmgInput := tt.vmgInput.DeepCopy()
313304

314-
gotAllowed, err := isMemberUpdateAllowed(ctx, kubeClient, tt.targetMember, vmgInput)
305+
err := isMemberUpdateAllowed(ctx, kubeClient, tt.targetMember, vmgInput)
315306

316307
if tt.wantErr {
317308
g.Expect(err).To(HaveOccurred())
318309
} else {
319310
g.Expect(err).NotTo(HaveOccurred())
320-
g.Expect(gotAllowed).To(Equal(tt.wantAllowed))
321311
}
322312
})
323313
}

0 commit comments

Comments
 (0)