Skip to content

Commit d7fc72f

Browse files
authored
Merge pull request #4441 from Azure/dbizau/disable_vmss_repairs_before_cleanup
disable automatic vmss repairs before cleanup and add retry
2 parents 6e55b76 + 4549611 commit d7fc72f

File tree

7 files changed

+328
-4
lines changed

7 files changed

+328
-4
lines changed

pkg/deploy/deploy.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ import (
99
"fmt"
1010
"reflect"
1111
"strings"
12+
"time"
1213

1314
"github.com/jongio/azidext/go/azidext"
1415
"github.com/sirupsen/logrus"
1516

1617
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
18+
mgmtcompute "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute"
1719
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
1820
"github.com/Azure/go-autorest/autorest/azure"
1921

@@ -29,6 +31,7 @@ import (
2931
"github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/features"
3032
"github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/msi"
3133
"github.com/Azure/ARO-RP/pkg/util/azureclient/mgmt/storage"
34+
"github.com/Azure/ARO-RP/pkg/util/pointerutils"
3235
)
3336

3437
var _ Deployer = (*deployer)(nil)
@@ -266,3 +269,61 @@ func (d *deployer) checkForKnownError(serviceErr *azure.ServiceError, deployAtte
266269

267270
return "", nil
268271
}
272+
273+
// disableAutomaticRepairsOnVMSS disables automatic repairs on a VMSS to prevent
274+
// race conditions during teardown when stopping services causes health probe failures.
275+
// This is a best-effort operation - we log errors as warnings but don't fail the
276+
// teardown, allowing cleanup to proceed even if we can't disable repairs.
277+
func (d *deployer) disableAutomaticRepairsOnVMSS(ctx context.Context, resourceGroupName, vmssName string) error {
278+
d.log.Printf("disabling automatic repairs on scaleset %s", vmssName)
279+
start := time.Now()
280+
err := d.vmss.UpdateAndWait(ctx, resourceGroupName, vmssName, mgmtcompute.VirtualMachineScaleSetUpdate{
281+
VirtualMachineScaleSetUpdateProperties: &mgmtcompute.VirtualMachineScaleSetUpdateProperties{
282+
AutomaticRepairsPolicy: &mgmtcompute.AutomaticRepairsPolicy{
283+
Enabled: pointerutils.ToPtr(false),
284+
},
285+
},
286+
})
287+
d.log.Printf("disabling automatic repairs on %s took %v", vmssName, time.Since(start))
288+
289+
if err != nil {
290+
// Log as warning but don't fail the teardown - we want to proceed with
291+
// stopping services and deleting the VMSS even if we couldn't disable repairs
292+
d.log.Warnf("failed to disable automatic repairs on %s, proceeding with teardown anyway: %v", vmssName, err)
293+
return nil
294+
}
295+
return nil
296+
}
297+
298+
// runCommandWithRetry runs a command on a VMSS instance with retry logic for
299+
// OperationPreempted errors that can occur when Azure repair operations race
300+
// with our shutdown commands.
301+
func (d *deployer) runCommandWithRetry(ctx context.Context, resourceGroupName, vmssName, instanceID string, input mgmtcompute.RunCommandInput) error {
302+
const maxRetries = 3
303+
const retryDelaySecs = 10
304+
305+
var err error
306+
for i := 0; i < maxRetries; i++ {
307+
err = d.vmssvms.RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input)
308+
if err == nil || !isOperationPreemptedError(err) {
309+
break
310+
}
311+
if i == maxRetries-1 {
312+
break
313+
}
314+
d.log.Printf("RunCommand preempted on instance %s, retrying (%d/%d)", instanceID, i+1, maxRetries)
315+
timer := time.NewTimer(retryDelaySecs * time.Second)
316+
select {
317+
case <-ctx.Done():
318+
if !timer.Stop() {
319+
<-timer.C
320+
}
321+
if ctxErr := ctx.Err(); ctxErr != nil {
322+
return ctxErr
323+
}
324+
return err
325+
case <-timer.C:
326+
}
327+
}
328+
return err
329+
}

pkg/deploy/deploy_test.go

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,22 @@ package deploy
55

66
import (
77
"context"
8+
"errors"
89
"fmt"
910
"reflect"
1011
"testing"
12+
"time"
1113

1214
"github.com/sirupsen/logrus"
1315
"go.uber.org/mock/gomock"
1416

17+
mgmtcompute "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-06-01/compute"
1518
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
19+
"github.com/Azure/go-autorest/autorest"
1620
"github.com/Azure/go-autorest/autorest/azure"
1721

1822
"github.com/Azure/ARO-RP/pkg/util/arm"
23+
mock_compute "github.com/Azure/ARO-RP/pkg/util/mocks/azureclient/mgmt/compute"
1924
mock_features "github.com/Azure/ARO-RP/pkg/util/mocks/azureclient/mgmt/features"
2025
mock_vmsscleaner "github.com/Azure/ARO-RP/pkg/util/mocks/vmsscleaner"
2126
"github.com/Azure/ARO-RP/pkg/util/pointerutils"
@@ -379,3 +384,199 @@ func TestGetParameters(t *testing.T) {
379384
})
380385
}
381386
}
387+
388+
func TestDisableAutomaticRepairsOnVMSS(t *testing.T) {
389+
ctx := context.Background()
390+
resourceGroupName := "rg"
391+
vmssName := "vmss"
392+
393+
for _, tt := range []struct {
394+
name string
395+
updateErr error
396+
}{
397+
{
398+
name: "success",
399+
},
400+
{
401+
name: "azure error returns nil",
402+
updateErr: errors.New("update failed"),
403+
},
404+
} {
405+
t.Run(tt.name, func(t *testing.T) {
406+
controller := gomock.NewController(t)
407+
defer controller.Finish()
408+
409+
mockVMSS := mock_compute.NewMockVirtualMachineScaleSetsClient(controller)
410+
411+
mockVMSS.EXPECT().UpdateAndWait(ctx, resourceGroupName, vmssName, gomock.AssignableToTypeOf(mgmtcompute.VirtualMachineScaleSetUpdate{})).DoAndReturn(
412+
func(_ context.Context, rg, name string, update mgmtcompute.VirtualMachineScaleSetUpdate) error {
413+
if update.VirtualMachineScaleSetUpdateProperties == nil {
414+
t.Fatalf("expected VirtualMachineScaleSetUpdateProperties to be set")
415+
}
416+
policy := update.AutomaticRepairsPolicy
417+
if policy == nil || policy.Enabled == nil {
418+
t.Fatalf("expected AutomaticRepairsPolicy.Enabled to be set")
419+
}
420+
if *policy.Enabled {
421+
t.Fatalf("expected AutomaticRepairsPolicy.Enabled to be false")
422+
}
423+
return tt.updateErr
424+
},
425+
)
426+
427+
d := deployer{
428+
log: logrus.NewEntry(logrus.StandardLogger()),
429+
vmss: mockVMSS,
430+
}
431+
432+
if err := d.disableAutomaticRepairsOnVMSS(ctx, resourceGroupName, vmssName); err != nil {
433+
t.Fatalf("unexpected error: %v", err)
434+
}
435+
})
436+
}
437+
}
438+
439+
func TestRunCommandWithRetrySuccess(t *testing.T) {
440+
ctx := context.Background()
441+
controller := gomock.NewController(t)
442+
defer controller.Finish()
443+
444+
input := mgmtcompute.RunCommandInput{}
445+
resourceGroupName := "rg"
446+
vmssName := "vmss"
447+
instanceID := "1"
448+
449+
mockVMSSVMs := mock_compute.NewMockVirtualMachineScaleSetVMsClient(controller)
450+
mockVMSSVMs.EXPECT().RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input).Return(nil)
451+
452+
d := deployer{
453+
log: logrus.NewEntry(logrus.StandardLogger()),
454+
vmssvms: mockVMSSVMs,
455+
}
456+
457+
if err := d.runCommandWithRetry(ctx, resourceGroupName, vmssName, instanceID, input); err != nil {
458+
t.Fatalf("unexpected error: %v", err)
459+
}
460+
}
461+
462+
func TestRunCommandWithRetryRetriesOnOperationPreempted(t *testing.T) {
463+
ctx := context.Background()
464+
controller := gomock.NewController(t)
465+
defer controller.Finish()
466+
467+
input := mgmtcompute.RunCommandInput{}
468+
resourceGroupName := "rg"
469+
vmssName := "vmss"
470+
instanceID := "1"
471+
472+
mockVMSSVMs := mock_compute.NewMockVirtualMachineScaleSetVMsClient(controller)
473+
gomock.InOrder(
474+
mockVMSSVMs.EXPECT().RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input).Return(newOperationPreemptedError()),
475+
mockVMSSVMs.EXPECT().RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input).Return(nil),
476+
)
477+
478+
d := deployer{
479+
log: logrus.NewEntry(logrus.StandardLogger()),
480+
vmssvms: mockVMSSVMs,
481+
}
482+
483+
start := time.Now()
484+
if err := d.runCommandWithRetry(ctx, resourceGroupName, vmssName, instanceID, input); err != nil {
485+
t.Fatalf("unexpected error: %v", err)
486+
}
487+
duration := time.Since(start)
488+
if duration < 10*time.Second {
489+
t.Fatalf("expected retry delay of at least 10s, got %v", duration)
490+
}
491+
}
492+
493+
func TestRunCommandWithRetryReturnsLastError(t *testing.T) {
494+
ctx := context.Background()
495+
controller := gomock.NewController(t)
496+
defer controller.Finish()
497+
498+
input := mgmtcompute.RunCommandInput{}
499+
resourceGroupName := "rg"
500+
vmssName := "vmss"
501+
instanceID := "1"
502+
wantErr := newOperationPreemptedError()
503+
504+
mockVMSSVMs := mock_compute.NewMockVirtualMachineScaleSetVMsClient(controller)
505+
mockVMSSVMs.EXPECT().RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input).Return(wantErr).Times(3)
506+
507+
d := deployer{
508+
log: logrus.NewEntry(logrus.StandardLogger()),
509+
vmssvms: mockVMSSVMs,
510+
}
511+
512+
start := time.Now()
513+
err := d.runCommandWithRetry(ctx, resourceGroupName, vmssName, instanceID, input)
514+
duration := time.Since(start)
515+
if err != wantErr {
516+
t.Fatalf("expected %v, got %v", wantErr, err)
517+
}
518+
if duration < 20*time.Second {
519+
t.Fatalf("expected retry delay of at least 20s (2 retries), got %v", duration)
520+
}
521+
}
522+
523+
func TestRunCommandWithRetryReturnsNonRetryableError(t *testing.T) {
524+
ctx := context.Background()
525+
controller := gomock.NewController(t)
526+
defer controller.Finish()
527+
528+
input := mgmtcompute.RunCommandInput{}
529+
resourceGroupName := "rg"
530+
vmssName := "vmss"
531+
instanceID := "1"
532+
wantErr := errors.New("boom")
533+
534+
mockVMSSVMs := mock_compute.NewMockVirtualMachineScaleSetVMsClient(controller)
535+
mockVMSSVMs.EXPECT().RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input).Return(wantErr)
536+
537+
d := deployer{
538+
log: logrus.NewEntry(logrus.StandardLogger()),
539+
vmssvms: mockVMSSVMs,
540+
}
541+
542+
err := d.runCommandWithRetry(ctx, resourceGroupName, vmssName, instanceID, input)
543+
if err != wantErr {
544+
t.Fatalf("expected %v, got %v", wantErr, err)
545+
}
546+
}
547+
548+
func TestRunCommandWithRetryContextCancelled(t *testing.T) {
549+
ctx, cancel := context.WithCancel(context.Background())
550+
controller := gomock.NewController(t)
551+
defer controller.Finish()
552+
553+
input := mgmtcompute.RunCommandInput{}
554+
resourceGroupName := "rg"
555+
vmssName := "vmss"
556+
instanceID := "1"
557+
558+
mockVMSSVMs := mock_compute.NewMockVirtualMachineScaleSetVMsClient(controller)
559+
mockVMSSVMs.EXPECT().RunCommandAndWait(ctx, resourceGroupName, vmssName, instanceID, input).DoAndReturn(
560+
func(context.Context, string, string, string, mgmtcompute.RunCommandInput) error {
561+
// Cancel context during the first call to simulate cancellation during retry
562+
cancel()
563+
return newOperationPreemptedError()
564+
},
565+
)
566+
567+
d := deployer{
568+
log: logrus.NewEntry(logrus.StandardLogger()),
569+
vmssvms: mockVMSSVMs,
570+
}
571+
572+
err := d.runCommandWithRetry(ctx, resourceGroupName, vmssName, instanceID, input)
573+
if !errors.Is(err, context.Canceled) {
574+
t.Fatalf("expected context.Canceled error, got %v", err)
575+
}
576+
}
577+
578+
func newOperationPreemptedError() error {
579+
return &autorest.DetailedError{
580+
Original: &azure.ServiceError{Code: OperationPreemptedCode},
581+
}
582+
}

pkg/deploy/error.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,14 @@ package deploy
44
// Licensed under the Apache License 2.0.
55

66
import (
7+
"strings"
8+
79
"github.com/Azure/go-autorest/autorest"
810
"github.com/Azure/go-autorest/autorest/azure"
911
)
1012

13+
const OperationPreemptedCode = "OperationPreempted"
14+
1115
func isDeploymentNotFoundError(err error) bool {
1216
if detailedErr, ok := err.(autorest.DetailedError); ok {
1317
if requestErr, ok := detailedErr.Original.(*azure.RequestError); ok &&
@@ -18,3 +22,19 @@ func isDeploymentNotFoundError(err error) bool {
1822
}
1923
return false
2024
}
25+
26+
func isOperationPreemptedError(err error) bool {
27+
if detailedErr, ok := err.(autorest.DetailedError); ok {
28+
if requestErr, ok := detailedErr.Original.(*azure.RequestError); ok &&
29+
requestErr.ServiceError != nil &&
30+
requestErr.ServiceError.Code == OperationPreemptedCode {
31+
return true
32+
}
33+
if serviceErr, ok := detailedErr.Original.(*azure.ServiceError); ok &&
34+
serviceErr.Code == OperationPreemptedCode {
35+
return true
36+
}
37+
}
38+
// Also check error message for OperationPreempted
39+
return err != nil && strings.Contains(err.Error(), OperationPreemptedCode)
40+
}

pkg/deploy/upgrade_gateway.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ func (d *deployer) UpgradeGateway(ctx context.Context) error {
3030
return err
3131
}
3232

33-
return d.gatewayRemoveOldScalesets(ctx)
33+
// Create a separate timeout for cleanup phase
34+
cleanupCtx, cleanupCancel := context.WithTimeout(ctx, 30*time.Minute)
35+
defer cleanupCancel()
36+
return d.gatewayRemoveOldScalesets(cleanupCtx)
3437
}
3538

3639
func (d *deployer) gatewayWaitForReadiness(ctx context.Context, vmssName string) error {
@@ -73,6 +76,12 @@ func (d *deployer) gatewayRemoveOldScalesets(ctx context.Context) error {
7376
}
7477

7578
func (d *deployer) gatewayRemoveOldScaleset(ctx context.Context, vmssName string) error {
79+
// Disable automatic repairs on the old VMSS to prevent race conditions during teardown
80+
err := d.disableAutomaticRepairsOnVMSS(ctx, d.config.GatewayResourceGroupName, vmssName)
81+
if err != nil {
82+
return err
83+
}
84+
7685
scalesetVMs, err := d.vmssvms.List(ctx, d.config.GatewayResourceGroupName, vmssName, "", "", "")
7786
if err != nil {
7887
return err
@@ -84,7 +93,7 @@ func (d *deployer) gatewayRemoveOldScaleset(ctx context.Context, vmssName string
8493
if d.isVMInstanceHealthy(ctx, d.config.GatewayResourceGroupName, vmssName, *vm.InstanceID) {
8594
d.log.Printf("stopping gateway service on %s", *vm.Name)
8695
go func(id string) {
87-
errors <- d.vmssvms.RunCommandAndWait(ctx, d.config.GatewayResourceGroupName, vmssName, id, mgmtcompute.RunCommandInput{
96+
errors <- d.runCommandWithRetry(ctx, d.config.GatewayResourceGroupName, vmssName, id, mgmtcompute.RunCommandInput{
8897
CommandID: pointerutils.ToPtr("RunShellScript"),
8998
Script: &[]string{"systemctl stop aro-gateway"},
9099
})

0 commit comments

Comments
 (0)