Skip to content

Commit b98b06b

Browse files
committed
feat(eksapi): enhance error messages for cfn failures
1 parent 270c7b8 commit b98b06b

File tree

3 files changed

+53
-2
lines changed

3 files changed

+53
-2
lines changed

internal/deployers/eksapi/infra.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates"
2626
"github.com/aws/aws-k8s-tester/internal/metrics"
27+
"github.com/aws/aws-k8s-tester/internal/util"
2728
)
2829

2930
const (
@@ -531,7 +532,7 @@ func (m *InfrastructureManager) createCloudWatchInfrastructureStack(clusterName
531532
StackName: out.StackId,
532533
},
533534
infraStackCreationTimeout); err != nil {
534-
return "", fmt.Errorf("failed to wait for CloudWatch infrastructure stack creation: %w", err)
535+
return "", util.WrapCFNStackFailure(context.TODO(), m.clients.CFN(), fmt.Errorf("failed to wait for CloudWatch infrastructure stack creation: %w", err), stackName)
535536
}
536537

537538
// Get the CloudWatch role ARN from stack outputs

internal/deployers/eksapi/node.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
2929

3030
"github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates"
31+
"github.com/aws/aws-k8s-tester/internal/util"
3132
apierrors "k8s.io/apimachinery/pkg/api/errors"
3233
)
3334

@@ -504,7 +505,7 @@ func (m *nodeManager) createUnmanagedNodegroup(infra *Infrastructure, cluster *C
504505
},
505506
opts.NodeCreationTimeout)
506507
if err != nil {
507-
return fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err)
508+
return util.WrapCFNStackFailure(context.TODO(), m.clients.CFN(), fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err), stackName)
508509
}
509510
klog.Infof("created unmanaged nodegroup stack: %s", *out.StackId)
510511
if opts.ExpectedAMI != "" {

internal/util/cloudformation.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package util
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
8+
"github.com/aws/aws-sdk-go-v2/aws"
9+
"github.com/aws/aws-sdk-go-v2/service/cloudformation"
10+
types "github.com/aws/aws-sdk-go-v2/service/cloudformation/types"
11+
)
12+
13+
// TODO: implement AWS client wrappers, and incorporate this into the cfn:CreateStack call
14+
func WrapCFNStackFailure(ctx context.Context, cfnClient *cloudformation.Client, createStackErr error, stackName string) error {
15+
if createStackErr == nil {
16+
return nil
17+
}
18+
resourceByFailureMode := make(map[string][]string)
19+
eventsPaginator := cloudformation.NewDescribeStackEventsPaginator(cfnClient, &cloudformation.DescribeStackEventsInput{
20+
StackName: &stackName,
21+
})
22+
for eventsPaginator.HasMorePages() {
23+
page, err := eventsPaginator.NextPage(ctx)
24+
if err != nil {
25+
return createStackErr
26+
}
27+
for _, event := range page.StackEvents {
28+
if event.ResourceStatus == types.ResourceStatusCreateFailed {
29+
if _, ok := resourceByFailureMode[aws.ToString(event.ResourceStatusReason)]; !ok {
30+
resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = []string{}
31+
}
32+
resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = append(resourceByFailureMode[aws.ToString(event.ResourceStatusReason)], aws.ToString(event.LogicalResourceId))
33+
}
34+
}
35+
}
36+
nonCancellationFailure := len(resourceByFailureMode) > 1
37+
var enhancedDetails []string
38+
for reason, resources := range resourceByFailureMode {
39+
if nonCancellationFailure && reason == "Resource creation cancelled" {
40+
// Ignore resource cancellation errors if there's another failure reported, those failures
41+
// would just be a consequence of that failure. If all the failures are resource cancellation,
42+
// then there was likely a user initiated delete of the whole stack based on a timeout
43+
// waiting for one of the resources to create
44+
continue
45+
}
46+
enhancedDetails = append(enhancedDetails, fmt.Sprintf("%s: %s", strings.Join(resources, ","), reason))
47+
}
48+
return fmt.Errorf("%w: %s", createStackErr, strings.Join(enhancedDetails, "--"))
49+
}

0 commit comments

Comments
 (0)