Skip to content

Commit d899392

Browse files
authored
feat: add disruptive reboot test case (#692)
1 parent cefa55b commit d899392

File tree

2 files changed

+178
-0
lines changed

2 files changed

+178
-0
lines changed

internal/e2e/conditions.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,23 @@ func (c *ConditionExtension) ResourceMatch(obj k8s.Object, matchFetcher func(obj
3232
}
3333
}
3434

35+
func (c *ConditionExtension) PodRunning(pod k8s.Object) apimachinerywait.ConditionWithContextFunc {
36+
return func(ctx context.Context) (done bool, err error) {
37+
if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil {
38+
return false, err
39+
}
40+
status := pod.(*v1.Pod).Status
41+
switch status.Phase {
42+
case v1.PodRunning:
43+
return true, nil
44+
case v1.PodPending:
45+
return false, nil
46+
default:
47+
return false, fmt.Errorf("pod cannot transition to running from current status: %s", status.Phase)
48+
}
49+
}
50+
}
51+
3552
func (c *ConditionExtension) PodSucceeded(pod k8s.Object) apimachinerywait.ConditionWithContextFunc {
3653
return func(ctx context.Context) (done bool, err error) {
3754
if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil {
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
//go:build e2e
2+
3+
package disruptive
4+
5+
import (
6+
"bytes"
7+
"context"
8+
"fmt"
9+
"strings"
10+
"testing"
11+
"time"
12+
13+
fwext "github.com/aws/aws-k8s-tester/internal/e2e"
14+
15+
"github.com/aws/aws-k8s-tester/internal/awssdk"
16+
"github.com/aws/aws-sdk-go-v2/service/ec2"
17+
18+
corev1 "k8s.io/api/core/v1"
19+
"k8s.io/apimachinery/pkg/api/resource"
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
"k8s.io/client-go/util/exec"
22+
23+
"sigs.k8s.io/e2e-framework/klient/wait"
24+
"sigs.k8s.io/e2e-framework/pkg/envconf"
25+
"sigs.k8s.io/e2e-framework/pkg/features"
26+
)
27+
28+
func getSleepPodTemplate(name string, targetNodeName string, duration string) corev1.Pod {
29+
return corev1.Pod{
30+
ObjectMeta: metav1.ObjectMeta{
31+
Name: name,
32+
Namespace: "default",
33+
},
34+
Spec: corev1.PodSpec{
35+
Containers: []corev1.Container{
36+
{
37+
Name: name,
38+
Image: "public.ecr.aws/amazonlinux/amazonlinux:2023",
39+
Command: []string{"sleep", duration},
40+
Resources: corev1.ResourceRequirements{
41+
Requests: corev1.ResourceList{
42+
corev1.ResourceCPU: resource.MustParse("250m"),
43+
corev1.ResourceMemory: resource.MustParse("64Mi"),
44+
},
45+
Limits: corev1.ResourceList{
46+
corev1.ResourceCPU: resource.MustParse("250m"),
47+
corev1.ResourceMemory: resource.MustParse("64Mi"),
48+
},
49+
},
50+
},
51+
},
52+
RestartPolicy: corev1.RestartPolicyNever,
53+
NodeName: targetNodeName,
54+
Resources: &corev1.ResourceRequirements{
55+
// set high pod limits to make sure the pod does not get
56+
// OOMKilled, and make requests equal to qualify the pod
57+
// for the Guaranteed Quality of Service class
58+
Requests: corev1.ResourceList{
59+
corev1.ResourceCPU: resource.MustParse("250m"),
60+
corev1.ResourceMemory: resource.MustParse("64Mi"),
61+
},
62+
Limits: corev1.ResourceList{
63+
corev1.ResourceCPU: resource.MustParse("250m"),
64+
corev1.ResourceMemory: resource.MustParse("64Mi"),
65+
},
66+
},
67+
},
68+
}
69+
}
70+
71+
func TestGracefulReboot(t *testing.T) {
72+
terminationCanaryPodName := fmt.Sprintf("termination-canary-%d", time.Now().Unix())
73+
canaryPod := getSleepPodTemplate(terminationCanaryPodName, "", "infinity")
74+
bootIndicatorPodName := fmt.Sprintf("boot-detection-%d", time.Now().Unix())
75+
bootIndicatorPod := getSleepPodTemplate(bootIndicatorPodName, "", "infinity")
76+
77+
feat := features.New("graceful-reboot").
78+
WithLabel("suite", "disruptive").
79+
Assess("Node gracefully reboots", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
80+
if err := cfg.Client().Resources().Create(ctx, &canaryPod); err != nil {
81+
t.Fatalf("Failed to create heartbeat pod: %v", err)
82+
}
83+
84+
if err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).PodRunning(&canaryPod),
85+
wait.WithContext(ctx),
86+
wait.WithTimeout(5*time.Minute),
87+
); err != nil {
88+
t.Fatalf("Failed to wait for pod to go into running status %s: %v", terminationCanaryPodName, err)
89+
}
90+
91+
var targetNode corev1.Node
92+
if err := cfg.Client().Resources().Get(ctx, canaryPod.Spec.NodeName, "", &targetNode); err != nil {
93+
t.Fatalf("failed to get node %s: %v", canaryPod.Spec.NodeName, err)
94+
}
95+
96+
providerIDParts := strings.Split(targetNode.Spec.ProviderID, "/")
97+
instanceID := providerIDParts[len(providerIDParts)-1]
98+
t.Logf("Node %s corresponds to EC2 instance: %s", targetNode.Name, instanceID)
99+
100+
ec2Client := ec2.NewFromConfig(awssdk.NewConfig())
101+
102+
// TODO: make sure the exec starts before the reboot to promote better determinism
103+
t.Logf("Rebooting instance %s to test graceful reboot...", instanceID)
104+
_, err := ec2Client.RebootInstances(ctx, &ec2.RebootInstancesInput{
105+
InstanceIds: []string{instanceID},
106+
})
107+
if err != nil {
108+
t.Fatalf("Failed to reboot EC2 instance %s: %v", instanceID, err)
109+
}
110+
t.Logf("Successfully initiated reboot of instance %s, waiting for pod %s to terminate...", instanceID, canaryPod.Name)
111+
112+
t.Logf("Started exec into pod %s", terminationCanaryPodName)
113+
// Attempt to execute a blocking command in the pod until we get a 143, which would indicate a SIGTERM.
114+
// This a reliable way to check termination since it requires direct response from Kubelet
115+
var execOut, execErr bytes.Buffer
116+
err = cfg.Client().Resources().ExecInPod(ctx, "default", terminationCanaryPodName, terminationCanaryPodName, []string{"sleep", "infinity"}, &execOut, &execErr)
117+
if err != nil {
118+
if execErr, ok := err.(exec.CodeExitError); ok && execErr.Code == 143 {
119+
t.Logf("Pod %s was terminated", terminationCanaryPodName)
120+
} else {
121+
t.Fatalf("Got unexpected error terminating pod: %v", err)
122+
}
123+
}
124+
125+
t.Logf("Waiting up to 10 minutes for node %s to become schedulable again", targetNode.Name)
126+
127+
// Create a second pod, under the assumption that a new pod cannot be scheduled by a shutting down kubelet
128+
// that has already evicted other pods, so this one should only schedule with a new kubelet after boot
129+
bootIndicatorPod.Spec.NodeName = targetNode.Name
130+
if err := cfg.Client().Resources().Create(ctx, &bootIndicatorPod); err != nil {
131+
t.Fatalf("Failed to create boot indicator pod: %v", err)
132+
}
133+
134+
if err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).PodRunning(&bootIndicatorPod),
135+
wait.WithContext(ctx),
136+
wait.WithTimeout(10*time.Minute), // TODO: bring down this value after collecting some more data
137+
); err != nil {
138+
t.Fatalf("Failed to wait for pod to go into running status %s: %v", bootIndicatorPodName, err)
139+
}
140+
141+
t.Logf("Node %s became ready and schedulable within %v!", targetNode.Name, time.Since(bootIndicatorPod.CreationTimestamp.Time))
142+
return ctx
143+
}).
144+
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
145+
if err := cfg.Client().Resources().Delete(ctx, &canaryPod); err != nil {
146+
t.Logf("Failed to delete pod %s: %v", terminationCanaryPodName, err)
147+
} else {
148+
t.Logf("Successfully cleaned up pod %s", terminationCanaryPodName)
149+
}
150+
151+
if err := cfg.Client().Resources().Delete(ctx, &bootIndicatorPod); err != nil {
152+
t.Logf("Failed to delete pod %s: %v", bootIndicatorPodName, err)
153+
} else {
154+
t.Logf("Successfully cleaned up pod %s", bootIndicatorPodName)
155+
}
156+
return ctx
157+
}).
158+
Feature()
159+
160+
testenv.Test(t, feat)
161+
}

0 commit comments

Comments
 (0)