Skip to content

Commit 0e171f6

Browse files
authored
Merge pull request #4372 from Azure/alcasim/ARO-11486
E2E for FixEtcd Geneva action
2 parents 6ef77a8 + ea6b0c7 commit 0e171f6

File tree

1 file changed

+202
-0
lines changed

1 file changed

+202
-0
lines changed

test/e2e/fix_etcd.go

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
package e2e
2+
3+
// Copyright (c) Microsoft Corporation.
4+
// Licensed under the Apache License 2.0.
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"net/http"
10+
"time"
11+
12+
. "github.com/onsi/ginkgo/v2"
13+
. "github.com/onsi/gomega"
14+
15+
corev1 "k8s.io/api/core/v1"
16+
kerrors "k8s.io/apimachinery/pkg/api/errors"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
19+
"k8s.io/apimachinery/pkg/types"
20+
21+
configv1 "github.com/openshift/api/config/v1"
22+
"github.com/openshift/api/machine/v1beta1"
23+
24+
"github.com/Azure/ARO-RP/pkg/util/pointerutils"
25+
)
26+
27+
const (
28+
masterMachineLabel = "machine.openshift.io/cluster-api-machine-role=master"
29+
)
30+
31+
// Steps performed in this test
32+
// 1. Disabling cluster-version-operator and etcd-operator
33+
// 2. Check if there are guardrails preventing machines to be deleted, and disable them if necessary
34+
// 3. Delete first master machine
35+
// 4. Enable operators
36+
// 5. Recreate Machine
37+
// 6. Wait for new ETCD pod
38+
// 7. Run the fix
39+
// 8. Wait until operators recover from degraded
40+
// 9. Enable back guardrails if necessary
41+
42+
var _ = Describe("Master replacement", Label(regressiontest), func() {
43+
BeforeEach(skipIfNotInDevelopmentEnv)
44+
45+
It("should fix etcd automatically", Serial, func(ctx context.Context) {
46+
By("Disabling reconciliation")
47+
dep, err := clients.Kubernetes.AppsV1().Deployments("openshift-cluster-version").Get(ctx, "cluster-version-operator", metav1.GetOptions{})
48+
Expect(err).NotTo(HaveOccurred())
49+
dep.Spec.Replicas = pointerutils.ToPtr(int32(0))
50+
_, err = clients.Kubernetes.AppsV1().Deployments("openshift-cluster-version").Update(ctx, dep, metav1.UpdateOptions{})
51+
Expect(err).NotTo(HaveOccurred())
52+
53+
dep, err = clients.Kubernetes.AppsV1().Deployments("openshift-etcd-operator").Get(ctx, "etcd-operator", metav1.GetOptions{})
54+
Expect(err).NotTo(HaveOccurred())
55+
dep.Spec.Replicas = pointerutils.ToPtr(int32(0))
56+
_, err = clients.Kubernetes.AppsV1().Deployments("openshift-etcd-operator").Update(ctx, dep, metav1.UpdateOptions{})
57+
Expect(err).NotTo(HaveOccurred())
58+
59+
// Check if we have guardrails in the cluster, so we can disable them to delete machines if necessary
60+
templateAroConstraint := &unstructured.Unstructured{}
61+
templateAroConstraint.SetAPIVersion("constraints.gatekeeper.sh/v1beta1")
62+
templateAroConstraint.SetKind("ARODenyLabels")
63+
constraintPresent := true
64+
65+
aroConstraintClient, err := clients.Dynamic.GetClient(templateAroConstraint)
66+
Expect(err).NotTo(HaveOccurred())
67+
_, err = aroConstraintClient.Get(ctx, "aro-machines-deny", metav1.GetOptions{})
68+
69+
if err != nil {
70+
if kerrors.IsNotFound(err) {
71+
// This cluster does not have guardrails, so we don't need to disable and enable again
72+
constraintPresent = false
73+
} else {
74+
// something else happened and we can't continue testing
75+
Expect(err).ToNot(HaveOccurred())
76+
}
77+
}
78+
79+
if constraintPresent {
80+
patchPayload := `[
81+
{
82+
"op": "replace",
83+
"path": "/spec/operatorflags/aro.guardrails.policies.aro-machines-deny.managed",
84+
"value": "false"
85+
}
86+
]`
87+
patchBytes := []byte(patchPayload)
88+
By("Disabling guardrail policies for aro machines")
89+
_, err = clients.AROClusters.AroV1alpha1().Clusters().Patch(ctx, "cluster", types.JSONPatchType, patchBytes, metav1.PatchOptions{})
90+
Expect(err).NotTo(HaveOccurred())
91+
92+
By("Waiting for constraint to be removed")
93+
Eventually(func(g Gomega, ctx context.Context) {
94+
_, err := aroConstraintClient.Get(ctx, "aro-machines-deny", metav1.GetOptions{})
95+
g.Expect(err).To(HaveOccurred())
96+
g.Expect(kerrors.IsNotFound(err)).To(BeTrue())
97+
}, 10*time.Minute, 10*time.Second, ctx).Should(Succeed())
98+
}
99+
100+
By("Deleting the first master machine")
101+
machines, err := clients.MachineAPI.MachineV1beta1().Machines("openshift-machine-api").
102+
List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabel})
103+
Expect(err).NotTo(HaveOccurred())
104+
Expect(machines.Items).To(HaveLen(3))
105+
machine := machines.Items[0]
106+
107+
machine.Spec.ProviderID = nil
108+
machine.Status = v1beta1.MachineStatus{}
109+
machine.Spec.LifecycleHooks = v1beta1.LifecycleHooks{}
110+
_, err = clients.MachineAPI.MachineV1beta1().Machines("openshift-machine-api").Update(ctx, &machine, metav1.UpdateOptions{})
111+
Expect(err).NotTo(HaveOccurred())
112+
err = clients.MachineAPI.MachineV1beta1().Machines("openshift-machine-api").Delete(ctx, machine.Name, metav1.DeleteOptions{})
113+
Expect(err).NotTo(HaveOccurred())
114+
115+
By("Waiting for the machine to be deleted")
116+
Eventually(func(g Gomega, ctx context.Context) {
117+
machines, err := clients.MachineAPI.MachineV1beta1().Machines("openshift-machine-api").
118+
List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabel})
119+
g.Expect(err).NotTo(HaveOccurred())
120+
g.Expect(machines.Items).To(HaveLen(2))
121+
}, 10*time.Minute, 10*time.Second, ctx).Should(Succeed())
122+
123+
By("Reverting deployments") // cluster-version-operator reconciles etcd-operator.
124+
dep, err = clients.Kubernetes.AppsV1().Deployments("openshift-cluster-version").Get(ctx, "cluster-version-operator", metav1.GetOptions{})
125+
Expect(err).NotTo(HaveOccurred())
126+
dep.Spec.Replicas = pointerutils.ToPtr(int32(1))
127+
_, err = clients.Kubernetes.AppsV1().Deployments("openshift-cluster-version").Update(ctx, dep, metav1.UpdateOptions{})
128+
Expect(err).NotTo(HaveOccurred())
129+
130+
By("Recreating the machine")
131+
machine.ObjectMeta = metav1.ObjectMeta{
132+
Labels: machine.Labels,
133+
Name: machine.Name,
134+
Namespace: machine.Namespace,
135+
}
136+
_, err = clients.MachineAPI.MachineV1beta1().Machines("openshift-machine-api").Create(ctx, &machine, metav1.CreateOptions{})
137+
Expect(err).NotTo(HaveOccurred())
138+
139+
By("Waiting for the machine to be created and its node to be ready")
140+
Eventually(func(g Gomega, ctx context.Context) {
141+
node, err := clients.Kubernetes.CoreV1().Nodes().Get(ctx, machine.Name, metav1.GetOptions{})
142+
g.Expect(err).NotTo(HaveOccurred())
143+
for _, condition := range node.Status.Conditions {
144+
if condition.Type == corev1.NodeReady {
145+
g.Expect(condition.Status).To(Equal(corev1.ConditionTrue))
146+
return
147+
}
148+
}
149+
}, 15*time.Minute, 10*time.Second, ctx).Should(Succeed())
150+
151+
By("Waiting for the etcd pod to be created")
152+
Eventually(func(g Gomega, ctx context.Context) {
153+
_, err := clients.Kubernetes.CoreV1().Pods("openshift-etcd").Get(ctx, fmt.Sprintf("etcd-%s", machine.Name), metav1.GetOptions{})
154+
g.Expect(err).NotTo(HaveOccurred())
155+
}, 5*time.Minute, 10*time.Second, ctx).Should(Succeed())
156+
157+
By("Running etcd recovery API")
158+
resp, err := adminRequest(ctx, http.MethodPost, "/admin"+clusterResourceID+"/etcdrecovery", nil, true, nil, nil)
159+
Expect(err).NotTo(HaveOccurred())
160+
// The master replacement doesn't always break the etcd.
161+
// It returns 200 and fixes it if broken, and it returns 400 if not broken.
162+
// If it gets either of them, we can say that the etcd is fixed (or not broken).
163+
Expect(resp.StatusCode).To(Or(Equal(http.StatusOK), Equal(http.StatusBadRequest)))
164+
By(fmt.Sprintf("Status Code: %d", resp.StatusCode))
165+
166+
By("Waiting for the cluster operator not to be degraded")
167+
Eventually(func(g Gomega, ctx context.Context) {
168+
cos, err := clients.ConfigClient.ConfigV1().ClusterOperators().List(ctx, metav1.ListOptions{})
169+
g.Expect(err).NotTo(HaveOccurred())
170+
for _, co := range cos.Items {
171+
isDegraded := false
172+
isAvailable := false
173+
for _, condition := range co.Status.Conditions {
174+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionTrue {
175+
isAvailable = true
176+
}
177+
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue {
178+
isDegraded = true
179+
}
180+
}
181+
g.Expect(isAvailable).To(BeTrue(), "operator %s is not available", co.Name)
182+
g.Expect(isDegraded).To(BeFalse(), "operator %s is degraded", co.Name)
183+
}
184+
}, 10*time.Minute, 10*time.Second, ctx).Should(Succeed())
185+
186+
if constraintPresent {
187+
// Re-enabling the cluster-api-machine-role label so gatekeeper allows to delete the machine
188+
By("Enabling guardrail policies for aro machines")
189+
patchPayload := `[
190+
{
191+
"op": "replace",
192+
"path": "/spec/operatorflags/aro.guardrails.policies.aro-machines-deny.managed",
193+
"value": "true"
194+
}
195+
]`
196+
patchBytes := []byte(patchPayload)
197+
198+
_, err = clients.AROClusters.AroV1alpha1().Clusters().Patch(ctx, "cluster", types.JSONPatchType, patchBytes, metav1.PatchOptions{})
199+
Expect(err).NotTo(HaveOccurred())
200+
}
201+
})
202+
})

0 commit comments

Comments
 (0)