Skip to content

Commit a60f3b1

Browse files
committed
chore: clean up node conditions before test
1 parent b8fe998 commit a60f3b1

File tree

3 files changed

+17
-6
lines changed

3 files changed

+17
-6
lines changed

health-events-analyzer/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ require (
2828
)
2929

3030
require (
31-
github.com/BurntSushi/toml v1.5.0
31+
github.com/BurntSushi/toml v1.5.0 // indirect
3232
github.com/beorn7/perks v1.0.1 // indirect
3333
github.com/caarlos0/env/v11 v11.3.1 // indirect
3434
github.com/cespare/xxhash/v2 v2.3.0 // indirect

tests/helpers/kube.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ func CreateRebootNodeCR(
613613
}
614614

615615
// WaitForNodeConditionWithCheckName waits for the node to have a condition with the reason containing the specified checkName.
616-
func WaitForNodeConditionWithCheckName(ctx context.Context, t *testing.T, c klient.Client, nodeName, checkName string) {
616+
func WaitForNodeConditionWithCheckName(ctx context.Context, t *testing.T, c klient.Client, nodeName, checkName, message string) {
617617
require.Eventually(t, func() bool {
618618
node, err := GetNodeByName(ctx, c, nodeName)
619619
if err != nil {
@@ -623,7 +623,7 @@ func WaitForNodeConditionWithCheckName(ctx context.Context, t *testing.T, c klie
623623

624624
// Look for a condition where the reason contains the check name
625625
for _, condition := range node.Status.Conditions {
626-
if condition.Status == v1.ConditionTrue && strings.Contains(condition.Reason, checkName) {
626+
if condition.Status == v1.ConditionTrue && strings.Contains(condition.Reason, checkName) && strings.Contains(message, condition.Message) {
627627
t.Logf("Found node condition: Type=%s, Reason=%s, Status=%s, Message=%s",
628628
condition.Type, condition.Reason, condition.Status, condition.Message)
629629
return true

tests/multiple_fatal_event_test.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,16 @@ func TestMultipleFatalEventRule(t *testing.T) {
5959

6060
client, err := c.NewClient()
6161
assert.NoError(t, err, "failed to create kubernetes client")
62+
63+
// clean up any existing node conditions
64+
t.Logf("Cleaning up any existing node conditions for node %s", gpuNodeName)
65+
err = helpers.SendHealthEventsToNodes(t, []string{gpuNodeName}, ERRORCODE_13, "data/non-fatal-health-event.json", "")
66+
assert.NoError(t, err, "failed to send non-fatal events")
67+
6268
xidsToInject := []string{ERRORCODE_13, ERRORCODE_48, ERRORCODE_13, ERRORCODE_48, ERRORCODE_13}
6369

6470
// inject 5 fatal errors and let the remediation cycle finish
65-
71+
t.Logf("Injecting fatal errors to node %s", gpuNodeName)
6672
for _, xid := range xidsToInject {
6773
// inject XID error
6874
err = helpers.SendHealthEventsToNodes(t, []string{gpuNodeName}, xid, "data/fatal-health-event.json", "")
@@ -92,7 +98,7 @@ func TestMultipleFatalEventRule(t *testing.T) {
9298
assert.NoError(t, err, "failed to send fatal events")
9399

94100
// Check node condition for matched ruleset
95-
helpers.WaitForNodeConditionWithCheckName(ctx, t, client, gpuNodeName, "MultipleFatalError")
101+
helpers.WaitForNodeConditionWithCheckName(ctx, t, client, gpuNodeName, "MultipleFatalError", "ErrorCode:31 GPU:0 XID error occurred Recommended Action=CONTACT_SUPPORT;")
96102

97103
return ctx
98104
})
@@ -143,8 +149,13 @@ func TestMultipleNonFatalEventRule(t *testing.T) {
143149
assert.True(t, len(gpuNodes) > 0, "no gpu nodes found")
144150
gpuNodeName := gpuNodes[rand.Intn(len(gpuNodes))]
145151
ctx = context.WithValue(ctx, keyGpuNodeName, gpuNodeName)
146-
t.Logf("Injecting fatal events to node %s", gpuNodeName)
147152

153+
// clean up any existing node conditions
154+
t.Logf("Cleaning up any existing node conditions for node %s", gpuNodeName)
155+
err := helpers.SendHealthEventsToNodes(t, []string{gpuNodeName}, ERRORCODE_13, "data/non-fatal-health-event.json", "")
156+
assert.NoError(t, err, "failed to send non-fatal events")
157+
158+
t.Logf("Injecting non-fatal events to node %s", gpuNodeName)
148159
for i := 0; i < 5; i++ {
149160
// inject XID error
150161
err := helpers.SendHealthEventsToNodes(t, []string{gpuNodeName}, ERRORCODE_13, "data/non-fatal-health-event.json", "")

0 commit comments

Comments
 (0)