Skip to content

Commit c46cc4e

Browse files
committed
chore: few changes
Signed-off-by: Ajay Mishra <[email protected]>
1 parent c36b17f commit c46cc4e

11 files changed

+57
-61
lines changed

tests/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ test-ci:
2121
# Run end-to-end tests
2222
.PHONY: test
2323
test:
24-
gotestsum --format standard-verbose -- -timeout 60m -count 1 ./...
24+
gotestsum --format standard-verbose -- -timeout 40m -count 1 -failfast ./...
2525

2626
# Help target
2727
.PHONY: help

tests/fault_management_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func TestDryRunMode(t *testing.T) {
8989
}
9090
}
9191
return false
92-
}, helpers.WaitTimeout, helpers.WaitInterval)
92+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
9393

9494
return ctx
9595
})
@@ -229,7 +229,7 @@ func TestNodeDeletedDuringDrain(t *testing.T) {
229229
require.Eventually(t, func() bool {
230230
_, err := helpers.GetNodeByName(ctx, client, testCtx.NodeName)
231231
return err != nil
232-
}, helpers.WaitTimeout, helpers.WaitInterval, "node should be deleted")
232+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "node should be deleted")
233233

234234
return ctx
235235
})
@@ -258,7 +258,7 @@ func TestNodeDeletedDuringDrain(t *testing.T) {
258258
require.Eventually(t, func() bool {
259259
_, err := helpers.GetNodeByName(ctx, client, testCtx.NodeName)
260260
return err == nil
261-
}, helpers.WaitTimeout, helpers.WaitInterval, "recreated node should exist")
261+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "recreated node should exist")
262262
}
263263
}
264264

@@ -308,7 +308,7 @@ func TestNodeRecoveryDuringDrain(t *testing.T) {
308308
require.Eventually(t, func() bool {
309309
found, _ := helpers.CheckNodeEventExists(ctx, client, testCtx.NodeName, "NodeDraining", "WaitingBeforeForceDelete", time.Time{})
310310
return found
311-
}, helpers.WaitTimeout, helpers.WaitInterval, "WaitingBeforeForceDelete event should be created")
311+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "WaitingBeforeForceDelete event should be created")
312312

313313
return ctx
314314
})

tests/fault_quarantine_test.go

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ func TestManualUncordonBehavior(t *testing.T) {
130130
}
131131

132132
return true
133-
}, helpers.WaitTimeout, helpers.WaitInterval)
133+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
134134

135135
return ctx
136136
})
@@ -227,7 +227,7 @@ func TestPreCordonedNodeHandling(t *testing.T) {
227227
}
228228

229229
return hasFQTaint && hasManualTaint && node.Spec.Unschedulable
230-
}, helpers.WaitTimeout, helpers.WaitInterval)
230+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
231231
t.Log("FQ taint successfully added to pre-cordoned node")
232232

233233
return ctx
@@ -270,7 +270,7 @@ func TestPreCordonedNodeHandling(t *testing.T) {
270270
_, hasAnnotation := node.Annotations["quarantineHealthEvent"]
271271

272272
return !hasFQTaint && !hasAnnotation
273-
}, helpers.WaitTimeout, helpers.WaitInterval)
273+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
274274

275275
return ctx
276276
})
@@ -396,23 +396,19 @@ func TestManagedByNVSentinelLabel(t *testing.T) {
396396
client, err := c.NewClient()
397397
require.NoError(t, err)
398398

399-
event1 := helpers.NewHealthEvent(testNodeIgnored).
400-
WithHealthy(true).
401-
WithFatal(false).
402-
WithMessage("No Health Failures")
403-
tempFile1, _ := helpers.SendHealthEventWithTemplate(testNodeIgnored, event1)
404-
if tempFile1 != "" {
405-
defer os.Remove(tempFile1)
406-
}
399+
tempFile1 := helpers.SendHealthEvent(ctx, t,
400+
helpers.NewHealthEvent(testNodeIgnored).
401+
WithHealthy(true).
402+
WithFatal(false).
403+
WithMessage("No Health Failures"))
404+
defer os.Remove(tempFile1)
407405

408-
event2 := helpers.NewHealthEvent(testNodeProcessed).
409-
WithHealthy(true).
410-
WithFatal(false).
411-
WithMessage("No Health Failures")
412-
tempFile2, _ := helpers.SendHealthEventWithTemplate(testNodeProcessed, event2)
413-
if tempFile2 != "" {
414-
defer os.Remove(tempFile2)
415-
}
406+
tempFile2 := helpers.SendHealthEvent(ctx, t,
407+
helpers.NewHealthEvent(testNodeProcessed).
408+
WithHealthy(true).
409+
WithFatal(false).
410+
WithMessage("No Health Failures"))
411+
defer os.Remove(tempFile2)
416412

417413
require.Eventually(t, func() bool {
418414
node, err := helpers.GetNodeByName(ctx, client, testNodeProcessed)
@@ -428,7 +424,7 @@ func TestManagedByNVSentinelLabel(t *testing.T) {
428424
}
429425
}
430426
return true
431-
}, helpers.WaitTimeout, helpers.WaitInterval)
427+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
432428

433429
nodeIgnored, err := helpers.GetNodeByName(ctx, client, testNodeIgnored)
434430
if err == nil {

tests/gpu_health_monitor_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ func TestGPUHealthMonitorMultipleErrors(t *testing.T) {
123123
}
124124

125125
return allFound
126-
}, helpers.WaitTimeout, helpers.WaitInterval, "all injected error conditions should appear")
126+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "all injected error conditions should appear")
127127

128128
return ctx
129129
})
@@ -181,7 +181,7 @@ func TestGPUHealthMonitorMultipleErrors(t *testing.T) {
181181
}
182182
t.Logf(" %s condition still unhealthy: %s", clearCmd.condition, condition.Message)
183183
return false
184-
}, helpers.WaitTimeout, helpers.WaitInterval, "%s condition should be cleared", clearCmd.condition)
184+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "%s condition should be cleared", clearCmd.condition)
185185
}
186186

187187
t.Logf("Removing ManagedByNVSentinel label from node %s", nodeName)
@@ -255,7 +255,7 @@ func TestGPUHealthMonitorDCGMConnectionError(t *testing.T) {
255255
t.Logf("Found condition - Status: %s, Reason: %s, Message: %s",
256256
condition.Status, condition.Reason, condition.Message)
257257
return condition.Status == v1.ConditionTrue
258-
}, helpers.WaitTimeout, helpers.WaitInterval, "GpuDcgmConnectivityFailure condition should appear")
258+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "GpuDcgmConnectivityFailure condition should appear")
259259

260260
return ctx
261261
})
@@ -288,7 +288,7 @@ func TestGPUHealthMonitorDCGMConnectionError(t *testing.T) {
288288

289289
// Condition should have Status=False when healthy
290290
return condition.Status == v1.ConditionFalse
291-
}, helpers.WaitTimeout, helpers.WaitInterval, "GpuDcgmConnectivityFailure should become healthy")
291+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "GpuDcgmConnectivityFailure should become healthy")
292292

293293
return ctx
294294
})
@@ -325,7 +325,7 @@ func TestGPUHealthMonitorDCGMConnectionError(t *testing.T) {
325325
}
326326
t.Logf("Condition still present: Status=%s", condition.Status)
327327
return false
328-
}, helpers.WaitTimeout, helpers.WaitInterval, "GpuDcgmConnectivityFailure should clear")
328+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval, "GpuDcgmConnectivityFailure should clear")
329329

330330
t.Logf("Removing ManagedByNVSentinel label from node %s", nodeName)
331331
err = helpers.RemoveNodeManagedByNVSentinelLabel(ctx, client, nodeName)

tests/helpers/fault_quarantine.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"fmt"
2020
"strings"
2121
"testing"
22-
"time"
2322

2423
"github.com/stretchr/testify/assert"
2524
"github.com/stretchr/testify/require"
@@ -203,7 +202,7 @@ func TeardownQuarantineTest(ctx context.Context, t *testing.T, c *envconf.Config
203202
}
204203

205204
return true
206-
}, WaitTimeout, WaitInterval)
205+
}, EventuallyWaitTimeout, WaitInterval)
207206
t.Logf("Node %s cleaned successfully", nodeName)
208207

209208
backupDataVal := ctx.Value(CELKeyConfigMapBackup)
@@ -246,7 +245,7 @@ func AssertNodeNeverQuarantined(ctx context.Context, t *testing.T, client klient
246245
}
247246

248247
return false
249-
}, 30*time.Second, 2*time.Second, "node %s should not be quarantined", nodeName)
248+
}, NeverWaitTimeout, WaitInterval, "node %s should not be quarantined", nodeName)
250249
}
251250

252251
// SendHealthyEventsAsync sends healthy events to multiple nodes and waits for quarantine cleanup on all of them.
@@ -282,7 +281,7 @@ func SendHealthyEventsAsync(ctx context.Context, t *testing.T, client klient.Cli
282281
}
283282

284283
return cleanedCount == len(nodeNames)
285-
}, WaitTimeout, WaitInterval)
284+
}, EventuallyWaitTimeout, WaitInterval)
286285
t.Logf("All %d nodes cleaned up successfully", len(nodeNames))
287286
}
288287

@@ -346,7 +345,7 @@ func AssertQuarantineState(ctx context.Context, t *testing.T, client klient.Clie
346345
}
347346

348347
return true
349-
}, WaitTimeout, WaitInterval)
348+
}, EventuallyWaitTimeout, WaitInterval)
350349

351350
t.Logf("Assertion passed for node %s", nodeName)
352351
}

tests/helpers/fault_remediation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ func TriggerFullRemediationFlow(ctx context.Context, t *testing.T, client klient
187187
return false
188188
}
189189
return node.Spec.Unschedulable
190-
}, WaitTimeout, WaitInterval)
190+
}, EventuallyWaitTimeout, WaitInterval)
191191
t.Log("Node cordoned successfully")
192192

193193
t.Log("Full remediation flow trigger completed")

tests/helpers/kube.go

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,9 @@ import (
5454
)
5555

5656
const (
57-
WaitTimeout = 10 * time.Minute
58-
WaitInterval = 5 * time.Second
57+
EventuallyWaitTimeout = 10 * time.Minute
58+
NeverWaitTimeout = 30 * time.Second
59+
WaitInterval = 5 * time.Second
5960

6061
// NVSentinelNamespace is the default namespace where NVSentinel components are deployed
6162
NVSentinelNamespace = "nvsentinel"
@@ -83,7 +84,7 @@ func WaitForNodesCordonState(ctx context.Context, t *testing.T, c klient.Client,
8384

8485
t.Logf("Nodes with cordon state %v: %d/%d", shouldCordon, actualCount, targetCount)
8586
return actualCount == targetCount
86-
}, WaitTimeout, WaitInterval, "nodes should have cordon state %v", shouldCordon)
87+
}, EventuallyWaitTimeout, WaitInterval, "nodes should have cordon state %v", shouldCordon)
8788
}
8889

8990
// CreateNamespace creates a new Kubernetes namespace with the specified `name`.
@@ -123,7 +124,7 @@ func DeleteNamespace(ctx context.Context, t *testing.T, c klient.Client, name st
123124
var ns v1.Namespace
124125
err := c.Resources().Get(ctx, name, "", &ns)
125126
return err != nil && apierrors.IsNotFound(err)
126-
}, WaitTimeout, WaitInterval, "namespace %s should be deleted", name)
127+
}, EventuallyWaitTimeout, WaitInterval, "namespace %s should be deleted", name)
127128

128129
return nil
129130
}
@@ -270,7 +271,7 @@ func WaitForNodesWithLabel(ctx context.Context, t *testing.T, c klient.Client, n
270271

271272
t.Logf("Nodes with label %s=%s: %d/%d", labelKey, expectedValue, actualCount, targetCount)
272273
return actualCount == targetCount
273-
}, WaitTimeout, WaitInterval, "all nodes should have label %s=%s", labelKey, expectedValue)
274+
}, EventuallyWaitTimeout, WaitInterval, "all nodes should have label %s=%s", labelKey, expectedValue)
274275
}
275276

276277
func WaitForNodeEvent(ctx context.Context, t *testing.T, c klient.Client, nodeName string, expectedEvent v1.Event) {
@@ -290,7 +291,7 @@ func WaitForNodeEvent(ctx context.Context, t *testing.T, c klient.Client, nodeNa
290291
}
291292
t.Logf("Did not find any events for node %s matching event %v", nodeName, expectedEvent)
292293
return false
293-
}, WaitTimeout, WaitInterval, "node %s should have event %v", nodeName, expectedEvent)
294+
}, EventuallyWaitTimeout, WaitInterval, "node %s should have event %v", nodeName, expectedEvent)
294295
}
295296

296297
// SelectTestNodeFromUnusedPool selects an available test node from the cluster.
@@ -421,7 +422,7 @@ func WaitForRebootNodeCR(ctx context.Context, t *testing.T, c klient.Client, nod
421422
}
422423
t.Logf("No RebootNode CR found for node %s", nodeName)
423424
return false
424-
}, WaitTimeout, WaitInterval, "RebootNode CR should complete for node %s", nodeName)
425+
}, EventuallyWaitTimeout, WaitInterval, "RebootNode CR should complete for node %s", nodeName)
425426

426427
t.Logf("RebootNode CR created for node %s", nodeName)
427428
return resultCR
@@ -591,7 +592,7 @@ func waitForPodRunning(ctx context.Context, t *testing.T, c klient.Client, podNa
591592
return false
592593
}
593594
return isRunning
594-
}, WaitTimeout, WaitInterval, "pod %s should be running", podName)
595+
}, EventuallyWaitTimeout, WaitInterval, "pod %s should be running", podName)
595596

596597
}
597598

@@ -846,7 +847,7 @@ func WaitForDeploymentRollout(ctx context.Context, t *testing.T, c klient.Client
846847

847848
t.Logf("Rollout complete: all %d replicas are updated, ready, and available", expectedReplicas)
848849
return true
849-
}, WaitTimeout, WaitInterval, "deployment %s/%s rollout should complete", namespace, name)
850+
}, EventuallyWaitTimeout, WaitInterval, "deployment %s/%s rollout should complete", namespace, name)
850851

851852
t.Logf("Deployment %s/%s rollout completed successfully", namespace, name)
852853
}
@@ -1080,7 +1081,7 @@ func WaitForNodeLabel(ctx context.Context, t *testing.T, client klient.Client, n
10801081
return false
10811082
}
10821083
return value == expectedValue
1083-
}, WaitTimeout, WaitInterval)
1084+
}, EventuallyWaitTimeout, WaitInterval)
10841085
t.Logf("Node %s has label %s=%s", nodeName, labelKey, expectedValue)
10851086
}
10861087

@@ -1098,7 +1099,7 @@ func AssertPodsNeverDeleted(ctx context.Context, t *testing.T, client klient.Cli
10981099
}
10991100
}
11001101
return false
1101-
}, 15*time.Second, 5*time.Second, "pods should not be deleted")
1102+
}, NeverWaitTimeout, WaitInterval, "pods should not be deleted")
11021103
t.Logf("All %d pods remain running in namespace %s", len(podNames), namespace)
11031104
}
11041105

@@ -1116,7 +1117,7 @@ func WaitForPodsDeleted(ctx context.Context, t *testing.T, client klient.Client,
11161117
}
11171118
}
11181119
return true
1119-
}, WaitTimeout, WaitInterval)
1120+
}, EventuallyWaitTimeout, WaitInterval)
11201121
t.Logf("All pods deleted from namespace %s", namespace)
11211122
}
11221123

@@ -1132,7 +1133,7 @@ func WaitForPodsRunning(ctx context.Context, t *testing.T, client klient.Client,
11321133
return false
11331134
}
11341135
return pod.Status.Phase == v1.PodRunning
1135-
}, WaitTimeout, WaitInterval)
1136+
}, EventuallyWaitTimeout, WaitInterval)
11361137
}
11371138
t.Logf("All %d pods running", len(podNames))
11381139
}

tests/node_drainer_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,13 @@ func TestNodeDrainerEvictionModes(t *testing.T) {
104104
return false
105105
}
106106
return p.DeletionTimestamp != nil
107-
}, helpers.WaitTimeout, helpers.WaitInterval)
107+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
108108

109109
require.Never(t, func() bool {
110110
var p v1.Pod
111111
err := client.Resources().Get(ctx, finalizerPod, "immediate-test", &p)
112112
return err != nil
113-
}, 15*time.Second, 5*time.Second)
113+
}, helpers.NeverWaitTimeout, helpers.WaitInterval)
114114

115115
t.Log("Phase 2: Both allowCompletion and deleteAfterTimeout waiting (verify for 15s)")
116116
require.Never(t, func() bool {
@@ -127,7 +127,7 @@ func TestNodeDrainerEvictionModes(t *testing.T) {
127127
}
128128
}
129129
return false
130-
}, 15*time.Second, 5*time.Second, "both mode pods should wait, not be deleted immediately")
130+
}, helpers.NeverWaitTimeout, helpers.WaitInterval, "both mode pods should wait, not be deleted immediately")
131131

132132
t.Log("Phase 2: Manually completing allowCompletion pods to unblock drain")
133133
helpers.DeletePodsByNames(ctx, t, client, "allowcompletion-test", allowCompletionPods)
@@ -163,7 +163,7 @@ func TestNodeDrainerEvictionModes(t *testing.T) {
163163
t.Logf("Found event after restart: %s", event.Reason)
164164
}
165165
return found
166-
}, helpers.WaitTimeout, helpers.WaitInterval)
166+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
167167

168168
helpers.DeletePodsByNames(ctx, t, client, "allowcompletion-test", podNames)
169169
helpers.WaitForPodsDeleted(ctx, t, client, "allowcompletion-test", podNames)

tests/scale_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,15 @@ func TestScaleHealthEvents(t *testing.T) {
152152
cordonedCount, percentageOfTotal, totalNodesInCluster)
153153
}
154154
return false
155-
}, helpers.WaitTimeout, helpers.WaitInterval)
155+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
156156

157157
cbState := helpers.GetCircuitBreakerState(ctx, t, c)
158158
if cbState != "TRIPPED" {
159159
require.Eventually(t, func() bool {
160160
state := helpers.GetCircuitBreakerState(ctx, t, c)
161161
t.Logf("Circuit breaker state: %s", state)
162162
return state == "TRIPPED"
163-
}, helpers.WaitTimeout, helpers.WaitInterval)
163+
}, helpers.EventuallyWaitTimeout, helpers.WaitInterval)
164164
}
165165
t.Log("Circuit breaker auto-tripped after threshold exceeded")
166166

0 commit comments

Comments
 (0)