Skip to content

Commit 36edd76

Browse files
committed
re-add some deleted tests
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent 0f7d577 commit 36edd76

File tree

4 files changed

+936
-0
lines changed

4 files changed

+936
-0
lines changed

fault-quarantine/pkg/reconciler/reconciler_e2e_test.go

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3666,3 +3666,196 @@ func TestE2E_UnhealthyEventNotMatchingRulesNotPropagated(t *testing.T) {
36663666
require.NoError(t, err)
36673667
assert.Equal(t, 1, healthEventsMap.Count(), "Should still have only GpuXidError tracked")
36683668
}
3669+
3670+
// TestE2E_ManualUncordonWithCancellation tests that manual uncordon triggers proper cleanup
3671+
func TestE2E_ManualUncordonWithCancellation(t *testing.T) {
3672+
ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second)
3673+
defer cancel()
3674+
3675+
nodeName := "e2e-manual-uncordon-" + generateShortTestID()
3676+
createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false)
3677+
defer func() {
3678+
_ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{})
3679+
}()
3680+
3681+
tomlConfig := config.TomlConfig{
3682+
LabelPrefix: "k8s.nvidia.com/",
3683+
RuleSets: []config.RuleSet{
3684+
{
3685+
Name: "gpu-xid-errors",
3686+
Version: "1",
3687+
Priority: 10,
3688+
Match: config.Match{
3689+
Any: []config.Rule{
3690+
{Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError' && event.isFatal == true"},
3691+
},
3692+
},
3693+
Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"},
3694+
Cordon: config.Cordon{ShouldCordon: true},
3695+
},
3696+
},
3697+
}
3698+
3699+
_, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil)
3700+
3701+
beforeManualUncordon := getCounterVecValue(t, metrics.TotalNodesManuallyUncordoned, nodeName)
3702+
beforeCurrentQuarantined := getGaugeVecValue(t, metrics.CurrentQuarantinedNodes, nodeName)
3703+
3704+
t.Log("Sending unhealthy event to quarantine node")
3705+
eventID1 := generateTestID()
3706+
mockWatcher.EventsChan <- &TestEvent{Data: createHealthEventBSON(
3707+
eventID1,
3708+
nodeName,
3709+
"GpuXidError",
3710+
false,
3711+
true,
3712+
[]*protos.Entity{{EntityType: "GPU", EntityValue: "0"}},
3713+
model.StatusInProgress,
3714+
)}
3715+
3716+
t.Log("Waiting for node to be quarantined")
3717+
require.Eventually(t, func() bool {
3718+
status := getStatus(eventID1)
3719+
return status != nil && *status == model.Quarantined
3720+
}, statusCheckTimeout, statusCheckPollInterval, "Status should be Quarantined")
3721+
3722+
require.Eventually(t, func() bool {
3723+
node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
3724+
return err == nil && node.Spec.Unschedulable
3725+
}, eventuallyTimeout, eventuallyPollInterval, "Node should be quarantined")
3726+
3727+
t.Log("Manually uncordon the node")
3728+
quarantinedNode, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
3729+
require.NoError(t, err)
3730+
quarantinedNode.Spec.Unschedulable = false
3731+
_, err = e2eTestClient.CoreV1().Nodes().Update(ctx, quarantinedNode, metav1.UpdateOptions{})
3732+
require.NoError(t, err)
3733+
3734+
t.Log("Verify manual uncordon cleanup")
3735+
require.Eventually(t, func() bool {
3736+
node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
3737+
if err != nil {
3738+
return false
3739+
}
3740+
3741+
return node.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey] == common.QuarantinedNodeUncordonedManuallyAnnotationValue &&
3742+
node.Annotations[common.QuarantineHealthEventAnnotationKey] == ""
3743+
}, eventuallyTimeout, eventuallyPollInterval, "Manual uncordon should clean up annotations")
3744+
3745+
t.Log("Verify manual uncordon metric incremented")
3746+
afterManualUncordon := getCounterVecValue(t, metrics.TotalNodesManuallyUncordoned, nodeName)
3747+
assert.Equal(t, beforeManualUncordon+1, afterManualUncordon, "TotalNodesManuallyUncordoned should increment")
3748+
3749+
t.Log("Verify current quarantined nodes gauge updated")
3750+
afterCurrentQuarantined := getGaugeVecValue(t, metrics.CurrentQuarantinedNodes, nodeName)
3751+
assert.Equal(t, float64(0), afterCurrentQuarantined, "CurrentQuarantinedNodes should be 0")
3752+
assert.GreaterOrEqual(t, beforeCurrentQuarantined, float64(0), "Gauge should have been set before")
3753+
}
3754+
3755+
// TestE2E_ManualUncordonMultipleEvents tests that manual uncordon works with multiple events on the same node
3756+
func TestE2E_ManualUncordonMultipleEvents(t *testing.T) {
3757+
ctx, cancel := context.WithTimeout(e2eTestContext, 30*time.Second)
3758+
defer cancel()
3759+
3760+
nodeName := "e2e-manual-multi-" + generateShortTestID()
3761+
createE2ETestNode(ctx, t, nodeName, nil, nil, nil, false)
3762+
defer func() {
3763+
_ = e2eTestClient.CoreV1().Nodes().Delete(ctx, nodeName, metav1.DeleteOptions{})
3764+
}()
3765+
3766+
tomlConfig := config.TomlConfig{
3767+
LabelPrefix: "k8s.nvidia.com/",
3768+
RuleSets: []config.RuleSet{
3769+
{
3770+
Name: "gpu-xid-errors",
3771+
Version: "1",
3772+
Priority: 10,
3773+
Match: config.Match{
3774+
Any: []config.Rule{
3775+
{Kind: "HealthEvent", Expression: "event.checkName == 'GpuXidError'"},
3776+
},
3777+
},
3778+
Taint: config.Taint{Key: "nvidia.com/gpu-xid-error", Value: "true", Effect: "NoSchedule"},
3779+
Cordon: config.Cordon{ShouldCordon: true},
3780+
},
3781+
},
3782+
}
3783+
3784+
_, mockWatcher, getStatus, _ := setupE2EReconciler(t, ctx, tomlConfig, nil)
3785+
3786+
t.Log("Send first unhealthy event (Quarantined)")
3787+
eventID1 := generateTestID()
3788+
mockWatcher.EventsChan <- &TestEvent{Data: createHealthEventBSON(
3789+
eventID1,
3790+
nodeName,
3791+
"GpuXidError",
3792+
false,
3793+
true,
3794+
[]*protos.Entity{{EntityType: "GPU", EntityValue: "0"}},
3795+
model.StatusInProgress,
3796+
)}
3797+
3798+
require.Eventually(t, func() bool {
3799+
status := getStatus(eventID1)
3800+
return status != nil && *status == model.Quarantined
3801+
}, statusCheckTimeout, statusCheckPollInterval, "First event should be Quarantined")
3802+
3803+
t.Log("Send second unhealthy event (AlreadyQuarantined)")
3804+
eventID2 := generateTestID()
3805+
mockWatcher.EventsChan <- &TestEvent{Data: createHealthEventBSON(
3806+
eventID2,
3807+
nodeName,
3808+
"GpuXidError",
3809+
false,
3810+
true,
3811+
[]*protos.Entity{{EntityType: "GPU", EntityValue: "1"}},
3812+
model.StatusInProgress,
3813+
)}
3814+
3815+
require.Eventually(t, func() bool {
3816+
status := getStatus(eventID2)
3817+
return status != nil && *status == model.AlreadyQuarantined
3818+
}, statusCheckTimeout, statusCheckPollInterval, "Second event should be Quarantined")
3819+
3820+
t.Log("Send third unhealthy event (AlreadyQuarantined)")
3821+
eventID3 := generateTestID()
3822+
mockWatcher.EventsChan <- &TestEvent{Data: createHealthEventBSON(
3823+
eventID3,
3824+
nodeName,
3825+
"GpuXidError",
3826+
false,
3827+
true,
3828+
[]*protos.Entity{{EntityType: "GPU", EntityValue: "2"}},
3829+
model.StatusInProgress,
3830+
)}
3831+
3832+
require.Eventually(t, func() bool {
3833+
status := getStatus(eventID3)
3834+
return status != nil && *status == model.AlreadyQuarantined
3835+
}, statusCheckTimeout, statusCheckPollInterval, "Third event should be AlreadyQuarantined")
3836+
3837+
t.Log("Manually uncordon the node")
3838+
quarantinedNode, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
3839+
require.NoError(t, err)
3840+
quarantinedNode.Spec.Unschedulable = false
3841+
_, err = e2eTestClient.CoreV1().Nodes().Update(ctx, quarantinedNode, metav1.UpdateOptions{})
3842+
require.NoError(t, err)
3843+
3844+
t.Log("Verify manual uncordon annotation is set")
3845+
require.Eventually(t, func() bool {
3846+
node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
3847+
if err != nil {
3848+
return false
3849+
}
3850+
return node.Annotations[common.QuarantinedNodeUncordonedManuallyAnnotationKey] == common.QuarantinedNodeUncordonedManuallyAnnotationValue
3851+
}, eventuallyTimeout, eventuallyPollInterval, "Manual uncordon annotation should be set")
3852+
3853+
t.Log("Verify quarantine annotation cleared")
3854+
require.Eventually(t, func() bool {
3855+
node, err := e2eTestClient.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
3856+
if err != nil {
3857+
return false
3858+
}
3859+
return node.Annotations[common.QuarantineHealthEventAnnotationKey] == ""
3860+
}, eventuallyTimeout, eventuallyPollInterval, "Quarantine annotation should be cleared")
3861+
}

0 commit comments

Comments
 (0)