@@ -3666,3 +3666,196 @@ func TestE2E_UnhealthyEventNotMatchingRulesNotPropagated(t *testing.T) {
36663666 require .NoError (t , err )
36673667 assert .Equal (t , 1 , healthEventsMap .Count (), "Should still have only GpuXidError tracked" )
36683668}
3669+
3670+ // TestE2E_ManualUncordonWithCancellation tests that manual uncordon triggers proper cleanup
3671+ func TestE2E_ManualUncordonWithCancellation (t * testing.T ) {
3672+ ctx , cancel := context .WithTimeout (e2eTestContext , 30 * time .Second )
3673+ defer cancel ()
3674+
3675+ nodeName := "e2e-manual-uncordon-" + generateShortTestID ()
3676+ createE2ETestNode (ctx , t , nodeName , nil , nil , nil , false )
3677+ defer func () {
3678+ _ = e2eTestClient .CoreV1 ().Nodes ().Delete (ctx , nodeName , metav1.DeleteOptions {})
3679+ }()
3680+
3681+ tomlConfig := config.TomlConfig {
3682+ LabelPrefix : "k8s.nvidia.com/" ,
3683+ RuleSets : []config.RuleSet {
3684+ {
3685+ Name : "gpu-xid-errors" ,
3686+ Version : "1" ,
3687+ Priority : 10 ,
3688+ Match : config.Match {
3689+ Any : []config.Rule {
3690+ {Kind : "HealthEvent" , Expression : "event.checkName == 'GpuXidError' && event.isFatal == true" },
3691+ },
3692+ },
3693+ Taint : config.Taint {Key : "nvidia.com/gpu-xid-error" , Value : "true" , Effect : "NoSchedule" },
3694+ Cordon : config.Cordon {ShouldCordon : true },
3695+ },
3696+ },
3697+ }
3698+
3699+ _ , mockWatcher , getStatus , _ := setupE2EReconciler (t , ctx , tomlConfig , nil )
3700+
3701+ beforeManualUncordon := getCounterVecValue (t , metrics .TotalNodesManuallyUncordoned , nodeName )
3702+ beforeCurrentQuarantined := getGaugeVecValue (t , metrics .CurrentQuarantinedNodes , nodeName )
3703+
3704+ t .Log ("Sending unhealthy event to quarantine node" )
3705+ eventID1 := generateTestID ()
3706+ mockWatcher .EventsChan <- & TestEvent {Data : createHealthEventBSON (
3707+ eventID1 ,
3708+ nodeName ,
3709+ "GpuXidError" ,
3710+ false ,
3711+ true ,
3712+ []* protos.Entity {{EntityType : "GPU" , EntityValue : "0" }},
3713+ model .StatusInProgress ,
3714+ )}
3715+
3716+ t .Log ("Waiting for node to be quarantined" )
3717+ require .Eventually (t , func () bool {
3718+ status := getStatus (eventID1 )
3719+ return status != nil && * status == model .Quarantined
3720+ }, statusCheckTimeout , statusCheckPollInterval , "Status should be Quarantined" )
3721+
3722+ require .Eventually (t , func () bool {
3723+ node , err := e2eTestClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
3724+ return err == nil && node .Spec .Unschedulable
3725+ }, eventuallyTimeout , eventuallyPollInterval , "Node should be quarantined" )
3726+
3727+ t .Log ("Manually uncordon the node" )
3728+ quarantinedNode , err := e2eTestClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
3729+ require .NoError (t , err )
3730+ quarantinedNode .Spec .Unschedulable = false
3731+ _ , err = e2eTestClient .CoreV1 ().Nodes ().Update (ctx , quarantinedNode , metav1.UpdateOptions {})
3732+ require .NoError (t , err )
3733+
3734+ t .Log ("Verify manual uncordon cleanup" )
3735+ require .Eventually (t , func () bool {
3736+ node , err := e2eTestClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
3737+ if err != nil {
3738+ return false
3739+ }
3740+
3741+ return node .Annotations [common .QuarantinedNodeUncordonedManuallyAnnotationKey ] == common .QuarantinedNodeUncordonedManuallyAnnotationValue &&
3742+ node .Annotations [common .QuarantineHealthEventAnnotationKey ] == ""
3743+ }, eventuallyTimeout , eventuallyPollInterval , "Manual uncordon should clean up annotations" )
3744+
3745+ t .Log ("Verify manual uncordon metric incremented" )
3746+ afterManualUncordon := getCounterVecValue (t , metrics .TotalNodesManuallyUncordoned , nodeName )
3747+ assert .Equal (t , beforeManualUncordon + 1 , afterManualUncordon , "TotalNodesManuallyUncordoned should increment" )
3748+
3749+ t .Log ("Verify current quarantined nodes gauge updated" )
3750+ afterCurrentQuarantined := getGaugeVecValue (t , metrics .CurrentQuarantinedNodes , nodeName )
3751+ assert .Equal (t , float64 (0 ), afterCurrentQuarantined , "CurrentQuarantinedNodes should be 0" )
3752+ assert .GreaterOrEqual (t , beforeCurrentQuarantined , float64 (0 ), "Gauge should have been set before" )
3753+ }
3754+
3755+ // TestE2E_ManualUncordonMultipleEvents tests that manual uncordon works with multiple events on the same node
3756+ func TestE2E_ManualUncordonMultipleEvents (t * testing.T ) {
3757+ ctx , cancel := context .WithTimeout (e2eTestContext , 30 * time .Second )
3758+ defer cancel ()
3759+
3760+ nodeName := "e2e-manual-multi-" + generateShortTestID ()
3761+ createE2ETestNode (ctx , t , nodeName , nil , nil , nil , false )
3762+ defer func () {
3763+ _ = e2eTestClient .CoreV1 ().Nodes ().Delete (ctx , nodeName , metav1.DeleteOptions {})
3764+ }()
3765+
3766+ tomlConfig := config.TomlConfig {
3767+ LabelPrefix : "k8s.nvidia.com/" ,
3768+ RuleSets : []config.RuleSet {
3769+ {
3770+ Name : "gpu-xid-errors" ,
3771+ Version : "1" ,
3772+ Priority : 10 ,
3773+ Match : config.Match {
3774+ Any : []config.Rule {
3775+ {Kind : "HealthEvent" , Expression : "event.checkName == 'GpuXidError'" },
3776+ },
3777+ },
3778+ Taint : config.Taint {Key : "nvidia.com/gpu-xid-error" , Value : "true" , Effect : "NoSchedule" },
3779+ Cordon : config.Cordon {ShouldCordon : true },
3780+ },
3781+ },
3782+ }
3783+
3784+ _ , mockWatcher , getStatus , _ := setupE2EReconciler (t , ctx , tomlConfig , nil )
3785+
3786+ t .Log ("Send first unhealthy event (Quarantined)" )
3787+ eventID1 := generateTestID ()
3788+ mockWatcher .EventsChan <- & TestEvent {Data : createHealthEventBSON (
3789+ eventID1 ,
3790+ nodeName ,
3791+ "GpuXidError" ,
3792+ false ,
3793+ true ,
3794+ []* protos.Entity {{EntityType : "GPU" , EntityValue : "0" }},
3795+ model .StatusInProgress ,
3796+ )}
3797+
3798+ require .Eventually (t , func () bool {
3799+ status := getStatus (eventID1 )
3800+ return status != nil && * status == model .Quarantined
3801+ }, statusCheckTimeout , statusCheckPollInterval , "First event should be Quarantined" )
3802+
3803+ t .Log ("Send second unhealthy event (AlreadyQuarantined)" )
3804+ eventID2 := generateTestID ()
3805+ mockWatcher .EventsChan <- & TestEvent {Data : createHealthEventBSON (
3806+ eventID2 ,
3807+ nodeName ,
3808+ "GpuXidError" ,
3809+ false ,
3810+ true ,
3811+ []* protos.Entity {{EntityType : "GPU" , EntityValue : "1" }},
3812+ model .StatusInProgress ,
3813+ )}
3814+
3815+ require .Eventually (t , func () bool {
3816+ status := getStatus (eventID2 )
3817+ return status != nil && * status == model .AlreadyQuarantined
3818+ }, statusCheckTimeout , statusCheckPollInterval , "Second event should be Quarantined" )
3819+
3820+ t .Log ("Send third unhealthy event (AlreadyQuarantined)" )
3821+ eventID3 := generateTestID ()
3822+ mockWatcher .EventsChan <- & TestEvent {Data : createHealthEventBSON (
3823+ eventID3 ,
3824+ nodeName ,
3825+ "GpuXidError" ,
3826+ false ,
3827+ true ,
3828+ []* protos.Entity {{EntityType : "GPU" , EntityValue : "2" }},
3829+ model .StatusInProgress ,
3830+ )}
3831+
3832+ require .Eventually (t , func () bool {
3833+ status := getStatus (eventID3 )
3834+ return status != nil && * status == model .AlreadyQuarantined
3835+ }, statusCheckTimeout , statusCheckPollInterval , "Third event should be AlreadyQuarantined" )
3836+
3837+ t .Log ("Manually uncordon the node" )
3838+ quarantinedNode , err := e2eTestClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
3839+ require .NoError (t , err )
3840+ quarantinedNode .Spec .Unschedulable = false
3841+ _ , err = e2eTestClient .CoreV1 ().Nodes ().Update (ctx , quarantinedNode , metav1.UpdateOptions {})
3842+ require .NoError (t , err )
3843+
3844+ t .Log ("Verify manual uncordon annotation is set" )
3845+ require .Eventually (t , func () bool {
3846+ node , err := e2eTestClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
3847+ if err != nil {
3848+ return false
3849+ }
3850+ return node .Annotations [common .QuarantinedNodeUncordonedManuallyAnnotationKey ] == common .QuarantinedNodeUncordonedManuallyAnnotationValue
3851+ }, eventuallyTimeout , eventuallyPollInterval , "Manual uncordon annotation should be set" )
3852+
3853+ t .Log ("Verify quarantine annotation cleared" )
3854+ require .Eventually (t , func () bool {
3855+ node , err := e2eTestClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
3856+ if err != nil {
3857+ return false
3858+ }
3859+ return node .Annotations [common .QuarantineHealthEventAnnotationKey ] == ""
3860+ }, eventuallyTimeout , eventuallyPollInterval , "Quarantine annotation should be cleared" )
3861+ }
0 commit comments