@@ -59,10 +59,16 @@ func TestMultipleFatalEventRule(t *testing.T) {
5959
6060 client , err := c .NewClient ()
6161 assert .NoError (t , err , "failed to create kubernetes client" )
62+
63+ // clean up any existing node conditions
64+ t .Logf ("Cleaning up any existing node conditions for node %s" , gpuNodeName )
65+ err = helpers .SendHealthEventsToNodes (t , []string {gpuNodeName }, ERRORCODE_13 , "data/non-fatal-health-event.json" , "" )
66+ assert .NoError (t , err , "failed to send non-fatal events" )
67+
6268 xidsToInject := []string {ERRORCODE_13 , ERRORCODE_48 , ERRORCODE_13 , ERRORCODE_48 , ERRORCODE_13 }
6369
6470 // inject 5 fatal errors and let the remediation cycle finish
65-
71+ t . Logf ( "Injecting fatal errors to node %s" , gpuNodeName )
6672 for _ , xid := range xidsToInject {
6773 // inject XID error
6874 err = helpers .SendHealthEventsToNodes (t , []string {gpuNodeName }, xid , "data/fatal-health-event.json" , "" )
@@ -92,7 +98,7 @@ func TestMultipleFatalEventRule(t *testing.T) {
9298 assert .NoError (t , err , "failed to send fatal events" )
9399
94100 // Check node condition for matched ruleset
95- helpers .WaitForNodeConditionWithCheckName (ctx , t , client , gpuNodeName , "MultipleFatalError" )
101+ helpers .WaitForNodeConditionWithCheckName (ctx , t , client , gpuNodeName , "MultipleFatalError" , "ErrorCode:31 GPU:0 XID error occurred Recommended Action=CONTACT_SUPPORT;" )
96102
97103 return ctx
98104 })
@@ -143,8 +149,13 @@ func TestMultipleNonFatalEventRule(t *testing.T) {
143149 assert .True (t , len (gpuNodes ) > 0 , "no gpu nodes found" )
144150 gpuNodeName := gpuNodes [rand .Intn (len (gpuNodes ))]
145151 ctx = context .WithValue (ctx , keyGpuNodeName , gpuNodeName )
146- t .Logf ("Injecting fatal events to node %s" , gpuNodeName )
147152
153+ // clean up any existing node conditions
154+ t .Logf ("Cleaning up any existing node conditions for node %s" , gpuNodeName )
155+ err := helpers .SendHealthEventsToNodes (t , []string {gpuNodeName }, ERRORCODE_13 , "data/non-fatal-health-event.json" , "" )
156+ assert .NoError (t , err , "failed to send non-fatal events" )
157+
158+ t .Logf ("Injecting non-fatal events to node %s" , gpuNodeName )
148159 for i := 0 ; i < 5 ; i ++ {
149160 // inject XID error
150161 err := helpers .SendHealthEventsToNodes (t , []string {gpuNodeName }, ERRORCODE_13 , "data/non-fatal-health-event.json" , "" )
0 commit comments