Skip to content

Commit 5f3c25e

Browse files
committed
chore: more fixes
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 381db14 commit 5f3c25e

File tree

7 files changed

+145
-169
lines changed

7 files changed

+145
-169
lines changed

health-monitors/syslog-health-monitor/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,4 @@ COPY --from=builder /go/src/nvsentinel/health-monitors/syslog-health-monitor/sys
5959
RUN apt-get update && \
6060
apt-get install -y libsystemd0 liblz4-1 libzstd1
6161

62-
ENTRYPOINT ["/app/syslog-health-monitor"]
62+
ENTRYPOINT ["/app/syslog-health-monitor"]

tests/fault_management_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
115
package tests
216

317
import (

tests/helpers/fault_quarantine.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,45 @@ type QuarantineAssertion struct {
4949
ExpectAnnotation bool
5050
}
5151

52+
func ApplyQuarantineConfig(ctx context.Context, t *testing.T, c *envconf.Config, configMapPath string) context.Context {
53+
t.Helper()
54+
client, err := c.NewClient()
55+
require.NoError(t, err)
56+
57+
t.Log("Backing up current fault-quarantine configmap")
58+
backupData, err := BackupConfigMap(ctx, client, "fault-quarantine-config", NVSentinelNamespace)
59+
require.NoError(t, err)
60+
ctx = context.WithValue(ctx, CELKeyConfigMapBackup, backupData)
61+
62+
t.Logf("Applying test configmap: %s", configMapPath)
63+
err = createConfigMapFromFilePath(ctx, client, configMapPath, "fault-quarantine-config", NVSentinelNamespace)
64+
require.NoError(t, err)
65+
66+
t.Log("Restarting fault-quarantine deployment to load configuration")
67+
err = RestartDeployment(ctx, t, client, "nvsentinel-fault-quarantine", NVSentinelNamespace)
68+
require.NoError(t, err)
69+
70+
return ctx
71+
}
72+
73+
func RestoreQuarantineConfig(ctx context.Context, t *testing.T, c *envconf.Config) {
74+
t.Helper()
75+
client, err := c.NewClient()
76+
require.NoError(t, err)
77+
78+
backupDataVal := ctx.Value(CELKeyConfigMapBackup)
79+
if backupDataVal != nil {
80+
backupData := backupDataVal.([]byte)
81+
t.Log("Restoring fault-quarantine configmap from memory")
82+
err = createConfigMapFromBytes(ctx, client, backupData, "fault-quarantine-config", NVSentinelNamespace)
83+
require.NoError(t, err)
84+
85+
t.Log("Restarting fault-quarantine deployment to load restored configuration")
86+
err = RestartDeployment(ctx, t, client, "nvsentinel-fault-quarantine", NVSentinelNamespace)
87+
require.NoError(t, err)
88+
}
89+
}
90+
5291
func SetupQuarantineTest(ctx context.Context, t *testing.T, c *envconf.Config, configMapPath string) (context.Context, *QuarantineTestContext) {
5392
ctx, testCtx, _ := SetupQuarantineTestWithOptions(ctx, t, c, configMapPath, nil)
5493
return ctx, testCtx

tests/helpers/kube.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1099,7 +1099,7 @@ func AssertPodsNeverDeleted(ctx context.Context, t *testing.T, client klient.Cli
10991099
}
11001100
}
11011101
return false
1102-
}, 30*WaitInterval, WaitInterval, "pods should not be deleted")
1102+
}, 15*time.Second, 5*time.Second, "pods should not be deleted")
11031103
t.Logf("All %d pods remain running in namespace %s", len(podNames), namespace)
11041104
}
11051105

tests/helpers/node_drainer.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"strings"
2121
"testing"
2222

23+
"github.com/nvidia/nvsentinel/commons/pkg/statemanager"
2324
"github.com/stretchr/testify/assert"
2425
"github.com/stretchr/testify/require"
2526
v1 "k8s.io/api/core/v1"
@@ -93,6 +94,25 @@ func SetupNodeDrainerTest(ctx context.Context, t *testing.T, c *envconf.Config,
9394
return ctx, testCtx
9495
}
9596

97+
func ResetNodeAndTriggerDrain(ctx context.Context, t *testing.T, client klient.Client, nodeName, namespace string) ([]string, string) {
98+
t.Helper()
99+
100+
SendHealthyEvent(ctx, t, nodeName)
101+
WaitForNodesCordonState(ctx, t, client, []string{nodeName}, false)
102+
103+
podNames := CreatePodsFromTemplate(ctx, t, client, "data/busybox-pods.yaml", nodeName, namespace)
104+
WaitForPodsRunning(ctx, t, client, namespace, podNames)
105+
106+
event := NewHealthEvent(nodeName).
107+
WithErrorCode("79").
108+
WithMessage("GPU Fallen off the bus")
109+
tempFile := SendHealthEvent(ctx, t, event)
110+
111+
WaitForNodeLabel(ctx, t, client, nodeName, statemanager.NVSentinelStateLabelKey, DrainingLabelValue)
112+
113+
return podNames, tempFile
114+
}
115+
96116
func TeardownNodeDrainer(ctx context.Context, t *testing.T, c *envconf.Config) context.Context {
97117
t.Helper()
98118
client, err := c.NewClient()

0 commit comments

Comments
 (0)