Skip to content

Commit a28f47c

Browse files
committed
fix: some stuffs
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 887f0b7 commit a28f47c

File tree

8 files changed

+103
-29
lines changed

8 files changed

+103
-29
lines changed

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/templates/deployment.yaml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,31 +48,31 @@ spec:
4848
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}"
4949
imagePullPolicy: {{ .Values.image.pullPolicy }}
5050
args:
51-
- "--config"
52-
- "/config/config.toml"
53-
- "--platform-connector-socket"
54-
- "{{ .Values.platformConnector.socket }}"
55-
- "--metrics-port"
56-
- "{{ (.Values.global).metricsPort | default 8080 }}"
57-
- "--max-concurrent-reconciles"
58-
- "{{ .Values.maxConcurrentReconciles }}"
51+
- "--policy-config-path=/config/config.toml"
52+
- "--platform-connector-socket=unix://{{ ((.Values.global).socketPath) | default "/var/run/nvsentinel.sock" }}"
53+
- "--metrics-bind-address=:{{ ((.Values.global).metricsPort) | default 2112 }}"
54+
- "--health-probe-bind-address=:8081"
55+
- "--max-concurrent-reconciles={{ .Values.maxConcurrentReconciles }}"
56+
- "--resync-period={{ .Values.resyncPeriod }}"
5957
resources:
6058
{{- toYaml .Values.resources | nindent 12 }}
6159
ports:
6260
- name: metrics
63-
containerPort: {{ (.Values.global).metricsPort | default 8080 }}
61+
containerPort: {{ ((.Values.global).metricsPort) | default 2112 }}
62+
- name: health
63+
containerPort: 8081
6464
livenessProbe:
6565
httpGet:
6666
path: /healthz
67-
port: metrics
67+
port: health
6868
initialDelaySeconds: 15
6969
periodSeconds: 20
7070
timeoutSeconds: 5
7171
failureThreshold: 3
7272
readinessProbe:
7373
httpGet:
74-
path: /healthz
75-
port: metrics
74+
path: /readyz
75+
port: health
7676
initialDelaySeconds: 5
7777
periodSeconds: 10
7878
timeoutSeconds: 3
@@ -84,8 +84,8 @@ spec:
8484
- name: config
8585
mountPath: /config
8686
readOnly: true
87-
- name: platform-connector-socket
88-
mountPath: /var/run/platform-connector
87+
- name: socket
88+
mountPath: /var/run
8989
volumes:
9090
- name: config
9191
configMap:

distros/kubernetes/nvsentinel/charts/kubernetes-object-monitor/values.yaml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ image:
2828
podAnnotations: {}
2929

3030
maxConcurrentReconciles: 1
31-
32-
platformConnector:
33-
socket: "unix:///var/run/platform-connector/platform-connector.sock"
31+
resyncPeriod: 5m
3432

3533
policies:
3634
- name: node-not-ready
@@ -59,8 +57,8 @@ resources:
5957
memory: 256Mi
6058

6159
volumes:
62-
- name: platform-connector-socket
60+
- name: socket
6361
hostPath:
64-
path: /var/run/platform-connector
62+
path: /var/run/nvsentinel
6563
type: DirectoryOrCreate
6664

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,24 @@ platformConnector:
184184
kubernetes-object-monitor:
185185
logLevel: debug
186186

187+
policies:
188+
- name: node-test-condition
189+
enabled: true
190+
resource:
191+
group: ""
192+
version: v1
193+
kind: Node
194+
predicate:
195+
expression: |
196+
resource.status.conditions.filter(c, c.type == "TestCondition" && c.status == "False").size() > 0
197+
healthEvent:
198+
componentClass: Node
199+
isFatal: false
200+
message: "Node test condition is not ready"
201+
recommendedAction: CONTACT_SUPPORT
202+
errorCode:
203+
- NODE_TEST_CONDITION_NOT_READY
204+
187205
affinity:
188206
podAntiAffinity:
189207
requiredDuringSchedulingIgnoredDuringExecution:
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
docker_build(
16+
"ghcr.io/nvidia/nvsentinel/kubernetes-object-monitor",
17+
context="../..",
18+
dockerfile="./Dockerfile"
19+
)
20+

health-monitors/kubernetes-object-monitor/pkg/initializer/initializer.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"strings"
2424
"time"
2525

26+
"github.com/go-logr/logr"
2627
pb "github.com/nvidia/nvsentinel/data-models/pkg/protos"
2728
"github.com/nvidia/nvsentinel/health-monitors/kubernetes-object-monitor/pkg/annotations"
2829
celenv "github.com/nvidia/nvsentinel/health-monitors/kubernetes-object-monitor/pkg/cel"
@@ -38,6 +39,7 @@ import (
3839
"sigs.k8s.io/controller-runtime/pkg/cache"
3940
"sigs.k8s.io/controller-runtime/pkg/client"
4041
ctrlcontroller "sigs.k8s.io/controller-runtime/pkg/controller"
42+
ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
4143
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
4244
)
4345

@@ -59,6 +61,10 @@ type Components struct {
5961
}
6062

6163
func InitializeAll(ctx context.Context, params Params) (*Components, error) {
64+
slogHandler := slog.Default().Handler()
65+
logrLogger := logr.FromSlogHandler(slogHandler)
66+
ctrllog.SetLogger(logrLogger)
67+
6268
cfg, err := config.Load(params.PolicyConfigPath)
6369
if err != nil {
6470
return nil, fmt.Errorf("failed to load policy config: %w", err)

tests/helpers/kube.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,10 +1594,13 @@ func SetNodeConditionStatus(
15941594
return err
15951595
}
15961596

1597+
found := false
15971598
modified := false
15981599

15991600
for i := range node.Status.Conditions {
16001601
if node.Status.Conditions[i].Type == conditionType {
1602+
found = true
1603+
16011604
if node.Status.Conditions[i].Status != status {
16021605
node.Status.Conditions[i].Status = status
16031606
node.Status.Conditions[i].LastTransitionTime = metav1.Now()
@@ -1609,6 +1612,19 @@ func SetNodeConditionStatus(
16091612
}
16101613
}
16111614

1615+
if !found {
1616+
now := metav1.Now()
1617+
node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
1618+
Type: conditionType,
1619+
Status: status,
1620+
LastTransitionTime: now,
1621+
LastHeartbeatTime: now,
1622+
Reason: "TestCondition",
1623+
Message: "Set by test",
1624+
})
1625+
modified = true
1626+
}
1627+
16121628
if !modified {
16131629
return nil
16141630
}

tests/kubernetes_object_monitor_test.go

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ type k8sObjectMonitorContextKey int
3030

3131
const (
3232
k8sMonitorKeyNodeName k8sObjectMonitorContextKey = iota
33+
34+
annotationKey = "nvsentinel.nvidia.com/k8s-object-monitor-policy-matches"
35+
testConditionType = "TestCondition"
3336
)
3437

3538
func TestKubernetesObjectMonitor(t *testing.T) {
@@ -41,12 +44,19 @@ func TestKubernetesObjectMonitor(t *testing.T) {
4144
client, err := c.NewClient()
4245
require.NoError(t, err)
4346

44-
pod, err := helpers.GetPodOnWorkerNode(ctx, t, client, helpers.NVSentinelNamespace, "kubernetes-object-monitor")
47+
nodeList := &v1.NodeList{}
48+
err = client.Resources().List(ctx, nodeList)
4549
require.NoError(t, err)
46-
require.NotNil(t, pod)
4750

48-
testNodeName := pod.Spec.NodeName
49-
t.Logf("Using kubernetes-object-monitor pod: %s on node: %s", pod.Name, testNodeName)
51+
var testNodeName string
52+
for _, node := range nodeList.Items {
53+
if node.Labels["type"] != "kwok" {
54+
testNodeName = node.Name
55+
break
56+
}
57+
}
58+
require.NotEmpty(t, testNodeName, "no real (non-KWOK) nodes found in cluster")
59+
t.Logf("Using test node: %s", testNodeName)
5060

5161
return context.WithValue(ctx, k8sMonitorKeyNodeName, testNodeName)
5262
})
@@ -56,9 +66,9 @@ func TestKubernetesObjectMonitor(t *testing.T) {
5666
require.NoError(t, err)
5767

5868
nodeName := ctx.Value(k8sMonitorKeyNodeName).(string)
59-
t.Logf("Marking node %s as NotReady", nodeName)
69+
t.Logf("Setting TestCondition to False on node %s", nodeName)
6070

61-
helpers.SetNodeConditionStatus(ctx, t, client, nodeName, v1.NodeReady, v1.ConditionFalse)
71+
helpers.SetNodeConditionStatus(ctx, t, client, nodeName, v1.NodeConditionType(testConditionType), v1.ConditionFalse)
6272

6373
t.Log("Waiting for policy match annotation on node")
6474
require.Eventually(t, func() bool {
@@ -68,7 +78,7 @@ func TestKubernetesObjectMonitor(t *testing.T) {
6878
return false
6979
}
7080

71-
annotation, exists := node.Annotations["nvsentinel.nvidia.com/k8s-object-monitor-policy-matches"]
81+
annotation, exists := node.Annotations[annotationKey]
7282
if !exists {
7383
return false
7484
}
@@ -85,9 +95,9 @@ func TestKubernetesObjectMonitor(t *testing.T) {
8595
require.NoError(t, err)
8696

8797
nodeName := ctx.Value(k8sMonitorKeyNodeName).(string)
88-
t.Logf("Marking node %s as Ready", nodeName)
98+
t.Logf("Setting TestCondition to True on node %s", nodeName)
8999

90-
helpers.SetNodeConditionStatus(ctx, t, client, nodeName, v1.NodeReady, v1.ConditionTrue)
100+
helpers.SetNodeConditionStatus(ctx, t, client, nodeName, v1.NodeConditionType(testConditionType), v1.ConditionTrue)
91101

92102
t.Log("Waiting for policy match annotation to be cleared")
93103
require.Eventually(t, func() bool {
@@ -97,7 +107,7 @@ func TestKubernetesObjectMonitor(t *testing.T) {
97107
return false
98108
}
99109

100-
annotation, exists := node.Annotations["nvsentinel.nvidia.com/k8s-object-monitor-policy-matches"]
110+
annotation, exists := node.Annotations[annotationKey]
101111
if exists && annotation != "" {
102112
t.Logf("Annotation still exists: %s", annotation)
103113
return false

tilt/Tiltfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ include('../platform-connectors/Tiltfile')
9696
include('./simple-health-client/Tiltfile')
9797
include('../health-events-analyzer/Tiltfile')
9898
include('../health-monitors/gpu-health-monitor/Tiltfile')
99+
include('../health-monitors/kubernetes-object-monitor/Tiltfile')
99100
include('../health-monitors/syslog-health-monitor/Tiltfile')
100101
include('../labeler/Tiltfile')
101102

@@ -160,6 +161,11 @@ k8s_resource(
160161
resource_deps=['platform-connectors']
161162
)
162163

164+
k8s_resource(
165+
'kubernetes-object-monitor',
166+
resource_deps=['platform-connectors']
167+
)
168+
163169
k8s_resource(
164170
'syslog-health-monitor-regular',
165171
resource_deps=['platform-connectors']

0 commit comments

Comments
 (0)