Skip to content

Commit 2e99424

Browse files
committed
chore: update store-client-sdk and nvsentinel charts to make them pluggable
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent f61d7b6 commit 2e99424

File tree

107 files changed

+8351
-5848
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+8351
-5848
lines changed

Tiltfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ kwok_node_names = ['kwok-node-' + str(i) + ':node' for i in range(num_gpu_nodes)
120120
k8s_resource(
121121
new_name='kwok-fake-nodes',
122122
objects=kwok_node_names,
123-
resource_deps=['kwok', 'nvsentinel-platform-connector', 'nvsentinel-fault-quarantine', 'nvsentinel-fault-remediation',
123+
resource_deps=['kwok', 'nvsentinel-platform-connector', 'nvsentinel-fault-quarantine', 'nvsentinel-fault-remediation',
124124
'nvsentinel-labeler', 'nvsentinel-node-drainer', 'nvsentinel-mongodb', 'simple-health-client'
125-
],
125+
],
126126
)
127127
k8s_resource(
128128
'kwok-stage-fast',

distros/kubernetes/nvsentinel/charts/csp-health-monitor/templates/deployment.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ spec:
5959
args:
6060
- "--config=/etc/config/config.toml"
6161
- "--metrics-port={{ .Values.global.metricsPort }}"
62-
- "--mongo-client-cert-mount-path={{ .Values.clientCertMountPath }}"
6362
- "-v={{ .Values.logLevel }}"
6463
resources:
6564
{{- toYaml .Values.resources | nindent 12 }}
@@ -86,7 +85,6 @@ spec:
8685
command: ["/app/maintenance-notifier"]
8786
args:
8887
- "--config=/etc/config/config.toml"
89-
- "--mongo-client-cert-mount-path={{ .Values.clientCertMountPath }}"
9088
- "--uds-path={{ .Values.udsPath }}"
9189
- "--metrics-port={{ .Values.quarantineTriggerEngine.metricsPort }}"
9290
- "-v={{ .Values.quarantineTriggerEngine.logLevel | default .Values.logLevel }}"

distros/kubernetes/nvsentinel/charts/fault-quarantine/templates/deployment.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ spec:
4444
{{- toYaml .Values.resources | nindent 12 }}
4545
args:
4646
- "--metrics-port={{ .Values.global.metricsPort }}"
47-
- "--mongo-client-cert-mount-path={{ .Values.clientCertMountPath }}"
4847
- "--dry-run={{ .Values.global.dryRun }}"
4948
- "--circuit-breaker-percentage={{ .Values.circuitBreaker.percentage }}"
5049
- "--circuit-breaker-duration={{ .Values.circuitBreaker.duration }}"
@@ -65,6 +64,8 @@ spec:
6564
valueFrom:
6665
fieldRef:
6766
fieldPath: metadata.namespace
67+
- name: MONGODB_CLIENT_CERT_MOUNT_PATH
68+
value: {{ .Values.clientCertMountPath }}
6869
envFrom:
6970
- configMapRef:
7071
name: mongodb-config

distros/kubernetes/nvsentinel/charts/fault-remediation/templates/deployment.yaml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,13 @@ spec:
3838
serviceAccountName: {{ include "nvsentinel.serviceAccountName" . }}
3939
containers:
4040
- name: fault-remediation
41-
env:
42-
- name: MAINTENANCE_NAMESPACE
43-
value: {{ .Values.maintenanceResource.namespace }}
44-
- name: MAINTENANCE_VERSION
45-
value: {{ .Values.maintenanceResource.version }}
46-
- name: MAINTENANCE_API_GROUP
47-
value: {{ .Values.maintenanceResource.apiGroup }}
48-
4941
image: "{{ .Values.global.faultRemediationModule.image.repository }}:{{ .Values.global.image.tag | default .Chart.AppVersion }}"
5042
imagePullPolicy: {{ .Values.global.faultRemediationModule.image.pullPolicy | default "IfNotPresent" }}
5143
resources:
5244
{{- toYaml .Values.resources | nindent 12 }}
5345
args:
54-
- "--mongo-client-cert-mount-path={{ .Values.clientCertMountPath }}"
5546
- "--dry-run={{ .Values.global.dryRun }}"
47+
- "-v=4"
5648
ports:
5749
- name: metrics
5850
containerPort: {{ .Values.global.metricsPort }}
@@ -69,6 +61,8 @@ spec:
6961
value: {{ .Values.maintenanceResource.version }}
7062
- name: MAINTENANCE_API_GROUP
7163
value: {{ .Values.maintenanceResource.apiGroup }}
64+
- name: MONGODB_CLIENT_CERT_MOUNT_PATH
65+
value: {{ .Values.clientCertMountPath }}
7266
- name: TEMPLATE_MOUNT_PATH
7367
value: {{ .Values.maintenanceResource.template.mountPath }}
7468
- name: TEMPLATE_FILE_NAME

distros/kubernetes/nvsentinel/charts/health-events-analyzer/templates/deployment.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ spec:
4545
{{- toYaml .Values.resources | nindent 12 }}
4646
args:
4747
- "--metrics-port={{ .Values.global.metricsPort }}"
48-
- "--mongo-client-cert-mount-path={{ .Values.clientCertMountPath }}"
4948
- "-v={{ .Values.logLevel }}"
5049
securityContext:
5150
{{- toYaml .Values.securityContext | nindent 12 }}
@@ -58,6 +57,9 @@ spec:
5857
readOnly: true
5958
- name: var-run-vol
6059
mountPath: /var/run/
60+
env:
61+
- name: MONGODB_CLIENT_CERT_MOUNT_PATH
62+
value: {{ .Values.clientCertMountPath }}
6163
envFrom:
6264
- configMapRef:
6365
name: mongodb-config

distros/kubernetes/nvsentinel/charts/mongodb-store/templates/configmap.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ kind: ConfigMap
1717
metadata:
1818
name: mongodb-config
1919
data:
20+
# Datastore provider configuration for new abstraction layer
21+
DATASTORE_PROVIDER: "mongodb"
22+
# Legacy MongoDB configuration (maintained for backward compatibility)
2023
MONGODB_URI: {{ $uri := "" }}{{ if .Values.mongodb.configMapValues }}{{ $uri = .Values.mongodb.configMapValues.mongodbUri }}{{ end }}{{ default (printf "mongodb://%s-mongodb-headless.%s.svc.cluster.local:27017/?replicaSet=rs0&tls=true" .Release.Name .Release.Namespace) $uri | quote }}
2124
MONGODB_DATABASE_NAME: "HealthEventsDatabase"
2225
MONGODB_COLLECTION_NAME: "HealthEvents"

distros/kubernetes/nvsentinel/charts/node-drainer/templates/deployment.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ spec:
4848
args:
4949
- "--metrics-port={{ .Values.global.metricsPort }}"
5050
- "--config-path={{ .Values.configPath }}"
51-
- "--mongo-client-cert-mount-path={{ .Values.clientCertMountPath }}"
5251
- "--dry-run={{ .Values.global.dryRun }}"
5352
volumeMounts:
5453
- name: config-volume
@@ -57,6 +56,9 @@ spec:
5756
- name: mongo-app-client-cert
5857
mountPath: {{ .Values.clientCertMountPath }}
5958
readOnly: true
59+
env:
60+
- name: MONGODB_CLIENT_CERT_MOUNT_PATH
61+
value: {{ .Values.clientCertMountPath }}
6062
envFrom:
6163
- configMapRef:
6264
name: mongodb-config
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{{/*
2+
Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/}}
16+
{{- if .Values.global.datastore }}
17+
apiVersion: v1
18+
kind: ConfigMap
19+
metadata:
20+
name: {{ include "nvsentinel.fullname" . }}-datastore-config
21+
labels:
22+
{{- include "nvsentinel.labels" . | nindent 4 }}
23+
data:
24+
datastore.yaml: |
25+
provider: {{ .Values.global.datastore.provider | quote }}
26+
connection:
27+
host: {{ .Values.global.datastore.connection.host | quote }}
28+
port: {{ .Values.global.datastore.connection.port | default 5432 }}
29+
database: {{ .Values.global.datastore.connection.database | quote }}
30+
{{- if .Values.global.datastore.connection.username }}
31+
username: {{ .Values.global.datastore.connection.username | quote }}
32+
{{- end }}
33+
{{- if .Values.global.datastore.connection.sslmode }}
34+
sslmode: {{ .Values.global.datastore.connection.sslmode | quote }}
35+
{{- end }}
36+
{{- if .Values.global.datastore.connection.sslcert }}
37+
sslcert: {{ .Values.global.datastore.connection.sslcert | quote }}
38+
{{- end }}
39+
{{- if .Values.global.datastore.connection.sslkey }}
40+
sslkey: {{ .Values.global.datastore.connection.sslkey | quote }}
41+
{{- end }}
42+
{{- if .Values.global.datastore.connection.sslrootcert }}
43+
sslrootcert: {{ .Values.global.datastore.connection.sslrootcert | quote }}
44+
{{- end }}
45+
{{- if .Values.global.datastore.connection.extraParams }}
46+
extraParams:
47+
{{- toYaml .Values.global.datastore.connection.extraParams | nindent 8 }}
48+
{{- end }}
49+
{{- if .Values.global.datastore.options }}
50+
options:
51+
{{- toYaml .Values.global.datastore.options | nindent 6 }}
52+
{{- end }}
53+
54+
# Environment variables for components
55+
DATASTORE_PROVIDER: {{ .Values.global.datastore.provider | quote }}
56+
DATASTORE_HOST: {{ .Values.global.datastore.connection.host | quote }}
57+
DATASTORE_PORT: {{ .Values.global.datastore.connection.port | default 5432 | quote }}
58+
DATASTORE_DATABASE: {{ .Values.global.datastore.connection.database | quote }}
59+
{{- if .Values.global.datastore.connection.username }}
60+
DATASTORE_USERNAME: {{ .Values.global.datastore.connection.username | quote }}
61+
{{- end }}
62+
{{- if .Values.global.datastore.connection.sslmode }}
63+
DATASTORE_SSLMODE: {{ .Values.global.datastore.connection.sslmode | quote }}
64+
{{- end }}
65+
{{- if .Values.global.datastore.connection.sslcert }}
66+
DATASTORE_SSLCERT: {{ .Values.global.datastore.connection.sslcert | quote }}
67+
{{- end }}
68+
{{- if .Values.global.datastore.connection.sslkey }}
69+
DATASTORE_SSLKEY: {{ .Values.global.datastore.connection.sslkey | quote }}
70+
{{- end }}
71+
{{- if .Values.global.datastore.connection.sslrootcert }}
72+
DATASTORE_SSLROOTCERT: {{ .Values.global.datastore.connection.sslrootcert | quote }}
73+
{{- end }}
74+
75+
# MongoDB specific
76+
MONGODB_URI: "mongodb://{{ .Values.global.datastore.connection.host }}:{{ .Values.global.datastore.connection.port | default 27017 }}"
77+
MONGODB_DATABASE_NAME: {{ .Values.global.datastore.connection.database | quote }}
78+
MONGODB_CLIENT_CERT_MOUNT_PATH: "/etc/ssl/mongo-client"
79+
{{- if .Values.global.datastore.connection.collection }}
80+
MONGODB_COLLECTION_NAME: {{ .Values.global.datastore.connection.collection | quote }}
81+
{{- else }}
82+
MONGODB_COLLECTION_NAME: "health_events"
83+
{{- end }}
84+
{{- end }}

distros/kubernetes/nvsentinel/templates/daemonset.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ spec:
7373
args:
7474
- "--config=/etc/config/config.json"
7575
- "--metrics-port={{ .Values.global.metricsPort }}"
76-
- "--mongo-client-cert-mount-path={{ .Values.platformConnector.mongodbStore.clientCertMountPath }}"
7776
- "--socket={{ .Values.socketPath }}"
77+
- "-v={{ .Values.platformConnector.logVerbosity | default 2 }}"
7878
resources:
7979
{{- toYaml .Values.platformConnector.resources | nindent 12 }}
8080
volumeMounts:
@@ -91,6 +91,8 @@ spec:
9191
fieldRef:
9292
apiVersion: v1
9393
fieldPath: spec.nodeName
94+
- name: MONGODB_CLIENT_CERT_MOUNT_PATH
95+
value: {{ .Values.platformConnector.mongodbStore.clientCertMountPath }}
9496
envFrom:
9597
- configMapRef:
9698
name: mongodb-config

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,21 @@
1515
global:
1616
dryRun: false
1717
kubeVersion: 1.31.0
18+
clusterType: "standalone"
19+
tiltMode: true # Flag to identify Tilt-based development environments
20+
21+
# Datastore provider configuration (can be overridden)
22+
datastore:
23+
provider: "mongodb" # Default to MongoDB for backward compatibility
24+
connection:
25+
host: "mongodb"
26+
port: 27017
27+
database: "nvsentinel"
28+
options:
29+
maxConnections: "25"
30+
maxIdleConnections: "10"
31+
connectionMaxLifetime: "1h"
32+
pollInterval: "5s"
1833

1934
nodeSelector: {}
2035

@@ -60,31 +75,113 @@ global:
6075
inclusterFileServer:
6176
enabled: false
6277

78+
# Service-specific configurations for MongoDB mode (Tilt defaults)
79+
6380
mongodbStore:
6481
images:
6582
kubectl:
66-
repository: bitnamilegacy/kubectl
83+
repository: docker.io/bitnamilegacy/kubectl
6784
tag: "1.30.6"
6885
pullPolicy: IfNotPresent
6986
mongosh:
70-
repository: rtsp/mongosh
87+
repository: docker.io/rtsp/mongosh
7188
tag: "2.5.2"
7289
pullPolicy: IfNotPresent
7390

7491
kata:
7592
enabled: false
7693

94+
# Enable syslog monitoring for both XID and SXID errors
95+
# Disabled in Tilt mode as it requires NVIDIA drivers
96+
xidUsingSyslogMonitor: false
97+
sxidUsingSyslogMonitor: false
98+
7799
platformConnector:
100+
logVerbosity: 2
78101
mongodbStore:
79102
enabled: true
80103
clientCertMountPath: "/etc/ssl/mongo-client"
81104

105+
gpu-health-monitor:
106+
nodeSelector:
107+
nvidia.com/gpu.present: "true"
108+
109+
systemNodeSelector:
110+
node-role.kubernetes.io/control-plane: ""
111+
112+
tolerations:
113+
- operator: Exists
114+
115+
systemNodeTolerations:
116+
- operator: Exists
117+
118+
csp-health-monitor:
119+
clientCertMountPath: "/etc/ssl/mongo-client"
120+
nodeSelector:
121+
node-role.kubernetes.io/control-plane: ""
122+
123+
tolerations:
124+
- operator: Exists
125+
126+
fault-quarantine:
127+
clientCertMountPath: "/etc/ssl/mongo-client"
128+
nodeSelector:
129+
node-role.kubernetes.io/control-plane: ""
130+
131+
tolerations:
132+
- operator: Exists
133+
134+
fault-remediation:
135+
clientCertMountPath: "/etc/ssl/mongo-client"
136+
nodeSelector:
137+
node-role.kubernetes.io/control-plane: ""
138+
139+
tolerations:
140+
- operator: Exists
141+
142+
node-drainer:
143+
clientCertMountPath: "/etc/ssl/mongo-client"
144+
nodeSelector:
145+
node-role.kubernetes.io/control-plane: ""
146+
147+
tolerations:
148+
- operator: Exists
149+
150+
syslog-health-monitor:
151+
nodeSelector:
152+
nvidia.com/gpu.present: "true"
153+
154+
tolerations:
155+
- operator: Exists
156+
157+
systemNodeSelector:
158+
node-role.kubernetes.io/control-plane: ""
159+
160+
systemNodeTolerations:
161+
- operator: Exists
162+
163+
health-events-analyzer:
164+
logLevel: 2
165+
clientCertMountPath: "/etc/ssl/mongo-client"
166+
nodeSelector:
167+
node-role.kubernetes.io/control-plane: ""
168+
169+
tolerations:
170+
- operator: Exists
171+
172+
tolerations:
173+
- operator: Exists
82174

83175
mongodb-store:
84176
mongodb:
177+
178+
global:
179+
imagePullSecrets:
180+
- nvidia-ngcuser-pull-secret
181+
85182
nodeSelector:
86183
node-role.kubernetes.io/control-plane: ""
87-
184+
88185
tolerations:
89186
- operator: Exists
90187

@@ -105,9 +202,26 @@ mongodb-store:
105202
pullPolicy: "IfNotPresent"
106203

107204
metrics:
108-
enabled: true
205+
enabled: false
109206
image:
110207
registry: docker.io
111208
repository: bitnamilegacy/mongodb-exporter
112209
tag: 0.41.2-debian-12-r1
113210

211+
# # Reduce MongoDB log verbosity - using configuration for fine-grained control
212+
# configuration: |-
213+
# # Reduce overall log verbosity
214+
# systemLog:
215+
# quiet: true
216+
# verbosity: 0
217+
# # Reduce noisy component logs (network, replication)
218+
# component:
219+
# network:
220+
# verbosity: 0
221+
# replication:
222+
# verbosity: 0
223+
224+
# Enable syslog monitoring for both XID and SXID errors
225+
# Disabled in Tilt mode as it requires NVIDIA drivers
226+
xidUsingSyslogMonitor: false
227+
sxidUsingSyslogMonitor: false

0 commit comments

Comments
 (0)