Skip to content

Commit 053dc08

Browse files
committed
Add postgresql as alternative datastore
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent 4c9563a commit 053dc08

File tree

43 files changed

+3027
-40
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3027
-40
lines changed

Tiltfile.postgresql

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
load('ext://helm_resource', 'helm_resource', 'helm_repo')
16+
load('ext://namespace', 'namespace_create', 'namespace_inject')
17+
18+
update_settings(k8s_upsert_timeout_secs=600)
19+
20+
num_gpu_nodes = int(os.getenv('NUM_GPU_NODES', '10'))
21+
22+
# Install cert-manager for TLS certificates
23+
helm_repo('jetstack', 'https://charts.jetstack.io')
24+
helm_resource(
25+
'cert-manager',
26+
chart='jetstack/cert-manager',
27+
namespace='cert-manager',
28+
flags=[
29+
'--create-namespace',
30+
'--set=installCRDs=true',
31+
],
32+
)
33+
34+
# Install Prometheus operator for monitoring
35+
helm_repo('prometheus-community', 'https://prometheus-community.github.io/helm-charts')
36+
helm_resource(
37+
'prometheus-operator',
38+
chart='prometheus-community/kube-prometheus-stack',
39+
namespace='monitoring',
40+
flags=[
41+
'--create-namespace',
42+
'--set=prometheus.enabled=false',
43+
'--set=alertmanager.enabled=false',
44+
'--set=grafana.enabled=false',
45+
'--set=kubeStateMetrics.enabled=false',
46+
'--set=nodeExporter.enabled=false',
47+
'--set=prometheusOperator.enabled=true',
48+
],
49+
)
50+
51+
# Install KWOK for fake GPU nodes
52+
helm_repo('sigs-kwok', 'https://kwok.sigs.k8s.io/charts/')
53+
helm_resource(
54+
'kwok',
55+
chart='sigs-kwok/kwok',
56+
namespace='kube-system',
57+
flags=[
58+
'--set=hostNetwork=true'
59+
]
60+
)
61+
helm_resource(
62+
'kwok-stage-fast',
63+
chart='sigs-kwok/stage-fast',
64+
resource_deps=['kwok'],
65+
pod_readiness='ignore'
66+
)
67+
68+
# Create namespaces
69+
namespace_create('gpu-operator')
70+
namespace_create('nvsentinel')
71+
72+
# Create fake GPU nodes using KWOK
73+
kwok_node_template = str(read_file('./tilt/kwok-node-template.yaml'))
74+
for i in range(num_gpu_nodes):
75+
node_yaml = kwok_node_template.replace('PLACEHOLDER', str(i))
76+
k8s_yaml(blob(node_yaml))
77+
78+
# Apply additional Kubernetes resources
79+
k8s_yaml('./tilt/nvidia-driver-daemonset.yaml')
80+
k8s_yaml('./tilt/nvidia-dcgm-daemonset.yaml')
81+
k8s_yaml('./tilt/janitor.dgxc.nvidia.com_rebootnodes.yaml')
82+
83+
# Include component-specific Tiltfiles
84+
include('./fault-quarantine-module/Tiltfile')
85+
include('./fault-remediation-module/Tiltfile')
86+
include('./node-drainer-module/Tiltfile')
87+
include('./platform-connectors/Tiltfile')
88+
include('./health-events-analyzer/Tiltfile')
89+
include('./health-monitors/csp-health-monitor/Tiltfile')
90+
include('./labeler-module/Tiltfile')
91+
include('./tilt/simple-health-client/Tiltfile')
92+
include('./health-monitors/gpu-health-monitor/Tiltfile')
93+
include('./health-monitors/syslog-health-monitor/Tiltfile')
94+
95+
# Deploy NVSentinel with PostgreSQL configuration using certificate authentication
96+
yaml = helm(
97+
'./distros/kubernetes/nvsentinel',
98+
name='nvsentinel',
99+
namespace='nvsentinel',
100+
values=[
101+
'./distros/kubernetes/nvsentinel/values.yaml',
102+
'./distros/kubernetes/nvsentinel/values-tilt-postgresql.yaml'
103+
],
104+
)
105+
k8s_yaml(yaml)
106+
107+
# Configure cert-manager resources for PostgreSQL
108+
k8s_resource(
109+
new_name='cert-manager-resources',
110+
objects=[
111+
'postgresql-root-ca:certificate',
112+
'postgresql-ca-issuer:issuer',
113+
'selfsigned-ca-issuer:issuer',
114+
'postgresql-server-cert:certificate',
115+
'postgresql-client-cert:certificate'
116+
],
117+
resource_deps=['cert-manager'],
118+
)
119+
120+
# Configure Prometheus monitoring resources
121+
k8s_resource(
122+
new_name='prometheus-resources',
123+
objects=['nvsentinel-pod-monitor:podmonitor'],
124+
resource_deps=['prometheus-operator'],
125+
)
126+
k8s_resource(
127+
'prometheus-operator',
128+
port_forwards='9090:9090',
129+
)
130+
131+
# Configure KWOK fake nodes
132+
kwok_node_names = ['kwok-node-' + str(i) + ':node' for i in range(num_gpu_nodes)]
133+
k8s_resource(
134+
new_name='kwok-fake-nodes',
135+
objects=kwok_node_names,
136+
resource_deps=['kwok', 'nvsentinel-platform-connector', 'nvsentinel-fault-quarantine', 'nvsentinel-fault-remediation',
137+
'nvsentinel-labeler', 'nvsentinel-node-drainer', 'nvsentinel-postgresql', 'simple-health-client'
138+
],
139+
)
140+
k8s_resource(
141+
'kwok-stage-fast',
142+
pod_readiness='ignore',
143+
resource_deps=['kwok']
144+
)
145+
146+
# Configure GPU health monitor with special pod readiness
147+
k8s_resource(
148+
workload='nvsentinel-gpu-health-monitor-dcgm-3.x',
149+
pod_readiness='ignore'
150+
)
151+
152+
# PostgreSQL-specific resources
153+
k8s_resource(
154+
workload='nvsentinel-postgresql',
155+
port_forwards=['5432:5432'], # Port-forward PostgreSQL for debugging
156+
resource_deps=['cert-manager-resources'],
157+
)

distros/kubernetes/nvsentinel/Chart.lock

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ dependencies:
55
- name: mongodb-store
66
repository: ""
77
version: 0.1.0
8+
- name: postgresql
9+
repository: https://charts.bitnami.com/bitnami
10+
version: 15.5.38
811
- name: fault-quarantine
912
repository: ""
1013
version: 0.1.0
@@ -29,5 +32,5 @@ dependencies:
2932
- name: labeler
3033
repository: ""
3134
version: 0.1.0
32-
digest: sha256:c10f6e7fdb0b99a47f38e210c25e610da182c94fe32d89753a34352d12c0bb22
33-
generated: "2025-10-15T10:37:19.739789+05:30"
35+
digest: sha256:7a790375389163a770b2796f9b771d0ac5744e66df098336bfa67113bd740318
36+
generated: "2025-10-21T14:58:37.80844-04:00"

distros/kubernetes/nvsentinel/Chart.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ dependencies:
2525
- name: mongodb-store
2626
version: "0.1.0"
2727
condition: platformConnector.mongodbStore.enabled
28+
- name: postgresql
29+
version: "15.5.38"
30+
repository: https://charts.bitnami.com/bitnami
31+
condition: postgresql.enabled
2832
- name: fault-quarantine
2933
version: "0.1.0"
3034
condition: global.faultQuarantineModule.enabled

distros/kubernetes/nvsentinel/charts/csp-health-monitor/templates/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ spec:
7474
readOnly: true
7575
envFrom:
7676
- configMapRef:
77-
name: mongodb-config
77+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
7878
optional: true
7979

8080
- name: maintenance-notifier
@@ -104,7 +104,7 @@ spec:
104104
mountPath: /run/nvsentinel
105105
envFrom:
106106
- configMapRef:
107-
name: mongodb-config
107+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
108108
optional: true
109109
restartPolicy: Always
110110
{{- with (.Values.global.systemNodeSelector | default .Values.nodeSelector) }}

distros/kubernetes/nvsentinel/charts/fault-quarantine/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ spec:
6868
value: {{ .Values.clientCertMountPath }}
6969
envFrom:
7070
- configMapRef:
71-
name: mongodb-config
71+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
7272
optional: true
7373
volumes:
7474
- name: config-volume

distros/kubernetes/nvsentinel/charts/fault-remediation/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ spec:
7575
value: "{{ .Values.logCollector.enableGcpSosCollection }}"
7676
envFrom:
7777
- configMapRef:
78-
name: mongodb-config
78+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
7979
optional: true
8080
volumes:
8181
- name: mongo-app-client-cert

distros/kubernetes/nvsentinel/charts/health-events-analyzer/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ spec:
6262
value: {{ .Values.clientCertMountPath }}
6363
envFrom:
6464
- configMapRef:
65-
name: mongodb-config
65+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
6666
optional: true
6767
volumes:
6868
- name: config-volume

distros/kubernetes/nvsentinel/charts/mongodb-store/charts/mongodb/values.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ diagnosticMode:
130130
##
131131
image:
132132
registry: docker.io
133-
repository: bitnami/mongodb
133+
repository: bitnamilegacy/mongodb
134134
tag: 8.0.3-debian-12-r0
135135
digest: ""
136136
## Specify a imagePullPolicy
@@ -256,7 +256,7 @@ tls:
256256
##
257257
image:
258258
registry: docker.io
259-
repository: bitnami/nginx
259+
repository: bitnamilegacy/nginx
260260
tag: 1.27.2-debian-12-r2
261261
digest: ""
262262
pullPolicy: IfNotPresent
@@ -844,7 +844,7 @@ externalAccess:
844844
##
845845
image:
846846
registry: docker.io
847-
repository: bitnami/kubectl
847+
repository: bitnamilegacy/kubectl
848848
tag: 1.31.2-debian-12-r3
849849
digest: ""
850850
## Specify a imagePullPolicy
@@ -893,7 +893,7 @@ externalAccess:
893893
##
894894
image:
895895
registry: docker.io
896-
repository: bitnami/os-shell
896+
repository: bitnamilegacy/os-shell
897897
tag: 12-debian-12-r32
898898
digest: ""
899899
## Specify a imagePullPolicy
@@ -1488,7 +1488,7 @@ volumePermissions:
14881488
##
14891489
image:
14901490
registry: docker.io
1491-
repository: bitnami/os-shell
1491+
repository: bitnamilegacy/os-shell
14921492
tag: 12-debian-12-r32
14931493
digest: ""
14941494
## Specify a imagePullPolicy
@@ -2288,7 +2288,7 @@ metrics:
22882288
##
22892289
image:
22902290
registry: docker.io
2291-
repository: bitnami/mongodb-exporter
2291+
repository: bitnamilegacy/mongodb-exporter
22922292
tag: 0.41.2-debian-12-r1
22932293
digest: ""
22942294
pullPolicy: IfNotPresent

distros/kubernetes/nvsentinel/charts/node-drainer/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ spec:
6161
value: {{ .Values.clientCertMountPath }}
6262
envFrom:
6363
- configMapRef:
64-
name: mongodb-config
64+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
6565
optional: true
6666
volumes:
6767
- name: config-volume
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
{{- if or (eq .Values.global.clusterType "standalone") (eq .Values.global.clusterType "mgmt") }}
16+
{{- if eq .Values.global.datastore.provider "postgresql" }}
17+
---
18+
apiVersion: cert-manager.io/v1
19+
kind: Issuer
20+
metadata:
21+
name: postgresql-ca-issuer
22+
namespace: {{ .Release.Namespace }}
23+
spec:
24+
ca:
25+
secretName: postgresql-root-ca-secret
26+
---
27+
apiVersion: cert-manager.io/v1
28+
kind: Issuer
29+
metadata:
30+
name: selfsigned-ca-issuer
31+
namespace: {{ .Release.Namespace }}
32+
spec:
33+
selfSigned: {}
34+
---
35+
apiVersion: cert-manager.io/v1
36+
kind: Certificate
37+
metadata:
38+
name: postgresql-root-ca
39+
namespace: {{ .Release.Namespace }}
40+
spec:
41+
isCA: true
42+
commonName: postgresql-root-ca
43+
secretName: postgresql-root-ca-secret
44+
duration: 87600h # 10 years
45+
renewBefore: 720h # 30 days before expiration
46+
privateKey:
47+
algorithm: RSA
48+
size: 4096
49+
issuerRef:
50+
name: selfsigned-ca-issuer
51+
kind: Issuer
52+
---
53+
# PostgreSQL Server Certificate
54+
apiVersion: cert-manager.io/v1
55+
kind: Certificate
56+
metadata:
57+
name: postgresql-server-cert
58+
namespace: {{ .Release.Namespace }}
59+
spec:
60+
secretName: postgresql-server-cert
61+
duration: 8760h # 1 year
62+
renewBefore: 360h # 15 days before expiration
63+
commonName: postgresql
64+
dnsNames:
65+
- postgresql
66+
- {{ printf "%s-postgresql.%s.svc.cluster.local" .Release.Name .Release.Namespace }}
67+
- {{ printf "%s-postgresql.%s.svc" .Release.Name .Release.Namespace }}
68+
- {{ printf "postgresql.%s.svc.cluster.local" .Release.Namespace }}
69+
- {{ printf "postgresql.%s.svc" .Release.Namespace }}
70+
issuerRef:
71+
name: postgresql-ca-issuer
72+
kind: Issuer
73+
---
74+
# PostgreSQL Client Certificate
75+
apiVersion: cert-manager.io/v1
76+
kind: Certificate
77+
metadata:
78+
name: postgresql-client-cert
79+
namespace: {{ .Release.Namespace }}
80+
spec:
81+
secretName: postgresql-client-cert
82+
duration: 8760h # 1 year
83+
renewBefore: 360h # 15 days before expiration
84+
commonName: postgresql
85+
issuerRef:
86+
name: postgresql-ca-issuer
87+
kind: Issuer
88+
{{- end }}
89+
{{- end }}

0 commit comments

Comments
 (0)