Skip to content

Commit bc19efb

Browse files
committed
Add postgresql as alternative datastore
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent 4c9563a commit bc19efb

File tree

41 files changed

+3036
-17
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3036
-17
lines changed

Tiltfile.postgresql

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
load('ext://helm_resource', 'helm_resource', 'helm_repo')
16+
load('ext://namespace', 'namespace_create', 'namespace_inject')
17+
18+
update_settings(k8s_upsert_timeout_secs=600)
19+
20+
num_gpu_nodes = int(os.getenv('NUM_GPU_NODES', '10'))
21+
22+
# Install cert-manager for TLS certificates
23+
helm_repo('jetstack', 'https://charts.jetstack.io')
24+
helm_resource(
25+
'cert-manager',
26+
chart='jetstack/cert-manager',
27+
namespace='cert-manager',
28+
flags=[
29+
'--create-namespace',
30+
'--set=installCRDs=true',
31+
],
32+
)
33+
34+
# Install Prometheus operator for monitoring
35+
helm_repo('prometheus-community', 'https://prometheus-community.github.io/helm-charts')
36+
helm_resource(
37+
'prometheus-operator',
38+
chart='prometheus-community/kube-prometheus-stack',
39+
namespace='monitoring',
40+
flags=[
41+
'--create-namespace',
42+
'--set=prometheus.enabled=false',
43+
'--set=alertmanager.enabled=false',
44+
'--set=grafana.enabled=false',
45+
'--set=kubeStateMetrics.enabled=false',
46+
'--set=nodeExporter.enabled=false',
47+
'--set=prometheusOperator.enabled=true',
48+
],
49+
)
50+
51+
# Install KWOK for fake GPU nodes
52+
helm_repo('sigs-kwok', 'https://kwok.sigs.k8s.io/charts/')
53+
helm_resource(
54+
'kwok',
55+
chart='sigs-kwok/kwok',
56+
namespace='kube-system',
57+
flags=[
58+
'--set=hostNetwork=true'
59+
]
60+
)
61+
helm_resource(
62+
'kwok-stage-fast',
63+
chart='sigs-kwok/stage-fast',
64+
resource_deps=['kwok'],
65+
pod_readiness='ignore'
66+
)
67+
68+
# Create namespaces
69+
namespace_create('gpu-operator')
70+
namespace_create('nvsentinel')
71+
72+
# Create fake GPU nodes using KWOK
73+
kwok_node_template = str(read_file('./tilt/kwok-node-template.yaml'))
74+
for i in range(num_gpu_nodes):
75+
node_yaml = kwok_node_template.replace('PLACEHOLDER', str(i))
76+
k8s_yaml(blob(node_yaml))
77+
78+
# Apply additional Kubernetes resources
79+
k8s_yaml('./tilt/nvidia-driver-daemonset.yaml')
80+
k8s_yaml('./tilt/nvidia-dcgm-daemonset.yaml')
81+
k8s_yaml('./tilt/janitor.dgxc.nvidia.com_rebootnodes.yaml')
82+
83+
# Include component-specific Tiltfiles
84+
include('./fault-quarantine-module/Tiltfile')
85+
include('./fault-remediation-module/Tiltfile')
86+
include('./node-drainer-module/Tiltfile')
87+
include('./platform-connectors/Tiltfile')
88+
include('./health-events-analyzer/Tiltfile')
89+
include('./health-monitors/csp-health-monitor/Tiltfile')
90+
include('./labeler-module/Tiltfile')
91+
include('./tilt/simple-health-client/Tiltfile')
92+
include('./health-monitors/gpu-health-monitor/Tiltfile')
93+
include('./health-monitors/syslog-health-monitor/Tiltfile')
94+
95+
# Generate dynamic passwords for PostgreSQL using local commands
96+
local('mkdir -p /tmp/tilt')
97+
98+
# Generate secure random passwords using openssl
99+
datastore_password = str(local('openssl rand -base64 32 | tr -d "=+/" | cut -c1-32')).strip()
100+
postgres_admin_password = str(local('openssl rand -base64 32 | tr -d "=+/" | cut -c1-32')).strip()
101+
102+
print("Generated PostgreSQL passwords:")
103+
print(" Datastore password: " + datastore_password)
104+
print(" PostgreSQL admin password: " + postgres_admin_password)
105+
106+
# Create dynamic values YAML content manually (since yaml module may not be available)
107+
dynamic_values_yaml = """global:
108+
datastore:
109+
connection:
110+
password: "%s"
111+
postgresql:
112+
auth:
113+
postgresPassword: "%s"
114+
password: "%s"
115+
""" % (datastore_password, postgres_admin_password, datastore_password)
116+
117+
# Write the dynamic values to a temporary file using local command
118+
local('cat > /tmp/tilt/postgresql-dynamic-values.yaml << "EOF"\n' + dynamic_values_yaml + 'EOF')
119+
120+
# Deploy NVSentinel with PostgreSQL configuration including dynamic passwords
121+
yaml = helm(
122+
'./distros/kubernetes/nvsentinel',
123+
name='nvsentinel',
124+
namespace='nvsentinel',
125+
values=[
126+
'./distros/kubernetes/nvsentinel/values.yaml',
127+
'./distros/kubernetes/nvsentinel/values-tilt-postgresql.yaml',
128+
'/tmp/tilt/postgresql-dynamic-values.yaml' # Override with dynamic passwords
129+
],
130+
)
131+
k8s_yaml(yaml)
132+
133+
# Configure cert-manager resources for PostgreSQL
134+
k8s_resource(
135+
new_name='cert-manager-resources',
136+
objects=[
137+
'postgresql-root-ca:certificate',
138+
'postgresql-ca-issuer:issuer',
139+
'selfsigned-ca-issuer:issuer',
140+
'postgresql-server-cert:certificate',
141+
'postgresql-client-cert:certificate'
142+
],
143+
resource_deps=['cert-manager'],
144+
)
145+
146+
# Configure Prometheus monitoring resources
147+
k8s_resource(
148+
new_name='prometheus-resources',
149+
objects=['nvsentinel-pod-monitor:podmonitor'],
150+
resource_deps=['prometheus-operator'],
151+
)
152+
k8s_resource(
153+
'prometheus-operator',
154+
port_forwards='9090:9090',
155+
)
156+
157+
# Configure KWOK fake nodes
158+
kwok_node_names = ['kwok-node-' + str(i) + ':node' for i in range(num_gpu_nodes)]
159+
k8s_resource(
160+
new_name='kwok-fake-nodes',
161+
objects=kwok_node_names,
162+
resource_deps=['kwok', 'nvsentinel-platform-connector', 'nvsentinel-fault-quarantine', 'nvsentinel-fault-remediation',
163+
'nvsentinel-labeler', 'nvsentinel-node-drainer', 'nvsentinel-postgresql', 'simple-health-client'
164+
],
165+
)
166+
k8s_resource(
167+
'kwok-stage-fast',
168+
pod_readiness='ignore',
169+
resource_deps=['kwok']
170+
)
171+
172+
# Configure GPU health monitor with special pod readiness
173+
k8s_resource(
174+
workload='nvsentinel-gpu-health-monitor-dcgm-3.x',
175+
pod_readiness='ignore'
176+
)
177+
178+
# PostgreSQL-specific resources
179+
k8s_resource(
180+
workload='nvsentinel-postgresql',
181+
port_forwards=['5432:5432'], # Port-forward PostgreSQL for debugging
182+
resource_deps=['cert-manager-resources'],
183+
)

distros/kubernetes/nvsentinel/Chart.lock

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ dependencies:
55
- name: mongodb-store
66
repository: ""
77
version: 0.1.0
8+
- name: postgresql
9+
repository: https://charts.bitnami.com/bitnami
10+
version: 15.5.38
811
- name: fault-quarantine
912
repository: ""
1013
version: 0.1.0
@@ -29,5 +32,5 @@ dependencies:
2932
- name: labeler
3033
repository: ""
3134
version: 0.1.0
32-
digest: sha256:c10f6e7fdb0b99a47f38e210c25e610da182c94fe32d89753a34352d12c0bb22
33-
generated: "2025-10-15T10:37:19.739789+05:30"
35+
digest: sha256:7a790375389163a770b2796f9b771d0ac5744e66df098336bfa67113bd740318
36+
generated: "2025-10-21T14:58:37.80844-04:00"

distros/kubernetes/nvsentinel/Chart.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ dependencies:
2525
- name: mongodb-store
2626
version: "0.1.0"
2727
condition: platformConnector.mongodbStore.enabled
28+
- name: postgresql
29+
version: "15.5.38"
30+
repository: https://charts.bitnami.com/bitnami
31+
condition: postgresql.enabled
2832
- name: fault-quarantine
2933
version: "0.1.0"
3034
condition: global.faultQuarantineModule.enabled

distros/kubernetes/nvsentinel/charts/csp-health-monitor/templates/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ spec:
7474
readOnly: true
7575
envFrom:
7676
- configMapRef:
77-
name: mongodb-config
77+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
7878
optional: true
7979

8080
- name: maintenance-notifier
@@ -104,7 +104,7 @@ spec:
104104
mountPath: /run/nvsentinel
105105
envFrom:
106106
- configMapRef:
107-
name: mongodb-config
107+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
108108
optional: true
109109
restartPolicy: Always
110110
{{- with (.Values.global.systemNodeSelector | default .Values.nodeSelector) }}

distros/kubernetes/nvsentinel/charts/fault-quarantine/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ spec:
6868
value: {{ .Values.clientCertMountPath }}
6969
envFrom:
7070
- configMapRef:
71-
name: mongodb-config
71+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
7272
optional: true
7373
volumes:
7474
- name: config-volume

distros/kubernetes/nvsentinel/charts/fault-remediation/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ spec:
7575
value: "{{ .Values.logCollector.enableGcpSosCollection }}"
7676
envFrom:
7777
- configMapRef:
78-
name: mongodb-config
78+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
7979
optional: true
8080
volumes:
8181
- name: mongo-app-client-cert

distros/kubernetes/nvsentinel/charts/health-events-analyzer/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ spec:
6262
value: {{ .Values.clientCertMountPath }}
6363
envFrom:
6464
- configMapRef:
65-
name: mongodb-config
65+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
6666
optional: true
6767
volumes:
6868
- name: config-volume

distros/kubernetes/nvsentinel/charts/node-drainer/templates/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ spec:
6161
value: {{ .Values.clientCertMountPath }}
6262
envFrom:
6363
- configMapRef:
64-
name: mongodb-config
64+
name: {{- if eq .Values.global.datastore.provider "postgresql" }} nvsentinel-datastore-config{{- else }} mongodb-config{{- end }}
6565
optional: true
6666
volumes:
6767
- name: config-volume
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
{{- if or (eq .Values.global.clusterType "standalone") (eq .Values.global.clusterType "mgmt") }}
16+
{{- if eq .Values.global.datastore.provider "postgresql" }}
17+
---
18+
apiVersion: cert-manager.io/v1
19+
kind: Issuer
20+
metadata:
21+
name: postgresql-ca-issuer
22+
namespace: {{ .Release.Namespace }}
23+
spec:
24+
ca:
25+
secretName: postgresql-root-ca-secret
26+
---
27+
apiVersion: cert-manager.io/v1
28+
kind: Issuer
29+
metadata:
30+
name: selfsigned-ca-issuer
31+
namespace: {{ .Release.Namespace }}
32+
spec:
33+
selfSigned: {}
34+
---
35+
apiVersion: cert-manager.io/v1
36+
kind: Certificate
37+
metadata:
38+
name: postgresql-root-ca
39+
namespace: {{ .Release.Namespace }}
40+
spec:
41+
isCA: true
42+
commonName: postgresql-root-ca
43+
secretName: postgresql-root-ca-secret
44+
duration: 87600h # 10 years
45+
renewBefore: 720h # 30 days before expiration
46+
privateKey:
47+
algorithm: RSA
48+
size: 4096
49+
issuerRef:
50+
name: selfsigned-ca-issuer
51+
kind: Issuer
52+
---
53+
# PostgreSQL Server Certificate
54+
apiVersion: cert-manager.io/v1
55+
kind: Certificate
56+
metadata:
57+
name: postgresql-server-cert
58+
namespace: {{ .Release.Namespace }}
59+
spec:
60+
secretName: postgresql-server-cert
61+
duration: 8760h # 1 year
62+
renewBefore: 360h # 15 days before expiration
63+
commonName: postgresql
64+
dnsNames:
65+
- postgresql
66+
- {{ printf "%s-postgresql.%s.svc.cluster.local" .Release.Name .Release.Namespace }}
67+
- {{ printf "%s-postgresql.%s.svc" .Release.Name .Release.Namespace }}
68+
- {{ printf "postgresql.%s.svc.cluster.local" .Release.Namespace }}
69+
- {{ printf "postgresql.%s.svc" .Release.Namespace }}
70+
issuerRef:
71+
name: postgresql-ca-issuer
72+
kind: Issuer
73+
---
74+
# PostgreSQL Client Certificate
75+
apiVersion: cert-manager.io/v1
76+
kind: Certificate
77+
metadata:
78+
name: postgresql-client-cert
79+
namespace: {{ .Release.Namespace }}
80+
spec:
81+
secretName: postgresql-client-cert
82+
duration: 8760h # 1 year
83+
renewBefore: 360h # 15 days before expiration
84+
commonName: postgresql
85+
issuerRef:
86+
name: postgresql-ca-issuer
87+
kind: Issuer
88+
{{- end }}
89+
{{- end }}

distros/kubernetes/nvsentinel/templates/configmap-datastore.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ data:
5959
{{- if .Values.global.datastore.connection.username }}
6060
DATASTORE_USERNAME: {{ .Values.global.datastore.connection.username | quote }}
6161
{{- end }}
62+
{{- if .Values.global.datastore.connection.password }}
63+
DATASTORE_PASSWORD: {{ .Values.global.datastore.connection.password | quote }}
64+
{{- end }}
6265
{{- if .Values.global.datastore.connection.sslmode }}
6366
DATASTORE_SSLMODE: {{ .Values.global.datastore.connection.sslmode | quote }}
6467
{{- end }}
@@ -72,6 +75,8 @@ data:
7275
DATASTORE_SSLROOTCERT: {{ .Values.global.datastore.connection.sslrootcert | quote }}
7376
{{- end }}
7477

78+
# Provider-specific configuration
79+
{{- if eq .Values.global.datastore.provider "mongodb" }}
7580
# MongoDB specific
7681
MONGODB_URI: "mongodb://{{ .Values.global.datastore.connection.host }}:{{ .Values.global.datastore.connection.port | default 27017 }}"
7782
MONGODB_DATABASE_NAME: {{ .Values.global.datastore.connection.database | quote }}
@@ -81,4 +86,16 @@ data:
8186
{{- else }}
8287
MONGODB_COLLECTION_NAME: "health_events"
8388
{{- end }}
89+
{{- else if eq .Values.global.datastore.provider "postgresql" }}
90+
# PostgreSQL specific
91+
POSTGRES_HOST: {{ .Values.global.datastore.connection.host | quote }}
92+
POSTGRES_PORT: {{ .Values.global.datastore.connection.port | default 5432 | quote }}
93+
POSTGRES_DATABASE: {{ .Values.global.datastore.connection.database | quote }}
94+
{{- if .Values.global.datastore.connection.username }}
95+
POSTGRES_USER: {{ .Values.global.datastore.connection.username | quote }}
96+
{{- end }}
97+
{{- if .Values.global.datastore.connection.sslmode }}
98+
POSTGRES_SSLMODE: {{ .Values.global.datastore.connection.sslmode | quote }}
99+
{{- end }}
100+
{{- end }}
84101
{{- end }}

0 commit comments

Comments
 (0)