Skip to content

Commit 727c185

Browse files
fix: csp monitor fixes (#333)
Signed-off-by: Tanisha goyal <[email protected]> Co-authored-by: Lalit Adithya <[email protected]>
1 parent e90bffe commit 727c185

File tree

14 files changed

+582
-334
lines changed

14 files changed

+582
-334
lines changed

distros/kubernetes/nvsentinel/charts/csp-health-monitor/templates/deployment.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ spec:
5555
- name: {{ .Chart.Name }}
5656
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}"
5757
imagePullPolicy: {{ .Values.image.pullPolicy }}
58-
command: ["/app/csp-health-monitor"]
5958
args:
6059
- "--config=/etc/config/config.toml"
6160
- "--metrics-port={{ ((.Values.global).metricsPort) | default 2112 }}"
@@ -81,11 +80,10 @@ spec:
8180
optional: true
8281

8382
- name: maintenance-notifier
84-
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}"
85-
imagePullPolicy: {{ .Values.image.pullPolicy }}
83+
image: "{{ .Values.quarantineTriggerEngine.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}"
84+
imagePullPolicy: {{ .Values.quarantineTriggerEngine.image.pullPolicy | default .Values.image.pullPolicy }}
8685
securityContext:
8786
runAsUser: 0
88-
command: ["/app/maintenance-notifier"]
8987
args:
9088
- "--config=/etc/config/config.toml"
9189
- "--mongo-client-cert-mount-path=/etc/ssl/mongo-client"

distros/kubernetes/nvsentinel/charts/csp-health-monitor/templates/serviceaccount.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,6 @@ metadata:
2323
iam.gke.io/gcp-service-account: {{ .Values.configToml.gcp.gcpServiceAccountName }}@{{ .Values.configToml.gcp.targetProjectId }}.iam.gserviceaccount.com
2424
{{- end }}
2525
{{- if and (eq .Values.cspName "aws") .Values.configToml.aws .Values.configToml.aws.accountId .Values.configToml.clusterName }}
26-
eks.amazonaws.com/role-arn: arn:aws:iam::{{ .Values.configToml.aws.accountId }}:role/{{ .Values.configToml.clusterName }}-health-monitor
26+
eks.amazonaws.com/role-arn: arn:aws:iam::{{ .Values.configToml.aws.accountId }}:role/{{ .Values.configToml.clusterName }}-nvsentinel-health-monitor-assume-role-policy
2727
{{- end }}
28-
{{- end }}
28+
{{- end }}

distros/kubernetes/nvsentinel/charts/csp-health-monitor/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ resources:
2929

3030
# Configuration for the Quarantine Trigger Engine sidecar
3131
quarantineTriggerEngine:
32+
image:
33+
repository: ghcr.io/nvidia/nvsentinel/maintenance-notifier
34+
pullPolicy: IfNotPresent
3235
resources:
3336
limits:
3437
cpu: "500m"

health-monitors/csp-health-monitor/Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ IS_KO_MODULE := 1
1616
include ../../make/common.mk
1717
include ../../make/go.mk
1818

19+
# Test setup commands for kubebuilder envtest
20+
# Version is centrally managed in .versions.yaml
21+
TEST_SETUP_COMMANDS := \
22+
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@$(SETUP_ENVTEST_VERSION) && \
23+
eval $$(setup-envtest use --use-env -p env) &&
24+
1925
# =============================================================================
2026
# DEFAULT TARGET
2127
# =============================================================================

health-monitors/csp-health-monitor/go.mod

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ require (
1111
github.com/aws/aws-sdk-go-v2 v1.39.6
1212
github.com/aws/aws-sdk-go-v2/config v1.31.18
1313
github.com/aws/aws-sdk-go-v2/service/health v1.34.10
14+
github.com/hashicorp/go-multierror v1.1.1
1415
github.com/nvidia/nvsentinel/commons v0.0.0
1516
github.com/nvidia/nvsentinel/data-models v0.0.0
1617
github.com/nvidia/nvsentinel/store-client v0.0.0
@@ -27,6 +28,7 @@ require (
2728
k8s.io/api v0.34.1
2829
k8s.io/apimachinery v0.34.1
2930
k8s.io/client-go v0.34.1
31+
sigs.k8s.io/controller-runtime v0.22.4
3032
)
3133

3234
require (
@@ -52,6 +54,7 @@ require (
5254
github.com/cespare/xxhash/v2 v2.3.0 // indirect
5355
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
5456
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
57+
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
5558
github.com/felixge/httpsnoop v1.0.4 // indirect
5659
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
5760
github.com/go-logr/logr v1.4.3 // indirect
@@ -73,11 +76,13 @@ require (
7376
github.com/gogo/protobuf v1.3.2 // indirect
7477
github.com/golang/snappy v1.0.0 // indirect
7578
github.com/google/gnostic-models v0.7.0 // indirect
79+
github.com/google/go-cmp v0.7.0 // indirect
7680
github.com/google/pprof v0.0.0-20251007162407-5df77e3f7d1d // indirect
7781
github.com/google/s2a-go v0.1.9 // indirect
7882
github.com/google/uuid v1.6.0 // indirect
7983
github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
8084
github.com/googleapis/gax-go/v2 v2.15.0 // indirect
85+
github.com/hashicorp/errwrap v1.0.0 // indirect
8186
github.com/json-iterator/go v1.1.12 // indirect
8287
github.com/klauspost/compress v1.18.0 // indirect
8388
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
@@ -116,6 +121,7 @@ require (
116121
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
117122
gopkg.in/inf.v0 v0.9.1 // indirect
118123
gopkg.in/yaml.v3 v3.0.1 // indirect
124+
k8s.io/apiextensions-apiserver v0.34.1 // indirect
119125
k8s.io/klog/v2 v2.130.1 // indirect
120126
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
121127
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect

health-monitors/csp-health-monitor/go.sum

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,12 @@ github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8k
7777
github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw=
7878
github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
7979
github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
80+
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
81+
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
8082
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
8183
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
84+
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
85+
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
8286
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
8387
github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
8488
github.com/go-jose/go-jose/v4 v4.1.2 h1:TK/7NqRQZfgAh+Td8AlsrvtPoUyiHh0LqVvokh+1vHI=
@@ -88,6 +92,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
8892
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
8993
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
9094
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
95+
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
96+
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
9197
github.com/go-openapi/jsonpointer v0.22.1 h1:sHYI1He3b9NqJ4wXLoJDKmUmHkWy/L7rtEo92JUxBNk=
9298
github.com/go-openapi/jsonpointer v0.22.1/go.mod h1:pQT9OsLkfz1yWoMgYFy4x3U5GY5nUlsOn1qSBH5MkCM=
9399
github.com/go-openapi/jsonreference v0.21.2 h1:Wxjda4M/BBQllegefXrY/9aq1fxBA8sI5M/lFU6tSWU=
@@ -143,6 +149,10 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU
143149
github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA=
144150
github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo=
145151
github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc=
152+
github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
153+
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
154+
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
155+
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
146156
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
147157
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
148158
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
@@ -235,6 +245,10 @@ go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
235245
go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
236246
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
237247
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
248+
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
249+
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
250+
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
251+
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
238252
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
239253
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
240254
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
@@ -298,6 +312,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T
298312
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
299313
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
300314
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
315+
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
316+
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
301317
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
302318
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
303319
google.golang.org/api v0.255.0 h1:OaF+IbRwOottVCYV2wZan7KUq7UeNUQn1BcPc4K7lE4=
@@ -323,6 +339,8 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
323339
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
324340
k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM=
325341
k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk=
342+
k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI=
343+
k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc=
326344
k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4=
327345
k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
328346
k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY=
@@ -333,6 +351,8 @@ k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZ
333351
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
334352
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
335353
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
354+
sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A=
355+
sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8=
336356
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
337357
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
338358
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=

health-monitors/csp-health-monitor/pkg/config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ type AWSConfig struct {
6565
// LoadConfig reads the configuration from a TOML file.
6666
func LoadConfig(filePath string) (*Config, error) {
6767
var cfg Config
68+
6869
// Read the file content
6970
content, err := os.ReadFile(filePath)
7071
if err != nil {

0 commit comments

Comments
 (0)