Skip to content

Commit 5fe1ab6

Browse files
Merge branch 'main' into workflow_implementation
2 parents 46c200d + 1584015 commit 5fe1ab6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+4627
-67
lines changed
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Integration Tests - GCP
16+
17+
on:
18+
workflow_dispatch: {} # allow manual runs for testing
19+
schedule:
20+
- cron: '30 14 * * *' # daily at 14:30 UTC, runs on default branch only
21+
push:
22+
branches:
23+
- main
24+
- feature/oidc-gcp
25+
26+
permissions:
27+
contents: read
28+
actions: read
29+
id-token: write
30+
31+
jobs:
32+
integration-test-gcp:
33+
runs-on: ubuntu-latest
34+
timeout-minutes: 60
35+
env:
36+
CSP: "gcp"
37+
PREFIX: "nvs"
38+
PROJECT_ID: "nv-dgxck8s-20250306"
39+
IDENTITY_PROVIDER: "projects/1015254933832/locations/global/workloadIdentityPools/github-pool/providers/github-provider"
40+
SERVICE_ACCOUNT: "github-actions-user"
41+
# Terraform Vars
42+
TF_VAR_deployment_id: "d${{ github.run_id }}"
43+
TF_VAR_project_id: "nv-dgxck8s-20250306"
44+
TF_VAR_region: "europe-west4"
45+
TF_VAR_zone: "europe-west4-b"
46+
TF_VAR_system_node_type: "e2-standard-4"
47+
TF_VAR_system_node_count: "3"
48+
TF_VAR_gpu_node_pool_name: "gpu-pool"
49+
TF_VAR_gpu_machine_type: "a3-megagpu-8g"
50+
TF_VAR_gpu_node_count: "1"
51+
TF_VAR_gpu_reservation_project: "nv-dgxcloudprodgsc-20240206"
52+
TF_VAR_gpu_reservation_name: "gsc-a3-megagpu-8g-shared-res-2"
53+
TF_VAR_gpu_driver_version: "INSTALLATION_DISABLED"
54+
TF_VAR_resource_labels: '{"environment":"test","team":"nvsentinel","managed_by":"terraform"}'
55+
# Debug
56+
SKIP_DELETE: "false" # skip cluster deletion
57+
TEST_TAG: "main-33c1d03"
58+
59+
steps:
60+
# Checkout
61+
- name: Checkout
62+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
63+
64+
# Terraform
65+
- name: Terraform
66+
uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # v3.1.2
67+
with:
68+
terraform_version: "1.13.5"
69+
70+
# Auth
71+
- name: Get AuthN Token
72+
id: auth
73+
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3
74+
with:
75+
token_format: access_token
76+
workload_identity_provider: ${{ env.IDENTITY_PROVIDER }}
77+
service_account: "${{ env.SERVICE_ACCOUNT }}@${{ env.PROJECT_ID }}.iam.gserviceaccount.com"
78+
79+
# Gcloud
80+
- name: Setup gcloud CLI
81+
uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db # v3.0.1
82+
83+
# Cluster
84+
- name: Create Cluster
85+
id: cluster
86+
shell: bash
87+
continue-on-error: true
88+
run: |
89+
set -euo pipefail
90+
cd tests/uat/gcp/cluster
91+
terraform init
92+
terraform apply -auto-approve
93+
94+
# Connect
95+
- name: Connect to Cluster
96+
id: client
97+
if: steps.cluster.outcome == 'success'
98+
shell: bash
99+
run: |
100+
set -euo pipefail
101+
echo "Installing GKE auth plugin..."
102+
gcloud components install gke-gcloud-auth-plugin --quiet --project ${{ env.TF_VAR_project_id }}
103+
echo "Getting cluster credentials..."
104+
gcloud container clusters get-credentials "${{ env.PREFIX }}-${{ env.TF_VAR_deployment_id }}" \
105+
--zone ${{ env.TF_VAR_zone }} --project ${{ env.TF_VAR_project_id }}
106+
107+
# Image Tag
108+
- name: Compute ref name with short SHA
109+
id: ref-name
110+
run: |
111+
if [[ "${{ github.ref_type }}" == "tag" ]]; then
112+
SAFE_REF="${{ github.ref_name }}"
113+
elif [[ "${{ github.ref_name }}" == "main" ]]; then
114+
SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
115+
SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}"
116+
else
117+
SAFE_REF="${{ env.TEST_TAG }}"
118+
fi
119+
# Sanitize ref name: replace slashes with hyphens for Docker tag compatibility
120+
SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g')
121+
echo "value=$SAFE_REF" >> $GITHUB_OUTPUT
122+
123+
# Apps
124+
- name: Install NVS
125+
id: apps
126+
if: steps.client.outcome == 'success'
127+
shell: bash
128+
env:
129+
GCP_PROJECT_ID: "${{ env.PROJECT_ID }}"
130+
GCP_ZONE: "${{ env.TF_VAR_zone }}"
131+
GCP_SERVICE_ACCOUNT: "${{ env.SERVICE_ACCOUNT }}"
132+
NVSENTINEL_VERSION: "${{ steps.ref-name.outputs.value }}"
133+
run: |
134+
set -euxo pipefail
135+
tests/uat/install-apps.sh
136+
137+
# Test
138+
- name: Run UAT Tests
139+
id: tests
140+
if: steps.apps.outcome == 'success'
141+
shell: bash
142+
run: |
143+
set -euxo pipefail
144+
tests/uat/tests.sh
145+
146+
# Teardown
147+
- name: Destroy Cluster
148+
if: always() && steps.cluster.outcome != 'skipped' && env.SKIP_DELETE != 'true'
149+
shell: bash
150+
run: |
151+
set -euxo pipefail
152+
cd tests/uat/gcp/cluster
153+
terraform destroy -auto-approve
154+
155+
# Summary
156+
- name: Test Summary
157+
if: always()
158+
run: |
159+
echo "## Test Results" >> $GITHUB_STEP_SUMMARY
160+
echo "- Cluster: ${{ steps.cluster.outcome }}" >> $GITHUB_STEP_SUMMARY
161+
echo "- Connection: ${{ steps.client.outcome }}" >> $GITHUB_STEP_SUMMARY
162+
echo "- Apps: ${{ steps.apps.outcome }}" >> $GITHUB_STEP_SUMMARY
163+
echo "- Tests: ${{ steps.tests.outcome }}" >> $GITHUB_STEP_SUMMARY

.github/workflows/publish.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ jobs:
106106
- component: syslog-health-monitor
107107
make_command: 'make -C health-monitors/syslog-health-monitor docker-publish'
108108
container_name: 'nvsentinel/syslog-health-monitor'
109+
- component: kubernetes-object-monitor
110+
make_command: 'make -C health-monitors/kubernetes-object-monitor docker-publish'
111+
container_name: 'nvsentinel/kubernetes-object-monitor'
109112
- component: metadata-collector
110113
make_command: 'make -C metadata-collector docker-publish'
111114
container_name: 'nvsentinel/metadata-collector'

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ fault-quarantine/fault-quarantine
446446
fault-remediation/fault-remediation
447447
health-events-analyzer/health-events-analyzer
448448
health-monitors/syslog-health-monitor/syslog-health-monitor
449+
health-monitors/kubernetes-object-monitor/kubernetes-object-monitor
449450
labeler/labeler
450451
metadata-collector/metadata-collector
451452
node-drainer/node-drainer

distros/kubernetes/nvsentinel/Chart.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,6 @@ dependencies:
5555
- name: metadata-collector
5656
version: "0.1.0"
5757
condition: global.metadataCollector.enabled
58+
- name: kubernetes-object-monitor
59+
version: "0.1.0"
60+
condition: global.kubernetesObjectMonitor.enabled
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v2
16+
name: kubernetes-object-monitor
17+
description: Monitors Kubernetes objects and publishes health events based on CEL policy predicates
18+
type: application
19+
version: 0.1.0
20+
appVersion: "1.16.0"
21+
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
{{/*
2+
Expand the name of the chart.
3+
*/}}
4+
{{- define "kubernetes-object-monitor.name" -}}
5+
{{- .Chart.Name | trunc 63 | trimSuffix "-" }}
6+
{{- end }}
7+
8+
{{/*
9+
Create a default fully qualified app name.
10+
*/}}
11+
{{- define "kubernetes-object-monitor.fullname" -}}
12+
{{- "kubernetes-object-monitor" | trunc 63 | trimSuffix "-" }}
13+
{{- end }}
14+
15+
{{/*
16+
Create chart name and version as used by the chart label.
17+
*/}}
18+
{{- define "kubernetes-object-monitor.chart" -}}
19+
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
20+
{{- end }}
21+
22+
{{/*
23+
Common labels
24+
*/}}
25+
{{- define "kubernetes-object-monitor.labels" -}}
26+
helm.sh/chart: {{ include "kubernetes-object-monitor.chart" . }}
27+
{{ include "kubernetes-object-monitor.selectorLabels" . }}
28+
{{- if .Chart.AppVersion }}
29+
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
30+
{{- end }}
31+
app.kubernetes.io/managed-by: {{ .Release.Service }}
32+
{{- end }}
33+
34+
{{/*
35+
Selector labels
36+
*/}}
37+
{{- define "kubernetes-object-monitor.selectorLabels" -}}
38+
app.kubernetes.io/name: {{ include "kubernetes-object-monitor.name" . }}
39+
app.kubernetes.io/instance: {{ .Release.Name }}
40+
{{- end }}
41+
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
{{- $resourcesByGroup := dict }}
16+
{{- range .Values.policies }}
17+
{{- $group := .resource.group | default "" }}
18+
{{- $kind := .resource.kind | lower }}
19+
{{- $resource := printf "%ss" $kind }}
20+
{{- $existing := index $resourcesByGroup $group | default list }}
21+
{{- if not (has $resource $existing) }}
22+
{{- $updated := append $existing $resource }}
23+
{{- $_ := set $resourcesByGroup $group $updated }}
24+
{{- end }}
25+
{{- end }}
26+
---
27+
apiVersion: rbac.authorization.k8s.io/v1
28+
kind: ClusterRole
29+
metadata:
30+
name: {{ include "kubernetes-object-monitor.fullname" . }}
31+
labels:
32+
{{- include "kubernetes-object-monitor.labels" . | nindent 4 }}
33+
rules:
34+
- apiGroups:
35+
- ""
36+
resources:
37+
- nodes
38+
verbs:
39+
- get
40+
- list
41+
- watch
42+
- patch
43+
- update
44+
{{- range $group, $resources := $resourcesByGroup }}
45+
{{- if ne $group "" }}
46+
- apiGroups:
47+
- {{ $group | quote }}
48+
resources:
49+
{{- range $resources }}
50+
- {{ . }}
51+
{{- end }}
52+
verbs:
53+
- get
54+
- list
55+
- watch
56+
{{- end }}
57+
{{- end }}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: rbac.authorization.k8s.io/v1
16+
kind: ClusterRoleBinding
17+
metadata:
18+
name: {{ include "kubernetes-object-monitor.fullname" . }}
19+
labels:
20+
{{- include "kubernetes-object-monitor.labels" . | nindent 4 }}
21+
subjects:
22+
- kind: ServiceAccount
23+
name: {{ include "kubernetes-object-monitor.fullname" . }}
24+
namespace: {{ .Release.Namespace }}
25+
roleRef:
26+
kind: ClusterRole
27+
name: {{ include "kubernetes-object-monitor.fullname" . }}
28+
apiGroup: rbac.authorization.k8s.io
29+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: {{ include "kubernetes-object-monitor.fullname" . }}
19+
labels:
20+
{{- include "kubernetes-object-monitor.labels" . | nindent 4 }}
21+
data:
22+
config.toml: |
23+
{{- range .Values.policies }}
24+
[[policies]]
25+
name = {{ .name | quote }}
26+
enabled = {{ .enabled }}
27+
28+
[policies.resource]
29+
group = {{ .resource.group | quote }}
30+
version = {{ .resource.version | quote }}
31+
kind = {{ .resource.kind | quote }}
32+
33+
[policies.predicate]
34+
expression = {{ if contains "\n" .predicate.expression }}'''
35+
{{ .predicate.expression | trim | nindent 10 }}
36+
'''{{ else }}{{ .predicate.expression | quote }}{{ end }}
37+
38+
{{- if .nodeAssociation }}
39+
[policies.nodeAssociation]
40+
expression = {{ .nodeAssociation.expression | quote }}
41+
{{- end }}
42+
43+
[policies.healthEvent]
44+
componentClass = {{ .healthEvent.componentClass | quote }}
45+
isFatal = {{ .healthEvent.isFatal }}
46+
message = {{ .healthEvent.message | quote }}
47+
recommendedAction = {{ .healthEvent.recommendedAction | quote }}
48+
{{- if .healthEvent.errorCode }}
49+
errorCode = [{{- range $index, $code := .healthEvent.errorCode }}{{- if $index }}, {{ end }}{{ $code | quote }}{{- end }}]
50+
{{- end }}
51+
52+
{{- end }}
53+

0 commit comments

Comments
 (0)