1+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ name : Helm Charts E2E Test
16+
17+ # E2E test using Kind cluster with locally built images and KWOK fake GPU nodes
18+
19+ on :
20+ push :
21+ branches :
22+ - main
23+ - " pull-request/[0-9]+"
24+ paths :
25+ - ' distros/kubernetes/nvsentinel/**'
26+ - ' tilt/release/**'
27+ - ' .github/workflows/helm-e2e-test.yml'
28+ - ' scripts/validate-nvsentinel.sh'
29+ workflow_dispatch :
30+
31+ concurrency :
32+ group : ${{ github.workflow }}-${{ github.ref }}
33+ cancel-in-progress : ${{ github.ref != 'refs/heads/main' }}
34+
35+ permissions :
36+ contents : read
37+ actions : read
38+
39+ env :
40+ KIND_VERSION : ' 0.30.0'
41+ HELM_VERSION : ' v3.14.4'
42+ KWOK_VERSION : ' v0.7.0'
43+ CLUSTER_NAME : ' helm-e2e-test'
44+ NVSENTINEL_NAMESPACE : ' nvsentinel'
45+ VALIDATION_TIMEOUT_MINUTES : ' 10'
46+ VALIDATION_INTERVAL_SECONDS : ' 30'
47+ FAKE_GPU_NODE_COUNT : ' 3'
48+
49+ jobs :
50+ prepare-environment :
51+ uses : ./.github/workflows/prepare-environment.yml
52+
53+ helm-e2e-test :
54+ name : " Helm Charts E2E Test"
55+ runs-on : linux-amd64-cpu32
56+ timeout-minutes : 60
57+ needs : prepare-environment
58+ steps :
59+ - uses : actions/checkout@v4
60+
61+ - name : Workaround for freeing up more disk space
62+ run : |
63+ sudo rm -rf /usr/local/lib/android
64+ sudo rm -rf /usr/share/dotnet
65+ sudo rm -rf /opt/ghc
66+ sudo rm -rf /opt/hostedtoolcache/CodeQL
67+ sudo docker image prune --all --force
68+ docker system prune -f
69+
70+ - name : Setup build environment
71+ uses : ./.github/actions/setup-build-env
72+ with :
73+ go-version : ${{ needs.prepare-environment.outputs.go_version }}
74+ python-version : ${{ needs.prepare-environment.outputs.python_version }}
75+ poetry-version : ${{ needs.prepare-environment.outputs.poetry_version }}
76+ golangci-lint-version : ${{ needs.prepare-environment.outputs.golangci_lint_version }}
77+ protobuf-version : ${{ needs.prepare-environment.outputs.protobuf_version }}
78+ protoc-gen-go-version : ${{ needs.prepare-environment.outputs.protoc_gen_go_version }}
79+ protoc-gen-go-grpc-version : ${{ needs.prepare-environment.outputs.protoc_gen_go_grpc_version }}
80+ shellcheck-version : ${{ needs.prepare-environment.outputs.shellcheck_version }}
81+
82+ - name : Set up Docker Buildx
83+ uses : docker/setup-buildx-action@v3
84+
85+ - name : Cache Helm e2e testing tools
86+ uses : actions/cache@v4
87+ with :
88+ path : |
89+ /usr/local/bin/kind
90+ /usr/local/bin/kubectl
91+ /usr/local/bin/helm
92+ /usr/local/bin/kwok
93+ /usr/local/bin/kwokctl
94+ key : ${{ runner.os }}-${{ runner.arch }}-helm-e2e-tools-${{ env.KIND_VERSION }}-${{ env.HELM_VERSION }}-${{ env.KWOK_VERSION }}
95+ restore-keys : |
96+ ${{ runner.os }}-${{ runner.arch }}-helm-e2e-tools-
97+
98+ - name : Install testing tools
99+ run : |
100+ ARCH=$(case $(uname -m) in x86_64) echo amd64;; aarch64|arm64) echo arm64;; *) echo $(uname -m);; esac)
101+
102+ # Install Kind
103+ if ! command -v kind &> /dev/null || ! kind version | grep -q "${{ env.KIND_VERSION }}"; then
104+ curl -fsSL --retry 3 -o ./kind https://github.com/kubernetes-sigs/kind/releases/download/v${{ env.KIND_VERSION }}/kind-linux-${ARCH}
105+ chmod +x ./kind && sudo mv ./kind /usr/local/bin/kind
106+ fi
107+
108+ # Install kubectl
109+ if ! command -v kubectl &> /dev/null; then
110+ curl -fsSL --retry 3 -O "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl"
111+ chmod +x kubectl && sudo mv ./kubectl /usr/local/bin/kubectl
112+ fi
113+
114+ # Install Helm
115+ if ! command -v helm &> /dev/null || ! helm version --short | grep -q "${{ env.HELM_VERSION }}"; then
116+ curl -fsSL --retry 3 https://get.helm.sh/helm-${{ env.HELM_VERSION }}-linux-${ARCH}.tar.gz | tar -xzO linux-${ARCH}/helm | sudo tee /usr/local/bin/helm > /dev/null
117+ sudo chmod +x /usr/local/bin/helm
118+ fi
119+
120+ # Install KWOK
121+ if ! command -v kwok &> /dev/null || ! kwok --version | grep -q "${{ env.KWOK_VERSION }}"; then
122+ curl -fsSL --retry 3 -o /tmp/kwok https://github.com/kubernetes-sigs/kwok/releases/download/${{ env.KWOK_VERSION }}/kwok-linux-${ARCH}
123+ curl -fsSL --retry 3 -o /tmp/kwokctl https://github.com/kubernetes-sigs/kwok/releases/download/${{ env.KWOK_VERSION }}/kwokctl-linux-${ARCH}
124+ chmod +x /tmp/kwok /tmp/kwokctl
125+ sudo mv /tmp/kwok /tmp/kwokctl /usr/local/bin/
126+ fi
127+
128+ # Verify installations
129+ kind version
130+ kubectl version --client
131+ helm version --short
132+ kwok --version
133+
134+ - name : Build container images locally
135+ env :
136+ SAFE_REF_NAME : ${{ needs.prepare-environment.outputs.safe_ref_name }}
137+ NVCR_CONTAINER_REPO : localhost
138+ NGC_ORG : nvsentinel-e2e
139+ PLATFORMS : linux/amd64
140+ DISABLE_REGISTRY_CACHE : true
141+ run : |
142+ make docker-all
143+ docker images | grep "localhost/nvsentinel-e2e/nvsentinel-.*:${SAFE_REF_NAME}" || docker images | grep nvsentinel
144+
145+ - name : Create Kind cluster
146+ run : |
147+ REGISTRY_MIRROR=$(jq -r '.["registry-mirrors"][0]? // empty' /etc/docker/daemon.json 2>/dev/null || echo "")
148+
149+ cat > /tmp/kind-config.yaml <<EOF
150+ kind: Cluster
151+ apiVersion: kind.x-k8s.io/v1alpha4
152+ name: ${CLUSTER_NAME}
153+ $(if [[ -n "$REGISTRY_MIRROR" ]]; then
154+ echo "containerdConfigPatches:"
155+ echo "- |-"
156+ echo " [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]"
157+ echo " endpoint = [\"${REGISTRY_MIRROR}\"]"
158+ fi)
159+ nodes:
160+ - role: control-plane
161+ kubeadmConfigPatches:
162+ - |
163+ kind: InitConfiguration
164+ nodeRegistration:
165+ kubeletExtraArgs:
166+ node-labels: "ingress-ready=true"
167+ extraPortMappings:
168+ - containerPort: 80
169+ hostPort: 80
170+ protocol: TCP
171+ - containerPort: 443
172+ hostPort: 443
173+ protocol: TCP
174+ - role: worker
175+ - role: worker
176+ EOF
177+
178+ kind create cluster --config=/tmp/kind-config.yaml
179+ kubectl cluster-info
180+ kubectl get nodes
181+
182+ - name : Load images into Kind cluster
183+ env :
184+ SAFE_REF_NAME : ${{ needs.prepare-environment.outputs.safe_ref_name }}
185+ run : |
186+ mapfile -t images < <(docker images --format "{{.Repository}}:{{.Tag}}" | grep "localhost/nvsentinel-e2e/nvsentinel-.*:.*${SAFE_REF_NAME}")
187+ [ ${#images[@]} -eq 0 ] && { echo "No images found"; exit 1; }
188+
189+ for image in "${images[@]}"; do
190+ kind load docker-image "$image" --name "${CLUSTER_NAME}"
191+ done
192+
193+ - name : Install infrastructure dependencies
194+ run : |
195+ helm repo add jetstack https://charts.jetstack.io --force-update
196+ helm upgrade --install cert-manager jetstack/cert-manager \
197+ --namespace cert-manager --create-namespace \
198+ --version v1.19.1 --set installCRDs=true \
199+ --wait --timeout=5m
200+
201+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update
202+ helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
203+ --namespace monitoring --create-namespace \
204+ --set prometheus.enabled=true \
205+ --set alertmanager.enabled=false \
206+ --set grafana.enabled=false \
207+ --set kubeStateMetrics.enabled=false \
208+ --set nodeExporter.enabled=false \
209+ --wait --timeout=5m
210+
211+ helm repo add kwok https://raw.githubusercontent.com/kubernetes-sigs/kwok/refs/heads/main/site/static/charts/ --force-update
212+ helm upgrade --install kwok kwok/kwok \
213+ --namespace kube-system \
214+ --wait --timeout=5m
215+
216+ - name : Create fake GPU nodes
217+ run : |
218+ for i in $(seq 1 ${{ env.FAKE_GPU_NODE_COUNT }}); do
219+ sed "s/kwok-node-PLACEHOLDER/kwok-gpu-node-${i}/g" tilt/kwok-node-template.yaml | kubectl apply -f -
220+ done
221+ kubectl get nodes -l type=kwok
222+
223+ - name : Patch values for E2E testing
224+ env :
225+ SAFE_REF_NAME : ${{ needs.prepare-environment.outputs.safe_ref_name }}
226+ run : |
227+ # Replace repository URLs in chart files
228+ sed -i 's|ghcr\.io/nvidia/nvsentinel-|localhost/nvsentinel-e2e/nvsentinel-|g' \
229+ distros/kubernetes/nvsentinel/charts/*/values.yaml \
230+ distros/kubernetes/nvsentinel/values.yaml
231+
232+ # Create patched values file
233+ cp tilt/release/values-release.yaml /tmp/values-patched.yaml
234+ sed -i '/^global:/a\
235+ image:\
236+ tag: "'"${SAFE_REF_NAME}"'"\
237+ imagePullSecrets: []' /tmp/values-patched.yaml
238+
239+ # Add MongoDB scheduling for real nodes only
240+ cat >> /tmp/values-patched.yaml <<'EOF'
241+ mongodb:
242+ affinity:
243+ nodeAffinity:
244+ requiredDuringSchedulingIgnoredDuringExecution:
245+ nodeSelectorTerms:
246+ - matchExpressions:
247+ - key: type
248+ operator: NotIn
249+ values: ["kwok"]
250+ - key: node-role.kubernetes.io/control-plane
251+ operator: DoesNotExist
252+ tolerations:
253+ - operator: Exists
254+ EOF
255+
256+ - name : Install NVSentinel via Helm
257+ run : |
258+ helm upgrade --install nvsentinel ./distros/kubernetes/nvsentinel \
259+ --create-namespace \
260+ --namespace ${{ env.NVSENTINEL_NAMESPACE }} \
261+ --values /tmp/values-patched.yaml \
262+ --set global.image.tag="${{ needs.prepare-environment.outputs.safe_ref_name }}" \
263+ --debug
264+
265+ kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }}
266+
267+ - name : Validate deployment with retry
268+ env :
269+ SAFE_REF_NAME : ${{ needs.prepare-environment.outputs.safe_ref_name }}
270+ run : |
271+ chmod +x scripts/validate-nvsentinel.sh
272+ max_attempts=$((${{ env.VALIDATION_TIMEOUT_MINUTES }} * 60 / ${{ env.VALIDATION_INTERVAL_SECONDS }}))
273+
274+ for attempt in $(seq 1 $max_attempts); do
275+ if ./scripts/validate-nvsentinel.sh --version "${SAFE_REF_NAME}" --namespace "${{ env.NVSENTINEL_NAMESPACE }}" --image-pattern "localhost/nvsentinel-e2e/nvsentinel" --verbose; then
276+ echo "Validation passed on attempt $attempt"
277+ exit 0
278+ fi
279+ [ $attempt -lt $max_attempts ] && sleep ${{ env.VALIDATION_INTERVAL_SECONDS }}
280+ done
281+
282+ echo "Validation failed after $max_attempts attempts"
283+ kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }} -o wide
284+ kubectl get events -n ${{ env.NVSENTINEL_NAMESPACE }} --sort-by='.lastTimestamp'
285+ exit 1
286+
287+ - name : Export Kind logs
288+ if : always()
289+ run : |
290+ mkdir -p /tmp/kind-logs
291+ kind export logs /tmp/kind-logs --name "${CLUSTER_NAME}" || true
292+
293+ - name : Collect debug artifacts
294+ if : failure()
295+ run : |
296+ mkdir -p /tmp/debug-artifacts
297+ kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true
298+ kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true
299+ kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }} -o yaml > /tmp/debug-artifacts/nvsentinel-pods.yaml || true
300+ kubectl logs -n ${{ env.NVSENTINEL_NAMESPACE }} --all-containers=true --tail=500 > /tmp/debug-artifacts/nvsentinel-logs.txt || true
301+ ./scripts/validate-nvsentinel.sh --version "${{ needs.prepare-environment.outputs.safe_ref_name }}" --namespace "${{ env.NVSENTINEL_NAMESPACE }}" --image-pattern "localhost/nvsentinel-e2e/nvsentinel" --verbose > /tmp/debug-artifacts/validation-output.txt 2>&1 || true
302+ docker images > /tmp/debug-artifacts/docker-images.txt
303+ df -h > /tmp/debug-artifacts/disk-usage.txt
304+
305+ - name : Upload Kind logs
306+ if : always()
307+ uses : actions/upload-artifact@v4
308+ with :
309+ name : helm-e2e-kind-logs-${{ github.run_id }}
310+ path : /tmp/kind-logs/
311+ retention-days : 7
312+
313+ - name : Upload debugging artifacts
314+ if : failure()
315+ uses : actions/upload-artifact@v4
316+ with :
317+ name : helm-e2e-debug-artifacts-${{ github.run_id }}
318+ path : /tmp/debug-artifacts/
319+ retention-days : 7
320+
321+ - name : Cleanup
322+ if : always()
323+ run : |
324+ kind delete cluster --name "${CLUSTER_NAME}" || true
325+ docker system prune -f || true
0 commit comments