Skip to content

Commit 8119485

Browse files
committed
New e2e test for helm charts
Signed-off-by: Davanum Srinivas <[email protected]>
1 parent 5d0eb79 commit 8119485

File tree

2 files changed

+336
-7
lines changed

2 files changed

+336
-7
lines changed
Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,325 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Helm Charts E2E Test
16+
17+
# E2E test using Kind cluster with locally built images and KWOK fake GPU nodes
18+
19+
on:
20+
push:
21+
branches:
22+
- main
23+
- "pull-request/[0-9]+"
24+
paths:
25+
- 'distros/kubernetes/nvsentinel/**'
26+
- 'tilt/release/**'
27+
- '.github/workflows/helm-e2e-test.yml'
28+
- 'scripts/validate-nvsentinel.sh'
29+
workflow_dispatch:
30+
31+
concurrency:
32+
group: ${{ github.workflow }}-${{ github.ref }}
33+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
34+
35+
permissions:
36+
contents: read
37+
actions: read
38+
39+
env:
40+
KIND_VERSION: '0.30.0'
41+
HELM_VERSION: 'v3.14.4'
42+
KWOK_VERSION: 'v0.7.0'
43+
CLUSTER_NAME: 'helm-e2e-test'
44+
NVSENTINEL_NAMESPACE: 'nvsentinel'
45+
VALIDATION_TIMEOUT_MINUTES: '10'
46+
VALIDATION_INTERVAL_SECONDS: '30'
47+
FAKE_GPU_NODE_COUNT: '3'
48+
49+
jobs:
50+
prepare-environment:
51+
uses: ./.github/workflows/prepare-environment.yml
52+
53+
helm-e2e-test:
54+
name: "Helm Charts E2E Test"
55+
runs-on: linux-amd64-cpu32
56+
timeout-minutes: 60
57+
needs: prepare-environment
58+
steps:
59+
- uses: actions/checkout@v4
60+
61+
- name: Workaround for freeing up more disk space
62+
run: |
63+
sudo rm -rf /usr/local/lib/android
64+
sudo rm -rf /usr/share/dotnet
65+
sudo rm -rf /opt/ghc
66+
sudo rm -rf /opt/hostedtoolcache/CodeQL
67+
sudo docker image prune --all --force
68+
docker system prune -f
69+
70+
- name: Setup build environment
71+
uses: ./.github/actions/setup-build-env
72+
with:
73+
go-version: ${{ needs.prepare-environment.outputs.go_version }}
74+
python-version: ${{ needs.prepare-environment.outputs.python_version }}
75+
poetry-version: ${{ needs.prepare-environment.outputs.poetry_version }}
76+
golangci-lint-version: ${{ needs.prepare-environment.outputs.golangci_lint_version }}
77+
protobuf-version: ${{ needs.prepare-environment.outputs.protobuf_version }}
78+
protoc-gen-go-version: ${{ needs.prepare-environment.outputs.protoc_gen_go_version }}
79+
protoc-gen-go-grpc-version: ${{ needs.prepare-environment.outputs.protoc_gen_go_grpc_version }}
80+
shellcheck-version: ${{ needs.prepare-environment.outputs.shellcheck_version }}
81+
82+
- name: Set up Docker Buildx
83+
uses: docker/setup-buildx-action@v3
84+
85+
- name: Cache Helm e2e testing tools
86+
uses: actions/cache@v4
87+
with:
88+
path: |
89+
/usr/local/bin/kind
90+
/usr/local/bin/kubectl
91+
/usr/local/bin/helm
92+
/usr/local/bin/kwok
93+
/usr/local/bin/kwokctl
94+
key: ${{ runner.os }}-${{ runner.arch }}-helm-e2e-tools-${{ env.KIND_VERSION }}-${{ env.HELM_VERSION }}-${{ env.KWOK_VERSION }}
95+
restore-keys: |
96+
${{ runner.os }}-${{ runner.arch }}-helm-e2e-tools-
97+
98+
- name: Install testing tools
99+
run: |
100+
ARCH=$(case $(uname -m) in x86_64) echo amd64;; aarch64|arm64) echo arm64;; *) echo $(uname -m);; esac)
101+
102+
# Install Kind
103+
if ! command -v kind &> /dev/null || ! kind version | grep -q "${{ env.KIND_VERSION }}"; then
104+
curl -fsSL --retry 3 -o ./kind https://github.com/kubernetes-sigs/kind/releases/download/v${{ env.KIND_VERSION }}/kind-linux-${ARCH}
105+
chmod +x ./kind && sudo mv ./kind /usr/local/bin/kind
106+
fi
107+
108+
# Install kubectl
109+
if ! command -v kubectl &> /dev/null; then
110+
curl -fsSL --retry 3 -O "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl"
111+
chmod +x kubectl && sudo mv ./kubectl /usr/local/bin/kubectl
112+
fi
113+
114+
# Install Helm
115+
if ! command -v helm &> /dev/null || ! helm version --short | grep -q "${{ env.HELM_VERSION }}"; then
116+
curl -fsSL --retry 3 https://get.helm.sh/helm-${{ env.HELM_VERSION }}-linux-${ARCH}.tar.gz | tar -xzO linux-${ARCH}/helm | sudo tee /usr/local/bin/helm > /dev/null
117+
sudo chmod +x /usr/local/bin/helm
118+
fi
119+
120+
# Install KWOK
121+
if ! command -v kwok &> /dev/null || ! kwok --version | grep -q "${{ env.KWOK_VERSION }}"; then
122+
curl -fsSL --retry 3 -o /tmp/kwok https://github.com/kubernetes-sigs/kwok/releases/download/${{ env.KWOK_VERSION }}/kwok-linux-${ARCH}
123+
curl -fsSL --retry 3 -o /tmp/kwokctl https://github.com/kubernetes-sigs/kwok/releases/download/${{ env.KWOK_VERSION }}/kwokctl-linux-${ARCH}
124+
chmod +x /tmp/kwok /tmp/kwokctl
125+
sudo mv /tmp/kwok /tmp/kwokctl /usr/local/bin/
126+
fi
127+
128+
# Verify installations
129+
kind version
130+
kubectl version --client
131+
helm version --short
132+
kwok --version
133+
134+
- name: Build container images locally
135+
env:
136+
SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
137+
NVCR_CONTAINER_REPO: localhost
138+
NGC_ORG: nvsentinel-e2e
139+
PLATFORMS: linux/amd64
140+
DISABLE_REGISTRY_CACHE: true
141+
run: |
142+
make docker-all
143+
docker images | grep "localhost/nvsentinel-e2e/nvsentinel-.*:${SAFE_REF_NAME}" || docker images | grep nvsentinel
144+
145+
- name: Create Kind cluster
146+
run: |
147+
REGISTRY_MIRROR=$(jq -r '.["registry-mirrors"][0]? // empty' /etc/docker/daemon.json 2>/dev/null || echo "")
148+
149+
cat > /tmp/kind-config.yaml <<EOF
150+
kind: Cluster
151+
apiVersion: kind.x-k8s.io/v1alpha4
152+
name: ${CLUSTER_NAME}
153+
$(if [[ -n "$REGISTRY_MIRROR" ]]; then
154+
echo "containerdConfigPatches:"
155+
echo "- |-"
156+
echo " [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]"
157+
echo " endpoint = [\"${REGISTRY_MIRROR}\"]"
158+
fi)
159+
nodes:
160+
- role: control-plane
161+
kubeadmConfigPatches:
162+
- |
163+
kind: InitConfiguration
164+
nodeRegistration:
165+
kubeletExtraArgs:
166+
node-labels: "ingress-ready=true"
167+
extraPortMappings:
168+
- containerPort: 80
169+
hostPort: 80
170+
protocol: TCP
171+
- containerPort: 443
172+
hostPort: 443
173+
protocol: TCP
174+
- role: worker
175+
- role: worker
176+
EOF
177+
178+
kind create cluster --config=/tmp/kind-config.yaml
179+
kubectl cluster-info
180+
kubectl get nodes
181+
182+
- name: Load images into Kind cluster
183+
env:
184+
SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
185+
run: |
186+
mapfile -t images < <(docker images --format "{{.Repository}}:{{.Tag}}" | grep "localhost/nvsentinel-e2e/nvsentinel-.*:.*${SAFE_REF_NAME}")
187+
[ ${#images[@]} -eq 0 ] && { echo "No images found"; exit 1; }
188+
189+
for image in "${images[@]}"; do
190+
kind load docker-image "$image" --name "${CLUSTER_NAME}"
191+
done
192+
193+
- name: Install infrastructure dependencies
194+
run: |
195+
helm repo add jetstack https://charts.jetstack.io --force-update
196+
helm upgrade --install cert-manager jetstack/cert-manager \
197+
--namespace cert-manager --create-namespace \
198+
--version v1.19.1 --set installCRDs=true \
199+
--wait --timeout=5m
200+
201+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update
202+
helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
203+
--namespace monitoring --create-namespace \
204+
--set prometheus.enabled=true \
205+
--set alertmanager.enabled=false \
206+
--set grafana.enabled=false \
207+
--set kubeStateMetrics.enabled=false \
208+
--set nodeExporter.enabled=false \
209+
--wait --timeout=5m
210+
211+
helm repo add kwok https://raw.githubusercontent.com/kubernetes-sigs/kwok/refs/heads/main/site/static/charts/ --force-update
212+
helm upgrade --install kwok kwok/kwok \
213+
--namespace kube-system \
214+
--wait --timeout=5m
215+
216+
- name: Create fake GPU nodes
217+
run: |
218+
for i in $(seq 1 ${{ env.FAKE_GPU_NODE_COUNT }}); do
219+
sed "s/kwok-node-PLACEHOLDER/kwok-gpu-node-${i}/g" tilt/kwok-node-template.yaml | kubectl apply -f -
220+
done
221+
kubectl get nodes -l type=kwok
222+
223+
- name: Patch values for E2E testing
224+
env:
225+
SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
226+
run: |
227+
# Replace repository URLs in chart files
228+
sed -i 's|ghcr\.io/nvidia/nvsentinel-|localhost/nvsentinel-e2e/nvsentinel-|g' \
229+
distros/kubernetes/nvsentinel/charts/*/values.yaml \
230+
distros/kubernetes/nvsentinel/values.yaml
231+
232+
# Create patched values file
233+
cp tilt/release/values-release.yaml /tmp/values-patched.yaml
234+
sed -i '/^global:/a\
235+
image:\
236+
tag: "'"${SAFE_REF_NAME}"'"\
237+
imagePullSecrets: []' /tmp/values-patched.yaml
238+
239+
# Add MongoDB scheduling for real nodes only
240+
cat >> /tmp/values-patched.yaml <<'EOF'
241+
mongodb:
242+
affinity:
243+
nodeAffinity:
244+
requiredDuringSchedulingIgnoredDuringExecution:
245+
nodeSelectorTerms:
246+
- matchExpressions:
247+
- key: type
248+
operator: NotIn
249+
values: ["kwok"]
250+
- key: node-role.kubernetes.io/control-plane
251+
operator: DoesNotExist
252+
tolerations:
253+
- operator: Exists
254+
EOF
255+
256+
- name: Install NVSentinel via Helm
257+
run: |
258+
helm upgrade --install nvsentinel ./distros/kubernetes/nvsentinel \
259+
--create-namespace \
260+
--namespace ${{ env.NVSENTINEL_NAMESPACE }} \
261+
--values /tmp/values-patched.yaml \
262+
--set global.image.tag="${{ needs.prepare-environment.outputs.safe_ref_name }}" \
263+
--debug
264+
265+
kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }}
266+
267+
- name: Validate deployment with retry
268+
env:
269+
SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
270+
run: |
271+
chmod +x scripts/validate-nvsentinel.sh
272+
max_attempts=$((${{ env.VALIDATION_TIMEOUT_MINUTES }} * 60 / ${{ env.VALIDATION_INTERVAL_SECONDS }}))
273+
274+
for attempt in $(seq 1 $max_attempts); do
275+
if ./scripts/validate-nvsentinel.sh --version "${SAFE_REF_NAME}" --namespace "${{ env.NVSENTINEL_NAMESPACE }}" --image-pattern "localhost/nvsentinel-e2e/nvsentinel" --verbose; then
276+
echo "Validation passed on attempt $attempt"
277+
exit 0
278+
fi
279+
[ $attempt -lt $max_attempts ] && sleep ${{ env.VALIDATION_INTERVAL_SECONDS }}
280+
done
281+
282+
echo "Validation failed after $max_attempts attempts"
283+
kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }} -o wide
284+
kubectl get events -n ${{ env.NVSENTINEL_NAMESPACE }} --sort-by='.lastTimestamp'
285+
exit 1
286+
287+
- name: Export Kind logs
288+
if: always()
289+
run: |
290+
mkdir -p /tmp/kind-logs
291+
kind export logs /tmp/kind-logs --name "${CLUSTER_NAME}" || true
292+
293+
- name: Collect debug artifacts
294+
if: failure()
295+
run: |
296+
mkdir -p /tmp/debug-artifacts
297+
kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true
298+
kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true
299+
kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }} -o yaml > /tmp/debug-artifacts/nvsentinel-pods.yaml || true
300+
kubectl logs -n ${{ env.NVSENTINEL_NAMESPACE }} --all-containers=true --tail=500 > /tmp/debug-artifacts/nvsentinel-logs.txt || true
301+
./scripts/validate-nvsentinel.sh --version "${{ needs.prepare-environment.outputs.safe_ref_name }}" --namespace "${{ env.NVSENTINEL_NAMESPACE }}" --image-pattern "localhost/nvsentinel-e2e/nvsentinel" --verbose > /tmp/debug-artifacts/validation-output.txt 2>&1 || true
302+
docker images > /tmp/debug-artifacts/docker-images.txt
303+
df -h > /tmp/debug-artifacts/disk-usage.txt
304+
305+
- name: Upload Kind logs
306+
if: always()
307+
uses: actions/upload-artifact@v4
308+
with:
309+
name: helm-e2e-kind-logs-${{ github.run_id }}
310+
path: /tmp/kind-logs/
311+
retention-days: 7
312+
313+
- name: Upload debugging artifacts
314+
if: failure()
315+
uses: actions/upload-artifact@v4
316+
with:
317+
name: helm-e2e-debug-artifacts-${{ github.run_id }}
318+
path: /tmp/debug-artifacts/
319+
retention-days: 7
320+
321+
- name: Cleanup
322+
if: always()
323+
run: |
324+
kind delete cluster --name "${CLUSTER_NAME}" || true
325+
docker system prune -f || true

scripts/validate-nvsentinel.sh

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,22 @@ set -uo pipefail
2121
NAMESPACE=""
2222
VERSION=""
2323
VERBOSE=false
24+
IMAGE_PATTERN=""
2425

2526
usage() {
26-
echo "Usage: $0 --version VERSION [--namespace NAMESPACE] [--verbose]"
27-
echo " --version Required. Expected image version (e.g., v0.0.3)"
28-
echo " --namespace Optional. Kubernetes namespace (default: nvsentinel)"
29-
echo " --verbose Optional. Print detailed image lists"
27+
echo "Usage: $0 --version VERSION [--namespace NAMESPACE] [--image-pattern PATTERN] [--verbose]"
28+
echo " --version Required. Expected image version (e.g., v0.0.3)"
29+
echo " --namespace Optional. Kubernetes namespace (default: nvsentinel)"
30+
echo " --image-pattern Optional. Image pattern to validate (default: ghcr.io/nvidia/nvsentinel)"
31+
echo " --verbose Optional. Print detailed image lists"
3032
exit 1
3133
}
3234

3335
while [[ $# -gt 0 ]]; do
3436
case $1 in
3537
--version) VERSION="$2"; shift 2 ;;
3638
--namespace) NAMESPACE="$2"; shift 2 ;;
39+
--image-pattern) IMAGE_PATTERN="$2"; shift 2 ;;
3740
--verbose) VERBOSE=true; shift ;;
3841
-h|--help) usage ;;
3942
*) echo "Unknown option: $1"; usage ;;
@@ -43,6 +46,7 @@ done
4346
# Validate required parameters
4447
[[ -z "$VERSION" ]] && { echo "Error: --version is required"; usage; }
4548
NAMESPACE="${NAMESPACE:-nvsentinel}"
49+
IMAGE_PATTERN="${IMAGE_PATTERN:-ghcr.io/nvidia/nvsentinel}"
4650

4751
ERRORS=0
4852

@@ -80,10 +84,10 @@ ok "cluster has $total_nodes total nodes ($gpu_nodes GPU nodes, $kwok_nodes KWOK
8084
echo "=== Image Versions ==="
8185
# shellcheck disable=SC2126 # wc -l is clearer than grep -c for pipeline
8286
wrong_versions=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].spec.containers[*].image}' 2>/dev/null | \
83-
tr ' ' '\n' | grep "ghcr.io/nvidia/nvsentinel" | grep -v ":$VERSION" | wc -l 2>/dev/null || echo "0")
87+
tr ' ' '\n' | grep "$IMAGE_PATTERN" | grep -v ":$VERSION" | wc -l 2>/dev/null || echo "0")
8488
wrong_versions=$(echo "$wrong_versions" | tr -d '\n' | tr -d ' ')
8589
# shellcheck disable=SC2015 # && || pattern is intentional for conditional execution
86-
[[ "$wrong_versions" -eq 0 ]] && ok "all nvsentinel images use $VERSION" || error "$wrong_versions pods use wrong image version"
90+
[[ "$wrong_versions" -eq 0 ]] && ok "all nvsentinel images use $VERSION (pattern: $IMAGE_PATTERN)" || error "$wrong_versions pods use wrong image version (pattern: $IMAGE_PATTERN)"
8791

8892
# Count images by registry across cluster
8993
all_images=$( (kubectl get deployments,daemonsets,statefulsets,jobs,cronjobs,replicasets --all-namespaces -o jsonpath='{range .items[*]}{range .spec.template.spec.containers[*]}{.image}{"\n"}{end}{range .spec.template.spec.initContainers[*]}{.image}{"\n"}{end}{end}' 2>/dev/null; kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{"\n"}{end}{range .spec.initContainers[*]}{.image}{"\n"}{end}{end}') | sort -u )
@@ -139,7 +143,7 @@ for secret in "${secrets[@]}"; do
139143
warn "$secret: present but wrong type ($secret_type)"
140144
fi
141145
else
142-
error "$secret not found"
146+
warn "$secret not found"
143147
fi
144148
done
145149

0 commit comments

Comments
 (0)