New e2e test for helm charts

dims · dims · commit 811948526428 · 2025-10-20T20:06:44.000-04:00
Signed-off-by: Davanum Srinivas &lt;dsrinivas@nvidia.com&gt;
diff --git a/.github/workflows/helm-e2e-test.yml b/.github/workflows/helm-e2e-test.yml
@@ -0,0 +1,325 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Helm Charts E2E Test
+
+# E2E test using Kind cluster with locally built images and KWOK fake GPU nodes
+
+on:
+  push:
+    branches:
+      - main
+      - "pull-request/[0-9]+"
+    paths:
+      - 'distros/kubernetes/nvsentinel/**'
+      - 'tilt/release/**'
+      - '.github/workflows/helm-e2e-test.yml'
+      - 'scripts/validate-nvsentinel.sh'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+permissions:
+  contents: read
+  actions: read
+
+env:
+  KIND_VERSION: '0.30.0'
+  HELM_VERSION: 'v3.14.4'
+  KWOK_VERSION: 'v0.7.0'
+  CLUSTER_NAME: 'helm-e2e-test'
+  NVSENTINEL_NAMESPACE: 'nvsentinel'
+  VALIDATION_TIMEOUT_MINUTES: '10'
+  VALIDATION_INTERVAL_SECONDS: '30'
+  FAKE_GPU_NODE_COUNT: '3'
+
+jobs:
+  prepare-environment:
+    uses: ./.github/workflows/prepare-environment.yml
+
+  helm-e2e-test:
+    name: "Helm Charts E2E Test"
+    runs-on: linux-amd64-cpu32
+    timeout-minutes: 60
+    needs: prepare-environment
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Workaround for freeing up more disk space
+        run: |
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          docker system prune -f
+
+      - name: Setup build environment
+        uses: ./.github/actions/setup-build-env
+        with:
+          go-version: ${{ needs.prepare-environment.outputs.go_version }}
+          python-version: ${{ needs.prepare-environment.outputs.python_version }}
+          poetry-version: ${{ needs.prepare-environment.outputs.poetry_version }}
+          golangci-lint-version: ${{ needs.prepare-environment.outputs.golangci_lint_version }}
+          protobuf-version: ${{ needs.prepare-environment.outputs.protobuf_version }}
+          protoc-gen-go-version: ${{ needs.prepare-environment.outputs.protoc_gen_go_version }}
+          protoc-gen-go-grpc-version: ${{ needs.prepare-environment.outputs.protoc_gen_go_grpc_version }}
+          shellcheck-version: ${{ needs.prepare-environment.outputs.shellcheck_version }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Cache Helm e2e testing tools
+        uses: actions/cache@v4
+        with:
+          path: |
+            /usr/local/bin/kind
+            /usr/local/bin/kubectl
+            /usr/local/bin/helm
+            /usr/local/bin/kwok
+            /usr/local/bin/kwokctl
+          key: ${{ runner.os }}-${{ runner.arch }}-helm-e2e-tools-${{ env.KIND_VERSION }}-${{ env.HELM_VERSION }}-${{ env.KWOK_VERSION }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-helm-e2e-tools-
+
+      - name: Install testing tools
+        run: |
+          ARCH=$(case $(uname -m) in x86_64) echo amd64;; aarch64|arm64) echo arm64;; *) echo $(uname -m);; esac)
+
+          # Install Kind
+          if ! command -v kind &> /dev/null || ! kind version | grep -q "${{ env.KIND_VERSION }}"; then
+            curl -fsSL --retry 3 -o ./kind https://github.com/kubernetes-sigs/kind/releases/download/v${{ env.KIND_VERSION }}/kind-linux-${ARCH}
+            chmod +x ./kind && sudo mv ./kind /usr/local/bin/kind
+          fi
+
+          # Install kubectl
+          if ! command -v kubectl &> /dev/null; then
+            curl -fsSL --retry 3 -O "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl"
+            chmod +x kubectl && sudo mv ./kubectl /usr/local/bin/kubectl
+          fi
+
+          # Install Helm
+          if ! command -v helm &> /dev/null || ! helm version --short | grep -q "${{ env.HELM_VERSION }}"; then
+            curl -fsSL --retry 3 https://get.helm.sh/helm-${{ env.HELM_VERSION }}-linux-${ARCH}.tar.gz | tar -xzO linux-${ARCH}/helm | sudo tee /usr/local/bin/helm > /dev/null
+            sudo chmod +x /usr/local/bin/helm
+          fi
+
+          # Install KWOK
+          if ! command -v kwok &> /dev/null || ! kwok --version | grep -q "${{ env.KWOK_VERSION }}"; then
+            curl -fsSL --retry 3 -o /tmp/kwok https://github.com/kubernetes-sigs/kwok/releases/download/${{ env.KWOK_VERSION }}/kwok-linux-${ARCH}
+            curl -fsSL --retry 3 -o /tmp/kwokctl https://github.com/kubernetes-sigs/kwok/releases/download/${{ env.KWOK_VERSION }}/kwokctl-linux-${ARCH}
+            chmod +x /tmp/kwok /tmp/kwokctl
+            sudo mv /tmp/kwok /tmp/kwokctl /usr/local/bin/
+          fi
+
+          # Verify installations
+          kind version
+          kubectl version --client
+          helm version --short
+          kwok --version
+
+      - name: Build container images locally
+        env:
+          SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
+          NVCR_CONTAINER_REPO: localhost
+          NGC_ORG: nvsentinel-e2e
+          PLATFORMS: linux/amd64
+          DISABLE_REGISTRY_CACHE: true
+        run: |
+          make docker-all
+          docker images | grep "localhost/nvsentinel-e2e/nvsentinel-.*:${SAFE_REF_NAME}" || docker images | grep nvsentinel
+
+      - name: Create Kind cluster
+        run: |
+          REGISTRY_MIRROR=$(jq -r '.["registry-mirrors"][0]? // empty' /etc/docker/daemon.json 2>/dev/null || echo "")
+
+          cat > /tmp/kind-config.yaml <<EOF
+          kind: Cluster
+          apiVersion: kind.x-k8s.io/v1alpha4
+          name: ${CLUSTER_NAME}
+          $(if [[ -n "$REGISTRY_MIRROR" ]]; then
+            echo "containerdConfigPatches:"
+            echo "- |-"
+            echo "  [plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]"
+            echo "    endpoint = [\"${REGISTRY_MIRROR}\"]"
+          fi)
+          nodes:
+          - role: control-plane
+            kubeadmConfigPatches:
+            - |
+              kind: InitConfiguration
+              nodeRegistration:
+                kubeletExtraArgs:
+                  node-labels: "ingress-ready=true"
+            extraPortMappings:
+            - containerPort: 80
+              hostPort: 80
+              protocol: TCP
+            - containerPort: 443
+              hostPort: 443
+              protocol: TCP
+          - role: worker
+          - role: worker
+          EOF
+
+          kind create cluster --config=/tmp/kind-config.yaml
+          kubectl cluster-info
+          kubectl get nodes
+
+      - name: Load images into Kind cluster
+        env:
+          SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
+        run: |
+          mapfile -t images < <(docker images --format "{{.Repository}}:{{.Tag}}" | grep "localhost/nvsentinel-e2e/nvsentinel-.*:.*${SAFE_REF_NAME}")
+          [ ${#images[@]} -eq 0 ] && { echo "No images found"; exit 1; }
+
+          for image in "${images[@]}"; do
+            kind load docker-image "$image" --name "${CLUSTER_NAME}"
+          done
+
+      - name: Install infrastructure dependencies
+        run: |
+          helm repo add jetstack https://charts.jetstack.io --force-update
+          helm upgrade --install cert-manager jetstack/cert-manager \
+            --namespace cert-manager --create-namespace \
+            --version v1.19.1 --set installCRDs=true \
+            --wait --timeout=5m
+
+          helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update
+          helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
+            --namespace monitoring --create-namespace \
+            --set prometheus.enabled=true \
+            --set alertmanager.enabled=false \
+            --set grafana.enabled=false \
+            --set kubeStateMetrics.enabled=false \
+            --set nodeExporter.enabled=false \
+            --wait --timeout=5m
+
+          helm repo add kwok https://raw.githubusercontent.com/kubernetes-sigs/kwok/refs/heads/main/site/static/charts/ --force-update
+          helm upgrade --install kwok kwok/kwok \
+            --namespace kube-system \
+            --wait --timeout=5m
+
+      - name: Create fake GPU nodes
+        run: |
+          for i in $(seq 1 ${{ env.FAKE_GPU_NODE_COUNT }}); do
+            sed "s/kwok-node-PLACEHOLDER/kwok-gpu-node-${i}/g" tilt/kwok-node-template.yaml | kubectl apply -f -
+          done
+          kubectl get nodes -l type=kwok
+
+      - name: Patch values for E2E testing
+        env:
+          SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
+        run: |
+          # Replace repository URLs in chart files
+          sed -i 's|ghcr\.io/nvidia/nvsentinel-|localhost/nvsentinel-e2e/nvsentinel-|g' \
+            distros/kubernetes/nvsentinel/charts/*/values.yaml \
+            distros/kubernetes/nvsentinel/values.yaml
+
+          # Create patched values file
+          cp tilt/release/values-release.yaml /tmp/values-patched.yaml
+          sed -i '/^global:/a\
+            image:\
+              tag: "'"${SAFE_REF_NAME}"'"\
+            imagePullSecrets: []' /tmp/values-patched.yaml
+
+          # Add MongoDB scheduling for real nodes only
+          cat >> /tmp/values-patched.yaml <<'EOF'
+          mongodb:
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: type
+                      operator: NotIn
+                      values: ["kwok"]
+                    - key: node-role.kubernetes.io/control-plane
+                      operator: DoesNotExist
+            tolerations:
+            - operator: Exists
+          EOF
+
+      - name: Install NVSentinel via Helm
+        run: |
+          helm upgrade --install nvsentinel ./distros/kubernetes/nvsentinel \
+            --create-namespace \
+            --namespace ${{ env.NVSENTINEL_NAMESPACE }} \
+            --values /tmp/values-patched.yaml \
+            --set global.image.tag="${{ needs.prepare-environment.outputs.safe_ref_name }}" \
+            --debug
+
+          kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }}
+
+      - name: Validate deployment with retry
+        env:
+          SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }}
+        run: |
+          chmod +x scripts/validate-nvsentinel.sh
+          max_attempts=$((${{ env.VALIDATION_TIMEOUT_MINUTES }} * 60 / ${{ env.VALIDATION_INTERVAL_SECONDS }}))
+
+          for attempt in $(seq 1 $max_attempts); do
+            if ./scripts/validate-nvsentinel.sh --version "${SAFE_REF_NAME}" --namespace "${{ env.NVSENTINEL_NAMESPACE }}" --image-pattern "localhost/nvsentinel-e2e/nvsentinel" --verbose; then
+              echo "Validation passed on attempt $attempt"
+              exit 0
+            fi
+            [ $attempt -lt $max_attempts ] && sleep ${{ env.VALIDATION_INTERVAL_SECONDS }}
+          done
+
+          echo "Validation failed after $max_attempts attempts"
+          kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }} -o wide
+          kubectl get events -n ${{ env.NVSENTINEL_NAMESPACE }} --sort-by='.lastTimestamp'
+          exit 1
+
+      - name: Export Kind logs
+        if: always()
+        run: |
+          mkdir -p /tmp/kind-logs
+          kind export logs /tmp/kind-logs --name "${CLUSTER_NAME}" || true
+
+      - name: Collect debug artifacts
+        if: failure()
+        run: |
+          mkdir -p /tmp/debug-artifacts
+          kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true
+          kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true
+          kubectl get pods -n ${{ env.NVSENTINEL_NAMESPACE }} -o yaml > /tmp/debug-artifacts/nvsentinel-pods.yaml || true
+          kubectl logs -n ${{ env.NVSENTINEL_NAMESPACE }} --all-containers=true --tail=500 > /tmp/debug-artifacts/nvsentinel-logs.txt || true
+          ./scripts/validate-nvsentinel.sh --version "${{ needs.prepare-environment.outputs.safe_ref_name }}" --namespace "${{ env.NVSENTINEL_NAMESPACE }}" --image-pattern "localhost/nvsentinel-e2e/nvsentinel" --verbose > /tmp/debug-artifacts/validation-output.txt 2>&1 || true
+          docker images > /tmp/debug-artifacts/docker-images.txt
+          df -h > /tmp/debug-artifacts/disk-usage.txt
+
+      - name: Upload Kind logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: helm-e2e-kind-logs-${{ github.run_id }}
+          path: /tmp/kind-logs/
+          retention-days: 7
+
+      - name: Upload debugging artifacts
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: helm-e2e-debug-artifacts-${{ github.run_id }}
+          path: /tmp/debug-artifacts/
+          retention-days: 7
+
+      - name: Cleanup
+        if: always()
+        run: |
+          kind delete cluster --name "${CLUSTER_NAME}" || true
+          docker system prune -f || true
diff --git a/scripts/validate-nvsentinel.sh b/scripts/validate-nvsentinel.sh
@@ -21,19 +21,22 @@ set -uo pipefail
 NAMESPACE=""
 VERSION=""
 VERBOSE=false
+IMAGE_PATTERN=""
 
 usage() {
-    echo "Usage: $0 --version VERSION [--namespace NAMESPACE] [--verbose]"
-    echo "  --version   Required. Expected image version (e.g., v0.0.3)"
-    echo "  --namespace Optional. Kubernetes namespace (default: nvsentinel)"
-    echo "  --verbose   Optional. Print detailed image lists"
+    echo "Usage: $0 --version VERSION [--namespace NAMESPACE] [--image-pattern PATTERN] [--verbose]"
+    echo "  --version       Required. Expected image version (e.g., v0.0.3)"
+    echo "  --namespace     Optional. Kubernetes namespace (default: nvsentinel)"
+    echo "  --image-pattern Optional. Image pattern to validate (default: ghcr.io/nvidia/nvsentinel)"
+    echo "  --verbose       Optional. Print detailed image lists"
     exit 1
 }
 
 while [[ $# -gt 0 ]]; do
     case $1 in
         --version) VERSION="$2"; shift 2 ;;
         --namespace) NAMESPACE="$2"; shift 2 ;;
+        --image-pattern) IMAGE_PATTERN="$2"; shift 2 ;;
         --verbose) VERBOSE=true; shift ;;
         -h|--help) usage ;;
         *) echo "Unknown option: $1"; usage ;;
@@ -43,6 +46,7 @@ done
 # Validate required parameters
 [[ -z "$VERSION" ]] && { echo "Error: --version is required"; usage; }
 NAMESPACE="${NAMESPACE:-nvsentinel}"
+IMAGE_PATTERN="${IMAGE_PATTERN:-ghcr.io/nvidia/nvsentinel}"
 
 ERRORS=0
 
@@ -80,10 +84,10 @@ ok "cluster has $total_nodes total nodes ($gpu_nodes GPU nodes, $kwok_nodes KWOK
 echo "=== Image Versions ==="
 # shellcheck disable=SC2126  # wc -l is clearer than grep -c for pipeline
 wrong_versions=$(kubectl get pods -n "$NAMESPACE" -o jsonpath='{.items[*].spec.containers[*].image}' 2>/dev/null | \
-    tr ' ' '\n' | grep "ghcr.io/nvidia/nvsentinel" | grep -v ":$VERSION" | wc -l 2>/dev/null || echo "0")
+    tr ' ' '\n' | grep "$IMAGE_PATTERN" | grep -v ":$VERSION" | wc -l 2>/dev/null || echo "0")
 wrong_versions=$(echo "$wrong_versions" | tr -d '\n' | tr -d ' ')
 # shellcheck disable=SC2015  # && || pattern is intentional for conditional execution
-[[ "$wrong_versions" -eq 0 ]] && ok "all nvsentinel images use $VERSION" || error "$wrong_versions pods use wrong image version"
+[[ "$wrong_versions" -eq 0 ]] && ok "all nvsentinel images use $VERSION (pattern: $IMAGE_PATTERN)" || error "$wrong_versions pods use wrong image version (pattern: $IMAGE_PATTERN)"
 
 # Count images by registry across cluster
 all_images=$( (kubectl get deployments,daemonsets,statefulsets,jobs,cronjobs,replicasets --all-namespaces -o jsonpath='{range .items[*]}{range .spec.template.spec.containers[*]}{.image}{"\n"}{end}{range .spec.template.spec.initContainers[*]}{.image}{"\n"}{end}{end}' 2>/dev/null; kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{range .spec.containers[*]}{.image}{"\n"}{end}{range .spec.initContainers[*]}{.image}{"\n"}{end}{end}') | sort -u )
@@ -139,7 +143,7 @@ for secret in "${secrets[@]}"; do
             warn "$secret: present but wrong type ($secret_type)"
         fi
     else
-        error "$secret not found"
+        warn "$secret not found"
     fi
 done