make the GH actions and Dockerfile architecture agnostic #101
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: E2E Tests | |
| # This workflow runs end-to-end tests on both AMD64 and ARM64 architectures in parallel | |
| # to ensure compatibility across different hardware platforms. | |
| # | |
| # Configuration: | |
| # - Set RUNNER_ARCH_LARGE_AMD64 variable to override default AMD64 runner | |
| # - Set RUNNER_ARCH_LARGE_ARM64 variable to override default ARM64 runner | |
| # - Each architecture gets its own isolated cluster and test artifacts | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - "pull-request/[0-9]+" | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| permissions: | |
| contents: read # Required for checking out code | |
| actions: read # Required for artifact operations | |
| env: | |
| # E2E-specific tool versions (not duplicated elsewhere) | |
| KIND_VERSION: '0.30.0' | |
| CTLPTL_VERSION: '0.8.43' | |
| TILT_VERSION: '0.35.2' | |
| jobs: | |
| prepare-environment: | |
| uses: ./.github/workflows/prepare-environment.yml | |
| e2e-test: | |
| # Run E2E tests on both AMD64 and ARM64 architectures in parallel | |
| strategy: | |
| fail-fast: false # Allow both architectures to complete even if one fails | |
| matrix: | |
| include: | |
| - arch: amd64 | |
| runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }} | |
| arch_name: "AMD64" | |
| - arch: arm64 | |
| runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }} | |
| arch_name: "ARM64" | |
| name: "E2E Tests (${{ matrix.arch_name }})" | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 90 | |
| needs: prepare-environment | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Workaround for freeing up more disk space | |
| run: | | |
| sudo rm -rf /usr/local/lib/android | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| sudo docker image prune --all --force | |
| # Additional Docker cleanup as recommended by Kind | |
| docker system prune -f | |
| - name: Setup build environment | |
| uses: ./.github/actions/setup-build-env | |
| with: | |
| go-version: ${{ needs.prepare-environment.outputs.go_version }} | |
| python-version: ${{ needs.prepare-environment.outputs.python_version }} | |
| poetry-version: ${{ needs.prepare-environment.outputs.poetry_version }} | |
| golangci-lint-version: ${{ needs.prepare-environment.outputs.golangci_lint_version }} | |
| protobuf-version: ${{ needs.prepare-environment.outputs.protobuf_version }} | |
| protoc-gen-go-version: ${{ needs.prepare-environment.outputs.protoc_gen_go_version }} | |
| protoc-gen-go-grpc-version: ${{ needs.prepare-environment.outputs.protoc_gen_go_grpc_version }} | |
| shellcheck-version: ${{ needs.prepare-environment.outputs.shellcheck_version }} | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Prep system for multi-node Kind cluster | |
| run: | | |
| # System configuration for Kind multi-node setup | |
| sudo systemctl stop apparmor || echo "unable to stop apparmor" | |
| sudo systemctl disable apparmor || echo "unable to disable apparmor" | |
| sudo modprobe br_netfilter || echo "unable to run modprobe" | |
| # Network configuration | |
| sudo sysctl -w net.ipv6.conf.all.forwarding=1 | |
| sudo sysctl -w net.ipv4.ip_forward=1 | |
| sudo sysctl -w net.bridge.bridge-nf-call-ip6tables=1 | |
| sudo sysctl -w net.bridge.bridge-nf-call-iptables=1 | |
| # File system limits for Kind | |
| sudo sysctl -w fs.inotify.max_user_watches=524288 | |
| sudo sysctl -w fs.inotify.max_user_instances=1024 | |
| # IPTables cleanup and configuration | |
| sudo iptables -F && sudo iptables -X && sudo iptables -t nat -F && sudo iptables -t nat -X && sudo iptables -t mangle -F && sudo iptables -t mangle -X && sudo iptables -P INPUT ACCEPT && sudo iptables -P FORWARD ACCEPT -w 5 && sudo iptables -P OUTPUT ACCEPT -w 5 | |
| sudo systemctl restart docker | |
| - name: Cache E2E testing tools | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| /usr/local/bin/ctlptl | |
| /usr/local/bin/kind | |
| /usr/local/bin/kubectl | |
| /usr/local/bin/tilt | |
| key: ${{ runner.os }}-${{ runner.arch }}-e2e-tools-${{ env.KIND_VERSION }}-${{ env.CTLPTL_VERSION }}-${{ env.TILT_VERSION }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-e2e-tools- | |
| - name: Detect runner architecture | |
| run: | | |
| echo "Matrix configuration: ${{ matrix.arch_name }} (${{ matrix.arch }})" | |
| echo "Runner: ${{ matrix.runner }}" | |
| echo "Runner OS: ${{ runner.os }}" | |
| echo "Runner architecture: ${{ runner.arch }}" | |
| echo "System architecture (uname -m): $(uname -m)" | |
| echo "Cache key will be: ${{ runner.os }}-${{ runner.arch }}-e2e-tools-${{ env.KIND_VERSION }}-${{ env.CTLPTL_VERSION }}-${{ env.TILT_VERSION }}" | |
| - name: Install E2E testing tools | |
| run: | | |
| # Install ctlptl (if not cached) | |
| CTLPTL_VERSION="${{ env.CTLPTL_VERSION }}" | |
| if command -v ctlptl &> /dev/null && ctlptl version | grep -q "v${CTLPTL_VERSION}"; then | |
| echo "ctlptl v${CTLPTL_VERSION} already installed from cache" | |
| else | |
| echo "Installing ctlptl v${CTLPTL_VERSION}..." | |
| ARCH=$(case $(uname -m) in x86_64) echo x86_64;; aarch64|arm64) echo aarch64;; *) echo $(uname -m);; esac) | |
| curl -fsSL https://github.com/tilt-dev/ctlptl/releases/download/v${CTLPTL_VERSION}/ctlptl.${CTLPTL_VERSION}.linux.${ARCH}.tar.gz | sudo tar -xzv -C /usr/local/bin ctlptl | |
| fi | |
| # Install Kind (if not cached) | |
| KIND_VERSION="${{ env.KIND_VERSION }}" | |
| if command -v kind &> /dev/null && kind version | grep -q "${KIND_VERSION}"; then | |
| echo "Kind v${KIND_VERSION} already installed from cache" | |
| else | |
| echo "Installing Kind v${KIND_VERSION}..." | |
| ARCH=$(case $(uname -m) in x86_64) echo amd64;; aarch64|arm64) echo arm64;; *) echo $(uname -m);; esac) | |
| curl -Lo ./kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-${ARCH} | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind | |
| fi | |
| # Install kubectl (if not cached) | |
| if command -v kubectl &> /dev/null; then | |
| echo "kubectl already installed from cache" | |
| else | |
| echo "Installing kubectl..." | |
| ARCH=$(case $(uname -m) in x86_64) echo amd64;; aarch64|arm64) echo arm64;; *) echo $(uname -m);; esac) | |
| curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl" | |
| chmod +x kubectl | |
| sudo mv ./kubectl /usr/local/bin/kubectl | |
| fi | |
| # Install Tilt (if not cached) | |
| TILT_VERSION="${{ env.TILT_VERSION }}" | |
| if command -v tilt &> /dev/null && tilt version | grep -q "${TILT_VERSION}"; then | |
| echo "Tilt v${TILT_VERSION} already installed from cache" | |
| else | |
| echo "Installing Tilt v${TILT_VERSION}..." | |
| ARCH=$(case $(uname -m) in x86_64) echo x86_64;; aarch64|arm64) echo aarch64;; *) echo $(uname -m);; esac) | |
| TEMP_DIR=$(mktemp -d) | |
| curl -fsSL https://github.com/tilt-dev/tilt/releases/download/v${TILT_VERSION}/tilt.${TILT_VERSION}.linux.${ARCH}.tar.gz | tar -xzv -C "$TEMP_DIR" tilt | |
| sudo mv "$TEMP_DIR/tilt" /usr/local/bin/ | |
| rm -rf "$TEMP_DIR" | |
| fi | |
| # Verify installations | |
| echo "Verifying tool installations:" | |
| echo "ctlptl: $(ctlptl version)" | |
| echo "Kind: $(kind version)" | |
| echo "kubectl: $(kubectl version --client --short)" | |
| echo "Tilt: $(tilt version)" | |
| echo "Docker: $(docker version --format '{{.Client.Version}}')" | |
| - name: Configure Helm repositories | |
| run: | | |
| helm repo add prometheus-community https://prometheus-community.github.io/helm-charts | |
| helm repo add bitnami https://charts.bitnami.com/bitnami | |
| helm repo add jetstack https://charts.jetstack.io | |
| helm repo update | |
| - name: Configure ctlptl registry authentication | |
| run: | | |
| chmod +x scripts/configure-ctlptl-registry.sh | |
| ./scripts/configure-ctlptl-registry.sh | |
| - name: Create cluster for E2E tests | |
| env: | |
| SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }} | |
| NVCR_CONTAINER_REPO: ${{ needs.prepare-environment.outputs.nvcr_container_repo }} | |
| NGC_ORG: ${{ needs.prepare-environment.outputs.container_org }} | |
| CTLPTL_YAML: ctlptl-config.yaml | |
| # Make cluster names unique per architecture to avoid conflicts in parallel runs | |
| CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}" | |
| run: | | |
| make cluster-create | |
| - name: Run E2E tests | |
| env: | |
| SAFE_REF_NAME: ${{ needs.prepare-environment.outputs.safe_ref_name }} | |
| NVCR_CONTAINER_REPO: ${{ needs.prepare-environment.outputs.nvcr_container_repo }} | |
| NGC_ORG: ${{ needs.prepare-environment.outputs.container_org }} | |
| CTLPTL_YAML: ctlptl-config.yaml | |
| # Use same cluster name suffix for consistency | |
| CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}" | |
| run: | | |
| make e2e-test-ci | |
| - name: Upload test results | |
| uses: ./.github/actions/upload-test-artifacts | |
| with: | |
| component-name: e2e-test-${{ matrix.arch }} | |
| file-paths: | | |
| tests/results/ | |
| tests/*.log | |
| retention-days: 14 | |
| - name: Cleanup Docker resources | |
| if: always() | |
| run: | | |
| # Remove all containers (running and stopped) | |
| docker rm -f $(docker ps -a -q) || true | |
| # Remove all images | |
| docker rmi -f $(docker images -q -a) || true | |
| # Remove all volumes | |
| docker volume prune -f || true | |
| # Remove all networks (except default ones) | |
| docker network prune -f || true | |
| # Clean up build cache | |
| docker builder prune -f || true |