Update docs/designs/012-health-events-exporter.md #1397
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: E2E Tests | |
| # This workflow runs end-to-end tests on both AMD64 and ARM64 architectures in parallel | |
| # to ensure compatibility across different hardware platforms. | |
| # | |
| # Configuration: | |
| # - Set RUNNER_ARCH_LARGE_AMD64 variable to override default AMD64 runner | |
| # - Set RUNNER_ARCH_LARGE_ARM64 variable to override default ARM64 runner | |
| # - Each architecture gets its own isolated cluster and test artifacts | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - "pull-request/[0-9]+" | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| permissions: | |
| contents: read # Required for checking out code | |
| actions: read # Required for artifact operations | |
| jobs: | |
| e2e-test: | |
| strategy: | |
| fail-fast: false # Allow both architectures to complete even if one fails | |
| matrix: | |
| include: | |
| - arch: amd64 | |
| runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }} | |
| arch_name: "AMD64" | |
| - arch: arm64 | |
| runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }} | |
| arch_name: "ARM64" | |
| name: "E2E Tests (${{ matrix.arch_name }})" | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 90 | |
| steps: | |
| - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| - name: Workaround for freeing up more disk space | |
| run: | | |
| sudo rm -rf /usr/local/lib/android | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| sudo docker image prune --all --force | |
| # Additional Docker cleanup as recommended by Kind | |
| docker system prune -f | |
| - name: Setup build environment | |
| uses: ./.github/actions/setup-ci-env | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 | |
| - name: Prep system for multi-node Kind cluster | |
| run: | | |
| # System configuration for Kind multi-node setup | |
| sudo systemctl stop apparmor || echo "unable to stop apparmor" | |
| sudo systemctl disable apparmor || echo "unable to disable apparmor" | |
| sudo modprobe br_netfilter || echo "unable to run modprobe" | |
| # Network configuration | |
| sudo sysctl -w net.ipv6.conf.all.forwarding=1 | |
| sudo sysctl -w net.ipv4.ip_forward=1 | |
| sudo sysctl -w net.bridge.bridge-nf-call-ip6tables=1 | |
| sudo sysctl -w net.bridge.bridge-nf-call-iptables=1 | |
| # File system limits for Kind | |
| sudo sysctl -w fs.inotify.max_user_watches=524288 | |
| sudo sysctl -w fs.inotify.max_user_instances=1024 | |
| # IPTables cleanup and configuration | |
| sudo iptables -F && sudo iptables -X && sudo iptables -t nat -F && sudo iptables -t nat -X && sudo iptables -t mangle -F && sudo iptables -t mangle -X && sudo iptables -P INPUT ACCEPT && sudo iptables -P FORWARD ACCEPT -w 5 && sudo iptables -P OUTPUT ACCEPT -w 5 | |
| sudo systemctl restart docker | |
| - name: Install E2E testing tools | |
| uses: ./.github/actions/install-e2e-tools | |
| - name: Configure Helm repositories | |
| run: | | |
| helm repo add prometheus-community https://prometheus-community.github.io/helm-charts | |
| helm repo add bitnami https://charts.bitnami.com/bitnami | |
| helm repo add jetstack https://charts.jetstack.io | |
| helm repo update | |
| - name: Configure ctlptl registry authentication | |
| run: | | |
| ./scripts/configure-ctlptl-registry.sh | |
| - name: Compute ref name with short SHA | |
| id: ref-name | |
| run: | | |
| SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7) | |
| SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}" | |
| # Sanitize ref name: replace slashes with hyphens for Docker tag compatibility | |
| SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g') | |
| echo "value=$SAFE_REF" >> $GITHUB_OUTPUT | |
| - name: Create cluster for E2E tests | |
| env: | |
| CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }} | |
| CTLPTL_YAML: .ctlptl.yaml | |
| # Make cluster names unique per architecture to avoid conflicts in parallel runs | |
| CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}" | |
| run: | | |
| make cluster-create | |
| - name: Run E2E tests | |
| env: | |
| CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }} | |
| CTLPTL_YAML: .ctlptl.yaml | |
| # Use same cluster name suffix for consistency | |
| CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}" | |
| run: | | |
| make e2e-test-ci | |
| - name: Upload test results | |
| uses: ./.github/actions/upload-test-artifacts | |
| with: | |
| component-name: e2e-test-${{ matrix.arch }} | |
| file-paths: | | |
| tests/results/ | |
| tests/*.log | |
| retention-days: 14 | |
| - name: Export Kind logs | |
| if: always() | |
| run: | | |
| mkdir -p /tmp/kind-logs | |
| CLUSTER_NAME=$(kind get clusters | head -n1) | |
| if [ -n "$CLUSTER_NAME" ]; then | |
| kind export logs /tmp/kind-logs --name "$CLUSTER_NAME" || true | |
| fi | |
| - name: Collect debug artifacts | |
| if: failure() | |
| run: | | |
| mkdir -p /tmp/debug-artifacts | |
| kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true | |
| kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true | |
| kubectl get pods --all-namespaces -o yaml > /tmp/debug-artifacts/all-pods.yaml || true | |
| kubectl logs --all-namespaces --all-containers=true --tail=500 > /tmp/debug-artifacts/all-logs.txt || true | |
| docker images > /tmp/debug-artifacts/docker-images.txt || true | |
| df -h > /tmp/debug-artifacts/disk-usage.txt || true | |
| - name: Upload Kind logs | |
| if: always() | |
| uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 | |
| with: | |
| name: e2e-kind-logs-${{ matrix.arch }}-${{ github.run_id }} | |
| path: /tmp/kind-logs/ | |
| retention-days: 7 | |
| - name: Upload debug artifacts | |
| if: failure() | |
| uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 | |
| with: | |
| name: e2e-debug-artifacts-${{ matrix.arch }}-${{ github.run_id }} | |
| path: /tmp/debug-artifacts/ | |
| retention-days: 7 | |
| - name: Cleanup Docker resources | |
| if: always() | |
| run: | | |
| docker rm -f $(docker ps -a -q) || true | |
| docker rmi -f $(docker images -q -a) || true | |
| docker volume prune -f || true | |
| docker network prune -f || true | |
| docker builder prune -f || true |