Skip to content

Update docs/designs/012-health-events-exporter.md #1395

Update docs/designs/012-health-events-exporter.md

Update docs/designs/012-health-events-exporter.md #1395

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: E2E Tests
# This workflow runs end-to-end tests on both AMD64 and ARM64 architectures in parallel
# to ensure compatibility across different hardware platforms.
#
# Configuration:
# - Set RUNNER_ARCH_LARGE_AMD64 variable to override default AMD64 runner
# - Set RUNNER_ARCH_LARGE_ARM64 variable to override default ARM64 runner
# - Each architecture gets its own isolated cluster and test artifacts
on:
push:
branches:
- main
- "pull-request/[0-9]+"
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: read # Required for checking out code
actions: read # Required for artifact operations
jobs:
e2e-test:
strategy:
fail-fast: false # Allow both architectures to complete even if one fails
matrix:
include:
- arch: amd64
runner: ${{ vars.RUNNER_ARCH_LARGE_AMD64 || 'linux-amd64-cpu32' }}
arch_name: "AMD64"
- arch: arm64
runner: ${{ vars.RUNNER_ARCH_LARGE_ARM64 || 'linux-arm64-cpu32' }}
arch_name: "ARM64"
name: "E2E Tests (${{ matrix.arch_name }})"
runs-on: ${{ matrix.runner }}
timeout-minutes: 90
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Workaround for freeing up more disk space
run: |
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo docker image prune --all --force
# Additional Docker cleanup as recommended by Kind
docker system prune -f
- name: Setup build environment
uses: ./.github/actions/setup-ci-env
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
- name: Prep system for multi-node Kind cluster
run: |
# System configuration for Kind multi-node setup
sudo systemctl stop apparmor || echo "unable to stop apparmor"
sudo systemctl disable apparmor || echo "unable to disable apparmor"
sudo modprobe br_netfilter || echo "unable to run modprobe"
# Network configuration
sudo sysctl -w net.ipv6.conf.all.forwarding=1
sudo sysctl -w net.ipv4.ip_forward=1
sudo sysctl -w net.bridge.bridge-nf-call-ip6tables=1
sudo sysctl -w net.bridge.bridge-nf-call-iptables=1
# File system limits for Kind
sudo sysctl -w fs.inotify.max_user_watches=524288
sudo sysctl -w fs.inotify.max_user_instances=1024
# IPTables cleanup and configuration
sudo iptables -F && sudo iptables -X && sudo iptables -t nat -F && sudo iptables -t nat -X && sudo iptables -t mangle -F && sudo iptables -t mangle -X && sudo iptables -P INPUT ACCEPT && sudo iptables -P FORWARD ACCEPT -w 5 && sudo iptables -P OUTPUT ACCEPT -w 5
sudo systemctl restart docker
- name: Install E2E testing tools
uses: ./.github/actions/install-e2e-tools
- name: Configure Helm repositories
run: |
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add bitnami https://charts.bitnami.com/bitnami
helm repo add jetstack https://charts.jetstack.io
helm repo update
- name: Configure ctlptl registry authentication
run: |
./scripts/configure-ctlptl-registry.sh
- name: Compute ref name with short SHA
id: ref-name
run: |
SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}"
# Sanitize ref name: replace slashes with hyphens for Docker tag compatibility
SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g')
echo "value=$SAFE_REF" >> $GITHUB_OUTPUT
- name: Create cluster for E2E tests
env:
CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }}
CTLPTL_YAML: .ctlptl.yaml
# Make cluster names unique per architecture to avoid conflicts in parallel runs
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}"
run: |
make cluster-create
- name: Run E2E tests
env:
CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }}
CTLPTL_YAML: .ctlptl.yaml
# Use same cluster name suffix for consistency
CLUSTER_NAME_SUFFIX: "-${{ matrix.arch }}"
run: |
make e2e-test-ci
- name: Upload test results
uses: ./.github/actions/upload-test-artifacts
with:
component-name: e2e-test-${{ matrix.arch }}
file-paths: |
tests/results/
tests/*.log
retention-days: 14
- name: Export Kind logs
if: always()
run: |
mkdir -p /tmp/kind-logs
CLUSTER_NAME=$(kind get clusters | head -n1)
if [ -n "$CLUSTER_NAME" ]; then
kind export logs /tmp/kind-logs --name "$CLUSTER_NAME" || true
fi
- name: Collect debug artifacts
if: failure()
run: |
mkdir -p /tmp/debug-artifacts
kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true
kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true
kubectl get pods --all-namespaces -o yaml > /tmp/debug-artifacts/all-pods.yaml || true
kubectl logs --all-namespaces --all-containers=true --tail=500 > /tmp/debug-artifacts/all-logs.txt || true
docker images > /tmp/debug-artifacts/docker-images.txt || true
df -h > /tmp/debug-artifacts/disk-usage.txt || true
- name: Upload Kind logs
if: always()
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: e2e-kind-logs-${{ matrix.arch }}-${{ github.run_id }}
path: /tmp/kind-logs/
retention-days: 7
- name: Upload debug artifacts
if: failure()
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: e2e-debug-artifacts-${{ matrix.arch }}-${{ github.run_id }}
path: /tmp/debug-artifacts/
retention-days: 7
- name: Cleanup Docker resources
if: always()
run: |
docker rm -f $(docker ps -a -q) || true
docker rmi -f $(docker images -q -a) || true
docker volume prune -f || true
docker network prune -f || true
docker builder prune -f || true