feat: Design doc for workflow implementation for XID 13 and 31 (#345) #220
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: Publish Containers | |
| on: | |
| push: | |
| tags: | |
| - 'v*' | |
| branches: | |
| - main | |
| workflow_dispatch: | |
| inputs: | |
| tag: | |
| description: 'Existing tag to build and publish (e.g., v1.2.3)' | |
| required: true | |
| type: string | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: false | |
| permissions: | |
| contents: read # Required for checking out code | |
| packages: write # Required for publishing containers to ghcr.io | |
| actions: read # Required for artifact operations | |
| security-events: write | |
| id-token: write | |
| attestations: write | |
| jobs: | |
| build-image-list: | |
| runs-on: linux-amd64-cpu32 | |
| timeout-minutes: 60 | |
| outputs: | |
| versions: ${{ steps.build-list.outputs.versions }} | |
| steps: | |
| - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag || github.ref }} | |
| - name: Setup build environment | |
| uses: ./.github/actions/setup-ci-env | |
| - name: Compute ref name with short SHA | |
| id: ref-name | |
| run: | | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| SAFE_REF="${{ github.event.inputs.tag }}" | |
| elif [[ "${{ github.ref_type }}" == "tag" ]]; then | |
| SAFE_REF="${{ github.ref_name }}" | |
| else | |
| SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7) | |
| SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}" | |
| fi | |
| # Sanitize ref name: replace slashes with hyphens for Docker tag compatibility | |
| SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g') | |
| echo "value=$SAFE_REF" >> $GITHUB_OUTPUT | |
| - name: Build image list | |
| id: build-list | |
| env: | |
| CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }} | |
| run: | | |
| chmod +x ./scripts/build-image-list.sh | |
| ./scripts/build-image-list.sh | |
| cat versions.txt | |
| - name: Upload versions.txt | |
| uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 | |
| with: | |
| name: versions | |
| path: versions.txt | |
| retention-days: 90 | |
| build-images-docker: | |
| runs-on: linux-amd64-cpu32 | |
| timeout-minutes: 60 | |
| permissions: | |
| contents: read | |
| packages: write | |
| id-token: write | |
| attestations: write | |
| strategy: | |
| matrix: | |
| include: | |
| - component: gpu-health-monitor-dcgm3 | |
| make_command: 'make -C health-monitors/gpu-health-monitor docker-publish-dcgm3' | |
| container_name: 'nvsentinel/gpu-health-monitor' | |
| tag_suffix: '-dcgm-3.x' | |
| - component: gpu-health-monitor-dcgm4 | |
| make_command: 'make -C health-monitors/gpu-health-monitor docker-publish-dcgm4' | |
| container_name: 'nvsentinel/gpu-health-monitor' | |
| tag_suffix: '-dcgm-4.x' | |
| - component: syslog-health-monitor | |
| make_command: 'make -C health-monitors/syslog-health-monitor docker-publish' | |
| container_name: 'nvsentinel/syslog-health-monitor' | |
| - component: kubernetes-object-monitor | |
| make_command: 'make -C health-monitors/kubernetes-object-monitor docker-publish' | |
| container_name: 'nvsentinel/kubernetes-object-monitor' | |
| - component: metadata-collector | |
| make_command: 'make -C metadata-collector docker-publish' | |
| container_name: 'nvsentinel/metadata-collector' | |
| - component: log-collector | |
| make_command: 'make -C log-collector docker-publish-log-collector' | |
| container_name: 'nvsentinel/log-collector' | |
| - component: file-server-cleanup | |
| make_command: 'make -C log-collector docker-publish-file-server-cleanup' | |
| container_name: 'nvsentinel/file-server-cleanup' | |
| steps: | |
| - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag || github.ref }} | |
| - name: Setup build environment | |
| uses: ./.github/actions/setup-ci-env | |
| - name: Compute ref name with short SHA | |
| id: ref-name | |
| run: | | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| SAFE_REF="${{ github.event.inputs.tag }}" | |
| elif [[ "${{ github.ref_type }}" == "tag" ]]; then | |
| SAFE_REF="${{ github.ref_name }}" | |
| else | |
| SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7) | |
| SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}" | |
| fi | |
| # Sanitize ref name: replace slashes with hyphens for Docker tag compatibility | |
| SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g') | |
| echo "value=$SAFE_REF" >> $GITHUB_OUTPUT | |
| - name: Publish container for ${{ matrix.component }} | |
| uses: ./.github/actions/publish-container | |
| env: | |
| CI_COMMIT_REF_NAME: ${{ steps.ref-name.outputs.value }} | |
| with: | |
| make_command: ${{ matrix.make_command }} | |
| registry_password: ${{ secrets.GITHUB_TOKEN }} | |
| container_name: ${{ matrix.container_name }} | |
| tag_suffix: ${{ matrix.tag_suffix }} | |
| # Build images using ko and attest provenance | |
| build-images-ko: | |
| runs-on: linux-amd64-cpu32 | |
| timeout-minutes: 60 | |
| permissions: | |
| contents: read | |
| packages: write | |
| id-token: write | |
| attestations: write | |
| env: | |
| KO_DOCKER_REPO: ghcr.io/${{ github.repository }} | |
| GIT_COMMIT: ${{ github.sha }} | |
| PLATFORMS: linux/amd64,linux/arm64 | |
| outputs: | |
| images: ${{ steps.ko-build.outputs.images }} | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag || github.ref }} | |
| - name: Setup build environment | |
| uses: ./.github/actions/setup-ci-env | |
| - name: Compute ref name with short SHA | |
| id: ref-name | |
| run: | | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| SAFE_REF="${{ github.event.inputs.tag }}" | |
| elif [[ "${{ github.ref_type }}" == "tag" ]]; then | |
| SAFE_REF="${{ github.ref_name }}" | |
| else | |
| SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7) | |
| SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}" | |
| fi | |
| # Sanitize ref name: replace slashes with hyphens for Docker tag compatibility | |
| SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g') | |
| echo "value=$SAFE_REF" >> $GITHUB_OUTPUT | |
| # ko is already installed in the setup-ci-env action | |
| - name: Authenticate to GHCR | |
| shell: bash | |
| run: echo ${{ secrets.GITHUB_TOKEN }} | ko login ghcr.io -u ${{ github.actor }} --password-stdin | |
| - name: Build Using Ko | |
| id: ko-build | |
| env: | |
| VERSION: ${{ steps.ref-name.outputs.value }} | |
| run: scripts/buildko.sh | |
| attest-and-sbom-ko: | |
| needs: build-images-ko | |
| runs-on: linux-amd64-cpu32 | |
| permissions: | |
| contents: read | |
| packages: write | |
| id-token: write | |
| attestations: write | |
| strategy: | |
| matrix: | |
| image: ${{ fromJson(needs.build-images-ko.outputs.images) }} | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag || github.ref }} | |
| - name: Generate SBOM and Attest | |
| uses: ./.github/actions/sbom-and-attest | |
| with: | |
| image_name: ${{ matrix.image.name }} | |
| image_digest: ${{ matrix.image.digest }} | |
| registry_password: ${{ secrets.GITHUB_TOKEN }} | |
| e2e-test: | |
| name: "E2E Test Published Images" | |
| runs-on: linux-amd64-cpu32 | |
| timeout-minutes: 60 | |
| needs: | |
| - build-images-docker | |
| - build-images-ko | |
| - attest-and-sbom-ko | |
| env: | |
| CLUSTER_NAME: 'nvsentinel-uat' | |
| FAKE_GPU_NODE_COUNT: '10' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | |
| with: | |
| ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.tag || github.ref }} | |
| - name: Workaround for freeing up more disk space | |
| run: | | |
| sudo rm -rf /usr/local/lib/android | |
| sudo rm -rf /usr/share/dotnet | |
| sudo rm -rf /opt/ghc | |
| sudo rm -rf /opt/hostedtoolcache/CodeQL | |
| sudo docker image prune --all --force | |
| sudo docker system prune -f | |
| - name: Setup build environment | |
| uses: ./.github/actions/setup-ci-env | |
| - name: Install E2E testing tools | |
| uses: ./.github/actions/install-e2e-tools | |
| - name: Configure ctlptl registry authentication | |
| run: | | |
| ./scripts/configure-ctlptl-registry.sh | |
| - name: Compute ref name with short SHA | |
| id: ref-name | |
| run: | | |
| if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then | |
| SAFE_REF="${{ github.event.inputs.tag }}" | |
| elif [[ "${{ github.ref_type }}" == "tag" ]]; then | |
| SAFE_REF="${{ github.ref_name }}" | |
| else | |
| SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7) | |
| SAFE_REF="${{ github.ref_name }}-${SHORT_SHA}" | |
| fi | |
| # Sanitize ref name: replace slashes with hyphens for Docker tag compatibility | |
| SAFE_REF=$(echo "$SAFE_REF" | sed 's/\//-/g') | |
| echo "value=$SAFE_REF" >> $GITHUB_OUTPUT | |
| - name: Prep system for multi-node Kind cluster | |
| run: | | |
| # System configuration for Kind multi-node setup | |
| sudo systemctl stop apparmor || echo "unable to stop apparmor" | |
| sudo systemctl disable apparmor || echo "unable to disable apparmor" | |
| sudo modprobe br_netfilter || echo "unable to run modprobe" | |
| # Network configuration | |
| sudo sysctl -w net.ipv6.conf.all.forwarding=1 | |
| sudo sysctl -w net.ipv4.ip_forward=1 | |
| sudo sysctl -w net.bridge.bridge-nf-call-ip6tables=1 | |
| sudo sysctl -w net.bridge.bridge-nf-call-iptables=1 | |
| # File system limits for Kind | |
| sudo sysctl -w fs.inotify.max_user_watches=524288 | |
| sudo sysctl -w fs.inotify.max_user_instances=1024 | |
| # IPTables cleanup and configuration | |
| sudo iptables -F && sudo iptables -X && sudo iptables -t nat -F && sudo iptables -t nat -X && sudo iptables -t mangle -F && sudo iptables -t mangle -X && sudo iptables -P INPUT ACCEPT && sudo iptables -P FORWARD ACCEPT -w 5 && sudo iptables -P OUTPUT ACCEPT -w 5 | |
| sudo systemctl restart docker | |
| - name: Create Kind cluster using make | |
| env: | |
| CLUSTER_NAME: ${{ env.CLUSTER_NAME }} | |
| CTLPTL_YAML: .ctlptl.yaml | |
| run: | | |
| make cluster-create | |
| - name: Install NVSentinel using UAT script | |
| env: | |
| NVSENTINEL_VERSION: ${{ steps.ref-name.outputs.value }} | |
| CSP: kind | |
| FAKE_GPU_NODE_COUNT: ${{ env.FAKE_GPU_NODE_COUNT }} | |
| CLUSTER_NAME: ${{ env.CLUSTER_NAME }} | |
| run: | | |
| cd tests/uat | |
| ./install-apps.sh | |
| - name: Validate NVSentinel deployment | |
| env: | |
| VERSION: ${{ steps.ref-name.outputs.value }} | |
| run: | | |
| ./scripts/validate-nvsentinel.sh --version "${VERSION}" | |
| - name: Export Kind logs | |
| if: always() | |
| run: | | |
| mkdir -p /tmp/kind-logs | |
| kind export logs /tmp/kind-logs --name "${CLUSTER_NAME}" || true | |
| - name: Collect debug artifacts | |
| if: failure() | |
| env: | |
| VERSION: ${{ steps.ref-name.outputs.value }} | |
| run: | | |
| mkdir -p /tmp/debug-artifacts | |
| kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true | |
| kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true | |
| kubectl get pods -n nvsentinel -o yaml > /tmp/debug-artifacts/nvsentinel-pods.yaml || true | |
| kubectl logs -n nvsentinel --all-containers=true --tail=500 > /tmp/debug-artifacts/nvsentinel-logs.txt || true | |
| docker images > /tmp/debug-artifacts/docker-images.txt | |
| df -h > /tmp/debug-artifacts/disk-usage.txt | |
| - name: Upload Kind logs | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-kind-logs-${{ github.run_id }} | |
| path: /tmp/kind-logs/ | |
| retention-days: 7 | |
| - name: Upload debugging artifacts | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-debug-artifacts-${{ github.run_id }} | |
| path: /tmp/debug-artifacts/ | |
| retention-days: 7 | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| kind delete cluster --name "${CLUSTER_NAME}" || true | |
| docker system prune -f || true |