Skip to content

Commit 2c49553

Browse files
Add weekly forward compatibility testing
Implement automated weekly tests validating GPU Operator against latest container-toolkit, device-plugin, and mig-manager images from GHCR. - Add forward-compatibility.yaml workflow with Slack alerts - Create get-latest-images.sh for fetching latest commit-based tags - Extend e2e-tests.yaml and install-operator.sh for component overrides - Add variables.yaml reusable workflow for shared CI variables Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 180e83b commit 2c49553

File tree

4 files changed

+292
-0
lines changed

4 files changed

+292
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/bin/bash
2+
# Copyright NVIDIA CORPORATION
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -euo pipefail
17+
18+
COMPONENT=${1:-}
19+
20+
if [[ -z "${COMPONENT}" ]]; then
21+
echo "Usage: $0 <toolkit|device-plugin|mig-manager>" >&2
22+
exit 1
23+
fi
24+
25+
# Verify regctl is available
26+
if ! command -v regctl &> /dev/null; then
27+
echo "Error: regctl not found. Please install regctl first." >&2
28+
exit 1
29+
fi
30+
31+
# Map component names to GHCR image repositories and GitHub source repositories
32+
case "${COMPONENT}" in
33+
toolkit)
34+
IMAGE_REPO="ghcr.io/nvidia/container-toolkit"
35+
GITHUB_REPO="NVIDIA/container-toolkit"
36+
;;
37+
device-plugin)
38+
IMAGE_REPO="ghcr.io/nvidia/k8s-device-plugin"
39+
GITHUB_REPO="NVIDIA/k8s-device-plugin"
40+
;;
41+
mig-manager)
42+
IMAGE_REPO="ghcr.io/nvidia/k8s-mig-manager"
43+
GITHUB_REPO="NVIDIA/k8s-mig-manager"
44+
;;
45+
*)
46+
echo "Error: Unknown component '${COMPONENT}'" >&2
47+
echo "Valid components: toolkit, device-plugin, mig-manager" >&2
48+
exit 1
49+
;;
50+
esac
51+
52+
echo "Fetching latest commit from ${GITHUB_REPO}..." >&2
53+
54+
# Get the latest commit SHA from the main branch using GitHub API
55+
GITHUB_API_URL="https://api.github.com/repos/${GITHUB_REPO}/commits/main"
56+
57+
# Use GITHUB_TOKEN if available for authentication (higher rate limits)
58+
if [[ -n "${GITHUB_TOKEN:-}" ]]; then
59+
LATEST_COMMIT=$(curl -sSL \
60+
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
61+
-H "Accept: application/vnd.github.v3+json" \
62+
"${GITHUB_API_URL}" | \
63+
jq -r '.sha[0:8]')
64+
else
65+
LATEST_COMMIT=$(curl -sSL \
66+
-H "Accept: application/vnd.github.v3+json" \
67+
"${GITHUB_API_URL}" | \
68+
jq -r '.sha[0:8]')
69+
fi
70+
71+
if [[ -z "${LATEST_COMMIT}" || "${LATEST_COMMIT}" == "null" ]]; then
72+
echo "Error: Failed to fetch latest commit from ${GITHUB_REPO}" >&2
73+
exit 1
74+
fi
75+
76+
echo "Latest commit SHA: ${LATEST_COMMIT}" >&2
77+
78+
# Construct full image path with commit tag
79+
FULL_IMAGE="${IMAGE_REPO}:${LATEST_COMMIT}"
80+
81+
echo "Verifying image exists: ${FULL_IMAGE}" >&2
82+
83+
# Verify the image exists using regctl
84+
if ! regctl manifest head "${FULL_IMAGE}" &> /dev/null; then
85+
echo "Error: Image ${FULL_IMAGE} does not exist or is not accessible" >&2
86+
echo "The image may not have been built yet for commit ${LATEST_COMMIT}" >&2
87+
exit 1
88+
fi
89+
90+
echo "Verified ${COMPONENT} image: ${FULL_IMAGE}" >&2
91+
echo "${FULL_IMAGE}"

.github/workflows/e2e-tests.yaml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,29 @@ on:
2828
operator_version:
2929
required: true
3030
type: string
31+
toolkit_image:
32+
required: false
33+
type: string
34+
description: 'Full container-toolkit image path (e.g., ghcr.io/nvidia/container-toolkit:v1.18.0)'
35+
device_plugin_image:
36+
required: false
37+
type: string
38+
description: 'Full device-plugin image path'
39+
mig_manager_image:
40+
required: false
41+
type: string
42+
description: 'Full mig-manager image path'
3143
secrets:
3244
AWS_ACCESS_KEY_ID:
3345
required: true
3446
AWS_SECRET_ACCESS_KEY:
3547
required: true
3648
AWS_SSH_KEY:
3749
required: true
50+
SLACK_BOT_TOKEN:
51+
required: false
52+
SLACK_CHANNEL_ID:
53+
required: false
3854
workflow_dispatch:
3955
inputs:
4056
operator_image:
@@ -45,13 +61,28 @@ on:
4561
description: 'Operator version to test (override)'
4662
required: false
4763
type: string
64+
toolkit_image:
65+
description: 'Override container-toolkit image'
66+
required: false
67+
type: string
68+
device_plugin_image:
69+
description: 'Override device-plugin image'
70+
required: false
71+
type: string
72+
mig_manager_image:
73+
description: 'Override mig-manager image'
74+
required: false
75+
type: string
4876

4977
jobs:
5078
variables:
5179
runs-on: ubuntu-latest
5280
outputs:
5381
operator_version: ${{ steps.vars.outputs.operator_version }}
5482
operator_image: ${{ steps.vars.outputs.operator_image }}
83+
toolkit_image: ${{ steps.vars.outputs.toolkit_image }}
84+
device_plugin_image: ${{ steps.vars.outputs.device_plugin_image }}
85+
mig_manager_image: ${{ steps.vars.outputs.mig_manager_image }}
5586
steps:
5687
- name: Checkout code
5788
if: ${{ github.event_name != 'workflow_call' }}
@@ -74,12 +105,29 @@ jobs:
74105
OPERATOR_IMAGE="ghcr.io/nvidia/gpu-operator"
75106
fi
76107
108+
# Component images (optional, use inputs if provided)
109+
TOOLKIT_IMAGE="${{ inputs.toolkit_image }}"
110+
DEVICE_PLUGIN_IMAGE="${{ inputs.device_plugin_image }}"
111+
MIG_MANAGER_IMAGE="${{ inputs.mig_manager_image }}"
112+
77113
# Output all variables
78114
echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT
79115
echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT
116+
echo "toolkit_image=${TOOLKIT_IMAGE}" >> $GITHUB_OUTPUT
117+
echo "device_plugin_image=${DEVICE_PLUGIN_IMAGE}" >> $GITHUB_OUTPUT
118+
echo "mig_manager_image=${MIG_MANAGER_IMAGE}" >> $GITHUB_OUTPUT
80119
81120
# Display for debugging
82121
echo "::notice::Testing operator: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}"
122+
if [[ -n "${TOOLKIT_IMAGE}" ]]; then
123+
echo "::notice::Using custom toolkit: ${TOOLKIT_IMAGE}"
124+
fi
125+
if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then
126+
echo "::notice::Using custom device-plugin: ${DEVICE_PLUGIN_IMAGE}"
127+
fi
128+
if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then
129+
echo "::notice::Using custom mig-manager: ${MIG_MANAGER_IMAGE}"
130+
fi
83131
84132
e2e-tests-containerd:
85133
needs: [variables]
@@ -110,6 +158,9 @@ jobs:
110158
env:
111159
OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }}
112160
OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }}
161+
TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }}
162+
DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }}
163+
MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }}
113164
GPU_PRODUCT_NAME: "Tesla-T4"
114165
SKIP_LAUNCH: "true"
115166
CONTAINER_RUNTIME: "containerd"
@@ -156,6 +207,9 @@ jobs:
156207
env:
157208
OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }}
158209
OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }}
210+
TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }}
211+
DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }}
212+
MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }}
159213
GPU_PRODUCT_NAME: "Tesla-T4"
160214
SKIP_LAUNCH: "true"
161215
CONTAINER_RUNTIME: "containerd"
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright NVIDIA CORPORATION
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Forward Compatibility
16+
17+
on:
18+
schedule:
19+
- cron: '0 2 * * 1' # Weekly on Monday at 2 AM UTC
20+
workflow_dispatch:
21+
inputs:
22+
toolkit_image:
23+
description: 'Override container-toolkit image'
24+
required: false
25+
type: string
26+
device_plugin_image:
27+
description: 'Override device-plugin image'
28+
required: false
29+
type: string
30+
mig_manager_image:
31+
description: 'Override mig-manager image'
32+
required: false
33+
type: string
34+
35+
concurrency:
36+
group: ${{ github.workflow }}-${{ github.ref }}
37+
cancel-in-progress: true
38+
39+
jobs:
40+
fetch-latest-images:
41+
runs-on: ubuntu-latest
42+
outputs:
43+
toolkit_image: ${{ steps.images.outputs.toolkit_image }}
44+
device_plugin_image: ${{ steps.images.outputs.device_plugin_image }}
45+
mig_manager_image: ${{ steps.images.outputs.mig_manager_image }}
46+
steps:
47+
- uses: actions/checkout@v5
48+
49+
- name: Install regctl
50+
run: |
51+
REGCTL_VERSION=v0.9.2
52+
mkdir -p bin
53+
curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
54+
chmod +x bin/regctl
55+
echo "$(pwd)/bin" >> $GITHUB_PATH
56+
57+
- name: Get latest component images
58+
id: images
59+
env:
60+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
61+
run: |
62+
# Use workflow_dispatch inputs if provided, otherwise fetch latest
63+
if [[ -n "${{ inputs.toolkit_image }}" ]]; then
64+
TOOLKIT="${{ inputs.toolkit_image }}"
65+
echo "::notice::Using provided toolkit image: ${TOOLKIT}"
66+
else
67+
echo "::notice::Fetching latest container-toolkit image..."
68+
TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit)
69+
fi
70+
echo "toolkit_image=${TOOLKIT}" >> $GITHUB_OUTPUT
71+
72+
if [[ -n "${{ inputs.device_plugin_image }}" ]]; then
73+
DEVICE_PLUGIN="${{ inputs.device_plugin_image }}"
74+
echo "::notice::Using provided device-plugin image: ${DEVICE_PLUGIN}"
75+
else
76+
echo "::notice::Fetching latest device-plugin image..."
77+
DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin)
78+
fi
79+
echo "device_plugin_image=${DEVICE_PLUGIN}" >> $GITHUB_OUTPUT
80+
81+
if [[ -n "${{ inputs.mig_manager_image }}" ]]; then
82+
MIG_MANAGER="${{ inputs.mig_manager_image }}"
83+
echo "::notice::Using provided mig-manager image: ${MIG_MANAGER}"
84+
else
85+
echo "::notice::Fetching latest mig-manager image..."
86+
MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager)
87+
fi
88+
echo "mig_manager_image=${MIG_MANAGER}" >> $GITHUB_OUTPUT
89+
90+
echo "::notice::=== Forward Compatibility Test Configuration ==="
91+
echo "::notice::Container Toolkit: ${TOOLKIT}"
92+
echo "::notice::Device Plugin: ${DEVICE_PLUGIN}"
93+
echo "::notice::MIG Manager: ${MIG_MANAGER}"
94+
95+
run-e2e-tests:
96+
needs: [fetch-latest-images]
97+
uses: ./.github/workflows/e2e-tests.yaml
98+
with:
99+
operator_image: ghcr.io/nvidia/gpu-operator
100+
operator_version: main-latest
101+
toolkit_image: ${{ needs.fetch-latest-images.outputs.toolkit_image }}
102+
device_plugin_image: ${{ needs.fetch-latest-images.outputs.device_plugin_image }}
103+
mig_manager_image: ${{ needs.fetch-latest-images.outputs.mig_manager_image }}
104+
secrets: inherit
105+
106+
notify-failure:
107+
runs-on: ubuntu-latest
108+
needs: [fetch-latest-images, run-e2e-tests]
109+
if: ${{ failure() }}
110+
steps:
111+
- name: Send Slack alert notification
112+
uses: slackapi/[email protected]
113+
with:
114+
method: chat.postMessage
115+
token: ${{ secrets.SLACK_BOT_TOKEN }}
116+
payload: |
117+
channel: ${{ secrets.SLACK_CHANNEL_ID }}
118+
text: |
119+
:x: *Forward Compatibility Test Failed for GPU Operator*
120+
121+
*Workflow:* ${{ github.workflow }}
122+
*Repository:* ${{ github.repository }}
123+
*Trigger:* ${{ github.event_name }}
124+
125+
*Tested Components:*
126+
• Container Toolkit: `${{ needs.fetch-latest-images.outputs.toolkit_image }}`
127+
• Device Plugin: `${{ needs.fetch-latest-images.outputs.device_plugin_image }}`
128+
• MIG Manager: `${{ needs.fetch-latest-images.outputs.mig_manager_image }}`
129+
130+
*Details:* <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Failed Run>
131+
<@D044YE2MBAR> | <@D051KR3TAQN> | <@D04D866RKLH> | <@D045R30QRPS>

tests/scripts/install-operator.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@ if [[ -n "${TOOLKIT_CONTAINER_IMAGE}" ]]; then
2626
TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\""
2727
fi
2828

29+
# We set up the options for the device plugin
30+
: ${DEVICE_PLUGIN_OPTIONS:=""}
31+
32+
if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then
33+
DEVICE_PLUGIN_OPTIONS="${DEVICE_PLUGIN_OPTIONS} --set devicePlugin.repository=\"\" --set devicePlugin.version=\"\" --set devicePlugin.image=\"${DEVICE_PLUGIN_IMAGE}\""
34+
fi
35+
36+
# We set up the options for the MIG manager
37+
: ${MIG_MANAGER_OPTIONS:=""}
38+
39+
if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then
40+
MIG_MANAGER_OPTIONS="${MIG_MANAGER_OPTIONS} --set migManager.repository=\"\" --set migManager.version=\"\" --set migManager.image=\"${MIG_MANAGER_IMAGE}\""
41+
fi
42+
2943
# Create the test namespace
3044
kubectl create namespace "${TEST_NAMESPACE}"
3145

@@ -48,4 +62,6 @@ ${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \
4862
-n "${TEST_NAMESPACE}" \
4963
${OPERATOR_OPTIONS} \
5064
${TOOLKIT_CONTAINER_OPTIONS} \
65+
${DEVICE_PLUGIN_OPTIONS} \
66+
${MIG_MANAGER_OPTIONS} \
5167
--wait

0 commit comments

Comments
 (0)