2020 - cron : ' 00 09 * * *' # scheduled job
2121
2222jobs :
23- pre-compiled :
23+ set-driver-version-matrix :
24+ runs-on : ubuntu-latest
25+ outputs :
26+ driver_branch : ${{ steps.extract_driver_branch.outputs.driver_branch }}
27+ kernel_flavors : ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
28+ steps :
29+ - name : Checkout code
30+ uses : actions/checkout@v4
31+ - name : Read driver versions
32+ id : extract_driver_branch
33+ run : |
34+ # get driver-branch
35+ DRIVER_BRANCH=("535" "550")
36+ driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
37+ echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
38+
39+ # get kernel flavors
40+ KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
41+ kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
42+ echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
43+
44+ precompiled-image :
45+ needs : set-driver-version-matrix
2446 runs-on : ubuntu-latest
2547 strategy :
2648 matrix :
27- driver :
28- - 535
29- - 550
30- flavor :
31- - aws
32- - azure
33- - generic
34- - nvidia
35- - oracle
49+ driver-branch : ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
50+ flavor : ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
3651 steps :
3752 - uses : actions/checkout@v4
3853 name : Check out code
@@ -64,10 +79,10 @@ jobs:
6479 VERSION : ${COMMIT_SHORT_SHA}
6580 BASE_TARGET : jammy
6681 run : |
67- make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
82+ make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
6883
6984 trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
70- docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
85+ docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }}
7186 # try 3 times every 10 seconds to get the file, if success exit the loop
7287 for i in {1..3}; do
7388 docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
@@ -81,4 +96,155 @@ jobs:
8196 DIST : signed_ubuntu22.04
8297 run : |
8398 source kernel_version.txt && \
84- make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
99+ make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION}
100+
101+ determine-e2e-test-matrix :
102+ runs-on : ubuntu-latest
103+ needs :
104+ - precompiled-image
105+ - set-driver-version-matrix
106+ outputs :
107+ matrix_values_not_empty : ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
108+ matrix_values : ${{ steps.set_kernel_version.outputs.matrix_values }}
109+ steps :
110+ - name : Check out code
111+ uses : actions/checkout@v4
112+ - name : Login to GitHub Container Registry
113+ uses : docker/login-action@v3
114+ with :
115+ registry : ghcr.io
116+ username : ${{ github.actor }}
117+ password : ${{ secrets.GITHUB_TOKEN }}
118+
119+ - name : Set kernel version
120+ id : set_kernel_version
121+ env :
122+ BASE_TARGET : " jammy"
123+ DIST : " ubuntu22.04"
124+ run : |
125+ echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
126+
127+ kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
128+ kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]')
129+ driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
130+ driver_branch=$(echo "$driver_branch_json" | jq -r '.[]')
131+
132+ kernel_versions=()
133+ for kernel_flavor in $kernel_flavors; do
134+ # FIXME -- remove if condition, once azure kernel upgrade starts working
135+ if [[ "$kernel_flavor" == "azure" ]]; then
136+ echo "skipping azure kernel testing"
137+ continue
138+ fi
139+ for DRIVER_BRANCH in $driver_branch; do
140+ source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST"
141+ if [[ "$should_continue" == true ]]; then
142+ echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
143+ break
144+ fi
145+ done
146+ if [[ "$should_continue" == false ]]; then
147+ echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
148+ else
149+ KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
150+ kernel_versions+=("$KERNEL_VERSION")
151+ echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
152+ fi
153+ done
154+
155+ # Convert array to JSON format and assign
156+ echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
157+ printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
158+ echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
159+
160+ e2e-tests-nvidiadriver :
161+ runs-on : ubuntu-latest
162+ needs :
163+ - determine-e2e-test-matrix
164+ - set-driver-version-matrix
165+ if : ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
166+ strategy :
167+ matrix :
168+ kernel_version : ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
169+ steps :
170+ - name : Check out code
171+ uses : actions/checkout@v4
172+ - name : Set up Holodeck
173+ 174+ env :
175+ AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
176+ AWS_ACCESS_KEY_ID : ${{ secrets.AWS_ACCESS_KEY_ID }}
177+ AWS_SSH_KEY : ${{ secrets.AWS_SSH_KEY }}
178+ with :
179+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
180+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
181+ aws_ssh_key : ${{ secrets.AWS_SSH_KEY }}
182+ holodeck_config : " tests/holodeck.yaml"
183+
184+ - name : Get public dns name
185+ id : get_public_dns_name
186+ uses : mikefarah/yq@master
187+ with :
188+ cmd : yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
189+ - name : Set and Calculate test vars
190+ run : |
191+ echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
192+ echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
193+ echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
194+ echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
195+ echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
196+ KERNEL_VERSION="${{ matrix.kernel_version }}"
197+ echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
198+
199+ - name : Upgrade the kernel for Precompiled e2e test
200+ env :
201+ UPGRADE_KERNEL_SCRIPT : " ./tests/scripts/upgrade-kernel.sh"
202+ run : |
203+ status=0
204+ ./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
205+ # On the target system, all scripts/test-case exit with code 1 for error handling.
206+ # However, since reboot-related disconnections break the SSH connection
207+ # and can cause the entire job to exit, we should ignore all errors except
208+ # exit code 1. During a reboot, exit code 1 will not be thrown, so handling
209+ # other errors as code 1 will ensure proper management of reboot scenarios
210+ if [ $status -eq 1 ]; then
211+ echo "Kernel version $KERNEL_VERSION upgrade failed"
212+ exit 1
213+ fi
214+ ./tests/scripts/remote_retry.sh || status=$?
215+ if [ $status -ne 0 ]; then
216+ echo "Failed to connect to remote instance"
217+ exit $status
218+ fi
219+
220+ - name : Precompiled e2e test gpu driver validation
221+ env :
222+ TEST_CASE : " ./tests/cases/nvidia-driver.sh"
223+ GPU_OPERATOR_OPTIONS : " --set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
224+ run : |
225+ rc=0
226+ # for precompiled driver we are setting driver branch as driver version
227+ driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
228+ driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
229+ for DRIVER_VERSION in $driver_versions; do
230+ echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
231+ status=0
232+ OPERATOR_OPTIONS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
233+ # add escape character for space
234+ OPERATOR_OPTIONS=$(printf '%q ' "$OPERATOR_OPTIONS")
235+ ./tests/ci-run-e2e.sh "${TEST_CASE}" "${OPERATOR_OPTIONS}" || status=$?
236+ if [ $status -eq 1 ]; then
237+ echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
238+ rc=$status
239+ fi
240+ done
241+ ./tests/scripts/pull.sh /tmp/logs logs
242+ exit $rc
243+
244+ - name : Archive test logs
245+ if : ${{ failure() }}
246+ uses : actions/upload-artifact@v4
247+ with :
248+ name : nvidiadriver-Precompiled-e2e-test-logs
249+ path : ./logs/
250+ retention-days : 15
0 commit comments