Skip to content

Commit 8052564

Browse files
[no-relnote] Add E2E tests for systemd unit
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 50496c2 commit 8052564

30 files changed

+1665
-17
lines changed

tests/e2e/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ GINKGO_BIN := $(CURDIR)/bin/ginkgo
2424
# current available tests:
2525
# - nvidia-container-cli
2626
# - docker
27+
# - nvidia-cdi-refresh
2728
GINKGO_FOCUS ?=
2829

2930
test: $(GINKGO_BIN)

tests/e2e/e2e_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ var (
4343
sshUser string
4444
sshHost string
4545
sshPort string
46+
47+
testContainerName = "ctk-e2e-test-container"
4648
)
4749

4850
func TestMain(t *testing.T) {
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"strings"
23+
"text/template"
24+
25+
. "github.com/onsi/ginkgo/v2"
26+
. "github.com/onsi/gomega"
27+
)
28+
29+
const (
30+
nvidiaCdiRefreshInstallTemplate = `
31+
# Read the TMPDIR
32+
TMPDIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
33+
export TMPDIR
34+
35+
# uninstall the nvidia-container-toolkit
36+
apt-get remove -y nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1
37+
apt-get autoremove -y
38+
39+
# Remove the cdi file if it exists
40+
if [ -f /var/run/cdi/nvidia.yaml ]; then
41+
rm -f /var/run/cdi/nvidia.yaml
42+
fi
43+
44+
# Stop the nvidia-cdi-refresh.path and nvidia-cdi-refresh.service units
45+
systemctl stop nvidia-cdi-refresh.path
46+
systemctl stop nvidia-cdi-refresh.service
47+
48+
# Reload the systemd daemon
49+
systemctl daemon-reload
50+
51+
# Start the dummy service to force systemd to enter a degraded state
52+
cat <<EOF > /etc/systemd/system/dummy.service
53+
[Unit]
54+
Description=Dummy systemd service
55+
56+
[Service]
57+
Type=oneshot
58+
ExecStart=/usr/bin/sh -c "exit 0"
59+
EOF
60+
61+
# We know the dummy service will fail, so we can ignore the error
62+
systemctl start dummy.service 2>/dev/null || true
63+
64+
# Install the nvidia-container-toolkit
65+
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
66+
67+
if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then
68+
echo "nvidia-cdi-refresh.path is not Active"
69+
exit 1
70+
fi
71+
if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then
72+
echo "nvidia-cdi-refresh.service is not loaded"
73+
exit 1
74+
fi
75+
`
76+
77+
nvidiaCdiRefreshFileExistsTemplate = `
78+
# is /var/run/cdi/nvidia.yaml exists? and exit with 0 if it does not exist
79+
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
80+
echo "nvidia.yaml file does not exist"
81+
exit 1
82+
fi
83+
84+
# generate the nvidia.yaml file
85+
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml
86+
87+
# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
88+
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
89+
echo "nvidia.yaml file is different"
90+
exit 1
91+
fi
92+
`
93+
94+
nvidiaCdiRefreshUpgradeTemplate = `
95+
# remove the generated files
96+
rm /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml
97+
98+
# Touch the nvidia-ctk binary to change the mtime
99+
# This will trigger the nvidia-cdi-refresh.path unit to call the
100+
# nvidia-cdi-refresh.service unit, simulating a change(update/downgrade) in the nvidia-ctk binary.
101+
touch $(which nvidia-ctk)
102+
103+
# wait for 3 seconds
104+
sleep 3
105+
106+
# Check if the file /var/run/cdi/nvidia.yaml is created
107+
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
108+
echo "nvidia.yaml file is not created after updating the modules.dep file"
109+
exit 1
110+
fi
111+
112+
# generate the nvidia.yaml file
113+
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml
114+
115+
# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
116+
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
117+
echo "nvidia.yaml file is different"
118+
exit 1
119+
fi
120+
`
121+
)
122+
123+
var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("systemd-unit"), func() {
124+
var (
125+
nestedContainerRunner Runner
126+
// TODO(@ArangoGutierrez): https://github.com/NVIDIA/nvidia-container-toolkit/pull/1235/files#r2302013660
127+
outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4"
128+
)
129+
130+
BeforeAll(func(ctx context.Context) {
131+
var err error
132+
nestedContainerRunner, err = NewNestedContainerRunner(runner, outerContainerImage, installCTK, imageName+":"+imageTag, testContainerName)
133+
Expect(err).ToNot(HaveOccurred())
134+
})
135+
136+
AfterAll(func(ctx context.Context) {
137+
// Cleanup: remove the container and the temporary script on the host.
138+
// Use || true to ensure cleanup doesn't fail the test
139+
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", testContainerName)) //nolint:errcheck
140+
})
141+
142+
When("installing nvidia-container-toolkit", Ordered, func() {
143+
It("should load the nvidia-cdi-refresh.path and nvidia-cdi-refresh.service units", func(ctx context.Context) {
144+
tmpl, err := template.New("nvidiaCdiRefreshInstall").Parse(nvidiaCdiRefreshInstallTemplate)
145+
Expect(err).ToNot(HaveOccurred())
146+
147+
var nvidiaCdiRefreshInstall strings.Builder
148+
err = tmpl.Execute(&nvidiaCdiRefreshInstall, struct {
149+
ToolkitImage string
150+
}{
151+
ToolkitImage: imageName + ":" + imageTag,
152+
})
153+
Expect(err).ToNot(HaveOccurred())
154+
155+
_, _, err = nestedContainerRunner.Run(nvidiaCdiRefreshInstall.String())
156+
Expect(err).ToNot(HaveOccurred())
157+
})
158+
159+
It("should generate the nvidia.yaml file", func(ctx context.Context) {
160+
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
161+
Expect(err).ToNot(HaveOccurred())
162+
})
163+
164+
It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) {
165+
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshUpgradeTemplate)
166+
Expect(err).ToNot(HaveOccurred())
167+
})
168+
})
169+
})

tests/e2e/nvidia-container-cli_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,12 @@ IN_NS
7272
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() {
7373
var (
7474
nestedContainerRunner Runner
75-
containerName = "node-container-e2e"
7675
hostOutput string
7776
)
7877

7978
BeforeAll(func(ctx context.Context) {
8079
var err error
81-
nestedContainerRunner, err = NewNestedContainerRunner(runner, installCTK, imageName+":"+imageTag, containerName)
80+
nestedContainerRunner, err = NewNestedContainerRunner(runner, "ubuntu", installCTK, imageName+":"+imageTag, testContainerName)
8281
Expect(err).ToNot(HaveOccurred())
8382

8483
// Capture the host GPU list.
@@ -92,7 +91,7 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
9291
AfterAll(func(ctx context.Context) {
9392
// Cleanup: remove the container and the temporary script on the host.
9493
// Use || true to ensure cleanup doesn't fail the test
95-
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
94+
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", testContainerName)) //nolint:errcheck
9695
})
9796

9897
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {

tests/e2e/nvidia-container-toolkit_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,9 +343,8 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
343343
_, _, err := runner.Run("docker pull ubuntu")
344344
Expect(err).ToNot(HaveOccurred())
345345

346-
var tmpDirPath string
347346
// Make test runable from a MacOs hosts.
348-
// On darwin, the temp dir is in /var/folders/.../T/
347+
// On darwin, the GinkgoT().TempDir() dir is in /var/folders/.../T/
349348
// We need to convert it to /tmp/...
350349
if runtime.GOOS == "darwin" {
351350
tmpDirPath = path.Join("/tmp", uuid.NewString())

tests/e2e/runner.go

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,35 @@ const (
3535
{{ range $i, $a := .AdditionalArguments -}}
3636
{{ $a }} \
3737
{{ end -}}
38-
ubuntu sleep infinity`
38+
{{.OuterContainerImage}} sleep infinity`
3939

4040
installDockerTemplate = `
4141
export DEBIAN_FRONTEND=noninteractive
4242
4343
# Add Docker official GPG key:
4444
apt-get update
45-
apt-get install -y ca-certificates curl apt-utils gnupg2
45+
apt-get install -y apt-utils ca-certificates curl gnupg2
4646
install -m 0755 -d /etc/apt/keyrings
47-
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
47+
48+
# Read OS information from /etc/os-release
49+
. /etc/os-release
50+
51+
if [ "${ID}" = "debian" ]; then
52+
curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc
53+
else
54+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
55+
fi
4856
chmod a+r /etc/apt/keyrings/docker.asc
4957
5058
# Add the repository to Apt sources:
51-
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
59+
if [ "${ID}" = "debian" ]; then
60+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian ${VERSION_CODENAME} stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
61+
else
62+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu ${UBUNTU_CODENAME:-$VERSION_CODENAME} stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
63+
fi
5264
apt-get update
5365
54-
apt-get install -y docker-ce docker-ce-cli containerd.io
66+
apt-get install -y docker-ce docker-ce-cli
5567
5668
# start dockerd in the background
5769
dockerd &
@@ -73,10 +85,11 @@ const (
7385
installCTKTemplate = `
7486
# Create a temporary directory and rootfs path
7587
TMPDIR="$(mktemp -d)"
88+
echo "$TMPDIR" > /tmp/ctk_e2e_temp_dir.txt
7689
7790
# Expose TMPDIR for the child namespace
7891
export TMPDIR
79-
92+
8093
docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
8194
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
8295
@@ -98,7 +111,6 @@ type nestedContainerRunner struct {
98111
}
99112

100113
type runnerOption func(*remoteRunner)
101-
type nestedContainerOption func(*nestedContainerRunner)
102114

103115
type Runner interface {
104116
Run(script string) (string, string, error)
@@ -146,7 +158,7 @@ func NewRunner(opts ...runnerOption) Runner {
146158
// NewNestedContainerRunner creates a new nested container runner.
147159
// A nested container runs a container inside another container based on a
148160
// given runner (remote or local).
149-
func NewNestedContainerRunner(runner Runner, installCTK bool, image string, containerName string, opts ...nestedContainerOption) (Runner, error) {
161+
func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool, image string, containerName string) (Runner, error) {
150162
additionalContainerArguments := []string{}
151163

152164
// If a container with the same name exists from a previous test run, remove it first.
@@ -222,6 +234,9 @@ func NewNestedContainerRunner(runner Runner, installCTK bool, image string, cont
222234
}
223235
}
224236

237+
// Mount the /lib/modules directory as a volume to enable the nvidia-cdi-refresh service
238+
additionalContainerArguments = append(additionalContainerArguments, "-v /lib/modules:/lib/modules")
239+
225240
// Launch the container in detached mode.
226241
var outerContainerScriptBuilder strings.Builder
227242
outerContainerTemplate, err := template.New("outerContainer").Parse(outerContainerTemplate)
@@ -231,9 +246,11 @@ func NewNestedContainerRunner(runner Runner, installCTK bool, image string, cont
231246
err = outerContainerTemplate.Execute(&outerContainerScriptBuilder, struct {
232247
ContainerName string
233248
AdditionalArguments []string
249+
OuterContainerImage string
234250
}{
235251
ContainerName: containerName,
236252
AdditionalArguments: additionalContainerArguments,
253+
OuterContainerImage: baseImage,
237254
})
238255
if err != nil {
239256
return nil, fmt.Errorf("failed to execute start container template: %w", err)

tests/go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.23.2
55
toolchain go1.24.1
66

77
require (
8+
github.com/google/uuid v1.6.0
89
github.com/onsi/ginkgo/v2 v2.24.0
910
github.com/onsi/gomega v1.38.0
1011
golang.org/x/crypto v0.41.0
@@ -16,10 +17,12 @@ require (
1617
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
1718
github.com/google/go-cmp v0.7.0 // indirect
1819
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
20+
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect
1921
go.uber.org/automaxprocs v1.6.0 // indirect
2022
golang.org/x/net v0.43.0 // indirect
2123
golang.org/x/sys v0.35.0 // indirect
2224
golang.org/x/text v0.28.0 // indirect
2325
golang.org/x/tools v0.36.0 // indirect
26+
gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b // indirect
2427
gopkg.in/yaml.v3 v3.0.1 // indirect
2528
)

tests/go.sum

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@ github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
1010
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
1111
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
1212
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
13-
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
14-
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
13+
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
14+
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
15+
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
16+
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
1517
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
1618
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
19+
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
20+
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
1721
github.com/onsi/ginkgo/v2 v2.24.0 h1:obZz8LAnHicNdbBqvG3ytAFx8fgza+i1IDpBVcHT2YE=
1822
github.com/onsi/ginkgo/v2 v2.24.0/go.mod h1:ppTWQ1dh9KM/F1XgpeRqelR+zHVwV81DGRSDnFxK7Sk=
1923
github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY=
@@ -41,7 +45,7 @@ golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s
4145
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
4246
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
4347
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
44-
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
45-
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
48+
gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b h1:QRR6H1YWRnHb4Y/HeNFCTJLFVxaq6wH4YuVdsUOr75U=
49+
gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
4650
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
4751
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

0 commit comments

Comments
 (0)