Skip to content

Commit 30ad4de

Browse files
ArangoGutierrezelezar
authored andcommitted
[no-relnote] Add E2E tests for systemd unit
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 81ab222 commit 30ad4de

File tree

4 files changed

+242
-4
lines changed

4 files changed

+242
-4
lines changed

tests/e2e/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ GINKGO_BIN := $(CURDIR)/bin/ginkgo
2424
# current available tests:
2525
# - nvidia-container-cli
2626
# - docker
27+
# - nvidia-cdi-refresh
2728
GINKGO_FOCUS ?=
2829

2930
test: $(GINKGO_BIN)
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"strings"
23+
"time"
24+
25+
. "github.com/onsi/ginkgo/v2"
26+
. "github.com/onsi/gomega"
27+
)
28+
29+
const (
30+
getSystemStateScript = `systemctl is-system-running 2>/dev/null`
31+
32+
setSystemdDegradedScript = `#!/usr/bin/env bash
33+
# Start the dummy service to force systemd to enter a degraded state
34+
cat <<EOF > /etc/systemd/system/dummy.service
35+
[Unit]
36+
Description=Dummy systemd service
37+
38+
[Service]
39+
Type=oneshot
40+
ExecStart=/usr/bin/sh -c "exit 1"
41+
EOF
42+
43+
# We know the dummy service will fail, so we can ignore the error
44+
systemctl start --now dummy.service 2>/dev/null || true
45+
`
46+
47+
fixSystemDegradedScript = `#!/usr/bin/env bash
48+
# Start the dummy service to force systemd to enter a degraded state
49+
cat <<EOF > /etc/systemd/system/dummy.service
50+
[Unit]
51+
Description=Dummy systemd service
52+
53+
[Service]
54+
Type=oneshot
55+
ExecStart=/usr/bin/sh -c "exit 0"
56+
EOF
57+
58+
systemctl daemon-reload
59+
60+
systemctl start --now dummy.service 2>/dev/null || true
61+
62+
rm -rf /etc/systemd/system/dummy.service
63+
systemctl daemon-reload
64+
`
65+
66+
nvidiaCdiRefreshPathActiveTemplate = `
67+
if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then
68+
echo "nvidia-cdi-refresh.path is not Active"
69+
exit 1
70+
fi
71+
`
72+
nvidiaCdiRefreshServiceLoadedTemplate = `
73+
if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then
74+
echo "nvidia-cdi-refresh.service is not loaded"
75+
exit 1
76+
fi
77+
`
78+
79+
nvidiaCdiRefreshFileExistsTemplate = `
80+
# is /var/run/cdi/nvidia.yaml exists? and exit with 0 if it does not exist
81+
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
82+
echo "nvidia.yaml file does not exist"
83+
exit 1
84+
fi
85+
86+
# generate the nvidia.yaml file
87+
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml
88+
89+
# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
90+
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
91+
echo "nvidia.yaml file is different"
92+
exit 1
93+
fi
94+
`
95+
96+
nvidiaCdiRefreshUpgradeTemplate = `
97+
# remove the generated files
98+
rm /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml
99+
100+
# Touch the nvidia-ctk binary to change the mtime
101+
# This will trigger the nvidia-cdi-refresh.path unit to call the
102+
# nvidia-cdi-refresh.service unit, simulating a change(update/downgrade) in the nvidia-ctk binary.
103+
touch $(which nvidia-ctk)
104+
105+
# wait for 3 seconds
106+
sleep 3
107+
108+
# Check if the file /var/run/cdi/nvidia.yaml is created
109+
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
110+
echo "nvidia.yaml file is not created after updating the modules.dep file"
111+
exit 1
112+
fi
113+
114+
# generate the nvidia.yaml file
115+
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml
116+
117+
# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
118+
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
119+
echo "nvidia.yaml file is different"
120+
exit 1
121+
fi
122+
`
123+
)
124+
125+
var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("systemd-unit"), func() {
126+
var (
127+
containerName = "nvctk-e2e-nvidia-cdi-refresh-tests"
128+
systemdRunner Runner
129+
// TODO(@ArangoGutierrez): https://github.com/NVIDIA/nvidia-container-toolkit/pull/1235/files#r2302013660
130+
outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4"
131+
)
132+
133+
BeforeAll(func(ctx context.Context) {
134+
var err error
135+
// TODO: We set installCTK to true here to SKIP the mounting of the files from the host.
136+
// The test here does NOT require the host toolkit.
137+
systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, true, containerName, localCacheDir)
138+
Expect(err).ToNot(HaveOccurred())
139+
for range 10 {
140+
state, _, err := systemdRunner.Run(getSystemStateScript)
141+
if err == nil {
142+
GinkgoLogr.Info("systemd started", "state", state)
143+
break
144+
}
145+
GinkgoLogr.Error(err, "systemctl state")
146+
time.Sleep(1 * time.Second)
147+
}
148+
})
149+
150+
AfterAll(func(ctx context.Context) {
151+
// Cleanup: remove the container and the temporary script on the host.
152+
// Use || true to ensure cleanup doesn't fail the test
153+
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName))
154+
})
155+
156+
When("installing nvidia-container-toolkit", Ordered, func() {
157+
BeforeAll(func(ctx context.Context) {
158+
159+
_, _, err := toolkitInstaller.Install(systemdRunner)
160+
Expect(err).ToNot(HaveOccurred())
161+
162+
output, _, err := systemdRunner.Run("nvidia-ctk --version")
163+
Expect(err).ToNot(HaveOccurred())
164+
GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output))
165+
})
166+
167+
AfterAll(func(ctx context.Context) {
168+
_, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*")
169+
Expect(err).ToNot(HaveOccurred())
170+
})
171+
172+
It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
173+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
174+
Expect(err).ToNot(HaveOccurred())
175+
})
176+
177+
It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
178+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
179+
Expect(err).ToNot(HaveOccurred())
180+
})
181+
182+
It("should generate the nvidia.yaml file", func(ctx context.Context) {
183+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
184+
Expect(err).ToNot(HaveOccurred())
185+
})
186+
187+
It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) {
188+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshUpgradeTemplate)
189+
Expect(err).ToNot(HaveOccurred())
190+
})
191+
})
192+
193+
When("installing nvidia-container-toolkit on a system with a degraded systemd", Ordered, func() {
194+
BeforeAll(func(ctx context.Context) {
195+
_, _, err := systemdRunner.Run(setSystemdDegradedScript)
196+
Expect(err).ToNot(HaveOccurred())
197+
198+
_, _, err = systemdRunner.Run(getSystemStateScript)
199+
Expect(err).To(HaveOccurred())
200+
Expect(err.Error()).To(ContainSubstring("degraded"))
201+
})
202+
203+
AfterAll(func(ctx context.Context) {
204+
_, _, err := systemdRunner.Run(fixSystemDegradedScript)
205+
Expect(err).ToNot(HaveOccurred())
206+
207+
state, _, err := systemdRunner.Run(getSystemStateScript)
208+
Expect(err).ToNot(HaveOccurred())
209+
Expect(strings.TrimSpace(state)).To(Equal("running"))
210+
})
211+
212+
It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
213+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
214+
Expect(err).ToNot(HaveOccurred())
215+
})
216+
217+
It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
218+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
219+
Expect(err).ToNot(HaveOccurred())
220+
})
221+
222+
It("should generate the nvidia.yaml file", func(ctx context.Context) {
223+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
224+
Expect(err).ToNot(HaveOccurred())
225+
})
226+
227+
It("should generate the nvidia.yaml file", func(ctx context.Context) {
228+
_, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
229+
Expect(err).ToNot(HaveOccurred())
230+
})
231+
})
232+
})

tests/e2e/nvidia-container-cli_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,14 @@ IN_NS
7171

7272
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() {
7373
var (
74+
containerName = "nvct-e2e-nvidia-container-cli-tests"
7475
nestedContainerRunner Runner
75-
containerName = "node-container-e2e"
7676
hostOutput string
7777
)
7878

7979
BeforeAll(func(ctx context.Context) {
8080
var err error
81-
nestedContainerRunner, err = NewNestedContainerRunner(runner, installCTK, containerName, localCacheDir)
81+
nestedContainerRunner, err = NewNestedContainerRunner(runner, "ubuntu", installCTK, containerName, localCacheDir)
8282
Expect(err).ToNot(HaveOccurred())
8383

8484
if installCTK {

tests/e2e/runner.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func NewRunner(opts ...runnerOption) Runner {
9696
// NewNestedContainerRunner creates a new nested container runner.
9797
// A nested container runs a container inside another container based on a
9898
// given runner (remote or local).
99-
func NewNestedContainerRunner(runner Runner, installCTK bool, containerName string, cacheDir string) (Runner, error) {
99+
func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool, containerName string, cacheDir string) (Runner, error) {
100100
// If a container with the same name exists from a previous test run, remove it first.
101101
// Ignore errors as container might not exist
102102
_, _, err := runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName))
@@ -165,9 +165,13 @@ func NewNestedContainerRunner(runner Runner, installCTK bool, containerName stri
165165
}
166166
}
167167

168+
// Mount the /lib/modules directory as a volume to enable the nvidia-cdi-refresh service
169+
additionalContainerArguments = append(additionalContainerArguments, "-v /lib/modules:/lib/modules")
170+
168171
// Launch the container in detached mode.
169172
container := outerContainer{
170173
Name: containerName,
174+
BaseImage: baseImage,
171175
AdditionalArguments: additionalContainerArguments,
172176
}
173177

@@ -298,6 +302,7 @@ func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
298302
// The template also allows for additional arguments to be specified.
299303
type outerContainer struct {
300304
Name string
305+
BaseImage string
301306
AdditionalArguments []string
302307
}
303308

@@ -308,7 +313,7 @@ func (o *outerContainer) Render() (string, error) {
308313
{{ range $i, $a := .AdditionalArguments -}}
309314
{{ $a }} \
310315
{{ end -}}
311-
ubuntu sleep infinity`)
316+
{{.BaseImage}} sleep infinity`)
312317

313318
if err != nil {
314319
return "", err

0 commit comments

Comments
 (0)