Skip to content

Commit 4507575

Browse files
authored
Merge pull request #1118 from ArangoGutierrez/e2e/nvidia-container-cli
[no-relnote] Add E2E for libnvidia-container
2 parents fd4ca8f + 718fe70 commit 4507575

File tree

4 files changed

+254
-9
lines changed

4 files changed

+254
-9
lines changed

tests/e2e/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
2020

2121
GINKGO_BIN := $(CURDIR)/bin/ginkgo
2222

23+
# If GINKGO_FOCUS is not set, run all tests
24+
# current available tests:
25+
# - nvidia-container-cli
26+
# - docker
27+
GINKGO_FOCUS ?=
28+
2329
test: $(GINKGO_BIN)
24-
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
30+
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/...
2531

2632
# test-preinstalled runs the test cases against the version of the toolkit that
2733
# is already installed (and configured for docker) on the host.

tests/e2e/e2e_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,7 @@ func getTestEnv() {
6464

6565
if installCTK {
6666
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
67-
6867
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
69-
7068
}
7169

7270
sshHost = getEnvVarOrDefault("E2E_SSH_HOST", "")

tests/e2e/installer.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,18 @@ var dockerInstallTemplate = `
2828
#! /usr/bin/env bash
2929
set -xe
3030
31-
: ${IMAGE:={{.Image}}}
32-
33-
# Create a temporary directory
34-
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
35-
mkdir -p "$TEMP_DIR"
31+
# if the TEMP_DIR is already set, use it
32+
if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then
33+
TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
34+
else
35+
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
36+
echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt
37+
fi
38+
39+
# if TEMP_DIR does not exist, create it
40+
if [ ! -d "$TEMP_DIR" ]; then
41+
mkdir -p "$TEMP_DIR"
42+
fi
3643
3744
# Given that docker has an init function that checks for the existence of the
3845
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
@@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged \
4653
-v /var/run/docker.sock:/var/run/docker.sock \
4754
-v "$TEMP_DIR:$TEMP_DIR" \
4855
-v /etc/docker:/config-root \
49-
${IMAGE} \
56+
{{.Image}} \
5057
--root "$TEMP_DIR" \
5158
--runtime=docker \
5259
--config=/config-root/daemon.json \
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"strings"
23+
"text/template"
24+
25+
. "github.com/onsi/ginkgo/v2"
26+
. "github.com/onsi/gomega"
27+
)
28+
29+
const (
30+
installDockerTemplate = `
31+
export DEBIAN_FRONTEND=noninteractive
32+
33+
# Add Docker official GPG key:
34+
apt-get update
35+
apt-get install -y ca-certificates curl apt-utils gnupg2
36+
install -m 0755 -d /etc/apt/keyrings
37+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
38+
chmod a+r /etc/apt/keyrings/docker.asc
39+
40+
# Add the repository to Apt sources:
41+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
42+
apt-get update
43+
44+
apt-get install -y docker-ce docker-ce-cli containerd.io
45+
46+
# start dockerd in the background
47+
dockerd &
48+
49+
# wait for dockerd to be ready with timeout
50+
timeout=30
51+
elapsed=0
52+
while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
53+
echo "Waiting for dockerd to be ready..."
54+
sleep 1
55+
elapsed=$((elapsed + 1))
56+
done
57+
if [ $elapsed -ge $timeout ]; then
58+
echo "Docker failed to start within $timeout seconds"
59+
exit 1
60+
fi
61+
`
62+
installCTKTemplate = `
63+
# Create a temporary directory and rootfs path
64+
TMPDIR="$(mktemp -d)"
65+
66+
# Expose TMPDIR for the child namespace
67+
export TMPDIR
68+
69+
docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
70+
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
71+
72+
nvidia-container-cli --version
73+
`
74+
75+
libnvidiaContainerCliTestTemplate = `
76+
# Create a temporary directory and rootfs path
77+
TMPDIR="$(mktemp -d)"
78+
ROOTFS="${TMPDIR}/rootfs"
79+
mkdir -p "${ROOTFS}"
80+
81+
# Expose ROOTFS for the child namespace
82+
export ROOTFS TMPDIR
83+
84+
# Download Ubuntu base image with error handling
85+
curl -fsSL http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz | tar -C $ROOTFS -xz || {
86+
echo "Failed to download or extract Ubuntu base image"
87+
exit 1
88+
}
89+
90+
# Enter a new mount + PID namespace so we can pivot_root without touching the
91+
# container'\''s original filesystem.
92+
unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\''
93+
: "${ROOTFS:?}"
94+
95+
# 1 Bind-mount the new root and make the mount private
96+
mount --bind "$ROOTFS" "$ROOTFS"
97+
mount --make-private "$ROOTFS"
98+
cd "$ROOTFS"
99+
100+
# 2 Minimal virtual filesystems
101+
mount -t proc proc proc
102+
mount -t sysfs sys sys
103+
mount -t tmpfs tmp tmp
104+
mount -t tmpfs run run
105+
106+
# 3 Configure NVIDIA devices
107+
nvidia-container-cli --load-kmods configure --ldconfig=@/sbin/ldconfig.real --no-cgroups --utility --device 0 $(pwd)
108+
109+
# 4 Switch root into the prepared filesystem
110+
pivot_root . mnt
111+
umount -l mnt
112+
nvidia-smi -L
113+
114+
IN_NS
115+
`
116+
117+
startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
118+
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
119+
-e NVIDIA_DRIVER_CAPABILITIES=all \
120+
{{ range $i, $a := .AdditionalArguments -}}
121+
{{ $a }} \
122+
{{ end -}}
123+
ubuntu sleep infinity`
124+
)
125+
126+
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() {
127+
var (
128+
runner Runner
129+
containerName = "node-container-e2e"
130+
hostOutput string
131+
additionalContainerArguments []string
132+
)
133+
134+
BeforeAll(func(ctx context.Context) {
135+
runner = NewRunner(
136+
WithHost(sshHost),
137+
WithPort(sshPort),
138+
WithSshKey(sshKey),
139+
WithSshUser(sshUser),
140+
)
141+
142+
if installCTK {
143+
installer, err := NewToolkitInstaller(
144+
WithRunner(runner),
145+
WithImage(imageName+":"+imageTag),
146+
WithTemplate(dockerInstallTemplate),
147+
)
148+
Expect(err).ToNot(HaveOccurred())
149+
150+
err = installer.Install()
151+
Expect(err).ToNot(HaveOccurred())
152+
} else {
153+
// If installCTK is false, we use the preinstalled toolkit.
154+
// TODO: This should be updated for other distributions and other components of the toolkit.
155+
output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*")
156+
Expect(err).ToNot(HaveOccurred())
157+
158+
output = strings.TrimSpace(output)
159+
Expect(output).ToNot(BeEmpty())
160+
161+
for _, lib := range strings.Split(output, "\n") {
162+
additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib)
163+
}
164+
additionalContainerArguments = append(additionalContainerArguments,
165+
"-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli",
166+
)
167+
}
168+
169+
// Capture the host GPU list.
170+
var err error
171+
hostOutput, _, err = runner.Run("nvidia-smi -L")
172+
Expect(err).ToNot(HaveOccurred())
173+
174+
// Normalize the output once
175+
hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))
176+
177+
// If a container with the same name exists from a previous test run, remove it first.
178+
// Ignore errors as container might not exist
179+
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
180+
})
181+
182+
AfterAll(func(ctx context.Context) {
183+
// Cleanup: remove the container and the temporary script on the host.
184+
// Use || true to ensure cleanup doesn't fail the test
185+
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
186+
})
187+
188+
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
189+
// Launch the container in detached mode.
190+
var startContainerScriptBuilder strings.Builder
191+
startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate)
192+
Expect(err).ToNot(HaveOccurred())
193+
err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct {
194+
ContainerName string
195+
AdditionalArguments []string
196+
}{
197+
ContainerName: containerName,
198+
AdditionalArguments: additionalContainerArguments,
199+
})
200+
Expect(err).ToNot(HaveOccurred())
201+
202+
startContainerScript := startContainerScriptBuilder.String()
203+
GinkgoLogr.Info("Starting test container", "script", startContainerScript)
204+
_, _, err = runner.Run(startContainerScript)
205+
Expect(err).ToNot(HaveOccurred())
206+
207+
// Install docker in the container.
208+
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate))
209+
Expect(err).ToNot(HaveOccurred())
210+
211+
if installCTK {
212+
// Install nvidia-container-cli in the container.
213+
tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate)
214+
Expect(err).ToNot(HaveOccurred())
215+
216+
var toolkitInstall strings.Builder
217+
err = tmpl.Execute(&toolkitInstall, struct {
218+
ToolkitImage string
219+
}{
220+
ToolkitImage: imageName + ":" + imageTag,
221+
})
222+
Expect(err).ToNot(HaveOccurred())
223+
224+
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String()))
225+
Expect(err).ToNot(HaveOccurred())
226+
}
227+
228+
// Run the test script in the container.
229+
output, _, err := runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", libnvidiaContainerCliTestTemplate))
230+
Expect(err).ToNot(HaveOccurred())
231+
Expect(strings.TrimSpace(output)).ToNot(BeEmpty())
232+
Expect(hostOutput).To(ContainSubstring(strings.TrimSpace(output)))
233+
})
234+
})

0 commit comments

Comments
 (0)