Skip to content

Commit b4cd062

Browse files
[no-relnote] Add E2E for libnvidia-container
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 0920018 commit b4cd062

File tree

4 files changed

+242
-13
lines changed

4 files changed

+242
-13
lines changed

tests/e2e/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
2020

2121
GINKGO_BIN := $(CURDIR)/bin/ginkgo
2222

23+
# If GINKGO_FOCUS is not set, run all tests
24+
# current available tests:
25+
# - nvidia-container-cli
26+
# - docker
27+
GINKGO_FOCUS ?=
28+
2329
test: $(GINKGO_BIN)
24-
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
30+
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/...
2531

2632
$(GINKGO_BIN):
2733
mkdir -p $(CURDIR)/bin

tests/e2e/e2e_test.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,8 @@ func getTestEnv() {
6262

6363
installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false)
6464

65-
if installCTK {
66-
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
67-
68-
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
69-
70-
}
65+
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
66+
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
7167

7268
sshKey = getRequiredEnvvar[string]("E2E_SSH_KEY")
7369
sshUser = getRequiredEnvvar[string]("E2E_SSH_USER")

tests/e2e/installer.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,18 @@ var dockerInstallTemplate = `
2828
#! /usr/bin/env bash
2929
set -xe
3030
31-
: ${IMAGE:={{.Image}}}
32-
33-
# Create a temporary directory
34-
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
35-
mkdir -p "$TEMP_DIR"
31+
# if the TEMP_DIR is already set, use it
32+
if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then
33+
TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
34+
else
35+
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
36+
echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt
37+
fi
38+
39+
# if TEMP_DIR does not exist, create it
40+
if [ ! -d "$TEMP_DIR" ]; then
41+
mkdir -p "$TEMP_DIR"
42+
fi
3643
3744
# Given that docker has an init function that checks for the existence of the
3845
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
@@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged \
4653
-v /var/run/docker.sock:/var/run/docker.sock \
4754
-v "$TEMP_DIR:$TEMP_DIR" \
4855
-v /etc/docker:/config-root \
49-
${IMAGE} \
56+
{{.Image}} \
5057
--root "$TEMP_DIR" \
5158
--runtime=docker \
5259
--config=/config-root/daemon.json \
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"strings"
23+
"text/template"
24+
25+
. "github.com/onsi/ginkgo/v2"
26+
. "github.com/onsi/gomega"
27+
)
28+
29+
const (
30+
libnvidiaContainerCliTestTemplate = `#!/bin/bash
31+
set -euo pipefail
32+
export DEBIAN_FRONTEND=noninteractive
33+
34+
# Add Docker's official GPG key:
35+
apt-get update
36+
apt-get install -y ca-certificates curl apt-utils gnupg2
37+
install -m 0755 -d /etc/apt/keyrings
38+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
39+
chmod a+r /etc/apt/keyrings/docker.asc
40+
41+
# Add the repository to Apt sources:
42+
echo \
43+
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
44+
$(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \
45+
tee /etc/apt/sources.list.d/docker.list > /dev/null
46+
apt-get update
47+
48+
apt-get install -y docker-ce docker-ce-cli containerd.io
49+
50+
# start dockerd in the background
51+
dockerd &
52+
53+
# wait for dockerd to be ready
54+
while ! docker info > /dev/null 2>&1; do
55+
echo "Waiting for dockerd to be ready..."
56+
sleep 1
57+
done
58+
59+
# Create a temporary directory and rootfs path
60+
TMPDIR="$(mktemp -d)"
61+
ROOTFS="${TMPDIR}/rootfs"
62+
mkdir -p "${ROOTFS}"
63+
64+
# Expose ROOTFS for the child namespace
65+
export ROOTFS
66+
67+
echo "Copying package files from ${NVIDIA_TOOLKIT_IMAGE} to ${ROOTFS}"
68+
docker run --rm \
69+
-v $(pwd):$(pwd) \
70+
-w $(pwd) \
71+
-u $(id -u):$(id -g) \
72+
--entrypoint="sh" \
73+
${NVIDIA_TOOLKIT_IMAGE}-packaging \
74+
-c "cp -p -R /artifacts/* ${TMPDIR}"
75+
76+
dpkg -i ${TMPDIR}/amd64/libnvidia-container1_*_amd64.deb \
77+
${TMPDIR}/amd64/nvidia-container-toolkit-base_*_amd64.deb \
78+
${TMPDIR}/amd64/libnvidia-container-tools_*_amd64.deb
79+
80+
nvidia-container-cli --version
81+
82+
curl http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz | tar -C $ROOTFS -xz
83+
84+
# Enter a new mount + PID namespace so we can pivot_root without touching the
85+
# container's original filesystem.
86+
unshare --mount --pid --fork --propagation private -- sh -eux <<'IN_NS'
87+
: "${ROOTFS:?}"
88+
89+
# 1 Bind-mount the new root and make the mount private
90+
mount --bind "$ROOTFS" "$ROOTFS"
91+
mount --make-private "$ROOTFS"
92+
cd "$ROOTFS"
93+
94+
# 2 Minimal virtual filesystems
95+
mount -t proc proc proc
96+
mount -t sysfs sys sys
97+
mount -t tmpfs tmp tmp
98+
mount -t tmpfs run run
99+
100+
# 3 GPU setup via nvidia-container-cli
101+
# Add potential install locations of nvidia-container-cli to PATH.
102+
# /artifacts/{rpm,deb}/usr/bin are where the binary ends up after the
103+
# packages are extracted in the application image. /work is included for
104+
# completeness since some images may copy the binary there.
105+
export PATH="${PATH}:/artifacts/rpm/usr/bin:/artifacts/deb/usr/bin:/work"
106+
107+
nvidia-container-cli --load-kmods \
108+
configure \
109+
--no-cgroups --utility --device=0 "$(pwd)"
110+
111+
# 4 Switch root into the prepared filesystem
112+
mkdir -p mnt
113+
pivot_root . mnt
114+
umount -l /mnt
115+
116+
exec nvidia-smi -L
117+
IN_NS
118+
`
119+
120+
dockerRunCmdTemplate = `docker run --name {{.ContainerName}} --privileged --runtime=nvidia \
121+
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
122+
-e NVIDIA_DRIVER_CAPABILITIES=all \
123+
-e NVIDIA_TOOLKIT_IMAGE={{.ToolkitImage}} \
124+
-v {{.ScriptPath}}:/libnvidia-container-cli.sh \
125+
ubuntu /libnvidia-container-cli.sh`
126+
)
127+
128+
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() {
129+
var (
130+
runner Runner
131+
containerName = "nvidia-cli-e2e"
132+
hostOutput string
133+
testScriptPath = "/tmp/libnvidia-container-cli.sh"
134+
)
135+
136+
BeforeAll(func(ctx context.Context) {
137+
runner = NewRunner(
138+
WithHost(sshHost),
139+
WithPort(sshPort),
140+
WithSshKey(sshKey),
141+
WithSshUser(sshUser),
142+
)
143+
144+
if installCTK {
145+
installer, err := NewToolkitInstaller(
146+
WithRunner(runner),
147+
WithImage(imageName+":"+imageTag),
148+
WithTemplate(dockerInstallTemplate),
149+
)
150+
Expect(err).ToNot(HaveOccurred())
151+
152+
err = installer.Install()
153+
Expect(err).ToNot(HaveOccurred())
154+
}
155+
156+
// Capture the host GPU list.
157+
var err error
158+
hostOutput, _, err = runner.Run("nvidia-smi -L")
159+
Expect(err).ToNot(HaveOccurred())
160+
161+
// Normalize the output once
162+
hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))
163+
164+
// If a container with the same name exists from a previous test run, remove it first.
165+
_, _, err = runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
166+
Expect(err).ToNot(HaveOccurred())
167+
})
168+
169+
AfterAll(func(ctx context.Context) {
170+
// Cleanup: remove the container and the temporary script on the host.
171+
runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
172+
173+
// Remove the script from the remote host.
174+
runner.Run(fmt.Sprintf("rm -f %s", testScriptPath))
175+
})
176+
177+
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
178+
// Write the script to the remote host and make it executable.
179+
createScriptCmd := fmt.Sprintf(
180+
"cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s",
181+
testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath,
182+
)
183+
184+
_, _, err := runner.Run(createScriptCmd)
185+
Expect(err).ToNot(HaveOccurred())
186+
187+
// Build the docker run command (detached mode) from the template so it
188+
// stays readable while still resulting in a single-line invocation.
189+
tmpl, err := template.New("dockerRun").Parse(dockerRunCmdTemplate)
190+
Expect(err).ToNot(HaveOccurred())
191+
192+
var dockerRunCmd strings.Builder
193+
err = tmpl.Execute(&dockerRunCmd, struct {
194+
ContainerName string
195+
ToolkitImage string
196+
ScriptPath string
197+
}{
198+
ContainerName: containerName,
199+
ToolkitImage: imageName + ":" + imageTag,
200+
ScriptPath: testScriptPath,
201+
})
202+
Expect(err).ToNot(HaveOccurred())
203+
204+
// Launch the container in detached mode.
205+
_, _, err = runner.Run(dockerRunCmd.String())
206+
Expect(err).ToNot(HaveOccurred())
207+
208+
// Poll the logs of the already running container until we observe
209+
// the GPU list matching the host or until a 5-minute timeout elapses.
210+
Eventually(func() string {
211+
logs, _, err := runner.Run(fmt.Sprintf("docker logs --tail 1 %s", containerName))
212+
if err != nil {
213+
return fmt.Sprintf("error: %v", err)
214+
}
215+
216+
// Docker already returns only the last line due to --tail 1, so just normalize.
217+
return strings.TrimSpace(strings.ReplaceAll(logs, "\r", ""))
218+
}, "5m", "10s").Should(Equal(hostOutput))
219+
})
220+
})

0 commit comments

Comments
 (0)