Skip to content

Commit 0a77548

Browse files
ArangoGutierrezelezar
authored andcommitted
[no-relnote] implement a nestedContainerRunner for E2E test suite
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 5ac402e commit 0a77548

File tree

2 files changed

+147
-119
lines changed

2 files changed

+147
-119
lines changed

tests/e2e/nvidia-container-cli_test.go

Lines changed: 22 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -20,58 +20,12 @@ import (
2020
"context"
2121
"fmt"
2222
"strings"
23-
"text/template"
2423

2524
. "github.com/onsi/ginkgo/v2"
2625
. "github.com/onsi/gomega"
2726
)
2827

2928
const (
30-
installDockerTemplate = `
31-
export DEBIAN_FRONTEND=noninteractive
32-
33-
# Add Docker official GPG key:
34-
apt-get update
35-
apt-get install -y ca-certificates curl apt-utils gnupg2
36-
install -m 0755 -d /etc/apt/keyrings
37-
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
38-
chmod a+r /etc/apt/keyrings/docker.asc
39-
40-
# Add the repository to Apt sources:
41-
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
42-
apt-get update
43-
44-
apt-get install -y docker-ce docker-ce-cli containerd.io
45-
46-
# start dockerd in the background
47-
dockerd &
48-
49-
# wait for dockerd to be ready with timeout
50-
timeout=30
51-
elapsed=0
52-
while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
53-
echo "Waiting for dockerd to be ready..."
54-
sleep 1
55-
elapsed=$((elapsed + 1))
56-
done
57-
if [ $elapsed -ge $timeout ]; then
58-
echo "Docker failed to start within $timeout seconds"
59-
exit 1
60-
fi
61-
`
62-
installCTKTemplate = `
63-
# Create a temporary directory and rootfs path
64-
TMPDIR="$(mktemp -d)"
65-
66-
# Expose TMPDIR for the child namespace
67-
export TMPDIR
68-
69-
docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
70-
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
71-
72-
nvidia-container-cli --version
73-
`
74-
7529
libnvidiaContainerCliTestTemplate = `
7630
# Create a temporary directory and rootfs path
7731
TMPDIR="$(mktemp -d)"
@@ -113,22 +67,14 @@ unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\''
11367
11468
IN_NS
11569
`
116-
117-
startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
118-
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
119-
-e NVIDIA_DRIVER_CAPABILITIES=all \
120-
{{ range $i, $a := .AdditionalArguments -}}
121-
{{ $a }} \
122-
{{ end -}}
123-
ubuntu sleep infinity`
12470
)
12571

12672
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() {
12773
var (
128-
runner Runner
129-
containerName = "node-container-e2e"
130-
hostOutput string
131-
additionalContainerArguments []string
74+
runner Runner
75+
nestedContainerRunner Runner
76+
containerName = "node-container-e2e"
77+
hostOutput string
13278
)
13379

13480
BeforeAll(func(ctx context.Context) {
@@ -139,44 +85,40 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
13985
WithSshUser(sshUser),
14086
)
14187

88+
// If installCTK is true, install the toolkit on the host. before creating
89+
// the nested container.
14290
if installCTK {
14391
installer, err := NewToolkitInstaller(
144-
WithRunner(runner),
145-
WithImage(imageName+":"+imageTag),
146-
WithTemplate(dockerInstallTemplate),
92+
WithImage(nvidiaContainerToolkitImage),
93+
WithMode(InstallUsingNVIDIACTKInstaller),
14794
)
14895
Expect(err).ToNot(HaveOccurred())
14996

150-
err = installer.Install()
151-
Expect(err).ToNot(HaveOccurred())
152-
} else {
153-
// If installCTK is false, we use the preinstalled toolkit.
154-
// TODO: This should be updated for other distributions and other components of the toolkit.
155-
output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*")
97+
_, _, err = installer.Install(runner)
15698
Expect(err).ToNot(HaveOccurred())
99+
}
157100

158-
output = strings.TrimSpace(output)
159-
Expect(output).ToNot(BeEmpty())
101+
var err error
102+
nestedContainerRunner, err = NewNestedContainerRunner(runner, installCTK, containerName)
103+
Expect(err).ToNot(HaveOccurred())
160104

161-
for _, lib := range strings.Split(output, "\n") {
162-
additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib)
163-
}
164-
additionalContainerArguments = append(additionalContainerArguments,
165-
"-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli",
105+
// We also need to install the toolkit in the nested runner.
106+
if installCTK {
107+
installer, err := NewToolkitInstaller(
108+
WithImage(nvidiaContainerToolkitImage),
109+
WithMode(InstallUsingPackagingImage),
166110
)
111+
Expect(err).ToNot(HaveOccurred())
112+
_, _, err = installer.Install(nestedContainerRunner)
113+
Expect(err).ToNot(HaveOccurred())
167114
}
168115

169116
// Capture the host GPU list.
170-
var err error
171117
hostOutput, _, err = runner.Run("nvidia-smi -L")
172118
Expect(err).ToNot(HaveOccurred())
173119

174120
// Normalize the output once
175121
hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))
176-
177-
// If a container with the same name exists from a previous test run, remove it first.
178-
// Ignore errors as container might not exist
179-
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
180122
})
181123

182124
AfterAll(func(ctx context.Context) {
@@ -186,47 +128,8 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
186128
})
187129

188130
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
189-
// Launch the container in detached mode.
190-
var startContainerScriptBuilder strings.Builder
191-
startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate)
192-
Expect(err).ToNot(HaveOccurred())
193-
err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct {
194-
ContainerName string
195-
AdditionalArguments []string
196-
}{
197-
ContainerName: containerName,
198-
AdditionalArguments: additionalContainerArguments,
199-
})
200-
Expect(err).ToNot(HaveOccurred())
201-
202-
startContainerScript := startContainerScriptBuilder.String()
203-
GinkgoLogr.Info("Starting test container", "script", startContainerScript)
204-
_, _, err = runner.Run(startContainerScript)
205-
Expect(err).ToNot(HaveOccurred())
206-
207-
// Install docker in the container.
208-
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate))
209-
Expect(err).ToNot(HaveOccurred())
210-
211-
if installCTK {
212-
// Install nvidia-container-cli in the container.
213-
tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate)
214-
Expect(err).ToNot(HaveOccurred())
215-
216-
var toolkitInstall strings.Builder
217-
err = tmpl.Execute(&toolkitInstall, struct {
218-
ToolkitImage string
219-
}{
220-
ToolkitImage: imageName + ":" + imageTag,
221-
})
222-
Expect(err).ToNot(HaveOccurred())
223-
224-
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String()))
225-
Expect(err).ToNot(HaveOccurred())
226-
}
227-
228131
// Run the test script in the container.
229-
output, _, err := runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", libnvidiaContainerCliTestTemplate))
132+
output, _, err := nestedContainerRunner.Run(libnvidiaContainerCliTestTemplate)
230133
Expect(err).ToNot(HaveOccurred())
231134
Expect(strings.TrimSpace(output)).ToNot(BeEmpty())
232135
Expect(hostOutput).To(ContainSubstring(strings.TrimSpace(output)))

tests/e2e/runner.go

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,56 @@ import (
2121
"fmt"
2222
"os"
2323
"os/exec"
24+
"strings"
25+
"text/template"
2426
"time"
2527

2628
"golang.org/x/crypto/ssh"
2729
)
2830

31+
const (
32+
startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
33+
-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
34+
-e NVIDIA_DRIVER_CAPABILITIES=all \
35+
{{ range $i, $a := .AdditionalArguments -}}
36+
{{ $a }} \
37+
{{ end -}}
38+
ubuntu sleep infinity`
39+
40+
installDockerTemplate = `
41+
export DEBIAN_FRONTEND=noninteractive
42+
43+
# Add Docker official GPG key:
44+
apt-get update
45+
apt-get install -y ca-certificates curl apt-utils gnupg2
46+
install -m 0755 -d /etc/apt/keyrings
47+
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
48+
chmod a+r /etc/apt/keyrings/docker.asc
49+
50+
# Add the repository to Apt sources:
51+
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
52+
apt-get update
53+
54+
apt-get install -y docker-ce docker-ce-cli containerd.io
55+
56+
# start dockerd in the background
57+
dockerd &
58+
59+
# wait for dockerd to be ready with timeout
60+
timeout=30
61+
elapsed=0
62+
while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
63+
echo "Waiting for dockerd to be ready..."
64+
sleep 1
65+
elapsed=$((elapsed + 1))
66+
done
67+
if [ $elapsed -ge $timeout ]; then
68+
echo "Docker failed to start within $timeout seconds"
69+
exit 1
70+
fi
71+
`
72+
)
73+
2974
type localRunner struct{}
3075
type remoteRunner struct {
3176
sshKey string
@@ -34,6 +79,11 @@ type remoteRunner struct {
3479
port string
3580
}
3681

82+
type nestedContainerRunner struct {
83+
runner Runner
84+
containerName string
85+
}
86+
3787
type runnerOption func(*remoteRunner)
3888

3989
type Runner interface {
@@ -79,6 +129,77 @@ func NewRunner(opts ...runnerOption) Runner {
79129
return r
80130
}
81131

132+
// NewNestedContainerRunner creates a new nested container runner.
133+
// A nested container runs a container inside another container based on a
134+
// given runner (remote or local).
135+
func NewNestedContainerRunner(runner Runner, installCTK bool, containerName string) (Runner, error) {
136+
additionalContainerArguments := []string{}
137+
138+
// If a container with the same name exists from a previous test run, remove it first.
139+
// Ignore errors as container might not exist
140+
_, _, err := runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck
141+
if err != nil {
142+
return nil, fmt.Errorf("failed to remove container: %w", err)
143+
}
144+
145+
if !installCTK {
146+
// If installCTK is false, we use the preinstalled toolkit.
147+
// This means we need to add toolkit libraries and binaries from the "host"
148+
149+
// TODO: This should be updated for other distributions and other components of the toolkit.
150+
output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*")
151+
if err != nil {
152+
return nil, fmt.Errorf("failed to list toolkit libraries: %w", err)
153+
}
154+
155+
output = strings.TrimSpace(output)
156+
if output == "" {
157+
return nil, fmt.Errorf("no toolkit libraries found") //nolint:goerr113
158+
}
159+
160+
for _, lib := range strings.Split(output, "\n") {
161+
additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib)
162+
}
163+
additionalContainerArguments = append(additionalContainerArguments, "-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli")
164+
}
165+
166+
// Launch the container in detached mode.
167+
var startContainerScriptBuilder strings.Builder
168+
startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate)
169+
if err != nil {
170+
return nil, fmt.Errorf("failed to parse start container template: %w", err)
171+
}
172+
err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct {
173+
ContainerName string
174+
AdditionalArguments []string
175+
}{
176+
ContainerName: containerName,
177+
AdditionalArguments: additionalContainerArguments,
178+
})
179+
if err != nil {
180+
return nil, fmt.Errorf("failed to execute start container template: %w", err)
181+
}
182+
183+
startContainerScript := startContainerScriptBuilder.String()
184+
_, _, err = runner.Run(startContainerScript)
185+
if err != nil {
186+
return nil, fmt.Errorf("failed to run start container script: %w", err)
187+
}
188+
189+
// install docker in the nested container
190+
_, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate))
191+
if err != nil {
192+
return nil, fmt.Errorf("failed to install docker: %w", err)
193+
}
194+
195+
nc := &nestedContainerRunner{
196+
runner: runner,
197+
containerName: containerName,
198+
}
199+
200+
return nc, nil
201+
}
202+
82203
func (l localRunner) Run(script string) (string, string, error) {
83204
// Create a command to run the script using bash
84205
cmd := exec.Command("bash", "-c", script)
@@ -131,6 +252,10 @@ func (r remoteRunner) Run(script string) (string, string, error) {
131252
return stdout.String(), "", nil
132253
}
133254

255+
func (r nestedContainerRunner) Run(script string) (string, string, error) {
256+
return r.runner.Run(fmt.Sprintf("docker exec -u root "+r.containerName+" bash -c '%s'", script))
257+
}
258+
134259
// createSshClient creates a ssh client, and retries if it fails to connect
135260
func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
136261
var client *ssh.Client

0 commit comments

Comments
 (0)