@@ -20,58 +20,12 @@ import (
2020 "context"
2121 "fmt"
2222 "strings"
23- "text/template"
2423
2524 . "github.com/onsi/ginkgo/v2"
2625 . "github.com/onsi/gomega"
2726)
2827
2928const (
30- installDockerTemplate = `
31- export DEBIAN_FRONTEND=noninteractive
32-
33- # Add Docker official GPG key:
34- apt-get update
35- apt-get install -y ca-certificates curl apt-utils gnupg2
36- install -m 0755 -d /etc/apt/keyrings
37- curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
38- chmod a+r /etc/apt/keyrings/docker.asc
39-
40- # Add the repository to Apt sources:
41- echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
42- apt-get update
43-
44- apt-get install -y docker-ce docker-ce-cli containerd.io
45-
46- # start dockerd in the background
47- dockerd &
48-
49- # wait for dockerd to be ready with timeout
50- timeout=30
51- elapsed=0
52- while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do
53- echo "Waiting for dockerd to be ready..."
54- sleep 1
55- elapsed=$((elapsed + 1))
56- done
57- if [ $elapsed -ge $timeout ]; then
58- echo "Docker failed to start within $timeout seconds"
59- exit 1
60- fi
61- `
62- installCTKTemplate = `
63- # Create a temporary directory and rootfs path
64- TMPDIR="$(mktemp -d)"
65-
66- # Expose TMPDIR for the child namespace
67- export TMPDIR
68-
69- docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/"
70- dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
71-
72- nvidia-container-cli --version
73- `
74-
7529 libnvidiaContainerCliTestTemplate = `
7630# Create a temporary directory and rootfs path
7731TMPDIR="$(mktemp -d)"
@@ -113,22 +67,14 @@ unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\''
11367
11468IN_NS
11569`
116-
117- startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \
118- -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \
119- -e NVIDIA_DRIVER_CAPABILITIES=all \
120- {{ range $i, $a := .AdditionalArguments -}}
121- {{ $a }} \
122- {{ end -}}
123- ubuntu sleep infinity`
12470)
12571
12672var _ = Describe ("nvidia-container-cli" , Ordered , ContinueOnFailure , Label ("libnvidia-container" ), func () {
12773 var (
128- runner Runner
129- containerName = "node-container-e2e"
130- hostOutput string
131- additionalContainerArguments [] string
74+ runner Runner
75+ nestedContainerRunner Runner
76+ containerName = "node-container-e2e"
77+ hostOutput string
13278 )
13379
13480 BeforeAll (func (ctx context.Context ) {
@@ -139,44 +85,40 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
13985 WithSshUser (sshUser ),
14086 )
14187
88+ // If installCTK is true, install the toolkit on the host. before creating
89+ // the nested container.
14290 if installCTK {
14391 installer , err := NewToolkitInstaller (
144- WithRunner (runner ),
145- WithImage (imageName + ":" + imageTag ),
146- WithTemplate (dockerInstallTemplate ),
92+ WithImage (nvidiaContainerToolkitImage ),
93+ WithMode (InstallUsingNVIDIACTKInstaller ),
14794 )
14895 Expect (err ).ToNot (HaveOccurred ())
14996
150- err = installer .Install ()
151- Expect (err ).ToNot (HaveOccurred ())
152- } else {
153- // If installCTK is false, we use the preinstalled toolkit.
154- // TODO: This should be updated for other distributions and other components of the toolkit.
155- output , _ , err := runner .Run ("ls /lib/**/libnvidia-container*.so.*.*" )
97+ _ , _ , err = installer .Install (runner )
15698 Expect (err ).ToNot (HaveOccurred ())
99+ }
157100
158- output = strings .TrimSpace (output )
159- Expect (output ).ToNot (BeEmpty ())
101+ var err error
102+ nestedContainerRunner , err = NewNestedContainerRunner (runner , installCTK , containerName )
103+ Expect (err ).ToNot (HaveOccurred ())
160104
161- for _ , lib := range strings . Split ( output , " \n " ) {
162- additionalContainerArguments = append ( additionalContainerArguments , "-v " + lib + ":" + lib )
163- }
164- additionalContainerArguments = append ( additionalContainerArguments ,
165- "-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli" ,
105+ // We also need to install the toolkit in the nested runner.
106+ if installCTK {
107+ installer , err := NewToolkitInstaller (
108+ WithImage ( nvidiaContainerToolkitImage ) ,
109+ WithMode ( InstallUsingPackagingImage ) ,
166110 )
111+ Expect (err ).ToNot (HaveOccurred ())
112+ _ , _ , err = installer .Install (nestedContainerRunner )
113+ Expect (err ).ToNot (HaveOccurred ())
167114 }
168115
169116 // Capture the host GPU list.
170- var err error
171117 hostOutput , _ , err = runner .Run ("nvidia-smi -L" )
172118 Expect (err ).ToNot (HaveOccurred ())
173119
174120 // Normalize the output once
175121 hostOutput = strings .TrimSpace (strings .ReplaceAll (hostOutput , "\r " , "" ))
176-
177- // If a container with the same name exists from a previous test run, remove it first.
178- // Ignore errors as container might not exist
179- runner .Run (fmt .Sprintf ("docker rm -f %s 2>/dev/null || true" , containerName )) //nolint:errcheck
180122 })
181123
182124 AfterAll (func (ctx context.Context ) {
@@ -186,47 +128,8 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn
186128 })
187129
188130 It ("should report the same GPUs inside the container as on the host" , func (ctx context.Context ) {
189- // Launch the container in detached mode.
190- var startContainerScriptBuilder strings.Builder
191- startContainerTemplate , err := template .New ("startContainer" ).Parse (startTestContainerTemplate )
192- Expect (err ).ToNot (HaveOccurred ())
193- err = startContainerTemplate .Execute (& startContainerScriptBuilder , struct {
194- ContainerName string
195- AdditionalArguments []string
196- }{
197- ContainerName : containerName ,
198- AdditionalArguments : additionalContainerArguments ,
199- })
200- Expect (err ).ToNot (HaveOccurred ())
201-
202- startContainerScript := startContainerScriptBuilder .String ()
203- GinkgoLogr .Info ("Starting test container" , "script" , startContainerScript )
204- _ , _ , err = runner .Run (startContainerScript )
205- Expect (err ).ToNot (HaveOccurred ())
206-
207- // Install docker in the container.
208- _ , _ , err = runner .Run (fmt .Sprintf ("docker exec -u root " + containerName + " bash -c '%s'" , installDockerTemplate ))
209- Expect (err ).ToNot (HaveOccurred ())
210-
211- if installCTK {
212- // Install nvidia-container-cli in the container.
213- tmpl , err := template .New ("toolkitInstall" ).Parse (installCTKTemplate )
214- Expect (err ).ToNot (HaveOccurred ())
215-
216- var toolkitInstall strings.Builder
217- err = tmpl .Execute (& toolkitInstall , struct {
218- ToolkitImage string
219- }{
220- ToolkitImage : imageName + ":" + imageTag ,
221- })
222- Expect (err ).ToNot (HaveOccurred ())
223-
224- _ , _ , err = runner .Run (fmt .Sprintf ("docker exec -u root " + containerName + " bash -c '%s'" , toolkitInstall .String ()))
225- Expect (err ).ToNot (HaveOccurred ())
226- }
227-
228131 // Run the test script in the container.
229- output , _ , err := runner .Run (fmt . Sprintf ( "docker exec -u root " + containerName + " bash -c '%s'" , libnvidiaContainerCliTestTemplate ) )
132+ output , _ , err := nestedContainerRunner .Run (libnvidiaContainerCliTestTemplate )
230133 Expect (err ).ToNot (HaveOccurred ())
231134 Expect (strings .TrimSpace (output )).ToNot (BeEmpty ())
232135 Expect (hostOutput ).To (ContainSubstring (strings .TrimSpace (output )))
0 commit comments