|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +package e2e |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + "strings" |
| 23 | + "text/template" |
| 24 | + |
| 25 | + . "github.com/onsi/ginkgo/v2" |
| 26 | + . "github.com/onsi/gomega" |
| 27 | +) |
| 28 | + |
| 29 | +const ( |
| 30 | + installDockerTemplate = ` |
| 31 | +export DEBIAN_FRONTEND=noninteractive |
| 32 | +
|
| 33 | +# Add Docker official GPG key: |
| 34 | +apt-get update |
| 35 | +apt-get install -y ca-certificates curl apt-utils gnupg2 |
| 36 | +install -m 0755 -d /etc/apt/keyrings |
| 37 | +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc |
| 38 | +chmod a+r /etc/apt/keyrings/docker.asc |
| 39 | +
|
| 40 | +# Add the repository to Apt sources: |
| 41 | +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null |
| 42 | +apt-get update |
| 43 | +
|
| 44 | +apt-get install -y docker-ce docker-ce-cli containerd.io |
| 45 | +
|
| 46 | +# start dockerd in the background |
| 47 | +dockerd & |
| 48 | +
|
| 49 | +# wait for dockerd to be ready with timeout |
| 50 | +timeout=30 |
| 51 | +elapsed=0 |
| 52 | +while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do |
| 53 | + echo "Waiting for dockerd to be ready..." |
| 54 | + sleep 1 |
| 55 | + elapsed=$((elapsed + 1)) |
| 56 | +done |
| 57 | +if [ $elapsed -ge $timeout ]; then |
| 58 | + echo "Docker failed to start within $timeout seconds" |
| 59 | + exit 1 |
| 60 | +fi |
| 61 | +` |
| 62 | + installCTKTemplate = ` |
| 63 | +# Create a temporary directory and rootfs path |
| 64 | +TMPDIR="$(mktemp -d)" |
| 65 | +
|
| 66 | +# Expose TMPDIR for the child namespace |
| 67 | +export TMPDIR |
| 68 | +
|
| 69 | +docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/" |
| 70 | +dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb |
| 71 | +
|
| 72 | +nvidia-container-cli --version |
| 73 | +` |
| 74 | + |
| 75 | + libnvidiaContainerCliTestTemplate = ` |
| 76 | +# Create a temporary directory and rootfs path |
| 77 | +TMPDIR="$(mktemp -d)" |
| 78 | +ROOTFS="${TMPDIR}/rootfs" |
| 79 | +mkdir -p "${ROOTFS}" |
| 80 | +
|
| 81 | +# Expose ROOTFS for the child namespace |
| 82 | +export ROOTFS TMPDIR |
| 83 | +
|
| 84 | +# Download Ubuntu base image with error handling |
| 85 | +curl -fsSL http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz | tar -C $ROOTFS -xz || { |
| 86 | + echo "Failed to download or extract Ubuntu base image" |
| 87 | + exit 1 |
| 88 | +} |
| 89 | +
|
| 90 | +# Enter a new mount + PID namespace so we can pivot_root without touching the |
| 91 | +# container'\''s original filesystem. |
| 92 | +unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\'' |
| 93 | + : "${ROOTFS:?}" |
| 94 | +
|
| 95 | + # 1 Bind-mount the new root and make the mount private |
| 96 | + mount --bind "$ROOTFS" "$ROOTFS" |
| 97 | + mount --make-private "$ROOTFS" |
| 98 | + cd "$ROOTFS" |
| 99 | +
|
| 100 | + # 2 Minimal virtual filesystems |
| 101 | + mount -t proc proc proc |
| 102 | + mount -t sysfs sys sys |
| 103 | + mount -t tmpfs tmp tmp |
| 104 | + mount -t tmpfs run run |
| 105 | +
|
| 106 | + # 3 Configure NVIDIA devices |
| 107 | + nvidia-container-cli --load-kmods configure --ldconfig=@/sbin/ldconfig.real --no-cgroups --utility --device 0 $(pwd) |
| 108 | +
|
| 109 | + # 4 Switch root into the prepared filesystem |
| 110 | + pivot_root . mnt |
| 111 | + umount -l mnt |
| 112 | + nvidia-smi -L |
| 113 | +
|
| 114 | +IN_NS |
| 115 | +` |
| 116 | + |
| 117 | + startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \ |
| 118 | + -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \ |
| 119 | + -e NVIDIA_DRIVER_CAPABILITIES=all \ |
| 120 | + {{ range $i, $a := .AdditionalArguments -}} |
| 121 | + {{ $a }} \ |
| 122 | + {{ end -}} |
| 123 | + ubuntu sleep infinity` |
| 124 | +) |
| 125 | + |
| 126 | +var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() { |
| 127 | + var ( |
| 128 | + runner Runner |
| 129 | + containerName = "node-container-e2e" |
| 130 | + hostOutput string |
| 131 | + additionalContainerArguments []string |
| 132 | + ) |
| 133 | + |
| 134 | + BeforeAll(func(ctx context.Context) { |
| 135 | + runner = NewRunner( |
| 136 | + WithHost(sshHost), |
| 137 | + WithPort(sshPort), |
| 138 | + WithSshKey(sshKey), |
| 139 | + WithSshUser(sshUser), |
| 140 | + ) |
| 141 | + |
| 142 | + if installCTK { |
| 143 | + installer, err := NewToolkitInstaller( |
| 144 | + WithRunner(runner), |
| 145 | + WithImage(imageName+":"+imageTag), |
| 146 | + WithTemplate(dockerInstallTemplate), |
| 147 | + ) |
| 148 | + Expect(err).ToNot(HaveOccurred()) |
| 149 | + |
| 150 | + err = installer.Install() |
| 151 | + Expect(err).ToNot(HaveOccurred()) |
| 152 | + } else { |
| 153 | + // If installCTK is false, we use the preinstalled toolkit. |
| 154 | + // TODO: This should be updated for other distributions and other components of the toolkit. |
| 155 | + output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*") |
| 156 | + Expect(err).ToNot(HaveOccurred()) |
| 157 | + |
| 158 | + output = strings.TrimSpace(output) |
| 159 | + Expect(output).ToNot(BeEmpty()) |
| 160 | + |
| 161 | + for _, lib := range strings.Split(output, "\n") { |
| 162 | + additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib) |
| 163 | + } |
| 164 | + additionalContainerArguments = append(additionalContainerArguments, |
| 165 | + "-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli", |
| 166 | + ) |
| 167 | + } |
| 168 | + |
| 169 | + // Capture the host GPU list. |
| 170 | + var err error |
| 171 | + hostOutput, _, err = runner.Run("nvidia-smi -L") |
| 172 | + Expect(err).ToNot(HaveOccurred()) |
| 173 | + |
| 174 | + // Normalize the output once |
| 175 | + hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) |
| 176 | + |
| 177 | + // If a container with the same name exists from a previous test run, remove it first. |
| 178 | + // Ignore errors as container might not exist |
| 179 | + runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck |
| 180 | + }) |
| 181 | + |
| 182 | + AfterAll(func(ctx context.Context) { |
| 183 | + // Cleanup: remove the container and the temporary script on the host. |
| 184 | + // Use || true to ensure cleanup doesn't fail the test |
| 185 | + runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck |
| 186 | + }) |
| 187 | + |
| 188 | + It("should report the same GPUs inside the container as on the host", func(ctx context.Context) { |
| 189 | + // Launch the container in detached mode. |
| 190 | + var startContainerScriptBuilder strings.Builder |
| 191 | + startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate) |
| 192 | + Expect(err).ToNot(HaveOccurred()) |
| 193 | + err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct { |
| 194 | + ContainerName string |
| 195 | + AdditionalArguments []string |
| 196 | + }{ |
| 197 | + ContainerName: containerName, |
| 198 | + AdditionalArguments: additionalContainerArguments, |
| 199 | + }) |
| 200 | + Expect(err).ToNot(HaveOccurred()) |
| 201 | + |
| 202 | + startContainerScript := startContainerScriptBuilder.String() |
| 203 | + GinkgoLogr.Info("Starting test container", "script", startContainerScript) |
| 204 | + _, _, err = runner.Run(startContainerScript) |
| 205 | + Expect(err).ToNot(HaveOccurred()) |
| 206 | + |
| 207 | + // Install docker in the container. |
| 208 | + _, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate)) |
| 209 | + Expect(err).ToNot(HaveOccurred()) |
| 210 | + |
| 211 | + if installCTK { |
| 212 | + // Install nvidia-container-cli in the container. |
| 213 | + tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate) |
| 214 | + Expect(err).ToNot(HaveOccurred()) |
| 215 | + |
| 216 | + var toolkitInstall strings.Builder |
| 217 | + err = tmpl.Execute(&toolkitInstall, struct { |
| 218 | + ToolkitImage string |
| 219 | + }{ |
| 220 | + ToolkitImage: imageName + ":" + imageTag, |
| 221 | + }) |
| 222 | + Expect(err).ToNot(HaveOccurred()) |
| 223 | + |
| 224 | + _, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String())) |
| 225 | + Expect(err).ToNot(HaveOccurred()) |
| 226 | + } |
| 227 | + |
| 228 | + // Run the test script in the container. |
| 229 | + output, _, err := runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", libnvidiaContainerCliTestTemplate)) |
| 230 | + Expect(err).ToNot(HaveOccurred()) |
| 231 | + Expect(strings.TrimSpace(output)).ToNot(BeEmpty()) |
| 232 | + Expect(hostOutput).To(ContainSubstring(strings.TrimSpace(output))) |
| 233 | + }) |
| 234 | +}) |
0 commit comments