diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index ab62193e9..ebafb36f1 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -24,6 +24,7 @@ GINKGO_BIN := $(CURDIR)/bin/ginkgo # current available tests: # - nvidia-container-cli # - docker +# - nvidia-cdi-refresh GINKGO_FOCUS ?= test: $(GINKGO_BIN) diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 11abe46e1..22f2cb9eb 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -22,6 +22,7 @@ import ( "errors" "os" "strconv" + "strings" "testing" . "github.com/onsi/ginkgo/v2" @@ -30,17 +31,21 @@ import ( // Test context var ( + runner Runner + ctx context.Context installCTK bool - imageName string - imageTag string + nvidiaContainerToolkitImage string sshKey string sshUser string sshHost string sshPort string + + localCacheDir string + toolkitInstaller *ToolkitInstaller ) func TestMain(t *testing.T) { @@ -49,23 +54,59 @@ func TestMain(t *testing.T) { RegisterFailHandler(Fail) ctx = context.Background() - getTestEnv() RunSpecs(t, suiteName, ) } +var _ = BeforeSuite(func() { + getTestEnv() + + runner = NewRunner( + WithHost(sshHost), + WithPort(sshPort), + WithSshKey(sshKey), + WithSshUser(sshUser), + ) + + // Create a tempdir on the runner. + tmpdir, _, err := runner.Run("mktemp -d --tmpdir=/tmp nvctk-e2e-test-cacheXXX") + Expect(err).ToNot(HaveOccurred()) + Expect(strings.TrimSpace(tmpdir)).ToNot(BeEmpty()) + + localCacheDir = strings.TrimSpace(tmpdir) + + toolkitInstaller, err = NewToolkitInstaller( + WithToolkitImage(nvidiaContainerToolkitImage), + WithCacheDir(localCacheDir), + ) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = toolkitInstaller.PrepareCache(runner) + Expect(err).ToNot(HaveOccurred()) + + if installCTK { + _, _, err := toolkitInstaller.Install(runner) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = runner.Run(`sudo nvidia-ctk runtime configure --runtime=docker`) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = runner.Run(`sudo systemctl restart docker`) + Expect(err).ToNot(HaveOccurred()) + } +}) + // getTestEnv gets the test environment variables func getTestEnv() { defer GinkgoRecover() installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false) - if installCTK { - imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME") - imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG") - } + imageName := getRequiredEnvvar[string]("E2E_IMAGE_NAME") + imageTag := getRequiredEnvvar[string]("E2E_IMAGE_TAG") + nvidiaContainerToolkitImage = imageName + ":" + imageTag sshHost = getEnvVarOrDefault("E2E_SSH_HOST", "") if sshHost != "" { @@ -73,7 +114,6 @@ func getTestEnv() { sshUser = getRequiredEnvvar[string]("E2E_SSH_USER") sshPort = getEnvVarOrDefault("E2E_SSH_PORT", "22") } - } // getRequiredEnvvar returns the specified envvar if set or raises an error. diff --git a/tests/e2e/installer.go b/tests/e2e/installer.go index 1b92d3fd1..f3dde1aa3 100644 --- a/tests/e2e/installer.go +++ b/tests/e2e/installer.go @@ -19,107 +19,114 @@ package e2e import ( "bytes" "fmt" + "strings" "text/template" ) -// dockerInstallTemplate is a template for installing the NVIDIA Container Toolkit -// on a host using Docker. -var dockerInstallTemplate = ` -#! /usr/bin/env bash +var prepareInstallerCacheTemplate = ` set -xe -# if the TEMP_DIR is already set, use it -if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then - TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt) -else - TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM" - echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt -fi - -# if TEMP_DIR does not exist, create it -if [ ! -d "$TEMP_DIR" ]; then - mkdir -p "$TEMP_DIR" -fi - -# Given that docker has an init function that checks for the existence of the -# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook -# in the /usr/bin directory. -# See https://github.com/moby/moby/blob/20a05dabf44934447d1a66cdd616cc803b81d4e2/daemon/nvidia_linux.go#L32-L46 -sudo rm -f /usr/bin/nvidia-container-runtime-hook -sudo ln -s "$TEMP_DIR/toolkit/nvidia-container-runtime-hook" /usr/bin/nvidia-container-runtime-hook - -docker run --pid=host --rm -i --privileged \ - -v /:/host \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -v "$TEMP_DIR:$TEMP_DIR" \ - -v /etc/docker:/config-root \ - {{.Image}} \ - --root "$TEMP_DIR" \ - --runtime=docker \ - --config=/config-root/daemon.json \ - --driver-root=/ \ - --no-daemon \ - --restart-mode=systemd +mkdir -p {{.CacheDir}} + +docker run --rm -v {{.CacheDir}}:/cache --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /cache/" ` -type ToolkitInstaller struct { - runner Runner - template string +var installFromImageTemplate = ` +set -xe - Image string -} +cd {{.CacheDir}}/packages/ubuntu18.04/amd64 -type installerOption func(*ToolkitInstaller) +{{if .WithSudo }}sudo {{end}}dpkg -i libnvidia-container1_*_amd64.deb \ + libnvidia-container-tools_*_amd64.deb \ + nvidia-container-toolkit-base_*_amd64.deb \ + nvidia-container-toolkit_*_amd64.deb -func WithRunner(r Runner) installerOption { - return func(i *ToolkitInstaller) { - i.runner = r - } +cd - + +nvidia-container-cli --version +` + +type ToolkitInstaller struct { + ToolkitImage string + CacheDir string } -func WithImage(image string) installerOption { +type installerOption func(*ToolkitInstaller) + +func WithToolkitImage(image string) installerOption { return func(i *ToolkitInstaller) { - i.Image = image + i.ToolkitImage = image } } -func WithTemplate(template string) installerOption { +func WithCacheDir(cacheDir string) installerOption { return func(i *ToolkitInstaller) { - i.template = template + i.CacheDir = cacheDir } } func NewToolkitInstaller(opts ...installerOption) (*ToolkitInstaller, error) { - i := &ToolkitInstaller{ - runner: localRunner{}, - template: dockerInstallTemplate, - } + i := &ToolkitInstaller{} for _, opt := range opts { opt(i) } - if i.Image == "" { + if i.ToolkitImage == "" { return nil, fmt.Errorf("image is required") } return i, nil } -func (i *ToolkitInstaller) Install() error { +// PrepareCache ensures that the installer (package) cache is created on the runner. +// The can be used to ensure that docker is not REQUIRED in an inner container. +func (i *ToolkitInstaller) PrepareCache(runner Runner) (string, string, error) { + renderedScript, err := i.renderScript(prepareInstallerCacheTemplate, false) + if err != nil { + return "", "", err + } + + return runner.Run(renderedScript) +} + +func (i *ToolkitInstaller) Install(runner Runner) (string, string, error) { + uid, _, err := runner.Run("id -u") + if err != nil { + return "", "", err + } + withSudo := false + if strings.TrimSpace(uid) != "0" { + withSudo = true + } + renderedScript, err := i.renderScript(installFromImageTemplate, withSudo) + if err != nil { + return "", "", err + } + + return runner.Run(renderedScript) +} + +func (i *ToolkitInstaller) renderScript(scriptTemplate string, withSudo bool) (string, error) { // Parse the combined template - tmpl, err := template.New("installScript").Parse(i.template) + tmpl, err := template.New("template").Parse(scriptTemplate) if err != nil { - return fmt.Errorf("error parsing template: %w", err) + return "", fmt.Errorf("error parsing template: %w", err) } + templateInfo := struct { + *ToolkitInstaller + WithSudo bool + }{ + ToolkitInstaller: i, + WithSudo: withSudo, + } // Execute the template var renderedScript bytes.Buffer - err = tmpl.Execute(&renderedScript, i) + err = tmpl.Execute(&renderedScript, templateInfo) if err != nil { - return fmt.Errorf("error executing template: %w", err) + return "", fmt.Errorf("error executing template: %w", err) } - _, _, err = i.runner.Run(renderedScript.String()) - return err + return renderedScript.String(), nil } diff --git a/tests/e2e/nvidia-cdi-refresh_test.go b/tests/e2e/nvidia-cdi-refresh_test.go new file mode 100644 index 000000000..7240bcf5c --- /dev/null +++ b/tests/e2e/nvidia-cdi-refresh_test.go @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package e2e + +import ( + "context" + "fmt" + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +const ( + getSystemStateScript = `systemctl is-system-running 2>/dev/null` + + setSystemdDegradedScript = `#!/usr/bin/env bash + # Start the dummy service to force systemd to enter a degraded state + cat < /etc/systemd/system/dummy.service +[Unit] +Description=Dummy systemd service + +[Service] +Type=oneshot +ExecStart=/usr/bin/sh -c "exit 1" +EOF + + # We know the dummy service will fail, so we can ignore the error + systemctl start --now dummy.service 2>/dev/null || true + ` + + fixSystemDegradedScript = `#!/usr/bin/env bash + # Start the dummy service to force systemd to enter a degraded state + cat < /etc/systemd/system/dummy.service +[Unit] +Description=Dummy systemd service + +[Service] +Type=oneshot +ExecStart=/usr/bin/sh -c "exit 0" +EOF + + systemctl daemon-reload + + systemctl start --now dummy.service 2>/dev/null || true + + rm -rf /etc/systemd/system/dummy.service + systemctl daemon-reload +` + + nvidiaCdiRefreshPathActiveTemplate = ` + if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then + echo "nvidia-cdi-refresh.path is not Active" + exit 1 + fi + ` + nvidiaCdiRefreshServiceLoadedTemplate = ` + if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then + echo "nvidia-cdi-refresh.service is not loaded" + exit 1 + fi + ` + + nvidiaCdiRefreshFileExistsTemplate = ` + # is /var/run/cdi/nvidia.yaml exists? and exit with 0 if it does not exist + if [ ! -f /var/run/cdi/nvidia.yaml ]; then + echo "nvidia.yaml file does not exist" + exit 1 + fi + + # generate the nvidia.yaml file + nvidia-ctk cdi generate --output=/tmp/nvidia.yaml + + # diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same + if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then + echo "nvidia.yaml file is different" + exit 1 + fi + ` + + nvidiaCdiRefreshUpgradeTemplate = ` + # remove the generated files + rm /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml + + # Touch the nvidia-ctk binary to change the mtime + # This will trigger the nvidia-cdi-refresh.path unit to call the + # nvidia-cdi-refresh.service unit, simulating a change(update/downgrade) in the nvidia-ctk binary. + touch $(which nvidia-ctk) + + # wait for 3 seconds + sleep 3 + + # Check if the file /var/run/cdi/nvidia.yaml is created + if [ ! -f /var/run/cdi/nvidia.yaml ]; then + echo "nvidia.yaml file is not created after updating the modules.dep file" + exit 1 + fi + + # generate the nvidia.yaml file + nvidia-ctk cdi generate --output=/tmp/nvidia.yaml + + # diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same + if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then + echo "nvidia.yaml file is different" + exit 1 + fi + ` +) + +var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("systemd-unit"), func() { + var ( + containerName = "nvctk-e2e-nvidia-cdi-refresh-tests" + systemdRunner Runner + // TODO(@ArangoGutierrez): https://github.com/NVIDIA/nvidia-container-toolkit/pull/1235/files#r2302013660 + outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4" + ) + + BeforeAll(func(ctx context.Context) { + var err error + // TODO: We set installCTK to true here to SKIP the mounting of the files from the host. + // The test here does NOT require the host toolkit. + systemdRunner, err = NewNestedContainerRunner(runner, outerContainerImage, true, containerName, localCacheDir) + Expect(err).ToNot(HaveOccurred()) + for range 10 { + state, _, err := systemdRunner.Run(getSystemStateScript) + if err == nil { + GinkgoLogr.Info("systemd started", "state", state) + break + } + GinkgoLogr.Error(err, "systemctl state") + time.Sleep(1 * time.Second) + } + }) + + AfterAll(func(ctx context.Context) { + // Cleanup: remove the container and the temporary script on the host. + // Use || true to ensure cleanup doesn't fail the test + runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) + }) + + When("installing nvidia-container-toolkit", Ordered, func() { + BeforeAll(func(ctx context.Context) { + + _, _, err := toolkitInstaller.Install(systemdRunner) + Expect(err).ToNot(HaveOccurred()) + + output, _, err := systemdRunner.Run("nvidia-ctk --version") + Expect(err).ToNot(HaveOccurred()) + GinkgoLogr.Info("using nvidia-ctk", "version", strings.TrimSpace(output)) + }) + + AfterAll(func(ctx context.Context) { + _, _, err := systemdRunner.Run("apt-get purge -y libnvidia-container* nvidia-container-toolkit*") + Expect(err).ToNot(HaveOccurred()) + }) + + It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should generate the nvidia.yaml file", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshUpgradeTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + }) + + When("installing nvidia-container-toolkit on a system with a degraded systemd", Ordered, func() { + BeforeAll(func(ctx context.Context) { + _, _, err := systemdRunner.Run(setSystemdDegradedScript) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = systemdRunner.Run(getSystemStateScript) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("degraded")) + }) + + AfterAll(func(ctx context.Context) { + _, _, err := systemdRunner.Run(fixSystemDegradedScript) + Expect(err).ToNot(HaveOccurred()) + + state, _, err := systemdRunner.Run(getSystemStateScript) + Expect(err).ToNot(HaveOccurred()) + Expect(strings.TrimSpace(state)).To(Equal("running")) + }) + + It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshPathActiveTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should generate the nvidia.yaml file", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should generate the nvidia.yaml file", func(ctx context.Context) { + _, _, err := systemdRunner.Run(nvidiaCdiRefreshFileExistsTemplate) + Expect(err).ToNot(HaveOccurred()) + }) + }) +}) diff --git a/tests/e2e/nvidia-container-cli_test.go b/tests/e2e/nvidia-container-cli_test.go index bfde2c9c9..fcb26ec94 100644 --- a/tests/e2e/nvidia-container-cli_test.go +++ b/tests/e2e/nvidia-container-cli_test.go @@ -20,58 +20,12 @@ import ( "context" "fmt" "strings" - "text/template" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) const ( - installDockerTemplate = ` -export DEBIAN_FRONTEND=noninteractive - -# Add Docker official GPG key: -apt-get update -apt-get install -y ca-certificates curl apt-utils gnupg2 -install -m 0755 -d /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc -chmod a+r /etc/apt/keyrings/docker.asc - -# Add the repository to Apt sources: -echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo \"${UBUNTU_CODENAME:-$VERSION_CODENAME}\") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null -apt-get update - -apt-get install -y docker-ce docker-ce-cli containerd.io - -# start dockerd in the background -dockerd & - -# wait for dockerd to be ready with timeout -timeout=30 -elapsed=0 -while ! docker info > /dev/null 2>&1 && [ $elapsed -lt $timeout ]; do - echo "Waiting for dockerd to be ready..." - sleep 1 - elapsed=$((elapsed + 1)) -done -if [ $elapsed -ge $timeout ]; then - echo "Docker failed to start within $timeout seconds" - exit 1 -fi -` - installCTKTemplate = ` -# Create a temporary directory and rootfs path -TMPDIR="$(mktemp -d)" - -# Expose TMPDIR for the child namespace -export TMPDIR - -docker run --rm -v ${TMPDIR}:/host-tmpdir --entrypoint="sh" {{.ToolkitImage}}-packaging -c "cp -p -R /artifacts/* /host-tmpdir/" -dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb - -nvidia-container-cli --version -` - libnvidiaContainerCliTestTemplate = ` # Create a temporary directory and rootfs path TMPDIR="$(mktemp -d)" @@ -113,70 +67,32 @@ unshare --mount --pid --fork --propagation private -- sh -eux <<'\''IN_NS'\'' IN_NS ` - - startTestContainerTemplate = `docker run -d --name {{.ContainerName}} --privileged --runtime=nvidia \ - -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \ - -e NVIDIA_DRIVER_CAPABILITIES=all \ - {{ range $i, $a := .AdditionalArguments -}} - {{ $a }} \ - {{ end -}} - ubuntu sleep infinity` ) var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libnvidia-container"), func() { var ( - runner Runner - containerName = "node-container-e2e" - hostOutput string - additionalContainerArguments []string + containerName = "nvct-e2e-nvidia-container-cli-tests" + nestedContainerRunner Runner + hostOutput string ) BeforeAll(func(ctx context.Context) { - runner = NewRunner( - WithHost(sshHost), - WithPort(sshPort), - WithSshKey(sshKey), - WithSshUser(sshUser), - ) + var err error + nestedContainerRunner, err = NewNestedContainerRunner(runner, "ubuntu", installCTK, containerName, localCacheDir) + Expect(err).ToNot(HaveOccurred()) if installCTK { - installer, err := NewToolkitInstaller( - WithRunner(runner), - WithImage(imageName+":"+imageTag), - WithTemplate(dockerInstallTemplate), - ) - Expect(err).ToNot(HaveOccurred()) - - err = installer.Install() + // We MAY also need to install the toolkit in the nested runner. + _, _, err = toolkitInstaller.Install(nestedContainerRunner) Expect(err).ToNot(HaveOccurred()) - } else { - // If installCTK is false, we use the preinstalled toolkit. - // TODO: This should be updated for other distributions and other components of the toolkit. - output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*") - Expect(err).ToNot(HaveOccurred()) - - output = strings.TrimSpace(output) - Expect(output).ToNot(BeEmpty()) - - for _, lib := range strings.Split(output, "\n") { - additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib) - } - additionalContainerArguments = append(additionalContainerArguments, - "-v /usr/bin/nvidia-container-cli:/usr/bin/nvidia-container-cli", - ) } // Capture the host GPU list. - var err error hostOutput, _, err = runner.Run("nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) // Normalize the output once hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) - - // If a container with the same name exists from a previous test run, remove it first. - // Ignore errors as container might not exist - runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) //nolint:errcheck }) AfterAll(func(ctx context.Context) { @@ -186,47 +102,8 @@ var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, Label("libn }) It("should report the same GPUs inside the container as on the host", func(ctx context.Context) { - // Launch the container in detached mode. - var startContainerScriptBuilder strings.Builder - startContainerTemplate, err := template.New("startContainer").Parse(startTestContainerTemplate) - Expect(err).ToNot(HaveOccurred()) - err = startContainerTemplate.Execute(&startContainerScriptBuilder, struct { - ContainerName string - AdditionalArguments []string - }{ - ContainerName: containerName, - AdditionalArguments: additionalContainerArguments, - }) - Expect(err).ToNot(HaveOccurred()) - - startContainerScript := startContainerScriptBuilder.String() - GinkgoLogr.Info("Starting test container", "script", startContainerScript) - _, _, err = runner.Run(startContainerScript) - Expect(err).ToNot(HaveOccurred()) - - // Install docker in the container. - _, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", installDockerTemplate)) - Expect(err).ToNot(HaveOccurred()) - - if installCTK { - // Install nvidia-container-cli in the container. - tmpl, err := template.New("toolkitInstall").Parse(installCTKTemplate) - Expect(err).ToNot(HaveOccurred()) - - var toolkitInstall strings.Builder - err = tmpl.Execute(&toolkitInstall, struct { - ToolkitImage string - }{ - ToolkitImage: imageName + ":" + imageTag, - }) - Expect(err).ToNot(HaveOccurred()) - - _, _, err = runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", toolkitInstall.String())) - Expect(err).ToNot(HaveOccurred()) - } - // Run the test script in the container. - output, _, err := runner.Run(fmt.Sprintf("docker exec -u root "+containerName+" bash -c '%s'", libnvidiaContainerCliTestTemplate)) + output, _, err := nestedContainerRunner.Run(libnvidiaContainerCliTestTemplate) Expect(err).ToNot(HaveOccurred()) Expect(strings.TrimSpace(output)).ToNot(BeEmpty()) Expect(hostOutput).To(ContainSubstring(strings.TrimSpace(output))) diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index ab4d762fe..9fcc505f8 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -28,31 +28,11 @@ import ( // Integration tests for Docker runtime var _ = Describe("docker", Ordered, ContinueOnFailure, func() { - var runner Runner var hostDriverVersion string var hostDriverMajor string // Install the NVIDIA Container Toolkit BeforeAll(func(ctx context.Context) { - runner = NewRunner( - WithHost(sshHost), - WithPort(sshPort), - WithSshKey(sshKey), - WithSshUser(sshUser), - ) - - if installCTK { - installer, err := NewToolkitInstaller( - WithRunner(runner), - WithImage(imageName+":"+imageTag), - WithTemplate(dockerInstallTemplate), - ) - Expect(err).ToNot(HaveOccurred()) - - err = installer.Install() - Expect(err).ToNot(HaveOccurred()) - } - driverOutput, _, err := runner.Run("nvidia-smi -q | grep \"Driver Version\"") Expect(err).ToNot(HaveOccurred()) parts := strings.SplitN(driverOutput, ":", 2) @@ -360,9 +340,11 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { _, _, err := runner.Run("docker pull ubuntu") Expect(err).ToNot(HaveOccurred()) - tmpDirPath = GinkgoT().TempDir() - _, _, err = runner.Run("mkdir -p " + tmpDirPath) + stdout, _, err := runner.Run("mktemp -d --tmpdir=/tmp") Expect(err).ToNot(HaveOccurred()) + Expect(stdout).ToNot(BeEmpty()) + Expect(stdout).To(HavePrefix("/tmp")) + tmpDirPath = strings.TrimSpace(stdout) output, _, err := runner.Run("mount | sort") Expect(err).ToNot(HaveOccurred()) @@ -441,7 +423,7 @@ EOF`) It("should fail when using the nvidia-container-runtime-hook", Label("legacy"), func(ctx context.Context) { _, stderr, err := runner.Run("docker run --rm --runtime=runc --gpus=all firmware-test") Expect(err).To(HaveOccurred()) - Expect(stderr).To(ContainSubstring("nvidia-container-cli.real: mount error: path error:")) + Expect(stderr).To(ContainSubstring(": mount error: path error: /lib/firmware/nvidia/")) }) }) diff --git a/tests/e2e/runner.go b/tests/e2e/runner.go index df7794105..c25a918e0 100644 --- a/tests/e2e/runner.go +++ b/tests/e2e/runner.go @@ -21,11 +21,20 @@ import ( "fmt" "os" "os/exec" + "strings" + "text/template" "time" "golang.org/x/crypto/ssh" ) +const ( + installPrerequisitesScript = ` + export DEBIAN_FRONTEND=noninteractive + apt-get update && apt-get install -y curl gnupg2 + ` +) + type localRunner struct{} type remoteRunner struct { sshKey string @@ -34,6 +43,11 @@ type remoteRunner struct { port string } +type nestedContainerRunner struct { + runner Runner + containerName string +} + type runnerOption func(*remoteRunner) type Runner interface { @@ -79,6 +93,110 @@ func NewRunner(opts ...runnerOption) Runner { return r } +// NewNestedContainerRunner creates a new nested container runner. +// A nested container runs a container inside another container based on a +// given runner (remote or local). +func NewNestedContainerRunner(runner Runner, baseImage string, installCTK bool, containerName string, cacheDir string) (Runner, error) { + // If a container with the same name exists from a previous test run, remove it first. + // Ignore errors as container might not exist + _, _, err := runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", containerName)) + if err != nil { + return nil, fmt.Errorf("failed to remove container: %w", err) + } + + var additionalContainerArguments []string + + if cacheDir != "" { + additionalContainerArguments = append(additionalContainerArguments, + "-v "+cacheDir+":"+cacheDir+":ro", + ) + } + + if !installCTK { + // If installCTK is false, we use the preinstalled toolkit. + // This means we need to add toolkit libraries and binaries from the "host" + + // TODO: This should be updated for other distributions and other components of the toolkit. + output, _, err := runner.Run("ls /lib/**/libnvidia-container*.so.*.*") + if err != nil { + return nil, fmt.Errorf("failed to list toolkit libraries: %w", err) + } + + output = strings.TrimSpace(output) + if output == "" { + return nil, fmt.Errorf("no toolkit libraries found") + } + + for _, lib := range strings.Split(output, "\n") { + additionalContainerArguments = append(additionalContainerArguments, "-v "+lib+":"+lib) + } + + // Look for NVIDIA binaries in standard locations and mount them as volumes + nvidiaBinaries := []string{ + "nvidia-container-cli", + "nvidia-container-runtime", + "nvidia-container-runtime-hook", + "nvidia-ctk", + "nvidia-cdi-hook", + "nvidia-container-runtime.cdi", + "nvidia-container-runtime.legacy", + } + + searchPaths := []string{ + "/usr/bin", + "/usr/sbin", + "/usr/local/bin", + "/usr/local/sbin", + } + + for _, binary := range nvidiaBinaries { + for _, searchPath := range searchPaths { + binaryPath := searchPath + "/" + binary + // Check if the binary exists at this path + checkCmd := fmt.Sprintf("test -f %s && echo 'exists'", binaryPath) + output, _, err := runner.Run(checkCmd) + if err == nil && strings.TrimSpace(output) == "exists" { + // Binary found, add it as a volume mount + additionalContainerArguments = append(additionalContainerArguments, + fmt.Sprintf("-v %s:%s", binaryPath, binaryPath)) + break // Move to the next binary once found + } + } + } + } + + // Mount the /lib/modules directory as a volume to enable the nvidia-cdi-refresh service + additionalContainerArguments = append(additionalContainerArguments, "-v /lib/modules:/lib/modules") + + // Launch the container in detached mode. + container := outerContainer{ + Name: containerName, + BaseImage: baseImage, + AdditionalArguments: additionalContainerArguments, + } + + script, err := container.Render() + if err != nil { + return nil, err + } + _, _, err = runner.Run(script) + if err != nil { + return nil, fmt.Errorf("failed to run start container script: %w", err) + } + + inContainer := &nestedContainerRunner{ + runner: runner, + containerName: containerName, + } + + _, _, err = inContainer.Run(installPrerequisitesScript) + if err != nil { + return nil, fmt.Errorf("failed to install docker: %w", err) + } + + return inContainer, nil +} + func (l localRunner) Run(script string) (string, string, error) { // Create a command to run the script using bash cmd := exec.Command("bash", "-c", script) @@ -131,6 +249,12 @@ func (r remoteRunner) Run(script string) (string, string, error) { return stdout.String(), "", nil } +// Run runs teh specified script in the container using the docker exec command. +// The script is is run as the root user. +func (r nestedContainerRunner) Run(script string) (string, string, error) { + return r.runner.Run(`docker exec -u root "` + r.containerName + `" bash -c '` + script + `'`) +} + // createSshClient creates a ssh client, and retries if it fails to connect func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) { var client *ssh.Client @@ -169,3 +293,37 @@ func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) { return client, nil } + +// outerContainerTemplate represents a template to start a container with +// a name specified. +// The container is given access to all NVIDIA gpus by explicitly using the +// nvidia runtime and the `runtime.nvidia.com/gpu=all` device to trigger JIT +// CDI spec generation. +// The template also allows for additional arguments to be specified. +type outerContainer struct { + Name string + BaseImage string + AdditionalArguments []string +} + +func (o *outerContainer) Render() (string, error) { + tmpl, err := template.New("startContainer").Parse(`docker run -d --name {{.Name}} --privileged --runtime=nvidia \ +-e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all \ +-e NVIDIA_DRIVER_CAPABILITIES=all \ +{{ range $i, $a := .AdditionalArguments -}} +{{ $a }} \ +{{ end -}} +{{.BaseImage}} sleep infinity`) + + if err != nil { + return "", err + } + + var script strings.Builder + err = tmpl.Execute(&script, o) + if err != nil { + return "", fmt.Errorf("failed to execute template: %w", err) + } + + return script.String(), nil +}