Add cuda-compat-mode flag to configure command

elezar · elezar · commit 8ed5824a077a · 2025-05-12T16:59:03.000+02:00
This changes adds a --cuda-compat-mode flag to the configure
CLI. This allows more flexibility than the existing --no-cntlibs flag.

Possible values of the flag are:
* mount (default) - CUDA compat libraries are mounted from /usr/local/cuda/compat to
  the standard library path in the container.
* ldconfig - The folder containing the CUDA compat libraries is added as a command
  line argument to the ldconfig command executed in the container.
* disabled - This is equivalent ot specifying the --no-cntlibs flag and skips
  the detection and injection of compat libraries from the container to the
  container entirely.

Signed-off-by: Evan Lezar &lt;elezar@nvidia.com&gt;
diff --git a/Makefile b/Makefile
@@ -86,6 +86,7 @@ LIB_RPC_SRCS := $(SRCS_DIR)/nvc_rpc.h \
                 $(SRCS_DIR)/nvc_clt.c
 
 BIN_SRCS     := $(SRCS_DIR)/cli/common.c    \
+                $(SRCS_DIR)/cli/compat_mode.c \
                 $(SRCS_DIR)/cli/configure.c \
                 $(SRCS_DIR)/cli/dsl.c       \
                 $(SRCS_DIR)/cli/info.c      \
diff --git a/src/cli/compat_mode.c b/src/cli/compat_mode.c
@@ -0,0 +1,127 @@
+/**
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+#include <err.h>
+#include <libgen.h>
+#undef basename /* Use the GNU version of basename. */
+#include <stdlib.h>
+
+#include "cli.h"
+#include "compat_mode.h"
+
+static void filter_by_major_version(bool, const struct nvc_driver_info *, char * [], size_t *);
+static int get_compat_library_path(struct error *, const char * [], size_t, char **);
+
+int
+update_compat_libraries(struct nvc_context *ctx, struct nvc_container *cnt, const struct nvc_driver_info *info) {
+        if (cnt->flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
+                return (0);
+        }
+        if (cnt->libs == NULL || cnt->nlibs == 0) {
+                return (0);
+        }
+        size_t nlibs = cnt->nlibs;
+        char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
+        if (libs == NULL) {
+                return (-1);
+        }
+
+        /* For cuda-compat-mode=mount, we also allow compat libraries with a LOWER major versions. */
+        bool allow_lower_major_versions = (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT);
+        filter_by_major_version(allow_lower_major_versions, info, libs, &nlibs);
+
+        /* Use the filtered library list. */
+        free(cnt->libs);
+        cnt->libs = libs;
+        cnt->nlibs = nlibs;
+
+        if (!(cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
+                return (0);
+        }
+        /* For cuda-compat-mode=ldconfig we also ensure that cuda_compat_dir is set. */
+        if (get_compat_library_path(&ctx->err, (const char **)libs, nlibs, &cnt->cuda_compat_dir) < 0) {
+                return (-1);
+        }
+        return (0);
+}
+
+static void
+filter_by_major_version(bool allow_lower_major_versions, const struct nvc_driver_info *info, char * paths[], size_t *size)
+{
+        char *lib, *maj;
+        bool exclude;
+        /*
+         * XXX Filter out any library that has a lower or equal major version than RM to prevent us from
+         * running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
+         */
+        for (size_t i = 0; i < *size; ++i) {
+                lib = basename(paths[i]);
+                if ((maj = strstr(lib, ".so.")) != NULL) {
+                        maj += strlen(".so.");
+                        exclude = false;
+                        if (allow_lower_major_versions) {
+                                // Only filter out EQUAL RM versions.
+                                exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) == 0);
+                        } else {
+                                // If the major version of RM is greater than or equal to the major version
+                                // of the library that we are considering, we remove the library from the
+                                // list.
+                                exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) >= 0);
+                        }
+                        if (exclude) {
+                                paths[i] = NULL;
+                        }
+                }
+        }
+        array_pack(paths, size);
+}
+
+static int
+get_compat_library_path(struct error *err, const char * paths[], size_t size, char **compat_dir_result)
+{
+        char *dir;
+        char *compat_dir;
+
+        if (size == 0) {
+                return 0;
+        }
+
+        char **dirnames = array_copy(err, (const char * const *)paths, size);
+        if (dirnames == NULL) {
+                return -1;
+        }
+
+        for (size_t i = 0; i < size; ++i) {
+                dir = dirname(dirnames[i]);
+                if (i == 0) {
+                        compat_dir = strdup(dir);
+                        if (compat_dir == NULL) {
+                                return -1;
+                        }
+                        continue;
+                }
+                if (strcmp(dir, compat_dir)) {
+                        goto fail;
+                }
+        }
+
+        *compat_dir_result = compat_dir;
+        return 0;
+fail:
+        free(dirnames);
+        free(compat_dir);
+        return -1;
+}
diff --git a/src/cli/compat_mode.h b/src/cli/compat_mode.h
@@ -0,0 +1,33 @@
+/**
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ # SPDX-License-Identifier: Apache-2.0
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ **/
+
+#ifndef HEADER_COMPAT_MODE_H
+#define HEADER_COMPAT_MODE_H
+
+// TODO: These are duplicated from options.h to prevent conflicts with the CLI
+// options header.
+enum {
+    /* OPT_CUDA_COMPAT_MODE_DISABLED replaced OPT_NO_CNTLIBS. */
+    OPT_CUDA_COMPAT_MODE_DISABLED = 1 << 14,
+    OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
+    OPT_CUDA_COMPAT_MODE_MOUNT    = 1 << 16,
+};
+
+int update_compat_libraries(struct nvc_context *, struct nvc_container *, const struct nvc_driver_info *);
+
+
+#endif /* HEADER_COMPAT_MODE_H */
diff --git a/src/cli/configure.c b/src/cli/configure.c
@@ -7,6 +7,7 @@
 
 #include "cli.h"
 #include "dsl.h"
+#include "compat_mode.h"
 
 static error_t configure_parser(int, char *, struct argp_state *);
 static int check_cuda_version(const struct dsl_data *, enum dsl_comparator, const char *);
@@ -36,7 +37,8 @@ const struct argp configure_usage = {
                 {"no-persistenced", 0x86, NULL, 0, "Don't include the NVIDIA persistenced socket", -1},
                 {"no-fabricmanager", 0x87, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
                 {"no-gsp-firmware", 0x88, NULL, 0, "Don't include GSP Firmware", -1},
-                {"no-cntlibs", 0x89, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
+                {"no-cntlibs", 0x89, NULL, 0, "[Deprecated] Equivalent to --cuda-compat-mode=disabled", -1},
+                {"cuda-compat-mode", 0x90, "MODE", 0, "The mode to use to support CUDA Forward Compatibility. One of [ mount (default) | ldconfig | disabled]", -1},
                 {0},
         },
         configure_parser,
@@ -167,7 +169,15 @@ configure_parser(int key, char *arg, struct argp_state *state)
                         goto fatal;
                 break;
         case 0x89:
-                if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
+                /* The --no-cntlibs command line flag is equivalent to --cuda-compat-mode=disabled. */
+                if (str_join(&err, &ctx->container_flags, "cuda-compat-mode=disabled", " ") < 0)
+                        goto fatal;
+                break;
+        case 0x90:
+                /* We add cuda-compat-mode=$arg to the container_flags. */
+                if (str_join(&err, &ctx->container_flags, "cuda-compat-mode", " ") < 0)
+                        goto fatal;
+                if (str_join(&err, &ctx->container_flags, arg, "=") < 0)
                         goto fatal;
                 break;
         case ARGP_KEY_ARG:
@@ -316,6 +326,15 @@ configure_command(const struct context *ctx)
                 goto fail;
         }
 
+        /*
+         * We now have the driver version and can update the list of compat
+         * libraries discovered above accordingly.
+         */
+        if (update_compat_libraries(nvc, cnt, drv) < 0) {
+                warn("updating compat library settings failed: %s", libnvc.error(nvc));
+                goto fail;
+        }
+
         /* Allocate space for selecting GPU devices and MIG devices */
         if (new_devices(&err, dev, &devices) < 0) {
                 warn("memory allocation failed: %s", err.msg);
diff --git a/src/nvc_container.c b/src/nvc_container.c
@@ -24,6 +24,7 @@ static char *find_namespace_path(struct error *, const struct nvc_container *, c
 static int  find_compat_library_paths(struct error *, struct nvc_container *);
 static int  lookup_owner(struct error *, struct nvc_container *);
 static int  copy_config(struct error *, struct nvc_container *, const struct nvc_container_config *);
+static int  validate_cuda_compat_mode_flags(struct error *, int32_t *);
 
 struct nvc_container_config *
 nvc_container_config_new(pid_t pid, const char *rootfs)
@@ -236,6 +237,9 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
                 error_setx(&ctx->err, "invalid mode of operation");
                 return (NULL);
         }
+        if (validate_cuda_compat_mode_flags(&ctx->err, &flags) < 0) {
+                return (NULL);
+        }
 
         log_infof("configuring container with '%s'", opts);
         if ((cnt = xcalloc(&ctx->err, 1, sizeof(*cnt))) == NULL)
@@ -246,7 +250,7 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
                 goto fail;
         if (lookup_owner(&ctx->err, cnt) < 0)
                 goto fail;
-        if (!(flags & OPT_NO_CNTLIBS)) {
+        if (!(flags & OPT_CUDA_COMPAT_MODE_DISABLED)) {
                 if (find_compat_library_paths(&ctx->err, cnt) < 0)
                         goto fail;
         }
@@ -293,5 +297,41 @@ nvc_container_free(struct nvc_container *cnt)
         free(cnt->mnt_ns);
         free(cnt->dev_cg);
         array_free(cnt->libs, cnt->nlibs);
+        free(cnt->cuda_compat_dir);
         free(cnt);
 }
+
+/*
+ * validate_cuda_compat_mode_flags checks the options associated with the
+ * cuda-compat-mode flags.
+ * This function does the following:
+ * - Ensures that if OPT_CUDA_COMPAT_MODE_DISABLED is set, other modes are ignored.
+ * - Ensures that the mode is set to the default (OPT_CUDA_COMPAT_MODE_MOUNT) if unset.
+ * - Ensures that only a single mode is set.
+ */
+static int
+validate_cuda_compat_mode_flags(struct error *err, int32_t *flags) {
+        if (*flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
+                /*
+                 * If the OPT_CUDA_COMPAT_MODE_DISABLED flag is specified, we
+                 * explicitly ignore other OP_CUDA_COMPAT_MODE_* flags.
+                 */
+                *flags &= ~(OPT_CUDA_COMPAT_MODE_MOUNT | OPT_CUDA_COMPAT_MODE_LDCONFIG);
+                return (0);
+        }
+        if (!(*flags & (OPT_CUDA_COMPAT_MODE_LDCONFIG | OPT_CUDA_COMPAT_MODE_MOUNT))) {
+                /*
+                 * If no OPT_CUDA_COMPAT_MODE_* flags are specified,
+                 * default to OPT_CUDA_COMPAT_MODE_MOUNT to maintain
+                 * backward compatibility.
+                 */
+                *flags &= OPT_CUDA_COMPAT_MODE_MOUNT;
+                return (0);
+        }
+
+        if ((*flags & OPT_CUDA_COMPAT_MODE_MOUNT) && (*flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
+                error_setx(err, "only one cuda-compat-mode can be specified at a time");
+                return (-1);
+        }
+        return (0);
+}
diff --git a/src/nvc_internal.h b/src/nvc_internal.h
@@ -84,6 +84,7 @@ struct nvc_container {
         char *dev_cg;
         char **libs;
         size_t nlibs;
+        char *cuda_compat_dir;
 };
 
 enum {
diff --git a/src/nvc_ldcache.c b/src/nvc_ldcache.c
@@ -30,6 +30,7 @@
 #include "nvc_internal.h"
 
 #include "error.h"
+#include "options.h"
 #include "utils.h"
 #include "xfuncs.h"
 
@@ -471,7 +472,19 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
         if (validate_args(ctx, cnt != NULL) < 0)
                 return (-1);
 
-        argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        if (cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG && cnt->cuda_compat_dir != NULL) {
+                /*
+                 * We include the cuda_compat_dir directory on the ldconfig
+                 * command line. This ensures that the CUDA Forward compat
+                 * libraries take precendence over the user-mode driver
+                 * libraries in the standard library paths (libs_dir and
+                 * libs32_dir).
+                 * */
+                argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cuda_compat_dir, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        } else {
+                argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        }
+
         if (*argv[0] == '@') {
                 /*
                  * We treat this path specially to be relative to the host filesystem.
diff --git a/src/nvc_mount.c b/src/nvc_mount.c
@@ -40,7 +40,6 @@ static int  update_app_profile(struct error *, const struct nvc_container *, dev
 static void unmount(const char *);
 static int  symlink_library(struct error *, const char *, const char *, const char *, uid_t, gid_t);
 static int  symlink_libraries(struct error *, const struct nvc_container *, const char * const [], size_t);
-static void filter_libraries(const struct nvc_driver_info *, char * [], size_t *);
 static int  device_mount_dxcore(struct nvc_context *, const struct nvc_container *);
 static int  device_mount_native(struct nvc_context *, const struct nvc_container *, const struct nvc_device *);
 static int  cap_device_mount(struct nvc_context *, const struct nvc_container *, const char *);
@@ -562,27 +561,6 @@ symlink_libraries(struct error *err, const struct nvc_container *cnt, const char
         return (0);
 }
 
-static void
-filter_libraries(const struct nvc_driver_info *info, char * paths[], size_t *size)
-{
-        char *lib, *maj;
-
-        /*
-         * XXX Filter out any library that matches the major version of RM to prevent us from
-         * running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
-         */
-        for (size_t i = 0; i < *size; ++i) {
-                lib = basename(paths[i]);
-                if ((maj = strstr(lib, ".so.")) != NULL) {
-                        maj += strlen(".so.");
-                        if (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")))
-                                continue;
-                }
-                paths[i] = NULL;
-        }
-        array_pack(paths, size);
-}
-
 static int
 device_mount_dxcore(struct nvc_context *ctx, const struct nvc_container *cnt)
 {
@@ -769,20 +747,12 @@ nvc_driver_mount(struct nvc_context *ctx, const struct nvc_container *cnt, const
                 goto fail;
 
         /* Container library mounts */
-        if (cnt->libs != NULL && cnt->nlibs > 0) {
-                size_t nlibs = cnt->nlibs;
-                char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
-                if (libs == NULL)
-                        goto fail;
-
-                filter_libraries(info, libs, &nlibs);
-                if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, libs, nlibs)) == NULL) {
-                        free(libs);
+        if ((cnt->flags & OPT_CUDA_COMPAT_MODE_MOUNT) && cnt->libs != NULL && cnt->nlibs > 0) {
+                if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, cnt->libs, cnt->nlibs)) == NULL) {
                         goto fail;
                 }
                 ptr = array_append(ptr, tmp, array_size(tmp));
                 free(tmp);
-                free(libs);
         }
 
         /* Firmware mounts */
diff --git a/src/options.h b/src/options.h