Skip to content

Commit 8ed5824

Browse files
committed
Add cuda-compat-mode flag to configure command
This changes adds a --cuda-compat-mode flag to the configure CLI. This allows more flexibility than the existing --no-cntlibs flag. Possible values of the flag are: * mount (default) - CUDA compat libraries are mounted from /usr/local/cuda/compat to the standard library path in the container. * ldconfig - The folder containing the CUDA compat libraries is added as a command line argument to the ldconfig command executed in the container. * disabled - This is equivalent ot specifying the --no-cntlibs flag and skips the detection and injection of compat libraries from the container to the container entirely. Signed-off-by: Evan Lezar <[email protected]>
1 parent a198166 commit 8ed5824

File tree

9 files changed

+247
-38
lines changed

9 files changed

+247
-38
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ LIB_RPC_SRCS := $(SRCS_DIR)/nvc_rpc.h \
8686
$(SRCS_DIR)/nvc_clt.c
8787

8888
BIN_SRCS := $(SRCS_DIR)/cli/common.c \
89+
$(SRCS_DIR)/cli/compat_mode.c \
8990
$(SRCS_DIR)/cli/configure.c \
9091
$(SRCS_DIR)/cli/dsl.c \
9192
$(SRCS_DIR)/cli/info.c \

src/cli/compat_mode.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/**
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
**/
17+
#include <err.h>
18+
#include <libgen.h>
19+
#undef basename /* Use the GNU version of basename. */
20+
#include <stdlib.h>
21+
22+
#include "cli.h"
23+
#include "compat_mode.h"
24+
25+
static void filter_by_major_version(bool, const struct nvc_driver_info *, char * [], size_t *);
26+
static int get_compat_library_path(struct error *, const char * [], size_t, char **);
27+
28+
int
29+
update_compat_libraries(struct nvc_context *ctx, struct nvc_container *cnt, const struct nvc_driver_info *info) {
30+
if (cnt->flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
31+
return (0);
32+
}
33+
if (cnt->libs == NULL || cnt->nlibs == 0) {
34+
return (0);
35+
}
36+
size_t nlibs = cnt->nlibs;
37+
char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
38+
if (libs == NULL) {
39+
return (-1);
40+
}
41+
42+
/* For cuda-compat-mode=mount, we also allow compat libraries with a LOWER major versions. */
43+
bool allow_lower_major_versions = (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT);
44+
filter_by_major_version(allow_lower_major_versions, info, libs, &nlibs);
45+
46+
/* Use the filtered library list. */
47+
free(cnt->libs);
48+
cnt->libs = libs;
49+
cnt->nlibs = nlibs;
50+
51+
if (!(cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
52+
return (0);
53+
}
54+
/* For cuda-compat-mode=ldconfig we also ensure that cuda_compat_dir is set. */
55+
if (get_compat_library_path(&ctx->err, (const char **)libs, nlibs, &cnt->cuda_compat_dir) < 0) {
56+
return (-1);
57+
}
58+
return (0);
59+
}
60+
61+
static void
62+
filter_by_major_version(bool allow_lower_major_versions, const struct nvc_driver_info *info, char * paths[], size_t *size)
63+
{
64+
char *lib, *maj;
65+
bool exclude;
66+
/*
67+
* XXX Filter out any library that has a lower or equal major version than RM to prevent us from
68+
* running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
69+
*/
70+
for (size_t i = 0; i < *size; ++i) {
71+
lib = basename(paths[i]);
72+
if ((maj = strstr(lib, ".so.")) != NULL) {
73+
maj += strlen(".so.");
74+
exclude = false;
75+
if (allow_lower_major_versions) {
76+
// Only filter out EQUAL RM versions.
77+
exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) == 0);
78+
} else {
79+
// If the major version of RM is greater than or equal to the major version
80+
// of the library that we are considering, we remove the library from the
81+
// list.
82+
exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) >= 0);
83+
}
84+
if (exclude) {
85+
paths[i] = NULL;
86+
}
87+
}
88+
}
89+
array_pack(paths, size);
90+
}
91+
92+
static int
93+
get_compat_library_path(struct error *err, const char * paths[], size_t size, char **compat_dir_result)
94+
{
95+
char *dir;
96+
char *compat_dir;
97+
98+
if (size == 0) {
99+
return 0;
100+
}
101+
102+
char **dirnames = array_copy(err, (const char * const *)paths, size);
103+
if (dirnames == NULL) {
104+
return -1;
105+
}
106+
107+
for (size_t i = 0; i < size; ++i) {
108+
dir = dirname(dirnames[i]);
109+
if (i == 0) {
110+
compat_dir = strdup(dir);
111+
if (compat_dir == NULL) {
112+
return -1;
113+
}
114+
continue;
115+
}
116+
if (strcmp(dir, compat_dir)) {
117+
goto fail;
118+
}
119+
}
120+
121+
*compat_dir_result = compat_dir;
122+
return 0;
123+
fail:
124+
free(dirnames);
125+
free(compat_dir);
126+
return -1;
127+
}

src/cli/compat_mode.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/**
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
**/
17+
18+
#ifndef HEADER_COMPAT_MODE_H
19+
#define HEADER_COMPAT_MODE_H
20+
21+
// TODO: These are duplicated from options.h to prevent conflicts with the CLI
22+
// options header.
23+
enum {
24+
/* OPT_CUDA_COMPAT_MODE_DISABLED replaced OPT_NO_CNTLIBS. */
25+
OPT_CUDA_COMPAT_MODE_DISABLED = 1 << 14,
26+
OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
27+
OPT_CUDA_COMPAT_MODE_MOUNT = 1 << 16,
28+
};
29+
30+
int update_compat_libraries(struct nvc_context *, struct nvc_container *, const struct nvc_driver_info *);
31+
32+
33+
#endif /* HEADER_COMPAT_MODE_H */

src/cli/configure.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cli.h"
99
#include "dsl.h"
10+
#include "compat_mode.h"
1011

1112
static error_t configure_parser(int, char *, struct argp_state *);
1213
static int check_cuda_version(const struct dsl_data *, enum dsl_comparator, const char *);
@@ -36,7 +37,8 @@ const struct argp configure_usage = {
3637
{"no-persistenced", 0x86, NULL, 0, "Don't include the NVIDIA persistenced socket", -1},
3738
{"no-fabricmanager", 0x87, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
3839
{"no-gsp-firmware", 0x88, NULL, 0, "Don't include GSP Firmware", -1},
39-
{"no-cntlibs", 0x89, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
40+
{"no-cntlibs", 0x89, NULL, 0, "[Deprecated] Equivalent to --cuda-compat-mode=disabled", -1},
41+
{"cuda-compat-mode", 0x90, "MODE", 0, "The mode to use to support CUDA Forward Compatibility. One of [ mount (default) | ldconfig | disabled]", -1},
4042
{0},
4143
},
4244
configure_parser,
@@ -167,7 +169,15 @@ configure_parser(int key, char *arg, struct argp_state *state)
167169
goto fatal;
168170
break;
169171
case 0x89:
170-
if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
172+
/* The --no-cntlibs command line flag is equivalent to --cuda-compat-mode=disabled. */
173+
if (str_join(&err, &ctx->container_flags, "cuda-compat-mode=disabled", " ") < 0)
174+
goto fatal;
175+
break;
176+
case 0x90:
177+
/* We add cuda-compat-mode=$arg to the container_flags. */
178+
if (str_join(&err, &ctx->container_flags, "cuda-compat-mode", " ") < 0)
179+
goto fatal;
180+
if (str_join(&err, &ctx->container_flags, arg, "=") < 0)
171181
goto fatal;
172182
break;
173183
case ARGP_KEY_ARG:
@@ -316,6 +326,15 @@ configure_command(const struct context *ctx)
316326
goto fail;
317327
}
318328

329+
/*
330+
* We now have the driver version and can update the list of compat
331+
* libraries discovered above accordingly.
332+
*/
333+
if (update_compat_libraries(nvc, cnt, drv) < 0) {
334+
warn("updating compat library settings failed: %s", libnvc.error(nvc));
335+
goto fail;
336+
}
337+
319338
/* Allocate space for selecting GPU devices and MIG devices */
320339
if (new_devices(&err, dev, &devices) < 0) {
321340
warn("memory allocation failed: %s", err.msg);

src/nvc_container.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ static char *find_namespace_path(struct error *, const struct nvc_container *, c
2424
static int find_compat_library_paths(struct error *, struct nvc_container *);
2525
static int lookup_owner(struct error *, struct nvc_container *);
2626
static int copy_config(struct error *, struct nvc_container *, const struct nvc_container_config *);
27+
static int validate_cuda_compat_mode_flags(struct error *, int32_t *);
2728

2829
struct nvc_container_config *
2930
nvc_container_config_new(pid_t pid, const char *rootfs)
@@ -236,6 +237,9 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
236237
error_setx(&ctx->err, "invalid mode of operation");
237238
return (NULL);
238239
}
240+
if (validate_cuda_compat_mode_flags(&ctx->err, &flags) < 0) {
241+
return (NULL);
242+
}
239243

240244
log_infof("configuring container with '%s'", opts);
241245
if ((cnt = xcalloc(&ctx->err, 1, sizeof(*cnt))) == NULL)
@@ -246,7 +250,7 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
246250
goto fail;
247251
if (lookup_owner(&ctx->err, cnt) < 0)
248252
goto fail;
249-
if (!(flags & OPT_NO_CNTLIBS)) {
253+
if (!(flags & OPT_CUDA_COMPAT_MODE_DISABLED)) {
250254
if (find_compat_library_paths(&ctx->err, cnt) < 0)
251255
goto fail;
252256
}
@@ -293,5 +297,41 @@ nvc_container_free(struct nvc_container *cnt)
293297
free(cnt->mnt_ns);
294298
free(cnt->dev_cg);
295299
array_free(cnt->libs, cnt->nlibs);
300+
free(cnt->cuda_compat_dir);
296301
free(cnt);
297302
}
303+
304+
/*
305+
* validate_cuda_compat_mode_flags checks the options associated with the
306+
* cuda-compat-mode flags.
307+
* This function does the following:
308+
* - Ensures that if OPT_CUDA_COMPAT_MODE_DISABLED is set, other modes are ignored.
309+
* - Ensures that the mode is set to the default (OPT_CUDA_COMPAT_MODE_MOUNT) if unset.
310+
* - Ensures that only a single mode is set.
311+
*/
312+
static int
313+
validate_cuda_compat_mode_flags(struct error *err, int32_t *flags) {
314+
if (*flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
315+
/*
316+
* If the OPT_CUDA_COMPAT_MODE_DISABLED flag is specified, we
317+
* explicitly ignore other OP_CUDA_COMPAT_MODE_* flags.
318+
*/
319+
*flags &= ~(OPT_CUDA_COMPAT_MODE_MOUNT | OPT_CUDA_COMPAT_MODE_LDCONFIG);
320+
return (0);
321+
}
322+
if (!(*flags & (OPT_CUDA_COMPAT_MODE_LDCONFIG | OPT_CUDA_COMPAT_MODE_MOUNT))) {
323+
/*
324+
* If no OPT_CUDA_COMPAT_MODE_* flags are specified,
325+
* default to OPT_CUDA_COMPAT_MODE_MOUNT to maintain
326+
* backward compatibility.
327+
*/
328+
*flags &= OPT_CUDA_COMPAT_MODE_MOUNT;
329+
return (0);
330+
}
331+
332+
if ((*flags & OPT_CUDA_COMPAT_MODE_MOUNT) && (*flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
333+
error_setx(err, "only one cuda-compat-mode can be specified at a time");
334+
return (-1);
335+
}
336+
return (0);
337+
}

src/nvc_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ struct nvc_container {
8484
char *dev_cg;
8585
char **libs;
8686
size_t nlibs;
87+
char *cuda_compat_dir;
8788
};
8889

8990
enum {

src/nvc_ldcache.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "nvc_internal.h"
3131

3232
#include "error.h"
33+
#include "options.h"
3334
#include "utils.h"
3435
#include "xfuncs.h"
3536

@@ -471,7 +472,19 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
471472
if (validate_args(ctx, cnt != NULL) < 0)
472473
return (-1);
473474

474-
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
475+
if (cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG && cnt->cuda_compat_dir != NULL) {
476+
/*
477+
* We include the cuda_compat_dir directory on the ldconfig
478+
* command line. This ensures that the CUDA Forward compat
479+
* libraries take precendence over the user-mode driver
480+
* libraries in the standard library paths (libs_dir and
481+
* libs32_dir).
482+
* */
483+
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cuda_compat_dir, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
484+
} else {
485+
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
486+
}
487+
475488
if (*argv[0] == '@') {
476489
/*
477490
* We treat this path specially to be relative to the host filesystem.

src/nvc_mount.c

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ static int update_app_profile(struct error *, const struct nvc_container *, dev
4040
static void unmount(const char *);
4141
static int symlink_library(struct error *, const char *, const char *, const char *, uid_t, gid_t);
4242
static int symlink_libraries(struct error *, const struct nvc_container *, const char * const [], size_t);
43-
static void filter_libraries(const struct nvc_driver_info *, char * [], size_t *);
4443
static int device_mount_dxcore(struct nvc_context *, const struct nvc_container *);
4544
static int device_mount_native(struct nvc_context *, const struct nvc_container *, const struct nvc_device *);
4645
static int cap_device_mount(struct nvc_context *, const struct nvc_container *, const char *);
@@ -562,27 +561,6 @@ symlink_libraries(struct error *err, const struct nvc_container *cnt, const char
562561
return (0);
563562
}
564563

565-
static void
566-
filter_libraries(const struct nvc_driver_info *info, char * paths[], size_t *size)
567-
{
568-
char *lib, *maj;
569-
570-
/*
571-
* XXX Filter out any library that matches the major version of RM to prevent us from
572-
* running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
573-
*/
574-
for (size_t i = 0; i < *size; ++i) {
575-
lib = basename(paths[i]);
576-
if ((maj = strstr(lib, ".so.")) != NULL) {
577-
maj += strlen(".so.");
578-
if (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")))
579-
continue;
580-
}
581-
paths[i] = NULL;
582-
}
583-
array_pack(paths, size);
584-
}
585-
586564
static int
587565
device_mount_dxcore(struct nvc_context *ctx, const struct nvc_container *cnt)
588566
{
@@ -769,20 +747,12 @@ nvc_driver_mount(struct nvc_context *ctx, const struct nvc_container *cnt, const
769747
goto fail;
770748

771749
/* Container library mounts */
772-
if (cnt->libs != NULL && cnt->nlibs > 0) {
773-
size_t nlibs = cnt->nlibs;
774-
char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
775-
if (libs == NULL)
776-
goto fail;
777-
778-
filter_libraries(info, libs, &nlibs);
779-
if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, libs, nlibs)) == NULL) {
780-
free(libs);
750+
if ((cnt->flags & OPT_CUDA_COMPAT_MODE_MOUNT) && cnt->libs != NULL && cnt->nlibs > 0) {
751+
if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, cnt->libs, cnt->nlibs)) == NULL) {
781752
goto fail;
782753
}
783754
ptr = array_append(ptr, tmp, array_size(tmp));
784755
free(tmp);
785-
free(libs);
786756
}
787757

788758
/* Firmware mounts */

0 commit comments

Comments
 (0)