From 872aa2fe1c3439da8457b9bc7467ef999839bb80 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 22 May 2025 16:45:20 +0200 Subject: [PATCH 1/2] Reenable nvsandboxutils for driver discovery This change reenables nvsandboxutils for driver discovery. This was disabled due to an error in a specific driver version (v565) so as to not block the release of the DRA driver for ComputeDomains. Signed-off-by: Evan Lezar --- pkg/nvcdi/lib.go | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 97a391682..4723f1e08 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -108,24 +108,7 @@ func New(opts ...Option) (Interface, error) { } l.nvmllib = nvml.New(nvmlOpts...) } - // TODO: Repeated calls to nvsandboxutils.Init and Shutdown are causing - // segmentation violations. Here we disabled nvsandbox utils unless explicitly - // specified. - // This will be reenabled as soon as we have more visibility into why this is - // happening and a mechanism to detect and disable this if required. - // if l.nvsandboxutilslib == nil { - // var nvsandboxutilsOpts []nvsandboxutils.LibraryOption - // // Set the library path for libnvidia-sandboxutils - // candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1") - // if err != nil { - // l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err) - // } else { - // libNvidiaSandboxutilsPath := candidates[0] - // l.logger.Infof("Using %v", libNvidiaSandboxutilsPath) - // nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath)) - // } - // l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...) - // } + l.nvsandboxutilslib = l.getNvsandboxUtilsLib() if l.devicelib == nil { l.devicelib = device.New(l.nvmllib) } @@ -231,3 +214,23 @@ func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) { } return version, nil } + +// getNvsandboxUtilsLib returns the nvsandboxutilslib to use for CDI spec +// generation. +func (l *nvcdilib) getNvsandboxUtilsLib() nvsandboxutils.Interface { + if l.nvsandboxutilslib != nil { + return l.nvsandboxutilslib + } + + var nvsandboxutilsOpts []nvsandboxutils.LibraryOption + // Set the library path for libnvidia-sandboxutils + candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1") + if err != nil { + l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err) + } else { + libNvidiaSandboxutilsPath := candidates[0] + l.logger.Infof("Using %v", libNvidiaSandboxutilsPath) + nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath)) + } + return nvsandboxutils.New(nvsandboxutilsOpts...) +} From 7bd65da91eb3dffaa057ebbbf83f4d6a9a0c4e27 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 22 May 2025 16:35:01 +0200 Subject: [PATCH 2/2] Add FeatureFlags to the nvcdi API This change adds support for feature flags to the nvcdi API. A feature flag to disable nvsandboxutils is also added to allow more flexibility in cases where this library causes issue. Signed-off-by: Evan Lezar --- pkg/nvcdi/api.go | 10 ++++++++++ pkg/nvcdi/lib.go | 6 ++++++ pkg/nvcdi/options.go | 11 +++++++++++ 3 files changed, 27 insertions(+) diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index 2988026f3..8c37f2778 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -45,3 +45,13 @@ const ( // This was added with v1.17.5 of the NVIDIA Container Toolkit. HookEnableCudaCompat = HookName("enable-cuda-compat") ) + +// A FeatureFlag refers to a specific feature that can be toggled in the CDI api. +// All features are off by default. +type FeatureFlag string + +const ( + // FeatureDisableNvsandboxUtils disables the use of nvsandboxutils when + // querying devices. + FeatureDisableNvsandboxUtils = FeatureFlag("disable-nvsandbox-utils") +) diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 4723f1e08..165a71366 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -56,6 +56,8 @@ type nvcdilib struct { mergedDeviceOptions []transform.MergedDeviceOption + featureFlags map[FeatureFlag]bool + disabledHooks disabledHooks hookCreator discover.HookCreator } @@ -64,6 +66,7 @@ type nvcdilib struct { func New(opts ...Option) (Interface, error) { l := &nvcdilib{ disabledHooks: make(disabledHooks), + featureFlags: make(map[FeatureFlag]bool), } for _, opt := range opts { opt(l) @@ -218,6 +221,9 @@ func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) { // getNvsandboxUtilsLib returns the nvsandboxutilslib to use for CDI spec // generation. func (l *nvcdilib) getNvsandboxUtilsLib() nvsandboxutils.Interface { + if l.featureFlags[FeatureDisableNvsandboxUtils] { + return nil + } if l.nvsandboxutilslib != nil { return l.nvsandboxutilslib } diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go index f38f2b4a9..7c76f7fc5 100644 --- a/pkg/nvcdi/options.go +++ b/pkg/nvcdi/options.go @@ -166,3 +166,14 @@ func WithDisabledHook(hook HookName) Option { o.disabledHooks[hook] = true } } + +// WithFeatureFlag allows specified features to be toggled on. +// This option can be specified multiple times for each feature flag. +func WithFeatureFlag(featureFlag FeatureFlag) Option { + return func(o *nvcdilib) { + if o.featureFlags == nil { + o.featureFlags = make(map[FeatureFlag]bool) + } + o.featureFlags[featureFlag] = true + } +}