Skip to content

Commit f537b96

Browse files
authored
Merge pull request #387 from klueska/bundle-nvidia-ctk
Remove external dependency on nvidia-container-toolkit (i.e. nvidia-cdi-hook)
2 parents f2cef6f + fe0d118 commit f537b96

File tree

20 files changed

+174
-76
lines changed

20 files changed

+174
-76
lines changed

cmd/compute-domain-kubelet-plugin/cdi.go

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,16 @@ const (
4747
)
4848

4949
type CDIHandler struct {
50-
logger *logrus.Logger
51-
nvml nvml.Interface
52-
nvdevice nvdevice.Interface
53-
nvcdiDevice nvcdi.Interface
54-
nvcdiClaim nvcdi.Interface
55-
cache *cdiapi.Cache
56-
driverRoot string
57-
devRoot string
58-
targetDriverRoot string
59-
nvidiaCTKPath string
50+
logger *logrus.Logger
51+
nvml nvml.Interface
52+
nvdevice nvdevice.Interface
53+
nvcdiDevice nvcdi.Interface
54+
nvcdiClaim nvcdi.Interface
55+
cache *cdiapi.Cache
56+
driverRoot string
57+
devRoot string
58+
targetDriverRoot string
59+
nvidiaCDIHookPath string
6060

6161
cdiRoot string
6262
vendor string
@@ -102,7 +102,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
102102
nvcdi.WithMode("management"),
103103
nvcdi.WithVendor(h.vendor),
104104
nvcdi.WithClass(h.deviceClass),
105-
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCTKPath),
105+
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
106106
)
107107
if err != nil {
108108
return nil, fmt.Errorf("unable to create CDI library for devices: %w", err)
@@ -119,9 +119,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
119119
nvcdi.WithMode("nvml"),
120120
nvcdi.WithVendor(h.vendor),
121121
nvcdi.WithClass(h.claimClass),
122-
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCTKPath),
123-
// TODO: This should be removed once the use of a NVIDIA Container Toolkit >= v1.17.5 is commonplace.
124-
nvcdi.WithDisabledHook(nvcdi.HookEnableCudaCompat),
122+
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
125123
)
126124
if err != nil {
127125
return nil, fmt.Errorf("unable to create CDI library for claims: %w", err)

cmd/compute-domain-kubelet-plugin/cdioptions.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ func WithCDIRoot(cdiRoot string) cdiOption {
5252
}
5353
}
5454

55-
// WithNvidiaCTKPath provides an cdiOption to set the nvidia-ctk path used by the 'cdi' interface.
56-
func WithNvidiaCTKPath(path string) cdiOption {
55+
// WithNVIDIACDIHookPath provides an cdiOption to set the nvidia-cdi-hook path used by the 'cdi' interface.
56+
func WithNVIDIACDIHookPath(path string) cdiOption {
5757
return func(c *CDIHandler) {
58-
c.nvidiaCTKPath = path
58+
c.nvidiaCDIHookPath = path
5959
}
6060
}
6161

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
7676
WithDriverRoot(string(containerDriverRoot)),
7777
WithDevRoot(devRoot),
7878
WithTargetDriverRoot(hostDriverRoot),
79-
WithNvidiaCTKPath(config.flags.nvidiaCTKPath),
79+
WithNVIDIACDIHookPath(config.flags.nvidiaCDIHookPath),
8080
WithCDIRoot(config.flags.cdiRoot),
8181
WithVendor(cdiVendor),
8282
)

cmd/compute-domain-kubelet-plugin/main.go

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"os"
2323
"os/signal"
24+
"path/filepath"
2425
"syscall"
2526

2627
"github.com/urfave/cli/v2"
@@ -46,7 +47,7 @@ type Flags struct {
4647
cdiRoot string
4748
containerDriverRoot string
4849
hostDriverRoot string
49-
nvidiaCTKPath string
50+
nvidiaCDIHookPath string
5051
}
5152

5253
type Config struct {
@@ -103,11 +104,10 @@ func newApp() *cli.App {
103104
EnvVars: []string{"CONTAINER_DRIVER_ROOT"},
104105
},
105106
&cli.StringFlag{
106-
Name: "nvidia-ctk-path",
107-
Value: "/usr/bin/nvidia-ctk",
108-
Usage: "the path to use for the nvidia-ctk in the generated CDI specification. Note that this represents the path on the host.",
109-
Destination: &flags.nvidiaCTKPath,
110-
EnvVars: []string{"NVIDIA_CTK_PATH"},
107+
Name: "nvidia-cdi-hook-path",
108+
Usage: "Absolute path to the nvidia-cdi-hook executable in the host file system. Used in the generated CDI specification.",
109+
Destination: &flags.nvidiaCDIHookPath,
110+
EnvVars: []string{"NVIDIA_CDI_HOOK_PATH"},
111111
},
112112
}
113113
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
@@ -152,12 +152,20 @@ func newApp() *cli.App {
152152
return app
153153
}
154154

155+
// StartPlugin initializes and runs the compute domain kubelet plugin.
155156
func StartPlugin(ctx context.Context, config *Config) error {
157+
// Create the plugin directory
156158
err := os.MkdirAll(DriverPluginPath, 0750)
157159
if err != nil {
158160
return err
159161
}
160162

163+
// Setup nvidia-cdi-hook binary
164+
if err := config.flags.setNvidiaCDIHookPath(); err != nil {
165+
return fmt.Errorf("error setting up nvidia-cdi-hook: %w", err)
166+
}
167+
168+
// Initialize CDI root directory
161169
info, err := os.Stat(config.flags.cdiRoot)
162170
switch {
163171
case err != nil && os.IsNotExist(err):
@@ -171,9 +179,11 @@ func StartPlugin(ctx context.Context, config *Config) error {
171179
return fmt.Errorf("path for cdi file generation is not a directory: '%v'", config.flags.cdiRoot)
172180
}
173181

182+
// Setup signal handling for graceful shutdown
174183
sigs := make(chan os.Signal, 1)
175184
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
176185

186+
// Create a cancellable context for cleanup
177187
var driver *driver
178188
ctx, cancel := context.WithCancel(ctx)
179189
defer func() {
@@ -183,12 +193,43 @@ func StartPlugin(ctx context.Context, config *Config) error {
183193
}
184194
}()
185195

196+
// Create and start the driver
186197
driver, err = NewDriver(ctx, config)
187198
if err != nil {
188199
return fmt.Errorf("error creating driver: %w", err)
189200
}
190201

202+
// Wait for shutdown signal
191203
<-sigs
192204

193205
return nil
194206
}
207+
208+
// setNvidiaCDIHookPath ensures the proper flag is set with the host path for the nvidia-cdi-hook binary.
209+
// If 'f.nvidiaCDIHookPath' is already set (from the command line), do nothing.
210+
// If 'f.nvidiaCDIHookPath' is empty, it copies the nvidia-cdi-hook binary from
211+
// /usr/bin/nvidia-cdi-hook to DriverPluginPath and sets 'f.nvidiaCDIHookPath'
212+
// to this path. The /usr/bin/nvidia-cdi-hook is present in the current
213+
// container image because it is copied from the toolkit image into this
214+
// container at build time.
215+
func (f *Flags) setNvidiaCDIHookPath() error {
216+
if f.nvidiaCDIHookPath != "" {
217+
return nil
218+
}
219+
220+
sourcePath := "/usr/bin/nvidia-cdi-hook"
221+
targetPath := filepath.Join(DriverPluginPath, "nvidia-cdi-hook")
222+
223+
input, err := os.ReadFile(sourcePath)
224+
if err != nil {
225+
return fmt.Errorf("error reading nvidia-cdi-hook: %w", err)
226+
}
227+
228+
if err := os.WriteFile(targetPath, input, 0755); err != nil {
229+
return fmt.Errorf("error copying nvidia-cdi-hook: %w", err)
230+
}
231+
232+
f.nvidiaCDIHookPath = targetPath
233+
234+
return nil
235+
}

cmd/gpu-kubelet-plugin/cdi.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,16 @@ const (
4747
)
4848

4949
type CDIHandler struct {
50-
logger *logrus.Logger
51-
nvml nvml.Interface
52-
nvdevice nvdevice.Interface
53-
nvcdiDevice nvcdi.Interface
54-
nvcdiClaim nvcdi.Interface
55-
cache *cdiapi.Cache
56-
driverRoot string
57-
devRoot string
58-
targetDriverRoot string
59-
nvidiaCTKPath string
50+
logger *logrus.Logger
51+
nvml nvml.Interface
52+
nvdevice nvdevice.Interface
53+
nvcdiDevice nvcdi.Interface
54+
nvcdiClaim nvcdi.Interface
55+
cache *cdiapi.Cache
56+
driverRoot string
57+
devRoot string
58+
targetDriverRoot string
59+
nvidiaCDIHookPath string
6060

6161
cdiRoot string
6262
vendor string
@@ -102,7 +102,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
102102
nvcdi.WithMode("nvml"),
103103
nvcdi.WithVendor(h.vendor),
104104
nvcdi.WithClass(h.deviceClass),
105-
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCTKPath),
105+
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
106106
)
107107
if err != nil {
108108
return nil, fmt.Errorf("unable to create CDI library for devices: %w", err)
@@ -119,7 +119,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
119119
nvcdi.WithMode("nvml"),
120120
nvcdi.WithVendor(h.vendor),
121121
nvcdi.WithClass(h.claimClass),
122-
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCTKPath),
122+
nvcdi.WithNVIDIACDIHookPath(h.nvidiaCDIHookPath),
123123
)
124124
if err != nil {
125125
return nil, fmt.Errorf("unable to create CDI library for claims: %w", err)

cmd/gpu-kubelet-plugin/cdioptions.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ func WithCDIRoot(cdiRoot string) cdiOption {
5252
}
5353
}
5454

55-
// WithNvidiaCTKPath provides an cdiOption to set the nvidia-ctk path used by the 'cdi' interface.
56-
func WithNvidiaCTKPath(path string) cdiOption {
55+
// WithNVIDIACDIHookPath provides an cdiOption to set the nvidia-cdi-hook path used by the 'cdi' interface.
56+
func WithNVIDIACDIHookPath(path string) cdiOption {
5757
return func(c *CDIHandler) {
58-
c.nvidiaCTKPath = path
58+
c.nvidiaCDIHookPath = path
5959
}
6060
}
6161

cmd/gpu-kubelet-plugin/device_state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
7676
WithDriverRoot(string(containerDriverRoot)),
7777
WithDevRoot(devRoot),
7878
WithTargetDriverRoot(hostDriverRoot),
79-
WithNvidiaCTKPath(config.flags.nvidiaCTKPath),
79+
WithNVIDIACDIHookPath(config.flags.nvidiaCDIHookPath),
8080
WithCDIRoot(config.flags.cdiRoot),
8181
WithVendor(cdiVendor),
8282
)

cmd/gpu-kubelet-plugin/main.go

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"os"
2323
"os/signal"
24+
"path/filepath"
2425
"syscall"
2526

2627
"github.com/urfave/cli/v2"
@@ -46,7 +47,7 @@ type Flags struct {
4647
cdiRoot string
4748
containerDriverRoot string
4849
hostDriverRoot string
49-
nvidiaCTKPath string
50+
nvidiaCDIHookPath string
5051
imageName string
5152
}
5253

@@ -104,11 +105,10 @@ func newApp() *cli.App {
104105
EnvVars: []string{"CONTAINER_DRIVER_ROOT"},
105106
},
106107
&cli.StringFlag{
107-
Name: "nvidia-ctk-path",
108-
Value: "/usr/bin/nvidia-ctk",
109-
Usage: "the path to use for the nvidia-ctk in the generated CDI specification. Note that this represents the path on the host.",
110-
Destination: &flags.nvidiaCTKPath,
111-
EnvVars: []string{"NVIDIA_CTK_PATH"},
108+
Name: "nvidia-cdi-hook-path",
109+
Usage: "Absolute path to the nvidia-cdi-hook executable in the host file system. Used in the generated CDI specification.",
110+
Destination: &flags.nvidiaCDIHookPath,
111+
EnvVars: []string{"NVIDIA_CDI_HOOK_PATH"},
112112
},
113113
&cli.StringFlag{
114114
Name: "image-name",
@@ -160,12 +160,20 @@ func newApp() *cli.App {
160160
return app
161161
}
162162

163+
// StartPlugin initializes and runs the GPU kubelet plugin.
163164
func StartPlugin(ctx context.Context, config *Config) error {
165+
// Create the plugin directory
164166
err := os.MkdirAll(DriverPluginPath, 0750)
165167
if err != nil {
166168
return err
167169
}
168170

171+
// Setup nvidia-cdi-hook binary
172+
if err := config.flags.setNvidiaCDIHookPath(); err != nil {
173+
return fmt.Errorf("error setting up nvidia-cdi-hook: %w", err)
174+
}
175+
176+
// Initialize CDI root directory
169177
info, err := os.Stat(config.flags.cdiRoot)
170178
switch {
171179
case err != nil && os.IsNotExist(err):
@@ -179,9 +187,11 @@ func StartPlugin(ctx context.Context, config *Config) error {
179187
return fmt.Errorf("path for cdi file generation is not a directory: '%v'", config.flags.cdiRoot)
180188
}
181189

190+
// Setup signal handling for graceful shutdown
182191
sigs := make(chan os.Signal, 1)
183192
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
184193

194+
// Create a cancellable context for cleanup
185195
var driver *driver
186196
ctx, cancel := context.WithCancel(ctx)
187197
defer func() {
@@ -191,12 +201,42 @@ func StartPlugin(ctx context.Context, config *Config) error {
191201
}
192202
}()
193203

204+
// Create and start the driver
194205
driver, err = NewDriver(ctx, config)
195206
if err != nil {
196207
return fmt.Errorf("error creating driver: %w", err)
197208
}
198209

210+
// Wait for shutdown signal
199211
<-sigs
200212

201213
return nil
202214
}
215+
216+
// If 'f.nvidiaCDIHookPath' is already set (from the command line), do nothing.
217+
// If 'f.nvidiaCDIHookPath' is empty, it copies the nvidia-cdi-hook binary from
218+
// /usr/bin/nvidia-cdi-hook to DriverPluginPath and sets 'f.nvidiaCDIHookPath'
219+
// to this path. The /usr/bin/nvidia-cdi-hook is present in the current
220+
// container image because it is copied from the toolkit image into this
221+
// container at build time.
222+
func (f *Flags) setNvidiaCDIHookPath() error {
223+
if f.nvidiaCDIHookPath != "" {
224+
return nil
225+
}
226+
227+
sourcePath := "/usr/bin/nvidia-cdi-hook"
228+
targetPath := filepath.Join(DriverPluginPath, "nvidia-cdi-hook")
229+
230+
input, err := os.ReadFile(sourcePath)
231+
if err != nil {
232+
return fmt.Errorf("error reading nvidia-cdi-hook: %w", err)
233+
}
234+
235+
if err := os.WriteFile(targetPath, input, 0755); err != nil {
236+
return fmt.Errorf("error copying nvidia-cdi-hook: %w", err)
237+
}
238+
239+
f.nvidiaCDIHookPath = targetPath
240+
241+
return nil
242+
}

demo/clusters/kind/install-dra-driver-gpu.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ source "${CURRENT_DIR}/scripts/common.sh"
2525
kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/gpu.present=true
2626

2727
helm upgrade -i --create-namespace --namespace nvidia-dra-driver-gpu nvidia-dra-driver-gpu ${PROJECT_DIR}/deployments/helm/nvidia-dra-driver-gpu \
28-
${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \
2928
${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \
3029
${MASK_NVIDIA_DRIVER_PARAMS:+--set maskNvidiaDriverParams=${MASK_NVIDIA_DRIVER_PARAMS}} \
3130
--set gpuResourcesEnabledOverride=true \

demo/clusters/kind/scripts/kind-cluster-config.yaml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,6 @@ nodes:
5959
# in `/etc/nvidia-container-runtime/config.toml`
6060
- hostPath: /dev/null
6161
containerPath: /var/run/nvidia-container-devices/cdi/runtime.nvidia.com/gpu/all
62-
# The generated CDI specification assumes that `nvidia-ctk` is available on a
63-
# node -- specifically for the `nvidia-ctk hook` subcommand. As a workaround,
64-
# we mount it from the host.
65-
# TODO: Remove this once we have a more stable solution to make `nvidia-ctk`
66-
# on the kind nodes.
67-
- hostPath: /usr/bin/nvidia-ctk
68-
containerPath: /usr/bin/nvidia-ctk
6962
# We need to inject the fabricmanager socket to support MIG with toolkit 1.16.2
7063
# TODO: Remove this once we have a version of the toolkit where this is not required
7164
- hostPath: /run/nvidia-fabricmanager/socket

0 commit comments

Comments
 (0)