Skip to content

Commit 2987c4d

Browse files
authored
Merge pull request #740 from elezar/imex-by-volume-mount
Allow IMEX channel requests by volume mount
2 parents b077e26 + 2e6712d commit 2987c4d

File tree

12 files changed

+411
-435
lines changed

12 files changed

+411
-435
lines changed

cmd/nvidia-container-runtime-hook/container_config.go

Lines changed: 43 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -6,40 +6,22 @@ import (
66
"log"
77
"os"
88
"path"
9-
"path/filepath"
10-
"strings"
119

1210
"github.com/opencontainers/runtime-spec/specs-go"
1311
"golang.org/x/mod/semver"
1412

1513
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
1614
)
1715

18-
const (
19-
envCUDAVersion = "CUDA_VERSION"
20-
envNVRequirePrefix = "NVIDIA_REQUIRE_"
21-
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
22-
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
23-
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
24-
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
25-
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
26-
envNVImexChannels = "NVIDIA_IMEX_CHANNELS"
27-
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
28-
)
29-
3016
const (
3117
capSysAdmin = "CAP_SYS_ADMIN"
3218
)
3319

34-
const (
35-
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
36-
)
37-
3820
type nvidiaConfig struct {
39-
Devices string
21+
Devices []string
4022
MigConfigDevices string
4123
MigMonitorDevices string
42-
ImexChannels string
24+
ImexChannels []string
4325
DriverCapabilities string
4426
// Requirements defines the requirements DSL for the container to run.
4527
// This is empty if no specific requirements are needed, or if requirements are
@@ -77,23 +59,14 @@ type LinuxCapabilities struct {
7759
Ambient []string `json:"ambient,omitempty" platform:"linux"`
7860
}
7961

80-
// Mount from OCI runtime spec
81-
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103
82-
type Mount struct {
83-
Destination string `json:"destination"`
84-
Type string `json:"type,omitempty" platform:"linux,solaris"`
85-
Source string `json:"source,omitempty"`
86-
Options []string `json:"options,omitempty"`
87-
}
88-
8962
// Spec from OCI runtime spec
9063
// We use pointers to structs, similarly to the latest version of runtime-spec:
9164
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28
9265
type Spec struct {
93-
Version *string `json:"ociVersion"`
94-
Process *Process `json:"process,omitempty"`
95-
Root *Root `json:"root,omitempty"`
96-
Mounts []Mount `json:"mounts,omitempty"`
66+
Version *string `json:"ociVersion"`
67+
Process *Process `json:"process,omitempty"`
68+
Root *Root `json:"root,omitempty"`
69+
Mounts []specs.Mount `json:"mounts,omitempty"`
9770
}
9871

9972
// HookState holds state information about the hook
@@ -172,82 +145,30 @@ func isPrivileged(s *Spec) bool {
172145
return image.IsPrivileged(&fullSpec)
173146
}
174147

175-
func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
148+
func getDevicesFromEnvvar(containerImage image.CUDA, swarmResourceEnvvars []string) []string {
176149
// We check if the image has at least one of the Swarm resource envvars defined and use this
177150
// if specified.
178-
var hasSwarmEnvvar bool
179151
for _, envvar := range swarmResourceEnvvars {
180-
if image.HasEnvvar(envvar) {
181-
hasSwarmEnvvar = true
182-
break
152+
if containerImage.HasEnvvar(envvar) {
153+
return containerImage.DevicesFromEnvvars(swarmResourceEnvvars...).List()
183154
}
184155
}
185156

186-
var devices []string
187-
if hasSwarmEnvvar {
188-
devices = image.DevicesFromEnvvars(swarmResourceEnvvars...).List()
189-
} else {
190-
devices = image.DevicesFromEnvvars(envNVVisibleDevices).List()
191-
}
192-
193-
if len(devices) == 0 {
194-
return nil
195-
}
196-
197-
devicesString := strings.Join(devices, ",")
198-
199-
return &devicesString
157+
return containerImage.VisibleDevicesFromEnvVar()
200158
}
201159

202-
func getDevicesFromMounts(mounts []Mount) *string {
203-
var devices []string
204-
for _, m := range mounts {
205-
root := filepath.Clean(deviceListAsVolumeMountsRoot)
206-
source := filepath.Clean(m.Source)
207-
destination := filepath.Clean(m.Destination)
208-
209-
// Only consider mounts who's host volume is /dev/null
210-
if source != "/dev/null" {
211-
continue
212-
}
213-
// Only consider container mount points that begin with 'root'
214-
if len(destination) < len(root) {
215-
continue
216-
}
217-
if destination[:len(root)] != root {
218-
continue
219-
}
220-
// Grab the full path beyond 'root' and add it to the list of devices
221-
device := destination[len(root):]
222-
if len(device) > 0 && device[0] == '/' {
223-
device = device[1:]
224-
}
225-
if len(device) == 0 {
226-
continue
227-
}
228-
devices = append(devices, device)
229-
}
230-
231-
if devices == nil {
232-
return nil
233-
}
234-
235-
ret := strings.Join(devices, ",")
236-
return &ret
237-
}
238-
239-
func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string {
160+
func getDevices(hookConfig *HookConfig, image image.CUDA, privileged bool) []string {
240161
// If enabled, try and get the device list from volume mounts first
241162
if hookConfig.AcceptDeviceListAsVolumeMounts {
242-
devices := getDevicesFromMounts(mounts)
243-
if devices != nil {
163+
devices := image.VisibleDevicesFromMounts()
164+
if len(devices) > 0 {
244165
return devices
245166
}
246167
}
247168

248169
// Fallback to reading from the environment variable if privileges are correct
249170
devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
250-
if devices == nil {
171+
if len(devices) == 0 {
251172
return nil
252173
}
253174
if privileged || hookConfig.AcceptEnvvarUnprivileged {
@@ -260,12 +181,12 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
260181
return nil
261182
}
262183

263-
func getMigConfigDevices(image image.CUDA) *string {
264-
return getMigDevices(image, envNVMigConfigDevices)
184+
func getMigConfigDevices(i image.CUDA) *string {
185+
return getMigDevices(i, image.EnvVarNvidiaMigConfigDevices)
265186
}
266187

267-
func getMigMonitorDevices(image image.CUDA) *string {
268-
return getMigDevices(image, envNVMigMonitorDevices)
188+
func getMigMonitorDevices(i image.CUDA) *string {
189+
return getMigDevices(i, image.EnvVarNvidiaMigMonitorDevices)
269190
}
270191

271192
func getMigDevices(image image.CUDA, envvar string) *string {
@@ -276,12 +197,24 @@ func getMigDevices(image image.CUDA, envvar string) *string {
276197
return &devices
277198
}
278199

279-
func getImexChannels(image image.CUDA) *string {
280-
if !image.HasEnvvar(envNVImexChannels) {
200+
func getImexChannels(hookConfig *HookConfig, image image.CUDA, privileged bool) []string {
201+
// If enabled, try and get the device list from volume mounts first
202+
if hookConfig.AcceptDeviceListAsVolumeMounts {
203+
devices := image.ImexChannelsFromMounts()
204+
if len(devices) > 0 {
205+
return devices
206+
}
207+
}
208+
devices := image.ImexChannelsFromEnvVar()
209+
if len(devices) == 0 {
281210
return nil
282211
}
283-
chans := image.Getenv(envNVImexChannels)
284-
return &chans
212+
213+
if privileged || hookConfig.AcceptEnvvarUnprivileged {
214+
return devices
215+
}
216+
217+
return nil
285218
}
286219

287220
func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities {
@@ -291,8 +224,8 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo
291224

292225
capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities)
293226

294-
capsEnvSpecified := cudaImage.HasEnvvar(envNVDriverCapabilities)
295-
capsEnv := cudaImage.Getenv(envNVDriverCapabilities)
227+
capsEnvSpecified := cudaImage.HasEnvvar(image.EnvVarNvidiaDriverCapabilities)
228+
capsEnv := cudaImage.Getenv(image.EnvVarNvidiaDriverCapabilities)
296229

297230
if !capsEnvSpecified && legacyImage {
298231
// Environment variable unset with legacy image: set all capabilities.
@@ -311,14 +244,12 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo
311244
return capabilities
312245
}
313246

314-
func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig {
247+
func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) *nvidiaConfig {
315248
legacyImage := image.IsLegacy()
316249

317-
var devices string
318-
if d := getDevices(hookConfig, image, mounts, privileged); d != nil {
319-
devices = *d
320-
} else {
321-
// 'nil' devices means this is not a GPU container.
250+
devices := getDevices(hookConfig, image, privileged)
251+
if len(devices) == 0 {
252+
// empty devices means this is not a GPU container.
322253
return nil
323254
}
324255

@@ -338,10 +269,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p
338269
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
339270
}
340271

341-
var imexChannels string
342-
if c := getImexChannels(image); c != nil {
343-
imexChannels = *c
344-
}
272+
imexChannels := getImexChannels(hookConfig, image, privileged)
345273

346274
driverCapabilities := hookConfig.getDriverCapabilities(image, legacyImage).String()
347275

@@ -376,6 +304,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
376304

377305
image, err := image.New(
378306
image.WithEnv(s.Process.Env),
307+
image.WithMounts(s.Mounts),
379308
image.WithDisableRequire(hook.DisableRequire),
380309
)
381310
if err != nil {
@@ -387,6 +316,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
387316
Pid: h.Pid,
388317
Rootfs: s.Root.Path,
389318
Image: image,
390-
Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged),
319+
Nvidia: getNvidiaConfig(&hook, image, privileged),
391320
}
392321
}

0 commit comments

Comments
 (0)