Skip to content

Commit ccdb3e6

Browse files
committed
Handle --gpus flag using CDI
This change switches to using CDI to handle the --gpus flag. This removes the custom implementation that invoked the nvidia-container-cli directly. This mechanism does not align with existing implementations. Signed-off-by: Evan Lezar <[email protected]>
1 parent 924e283 commit ccdb3e6

File tree

3 files changed

+62
-54
lines changed

3 files changed

+62
-54
lines changed

docs/gpu.md

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ nerdctl provides docker-compatible NVIDIA GPU support.
99

1010
- NVIDIA Drivers
1111
- Same requirement as when you use GPUs on Docker. For details, please refer to [the doc by NVIDIA](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#pre-requisites).
12-
- `nvidia-container-cli`
13-
- containerd relies on this CLI for setting up GPUs inside container. You can install this via [`libnvidia-container` package](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/arch-overview.html#libnvidia-container).
12+
- The NVIDIA Container Toolkit
13+
- containerd relies on the NVIDIA Container Toolkit to make GPUs usable inside a container. You can install the NVIDIA Container Toolkit by following the [official installation instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
1414

1515
## Options for `nerdctl run --gpus`
1616

@@ -27,23 +27,24 @@ You can also pass detailed configuration to `--gpus` option as a list of key-val
2727

2828
- `count`: number of GPUs to use. `all` exposes all available GPUs.
2929
- `device`: IDs of GPUs to use. UUID or numbers of GPUs can be specified.
30-
- `capabilities`: [Driver capabilities](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html#driver-capabilities). If unset, use default driver `utility`, `compute`.
3130

3231
The following example exposes a specific GPU to the container.
3332

3433
```
35-
nerdctl run -it --rm --gpus '"capabilities=utility,compute",device=GPU-3a23c669-1f69-c64e-cf85-44e9b07e7a2a' nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
34+
nerdctl run -it --rm --gpus 'device=GPU-3a23c669-1f69-c64e-cf85-44e9b07e7a2a' nvidia/cuda:12.3.1-base-ubuntu20.04 nvidia-smi
3635
```
3736

37+
Note that although `capabilities` options may be provided, these are ignored when processing the GPU request.
38+
3839
## Fields for `nerdctl compose`
3940

4041
`nerdctl compose` also supports GPUs following [compose-spec](https://github.com/compose-spec/compose-spec/blob/master/deploy.md#devices).
4142

42-
You can use GPUs on compose when you specify some of the following `capabilities` in `services.demo.deploy.resources.reservations.devices`.
43+
You can use GPUs on compose when you specify the `driver` as `nvidia` or one or
44+
more of the following `capabilities` in `services.demo.deploy.resources.reservations.devices`.
4345

4446
- `gpu`
4547
- `nvidia`
46-
- all allowed capabilities for `nerdctl run --gpus`
4748

4849
Available fields are the same as `nerdctl run --gpus`.
4950

@@ -59,12 +60,37 @@ services:
5960
resources:
6061
reservations:
6162
devices:
62-
- capabilities: ["utility"]
63+
- driver: nvidia
6364
count: all
6465
```
6566

6667
## Trouble Shooting
6768

69+
### `nerdctl run --gpus` fails due to an unresolvable CDI device
70+
71+
If the required CDI specifications for NVIDIA devices are not available on the
72+
system, the `nerdctl run` command will fail with an error similar to: `CDI device injection failed: unresolvable CDI devices nvidia.com/gpu=all` (the
73+
exact error message will depend on the device(s) requested).
74+
75+
This should be the same error message that is reported when the `--device` flag
76+
is used to request a CDI device:
77+
```
78+
nerdctl run --device=nvidia.com/gpu=all
79+
```
80+
81+
Ensure that the NVIDIA Container Toolkit (>= v1.18.0 is recommended) is installed and the requested CDI devices are present in the ouptut of `nvidia-ctk cdi list`:
82+
83+
```
84+
$ nvidia-ctk cdi list
85+
INFO[0000] Found 3 CDI devices
86+
nvidia.com/gpu=0
87+
nvidia.com/gpu=GPU-3eb87630-93d5-b2b6-b8ff-9b359caf4ee2
88+
nvidia.com/gpu=all
89+
```
90+
91+
See the NVIDIA Container Toolkit [CDI documentation](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html) for more information.
92+
93+
6894
### `nerdctl run --gpus` fails when using the Nvidia gpu-operator
6995

7096
If the Nvidia driver is installed by the [gpu-operator](https://github.com/NVIDIA/gpu-operator).The `nerdctl run` will fail with the error message `(FATA[0000] exec: "nvidia-container-cli": executable file not found in $PATH)`.

pkg/cmd/container/run_linux.go

Lines changed: 18 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ import (
2525
"github.com/opencontainers/runtime-spec/specs-go"
2626

2727
containerd "github.com/containerd/containerd/v2/client"
28-
"github.com/containerd/containerd/v2/contrib/nvidia"
2928
"github.com/containerd/containerd/v2/core/containers"
3029
"github.com/containerd/containerd/v2/pkg/oci"
3130
"github.com/containerd/log"
@@ -99,7 +98,7 @@ func setPlatformOptions(ctx context.Context, client *containerd.Client, id, uts
9998
if options.Sysctl != nil {
10099
opts = append(opts, WithSysctls(strutil.ConvertKVStringsToMap(options.Sysctl)))
101100
}
102-
gpuOpt, err := parseGPUOpts(options.GPUs)
101+
gpuOpt, err := parseGPUOpts(options.GOptions.CDISpecDirs, options.GPUs)
103102
if err != nil {
104103
return nil, err
105104
}
@@ -262,60 +261,36 @@ func withOOMScoreAdj(score int) oci.SpecOpts {
262261
}
263262
}
264263

265-
func parseGPUOpts(value []string) (res []oci.SpecOpts, _ error) {
264+
func parseGPUOpts(cdiSpecDirs []string, value []string) (res []oci.SpecOpts, _ error) {
266265
for _, gpu := range value {
267-
gpuOpt, err := parseGPUOpt(gpu)
266+
req, err := ParseGPUOptCSV(gpu)
268267
if err != nil {
269268
return nil, err
270269
}
271-
res = append(res, gpuOpt)
270+
res = append(res, withCDIDevices(cdiSpecDirs, req.toCDIDeviceIDS()...))
272271
}
273272
return res, nil
274273
}
275274

276-
func parseGPUOpt(value string) (oci.SpecOpts, error) {
277-
req, err := ParseGPUOptCSV(value)
278-
if err != nil {
279-
return nil, err
275+
func (req *GPUReq) toCDIDeviceIDS() []string {
276+
var cdiDeviceIDs []string
277+
for _, id := range req.normalizeDeviceIDs() {
278+
cdiDeviceIDs = append(cdiDeviceIDs, "nvidia.com/gpu="+id)
280279
}
280+
return cdiDeviceIDs
281+
}
281282

282-
var gpuOpts []nvidia.Opts
283-
283+
func (req *GPUReq) normalizeDeviceIDs() []string {
284284
if len(req.DeviceIDs) > 0 {
285-
gpuOpts = append(gpuOpts, nvidia.WithDeviceUUIDs(req.DeviceIDs...))
286-
} else if req.Count > 0 {
287-
var devices []int
288-
for i := 0; i < req.Count; i++ {
289-
devices = append(devices, i)
290-
}
291-
gpuOpts = append(gpuOpts, nvidia.WithDevices(devices...))
292-
} else if req.Count < 0 {
293-
gpuOpts = append(gpuOpts, nvidia.WithAllDevices)
285+
return req.DeviceIDs
294286
}
295-
296-
str2cap := make(map[string]nvidia.Capability)
297-
for _, c := range nvidia.AllCaps() {
298-
str2cap[string(c)] = c
299-
}
300-
var nvidiaCaps []nvidia.Capability
301-
for _, c := range req.Capabilities {
302-
if cp, isNvidiaCap := str2cap[c]; isNvidiaCap {
303-
nvidiaCaps = append(nvidiaCaps, cp)
304-
}
287+
if req.Count < 0 {
288+
return []string{"all"}
305289
}
306-
if len(nvidiaCaps) != 0 {
307-
gpuOpts = append(gpuOpts, nvidia.WithCapabilities(nvidiaCaps...))
308-
} else {
309-
// Add "utility", "compute" capability if unset.
310-
// Please see also: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html#driver-capabilities
311-
gpuOpts = append(gpuOpts, nvidia.WithCapabilities(nvidia.Utility, nvidia.Compute))
312-
}
313-
314-
if rootlessutil.IsRootless() {
315-
// "--no-cgroups" option is needed to nvidia-container-cli in rootless environment
316-
// Please see also: https://github.com/moby/moby/issues/38729#issuecomment-463493866
317-
gpuOpts = append(gpuOpts, nvidia.WithNoCgroups)
290+
var ids []string
291+
for i := 0; i < req.Count; i++ {
292+
ids = append(ids, fmt.Sprintf("%d", i))
318293
}
319294

320-
return nvidia.WithGPUs(gpuOpts...), nil
295+
return ids
321296
}

pkg/composer/serviceparser/serviceparser.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030

3131
"github.com/compose-spec/compose-go/v2/types"
3232

33-
"github.com/containerd/containerd/v2/contrib/nvidia"
3433
"github.com/containerd/log"
3534

3635
"github.com/containerd/nerdctl/v2/pkg/identifiers"
@@ -262,9 +261,17 @@ func getMemLimit(svc types.ServiceConfig) (types.UnitBytes, error) {
262261
func getGPUs(svc types.ServiceConfig) (reqs []string, _ error) {
263262
// "gpu" and "nvidia" are also allowed capabilities (but not used as nvidia driver capabilities)
264263
// https://github.com/moby/moby/blob/v20.10.7/daemon/nvidia_linux.go#L37
265-
capset := map[string]struct{}{"gpu": {}, "nvidia": {}}
266-
for _, c := range nvidia.AllCaps() {
267-
capset[string(c)] = struct{}{}
264+
capset := map[string]struct{}{
265+
"gpu": {}, "nvidia": {},
266+
// Allow the list of capabilities here (excluding "all" and "none")
267+
// https://github.com/NVIDIA/nvidia-container-toolkit/blob/ff7c2d4866a7d46d1bf2a83590b263e10ec99cb5/internal/config/image/capabilities.go#L28-L38
268+
"compat32": {},
269+
"compute": {},
270+
"display": {},
271+
"graphics": {},
272+
"ngx": {},
273+
"utility": {},
274+
"video": {},
268275
}
269276
if svc.Deploy != nil && svc.Deploy.Resources.Reservations != nil {
270277
for _, dev := range svc.Deploy.Resources.Reservations.Devices {

0 commit comments

Comments
 (0)