Skip to content

Commit 868963b

Browse files
committed
Generate separate specs for coherent and noncoherent devices
With this change the nvidia-ctk cdi generate command generates CDI specs based on whether a device supports coherent access to system memory or not. In this case "regular" nvidia.com/gpu CDI specs are generated for all devices as well as nvidia.com/gpu.coherent and nvidia.com/gpu.noncoherent for devices that are either coherent or non-coherent. Adding the --feature-flag=disable-coherent-annotations command line argument to the nvidia-ctk cdi generate command will disable this. The "disable-coherent-annotations" feature flag can also be set in the nvcdi API in which case the generated CDI device specification will not include annotations indicating coherence. Signed-off-by: Evan Lezar <[email protected]>
1 parent dc0f804 commit 868963b

File tree

7 files changed

+190
-40
lines changed

7 files changed

+190
-40
lines changed

cmd/nvidia-ctk/cdi/generate/generate.go

Lines changed: 114 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package generate
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"os"
2324
"path/filepath"
@@ -26,6 +27,7 @@ import (
2627
"github.com/urfave/cli/v3"
2728

2829
cdi "tags.cncf.io/container-device-interface/pkg/parser"
30+
"tags.cncf.io/container-device-interface/specs-go"
2931

3032
"github.com/NVIDIA/go-nvml/pkg/nvml"
3133

@@ -64,6 +66,8 @@ type options struct {
6466
disabledHooks []string
6567
enabledHooks []string
6668

69+
featureFlags []string
70+
6771
csv struct {
6872
files []string
6973
ignorePatterns []string
@@ -222,6 +226,13 @@ func (m command) build() *cli.Command {
222226
Destination: &opts.enabledHooks,
223227
Sources: cli.EnvVars("NVIDIA_CTK_CDI_GENERATE_ENABLED_HOOKS"),
224228
},
229+
&cli.StringSliceFlag{
230+
Name: "feature-flag",
231+
Aliases: []string{"feature-flags"},
232+
Usage: "specify feature flags for CDI spec generation",
233+
Destination: &opts.featureFlags,
234+
Sources: cli.EnvVars("NVIDIA_CTK_CDI_GENERATE_FEATURE_FLAGS"),
235+
},
225236
},
226237
}
227238

@@ -276,21 +287,18 @@ func (m command) validateFlags(c *cli.Command, opts *options) error {
276287
}
277288

278289
func (m command) run(opts *options) error {
279-
spec, err := m.generateSpec(opts)
290+
specs, err := m.generateSpecs(opts)
280291
if err != nil {
281292
return fmt.Errorf("failed to generate CDI spec: %v", err)
282293
}
283-
m.logger.Infof("Generated CDI spec with version %v", spec.Raw().Version)
284294

285-
if opts.output == "" {
286-
_, err := spec.WriteTo(os.Stdout)
287-
if err != nil {
288-
return fmt.Errorf("failed to write CDI spec to STDOUT: %v", err)
289-
}
290-
return nil
291-
}
295+
var errs error
296+
for _, spec := range specs {
297+
m.logger.Infof("Generated CDI spec with version %v", spec.Raw().Version)
292298

293-
return spec.Save(opts.output)
299+
errs = errors.Join(errs, spec.Save(opts.output))
300+
}
301+
return errs
294302
}
295303

296304
func formatFromFilename(filename string) string {
@@ -305,7 +313,34 @@ func formatFromFilename(filename string) string {
305313
return ""
306314
}
307315

308-
func (m command) generateSpec(opts *options) (spec.Interface, error) {
316+
type generatedSpecs struct {
317+
spec.Interface
318+
filenameInfix string
319+
}
320+
321+
func (g *generatedSpecs) Save(filename string) error {
322+
filename = g.updateFilename(filename)
323+
324+
if filename == "" {
325+
_, err := g.WriteTo(os.Stdout)
326+
if err != nil {
327+
return fmt.Errorf("failed to write CDI spec to STDOUT: %v", err)
328+
}
329+
return nil
330+
}
331+
332+
return g.Interface.Save(filename)
333+
}
334+
335+
func (g generatedSpecs) updateFilename(filename string) string {
336+
if g.filenameInfix == "" || filename == "" {
337+
return filename
338+
}
339+
ext := filepath.Ext(filepath.Base(filename))
340+
return strings.TrimSuffix(filename, ext) + g.filenameInfix + ext
341+
}
342+
343+
func (m command) generateSpecs(opts *options) ([]generatedSpecs, error) {
309344
var deviceNamers []nvcdi.DeviceNamer
310345
for _, strategy := range opts.deviceNameStrategies {
311346
deviceNamer, err := nvcdi.NewDeviceNamer(strategy)
@@ -329,6 +364,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
329364
nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns),
330365
nvcdi.WithDisabledHooks(opts.disabledHooks...),
331366
nvcdi.WithEnabledHooks(opts.enabledHooks...),
367+
nvcdi.WithFeatureFlags(opts.featureFlags...),
332368
// We set the following to allow for dependency injection:
333369
nvcdi.WithNvmlLib(opts.nvmllib),
334370
}
@@ -338,7 +374,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
338374
return nil, fmt.Errorf("failed to create CDI library: %v", err)
339375
}
340376

341-
deviceSpecs, err := cdilib.GetDeviceSpecsByID("all")
377+
allDeviceSpecs, err := cdilib.GetDeviceSpecsByID("all")
342378
if err != nil {
343379
return nil, fmt.Errorf("failed to create device CDI specs: %v", err)
344380
}
@@ -348,16 +384,79 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
348384
return nil, fmt.Errorf("failed to create edits common for entities: %v", err)
349385
}
350386

351-
return spec.New(
387+
commonSpecOptions := []spec.Option{
352388
spec.WithVendor(opts.vendor),
353-
spec.WithClass(opts.class),
354-
spec.WithDeviceSpecs(deviceSpecs),
355389
spec.WithEdits(*commonEdits.ContainerEdits),
356390
spec.WithFormat(opts.format),
357391
spec.WithMergedDeviceOptions(
358392
transform.WithName(allDeviceName),
359393
transform.WithSkipIfExists(true),
360394
),
361395
spec.WithPermissions(0644),
396+
}
397+
398+
fullSpec, err := spec.New(
399+
append(commonSpecOptions,
400+
spec.WithClass(opts.class),
401+
spec.WithDeviceSpecs(allDeviceSpecs),
402+
)...,
362403
)
404+
if err != nil {
405+
return nil, err
406+
}
407+
var allSpecs []generatedSpecs
408+
409+
allSpecs = append(allSpecs, generatedSpecs{Interface: fullSpec, filenameInfix: ""})
410+
411+
deviceSpecsByDeviceCoherence := (deviceSpecs)(allDeviceSpecs).splitOnAnnotation("gpu.nvidia.com/coherent")
412+
413+
if coherentDeviceSpecs := deviceSpecsByDeviceCoherence["gpu.nvidia.com/coherent=true"]; len(coherentDeviceSpecs) > 0 {
414+
infix := ".coherent"
415+
coherentSpecs, err := spec.New(
416+
append(commonSpecOptions,
417+
spec.WithClass(opts.class+infix),
418+
spec.WithDeviceSpecs(coherentDeviceSpecs),
419+
)...,
420+
)
421+
if err != nil {
422+
return nil, err
423+
}
424+
allSpecs = append(allSpecs, generatedSpecs{Interface: coherentSpecs, filenameInfix: infix})
425+
}
426+
427+
if noncoherentDeviceSpecs := deviceSpecsByDeviceCoherence["gpu.nvidia.com/coherent=false"]; len(noncoherentDeviceSpecs) > 0 {
428+
infix := ".noncoherent"
429+
noncoherentSpecs, err := spec.New(
430+
append(commonSpecOptions,
431+
spec.WithClass(opts.class+infix),
432+
spec.WithDeviceSpecs(noncoherentDeviceSpecs),
433+
)...,
434+
)
435+
436+
if err != nil {
437+
return nil, err
438+
}
439+
allSpecs = append(allSpecs, generatedSpecs{Interface: noncoherentSpecs, filenameInfix: infix})
440+
}
441+
442+
return allSpecs, nil
443+
}
444+
445+
type deviceSpecs []specs.Device
446+
447+
func (d deviceSpecs) splitOnAnnotation(key string) map[string][]specs.Device {
448+
splitSpecs := make(map[string][]specs.Device)
449+
450+
for _, deviceSpec := range d {
451+
if len(deviceSpec.Annotations) == 0 {
452+
continue
453+
}
454+
value, ok := deviceSpec.Annotations[key]
455+
if !ok {
456+
continue
457+
}
458+
splitSpecs[key+"="+value] = append(splitSpecs[key+"="+value], deviceSpec)
459+
}
460+
461+
return splitSpecs
363462
}

cmd/nvidia-ctk/cdi/generate/generate_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/stretchr/testify/require"
2929

3030
"github.com/NVIDIA/nvidia-container-toolkit/internal/test"
31+
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
3132
)
3233

3334
func TestGenerateSpec(t *testing.T) {
@@ -476,12 +477,15 @@ containerEdits:
476477
}
477478
tc.options.nvmllib = server
478479

479-
spec, err := c.generateSpec(&tc.options)
480+
tc.options.featureFlags = []string{string(nvcdi.FeatureDisableCoherentAnnotations)}
481+
specs, err := c.generateSpecs(&tc.options)
480482
require.ErrorIs(t, err, tc.expectedError)
481483

482484
var buf bytes.Buffer
483-
_, err = spec.WriteTo(&buf)
484-
require.NoError(t, err)
485+
for _, spec := range specs {
486+
_, err = spec.WriteTo(&buf)
487+
require.NoError(t, err)
488+
}
485489

486490
require.Equal(t, strings.ReplaceAll(tc.expectedSpec, "{{ .driverRoot }}", driverRoot), buf.String())
487491
})

pkg/nvcdi/api.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,7 @@ const (
8080
// FeatureDisableNvsandboxUtils disables the use of nvsandboxutils when
8181
// querying devices.
8282
FeatureDisableNvsandboxUtils = FeatureFlag("disable-nvsandbox-utils")
83+
// FeatureDisableCoherentAnnotations disables the addition of annotations
84+
// coherent or non-coherent devices.
85+
FeatureDisableCoherentAnnotations = FeatureFlag("disable-coherent-annotations")
8386
)

pkg/nvcdi/full-gpu-nvml.go

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ type fullGPUDeviceSpecGenerator struct {
3636
*nvmllib
3737
uuid string
3838
index int
39+
40+
featureFlags map[FeatureFlag]bool
3941
}
4042

4143
var _ DeviceSpecGenerator = (*fullGPUDeviceSpecGenerator)(nil)
@@ -44,7 +46,7 @@ func (l *fullGPUDeviceSpecGenerator) GetUUID() (string, error) {
4446
return l.uuid, nil
4547
}
4648

47-
func (l *nvmllib) newFullGPUDeviceSpecGeneratorFromDevice(index int, d device.Device) (*fullGPUDeviceSpecGenerator, error) {
49+
func (l *nvmllib) newFullGPUDeviceSpecGeneratorFromDevice(index int, d device.Device, featureFlags map[FeatureFlag]bool) (*fullGPUDeviceSpecGenerator, error) {
4850
uuid, ret := d.GetUUID()
4951
if ret != nvml.SUCCESS {
5052
return nil, fmt.Errorf("failed to get device UUID: %v", ret)
@@ -53,12 +55,14 @@ func (l *nvmllib) newFullGPUDeviceSpecGeneratorFromDevice(index int, d device.De
5355
nvmllib: l,
5456
uuid: uuid,
5557
index: index,
58+
59+
featureFlags: featureFlags,
5660
}
5761

5862
return e, nil
5963
}
6064

61-
func (l *nvmllib) newFullGPUDeviceSpecGeneratorFromNVMLDevice(uuid string, nvmlDevice nvml.Device) (DeviceSpecGenerator, error) {
65+
func (l *nvmllib) newFullGPUDeviceSpecGeneratorFromNVMLDevice(uuid string, nvmlDevice nvml.Device, featureFlags map[FeatureFlag]bool) (DeviceSpecGenerator, error) {
6266
index, ret := nvmlDevice.GetIndex()
6367
if ret != nvml.SUCCESS {
6468
return nil, fmt.Errorf("failed to get device index: %v", ret)
@@ -68,6 +72,8 @@ func (l *nvmllib) newFullGPUDeviceSpecGeneratorFromNVMLDevice(uuid string, nvmlD
6872
nvmllib: l,
6973
uuid: uuid,
7074
index: index,
75+
76+
featureFlags: featureFlags,
7177
}
7278
return e, nil
7379
}
@@ -83,11 +89,17 @@ func (l *fullGPUDeviceSpecGenerator) GetDeviceSpecs() ([]specs.Device, error) {
8389
return nil, fmt.Errorf("failed to get device names: %w", err)
8490
}
8591

92+
annotations, err := l.getDeviceAnnotations()
93+
if err != nil {
94+
l.logger.Warning("Ignoring error getting device annotations for device(s) %v: %v", names, err)
95+
annotations = nil
96+
}
8697
var deviceSpecs []specs.Device
8798
for _, name := range names {
8899
deviceSpec := specs.Device{
89100
Name: name,
90101
ContainerEdits: *deviceEdits.ContainerEdits,
102+
Annotations: annotations,
91103
}
92104
deviceSpecs = append(deviceSpecs, deviceSpec)
93105
}
@@ -99,6 +111,29 @@ func (l *fullGPUDeviceSpecGenerator) device() (device.Device, error) {
99111
return l.devicelib.NewDeviceByUUID(l.uuid)
100112
}
101113

114+
func (l *fullGPUDeviceSpecGenerator) getDeviceAnnotations() (map[string]string, error) {
115+
if l.featureFlags[FeatureDisableCoherentAnnotations] {
116+
return nil, nil
117+
}
118+
119+
device, err := l.device()
120+
if err != nil {
121+
return nil, err
122+
}
123+
124+
// TODO: Should we distinguish between not-supported and disabled?
125+
isCoherent, err := device.IsCoherent()
126+
if err != nil {
127+
return nil, fmt.Errorf("failed to check device coherence: %w", err)
128+
}
129+
130+
annotations := map[string]string{
131+
"gpu.nvidia.com/coherent": fmt.Sprintf("%v", isCoherent),
132+
}
133+
134+
return annotations, nil
135+
}
136+
102137
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
103138
func (l *fullGPUDeviceSpecGenerator) getDeviceEdits() (*cdi.ContainerEdits, error) {
104139
device, err := l.device()

pkg/nvcdi/lib-nvml.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ func (l *nvmllib) newDeviceSpecGeneratorFromNVMLDevice(id string, nvmlDevice nvm
102102
return l.newMIGDeviceSpecGeneratorFromNVMLDevice(id, nvmlDevice)
103103
}
104104

105-
return l.newFullGPUDeviceSpecGeneratorFromNVMLDevice(id, nvmlDevice)
105+
return l.newFullGPUDeviceSpecGeneratorFromNVMLDevice(id, nvmlDevice, l.featureFlags)
106106
}
107107

108108
// getDeviceSpecGeneratorsForAllDevices returns the CDI device spec generators
@@ -118,7 +118,7 @@ func (l *nvmllib) getDeviceSpecGeneratorsForAllDevices() (DeviceSpecGenerator, e
118118
if isMigEnabled {
119119
return nil
120120
}
121-
fullGPU, err := l.newFullGPUDeviceSpecGeneratorFromDevice(i, d)
121+
fullGPU, err := l.newFullGPUDeviceSpecGeneratorFromDevice(i, d, l.featureFlags)
122122
if err != nil {
123123
return err
124124
}

pkg/nvcdi/mig-device-nvml.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func (l *migDeviceSpecGenerator) GetUUID() (string, error) {
4141
}
4242

4343
func (l *nvmllib) newMIGDeviceSpecGeneratorFromDevice(i int, d device.Device, j int, m device.MigDevice) (*migDeviceSpecGenerator, error) {
44-
parent, err := l.newFullGPUDeviceSpecGeneratorFromDevice(i, d)
44+
parent, err := l.newFullGPUDeviceSpecGeneratorFromDevice(i, d, map[FeatureFlag]bool{FeatureDisableCoherentAnnotations: true})
4545
if err != nil {
4646
return nil, err
4747
}

0 commit comments

Comments
 (0)