Skip to content

Commit 975f57a

Browse files
Mock NVIDIA userspace + CDI integration
- Add mock driver tree generation (pkg/gpu/mockdriver) - Integrate nvidia-container-toolkit nvcdi for CDI spec generation - Extend go-nvml dgxa100 mock with MIG stubs for nvcdi compatibility - Add DaemonSet to deploy CDI mock on all nodes - Add comprehensive smoke tests for CDI verification - Update gpu-mockctl CLI with fs/cdi/all modes - Use __NVCT_TESTING_DEVICES_ARE_FILES from nvidia-container-toolkit tests - Generate empty files with versioned naming Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent bd92cb4 commit 975f57a

File tree

276 files changed

+34033
-44
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

276 files changed

+34033
-44
lines changed

cmd/gpu-mockctl/main.go

Lines changed: 158 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
// Licensed under the Apache License, Version 2.0 (the "License");
33
// you may not use this file except in compliance with the License.
44
// You may obtain a copy of the License at
@@ -19,9 +19,12 @@ import (
1919
"log"
2020
"os"
2121
"path/filepath"
22+
"strings"
2223

2324
"github.com/urfave/cli/v3"
2425

26+
"github.com/NVIDIA/k8s-test-infra/pkg/gpu/cdi"
27+
"github.com/NVIDIA/k8s-test-infra/pkg/gpu/mockdriver"
2528
mockfs "github.com/NVIDIA/k8s-test-infra/pkg/gpu/mockfs"
2629
mocktopo "github.com/NVIDIA/k8s-test-infra/pkg/gpu/mocktopo"
2730
)
@@ -31,10 +34,25 @@ func main() {
3134
Name: "gpu-mockctl",
3235
Usage: "Generate mock NVIDIA driver filesystem for testing",
3336
Flags: []cli.Flag{
37+
&cli.StringFlag{
38+
Name: "mode",
39+
Value: "all",
40+
Usage: "operation mode: fs, cdi, or all",
41+
},
3442
&cli.StringFlag{
3543
Name: "base",
3644
Value: "/run/nvidia/driver",
37-
Usage: "mock driver root directory",
45+
Usage: "mock driver root directory (for fs mode)",
46+
},
47+
&cli.StringFlag{
48+
Name: "driver-root",
49+
Value: "/var/lib/nvidia-mock/driver",
50+
Usage: "host mock driver tree root (for cdi mode)",
51+
},
52+
&cli.StringFlag{
53+
Name: "cdi-output",
54+
Value: cdi.DefaultSpecPath,
55+
Usage: "CDI spec output path",
3856
},
3957
&cli.StringFlag{
4058
Name: "machine",
@@ -46,12 +64,31 @@ func main() {
4664
}(),
4765
Usage: "machine type (only dgxa100 supported)",
4866
},
67+
&cli.BoolFlag{
68+
Name: "with-dri",
69+
Usage: "include DRI render node",
70+
},
71+
&cli.BoolFlag{
72+
Name: "with-hook",
73+
Usage: "include CDI hook references",
74+
},
75+
&cli.StringFlag{
76+
Name: "toolkit-root",
77+
Value: "/usr/local/nvidia-container-toolkit",
78+
Usage: "toolkit root for hook paths",
79+
},
4980
},
5081
Action: func(ctx context.Context, cmd *cli.Command) error {
51-
return run(
52-
cmd.String("base"),
53-
cmd.String("machine"),
54-
)
82+
return run(&config{
83+
mode: cmd.String("mode"),
84+
base: cmd.String("base"),
85+
driverRoot: cmd.String("driver-root"),
86+
cdiOutput: cmd.String("cdi-output"),
87+
machine: cmd.String("machine"),
88+
withDRI: cmd.Bool("with-dri"),
89+
withHook: cmd.Bool("with-hook"),
90+
toolkitRoot: cmd.String("toolkit-root"),
91+
})
5592
},
5693
}
5794

@@ -60,17 +97,51 @@ func main() {
6097
}
6198
}
6299

63-
func run(base, machine string) error {
64-
topo, err := mocktopo.New(machine)
100+
type config struct {
101+
mode string
102+
base string
103+
driverRoot string
104+
cdiOutput string
105+
machine string
106+
withDRI bool
107+
withHook bool
108+
toolkitRoot string
109+
}
110+
111+
func run(cfg *config) error {
112+
// Get topology (A100-only for now)
113+
topo, err := mocktopo.New(cfg.machine)
65114
if err != nil {
66115
if os.Getenv("ALLOW_UNSUPPORTED") == "true" {
67-
log.Printf("unsupported machine %q, using fallback", machine)
116+
log.Printf("unsupported machine %q, using fallback",
117+
cfg.machine)
68118
topo = mocktopo.NewFallback(8, "NVIDIA A100-SXM4-40GB")
69119
} else {
70120
return fmt.Errorf("failed to create topology: %w", err)
71121
}
72122
}
73123

124+
gpuCount := len(topo.GPUs)
125+
modes := strings.Split(cfg.mode, ",")
126+
127+
// Mode: fs (Step 1 behavior - proc/dev mock under -base)
128+
if contains(modes, "fs") || contains(modes, "all") {
129+
if err := runFS(cfg.base, topo); err != nil {
130+
return err
131+
}
132+
}
133+
134+
// Mode: cdi (Step 2 - mock driver tree + CDI spec)
135+
if contains(modes, "cdi") || contains(modes, "all") {
136+
if err := runCDI(cfg, topo, gpuCount); err != nil {
137+
return err
138+
}
139+
}
140+
141+
return nil
142+
}
143+
144+
func runFS(base string, topo *mocktopo.Topology) error {
74145
layout := mockfs.Layout{Base: filepath.Clean(base)}
75146
for _, g := range topo.GPUs {
76147
layout.GPUs = append(layout.GPUs, mockfs.GPU{
@@ -84,10 +155,83 @@ func run(base, machine string) error {
84155
return fmt.Errorf("failed to write mock filesystem: %w", err)
85156
}
86157

87-
log.Printf(
88-
"mock filesystem written under %s (%d GPUs)\n",
89-
layout.Base,
90-
len(layout.GPUs),
91-
)
158+
log.Printf("mock filesystem (fs mode) written under %s (%d GPUs)\n",
159+
layout.Base, len(layout.GPUs))
160+
return nil
161+
}
162+
163+
func runCDI(cfg *config, topo *mocktopo.Topology, gpuCount int) error {
164+
// Create mock driver tree
165+
files := mockdriver.DefaultFiles(cfg.driverRoot)
166+
if err := mockdriver.WriteAll(files); err != nil {
167+
return fmt.Errorf("failed to write driver files: %w", err)
168+
}
169+
log.Printf("mock driver tree written to %s\n", cfg.driverRoot)
170+
171+
// Create device nodes (both host /dev and under driverRoot/dev)
172+
// Host /dev nodes for CDI runtime compatibility
173+
hostDevNodes := mockdriver.DeviceNodes("/dev", gpuCount, cfg.withDRI)
174+
if err := mockdriver.WriteAll(hostDevNodes); err != nil {
175+
log.Printf("warning: failed to create host /dev nodes: %v", err)
176+
}
177+
178+
// Also create under driverRoot/dev for completeness
179+
driverDevNodes := mockdriver.DeviceNodes(cfg.driverRoot, gpuCount,
180+
cfg.withDRI)
181+
if err := mockdriver.WriteAll(driverDevNodes); err != nil {
182+
log.Printf("warning: failed to create %s/dev nodes: %v",
183+
cfg.driverRoot, err)
184+
}
185+
186+
// Get the mock NVML library for CDI generation
187+
// This uses the same mock topology we're already using
188+
mockNVML, err := mocktopo.New(cfg.machine)
189+
if err != nil {
190+
if os.Getenv("ALLOW_UNSUPPORTED") == "true" {
191+
log.Printf("warning: using fallback mock for CDI generation")
192+
mockNVML = mocktopo.NewFallback(8, "NVIDIA A100-SXM4-40GB")
193+
} else {
194+
return fmt.Errorf("failed to create mock NVML: %w", err)
195+
}
196+
}
197+
198+
// Generate CDI spec using nvidia-container-toolkit nvcdi library
199+
cdiOpts := cdi.Options{
200+
NVMLLib: mockNVML.NVMLInterface(),
201+
DriverRoot: cfg.driverRoot,
202+
DevRoot: "/host/dev", // DevRoot is already prefixed by the DaemonSet mount
203+
NVIDIACDIHookPath: cfg.toolkitRoot + "/bin/nvidia-cdi-hook",
204+
}
205+
206+
specYAML, err := cdi.Generate(cdiOpts)
207+
if err != nil {
208+
return fmt.Errorf("failed to generate CDI spec: %w", err)
209+
}
210+
211+
// Validate before writing
212+
if err := cdi.Validate(specYAML); err != nil {
213+
return fmt.Errorf("CDI spec validation failed: %w", err)
214+
}
215+
216+
// Write CDI spec
217+
if err := os.MkdirAll(filepath.Dir(cfg.cdiOutput), 0o755); err != nil {
218+
return fmt.Errorf("failed to create CDI directory: %w", err)
219+
}
220+
221+
if err := os.WriteFile(cfg.cdiOutput, specYAML, 0o644); err != nil {
222+
return fmt.Errorf("failed to write CDI spec: %w", err)
223+
}
224+
225+
log.Printf("CDI spec written to %s (generated via nvidia-container-toolkit)\n",
226+
cfg.cdiOutput)
92227
return nil
93228
}
229+
230+
func contains(slice []string, item string) bool {
231+
for _, s := range slice {
232+
if s == item {
233+
return true
234+
}
235+
}
236+
return false
237+
}

deployments/devel/gpu-mock/00-namespace.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at

deployments/devel/gpu-mock/10-configmap-verify-script.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at

deployments/devel/gpu-mock/20-job-gpu-mock-verify.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -43,7 +43,7 @@ spec:
4343
volumeMounts:
4444
- name: mock-driver
4545
mountPath: /run/nvidia/driver
46-
command: ["/usr/local/bin/gpu-mockctl","-base","/run/nvidia/driver"]
46+
command: ["/usr/local/bin/gpu-mockctl","-mode","fs","-base","/run/nvidia/driver"]
4747
containers:
4848
- name: verify
4949
image: alpine:3.20
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
apiVersion: apps/v1
15+
kind: DaemonSet
16+
metadata:
17+
name: nvidia-cdi-mock
18+
namespace: gpu-mock
19+
spec:
20+
selector:
21+
matchLabels:
22+
app: nvidia-cdi-mock
23+
template:
24+
metadata:
25+
labels:
26+
app: nvidia-cdi-mock
27+
spec:
28+
hostPID: true
29+
tolerations:
30+
- operator: Exists
31+
restartPolicy: Always
32+
volumes:
33+
- name: host-cdi
34+
hostPath:
35+
path: /etc/cdi
36+
type: DirectoryOrCreate
37+
- name: host-driver
38+
hostPath:
39+
path: /var/lib/nvidia-mock/driver
40+
type: DirectoryOrCreate
41+
- name: host-dev
42+
hostPath:
43+
path: /dev
44+
initContainers:
45+
- name: setup
46+
image: local/gpu-mockctl:dev
47+
env:
48+
- name: __NVCT_TESTING_DEVICES_ARE_FILES
49+
value: "true"
50+
securityContext:
51+
privileged: true
52+
runAsUser: 0
53+
volumeMounts:
54+
- name: host-cdi
55+
mountPath: /host/etc/cdi
56+
- name: host-driver
57+
mountPath: /host/var/lib/nvidia-mock/driver
58+
- name: host-dev
59+
mountPath: /host/dev
60+
command: ["/usr/local/bin/gpu-mockctl"]
61+
args:
62+
- "-mode"
63+
- "cdi"
64+
- "-driver-root"
65+
- "/host/var/lib/nvidia-mock/driver"
66+
- "-cdi-output"
67+
- "/host/etc/cdi/nvidia.yaml"
68+
containers:
69+
- name: hold
70+
image: debian:bookworm-slim
71+
command: ["/bin/sh", "-c", "sleep infinity"]
72+

0 commit comments

Comments
 (0)