Skip to content

Commit bd92cb4

Browse files
add GPU mock infrastructure
- Implement pkg/gpu/mockfs for NVIDIA driver filesystem mocking - Implement pkg/gpu/mocktopo with dgxa100 support via go-nvml - Add cmd/gpu-mockctl CLI tool for mock generation - Add Kubernetes Job for verification on kind - Support extensibility for future H100/H200/B200 flavors Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 5d77a9d commit bd92cb4

File tree

103 files changed

+68233
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+68233
-1
lines changed

cmd/gpu-mockctl/main.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package main
15+
16+
import (
17+
"context"
18+
"fmt"
19+
"log"
20+
"os"
21+
"path/filepath"
22+
23+
"github.com/urfave/cli/v3"
24+
25+
mockfs "github.com/NVIDIA/k8s-test-infra/pkg/gpu/mockfs"
26+
mocktopo "github.com/NVIDIA/k8s-test-infra/pkg/gpu/mocktopo"
27+
)
28+
29+
func main() {
30+
cmd := &cli.Command{
31+
Name: "gpu-mockctl",
32+
Usage: "Generate mock NVIDIA driver filesystem for testing",
33+
Flags: []cli.Flag{
34+
&cli.StringFlag{
35+
Name: "base",
36+
Value: "/run/nvidia/driver",
37+
Usage: "mock driver root directory",
38+
},
39+
&cli.StringFlag{
40+
Name: "machine",
41+
Value: func() string {
42+
if v := os.Getenv("MACHINE_TYPE"); v != "" {
43+
return v
44+
}
45+
return "dgxa100"
46+
}(),
47+
Usage: "machine type (only dgxa100 supported)",
48+
},
49+
},
50+
Action: func(ctx context.Context, cmd *cli.Command) error {
51+
return run(
52+
cmd.String("base"),
53+
cmd.String("machine"),
54+
)
55+
},
56+
}
57+
58+
if err := cmd.Run(context.Background(), os.Args); err != nil {
59+
log.Fatal(err)
60+
}
61+
}
62+
63+
func run(base, machine string) error {
64+
topo, err := mocktopo.New(machine)
65+
if err != nil {
66+
if os.Getenv("ALLOW_UNSUPPORTED") == "true" {
67+
log.Printf("unsupported machine %q, using fallback", machine)
68+
topo = mocktopo.NewFallback(8, "NVIDIA A100-SXM4-40GB")
69+
} else {
70+
return fmt.Errorf("failed to create topology: %w", err)
71+
}
72+
}
73+
74+
layout := mockfs.Layout{Base: filepath.Clean(base)}
75+
for _, g := range topo.GPUs {
76+
layout.GPUs = append(layout.GPUs, mockfs.GPU{
77+
PCI: mockfs.NormPCI(g.PCI),
78+
UUID: g.UUID,
79+
Model: g.Model,
80+
})
81+
}
82+
83+
if err := layout.Write(); err != nil {
84+
return fmt.Errorf("failed to write mock filesystem: %w", err)
85+
}
86+
87+
log.Printf(
88+
"mock filesystem written under %s (%d GPUs)\n",
89+
layout.Base,
90+
len(layout.GPUs),
91+
)
92+
return nil
93+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
apiVersion: v1
15+
kind: Namespace
16+
metadata:
17+
name: gpu-mock
18+
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
apiVersion: v1
15+
kind: ConfigMap
16+
metadata:
17+
name: gpu-mock-scripts
18+
namespace: gpu-mock
19+
immutable: true
20+
data:
21+
verify_gpu_mock.sh: |
22+
#!/bin/sh
23+
set -euo pipefail
24+
BASE="${1:-/run/nvidia/driver}"
25+
DEV="$BASE/dev"
26+
PROC="$BASE/proc/driver/nvidia"
27+
require() { test -e "$1" || { echo "MISSING: $1" >&2; exit 1; }; }
28+
require_char() { [ -c "$1" ] || { echo "NOT-CHAR: $1" >&2; exit 1; }; }
29+
COUNT=$(find "$PROC/gpus" -maxdepth 2 -type f -name information | wc -l | tr -d ' ')
30+
[ "$COUNT" -gt 0 ] || { echo "No GPUs found in proc mock" >&2; exit 1; }
31+
for i in $(seq 0 $((COUNT-1))); do require_char "$DEV/nvidia$i"; done
32+
require_char "$DEV/nvidiactl"
33+
require_char "$DEV/nvidia-uvm"
34+
require_char "$DEV/nvidia-uvm-tools"
35+
require "$PROC/version" && grep -q "NVRM version:" "$PROC/version"
36+
for f in $(find "$PROC/gpus" -maxdepth 2 -type f -name information | sort); do
37+
grep -q "Model:" "$f" && grep -q "GPU UUID:" "$f" && grep -q "Bus Location:" "$f" || { echo "Bad info in $f" >&2; exit 1; }
38+
done
39+
(cd "$BASE" && find . -maxdepth 4 -print | sort)
40+
echo "ALL CHECKS PASSED"
41+
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
apiVersion: batch/v1
15+
kind: Job
16+
metadata:
17+
name: gpu-mock-verify
18+
namespace: gpu-mock
19+
spec:
20+
backoffLimit: 0
21+
template:
22+
metadata:
23+
labels:
24+
app: gpu-mock-verify
25+
spec:
26+
restartPolicy: Never
27+
volumes:
28+
- name: mock-driver
29+
emptyDir: {}
30+
- name: scripts
31+
configMap:
32+
name: gpu-mock-scripts
33+
defaultMode: 0755
34+
initContainers:
35+
- name: setup-mock
36+
image: local/gpu-mockctl:dev
37+
env:
38+
- name: MACHINE_TYPE
39+
value: "dgxa100"
40+
securityContext:
41+
privileged: true
42+
runAsUser: 0
43+
volumeMounts:
44+
- name: mock-driver
45+
mountPath: /run/nvidia/driver
46+
command: ["/usr/local/bin/gpu-mockctl","-base","/run/nvidia/driver"]
47+
containers:
48+
- name: verify
49+
image: alpine:3.20
50+
securityContext:
51+
runAsUser: 0
52+
volumeMounts:
53+
- name: mock-driver
54+
mountPath: /run/nvidia/driver
55+
- name: scripts
56+
mountPath: /opt/mock/bin
57+
command: ["/bin/sh","-c","/opt/mock/bin/verify_gpu_mock.sh /run/nvidia/driver"]
58+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
# builder
15+
FROM golang:1.25 AS builder
16+
WORKDIR /src
17+
COPY . .
18+
RUN --mount=type=cache,target=/go/pkg/mod \
19+
CGO_ENABLED=1 go build -trimpath -ldflags='-s -w' \
20+
-o /out/gpu-mockctl ./cmd/gpu-mockctl
21+
22+
# runtime
23+
FROM debian:bookworm-slim
24+
RUN apt-get update && apt-get install -y --no-install-recommends \
25+
bash coreutils util-linux && \
26+
rm -rf /var/lib/apt/lists/*
27+
COPY --from=builder /out/gpu-mockctl /usr/local/bin/gpu-mockctl
28+
ENTRYPOINT ["/usr/local/bin/gpu-mockctl"]
29+
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
SHELL := /bin/bash
15+
APP := gpu-mockctl
16+
IMAGE ?= local/$(APP):dev
17+
CLUSTER ?= gpu-mock
18+
KIND_IMAGE ?= kindest/node:v1.30.0
19+
KUBECTL ?= kubectl
20+
REPO_ROOT := $(shell cd ../../.. && pwd)
21+
22+
.PHONY: all kind-up image load apply wait logs clean re vendor
23+
24+
all: kind-up image load apply wait logs
25+
26+
vendor:
27+
cd $(REPO_ROOT) && go mod tidy && go mod vendor
28+
29+
image: vendor
30+
cd $(REPO_ROOT) && docker build -t $(IMAGE) -f deployments/devel/gpu-mock/Dockerfile .
31+
32+
load:
33+
kind load docker-image $(IMAGE) --name $(CLUSTER)
34+
35+
kind-up:
36+
kind create cluster --name $(CLUSTER) --image $(KIND_IMAGE) || true
37+
38+
apply:
39+
$(KUBECTL) apply -f 00-namespace.yaml
40+
$(KUBECTL) -n gpu-mock apply -f 10-configmap-verify-script.yaml
41+
$(KUBECTL) -n gpu-mock apply -f 20-job-gpu-mock-verify.yaml
42+
43+
wait:
44+
$(KUBECTL) -n gpu-mock wait --for=condition=complete job/gpu-mock-verify --timeout=180s
45+
46+
logs:
47+
@pod=$$($(KUBECTL) -n gpu-mock get pods -l job-name=gpu-mock-verify -o jsonpath='{.items[0].metadata.name}'); \
48+
$(KUBECTL) -n gpu-mock logs $$pod
49+
50+
clean:
51+
kind delete cluster --name $(CLUSTER) || true
52+
53+
re: clean all
54+

0 commit comments

Comments
 (0)