Skip to content

Commit 6ffb425

Browse files
authored
feat(admission): Explicitly apply 'nvidia' runtimeClass to GPU pods (v0.9) (#625)
Signed-off-by: Omer Yahud <[email protected]>
1 parent 57b7c03 commit 6ffb425

File tree

19 files changed

+337
-2
lines changed

19 files changed

+337
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1313

1414
### Added
1515
- Support DRA in kubernetes 1.34
16+
- Added enforcement of the `nvidia` runtime class for GPU pods, with the option to enforce a custom runtime class, or disable enforcement entirely.
1617

1718
### Fixed
1819
- Fixed a bug where the scheduler would not re-try updating podgroup status after failure

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ helm upgrade -i kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler -
4949
#### Build from Source
5050
Follow the instructions [here](docs/developer/building-from-source.md)
5151

52+
## Flavor Specific Instructions
53+
### Openshift
54+
When `gpu-operator` <v25.10.0 is installed, the following flag should be added to the installation command:
55+
```
56+
--set admission.gpuPodRuntimeClassName=null
57+
```
58+
5259
## Quick Start
5360
To start scheduling workloads with KAI Scheduler, please continue to [Quick Start example](docs/quickstart/README.md)
5461

cmd/admission/app/app.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ var (
4141

4242
func init() {
4343
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
44-
4544
utilruntime.Must(schedulingv1alpha2.AddToScheme(scheme))
4645
// +kubebuilder:scaffold:scheme
4746
}
@@ -174,5 +173,6 @@ func (app *App) Run() error {
174173
setupLog.Error(err, "problem running manager")
175174
return err
176175
}
176+
177177
return nil
178178
}

cmd/admission/app/options.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
package app
55

66
import (
7+
"fmt"
8+
79
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
810
"github.com/spf13/pflag"
911

@@ -22,6 +24,7 @@ type Options struct {
2224
WebhookPort int
2325
FakeGPUNodes bool
2426
GPUSharingEnabled bool
27+
GPUPodRuntimeClassName string
2528
}
2629

2730
func InitOptions() *Options {
@@ -63,6 +66,9 @@ func InitOptions() *Options {
6366
fs.BoolVar(&options.GPUSharingEnabled,
6467
"gpu-sharing-enabled", false,
6568
"Specifies if the GPU sharing is enabled")
69+
fs.StringVar(&options.GPUPodRuntimeClassName,
70+
"gpu-pod-runtime-class-name", constants.DefaultRuntimeClassName,
71+
fmt.Sprintf("Runtime class to be set for GPU pods (defaults to %s) Set to empty string to disable", constants.DefaultRuntimeClassName))
6672

6773
utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
6874

cmd/admission/main.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
"github.com/NVIDIA/KAI-scheduler/pkg/admission/plugins"
1414
"github.com/NVIDIA/KAI-scheduler/pkg/admission/webhook/v1alpha2/gpusharing"
15+
"github.com/NVIDIA/KAI-scheduler/pkg/admission/webhook/v1alpha2/runtimeenforcement"
1516
)
1617

1718
var (
@@ -42,8 +43,13 @@ func registerPlugins(app *app.App) error {
4243
admissionPlugins := plugins.New()
4344

4445
admissionGpuSharingPlugin := gpusharing.New(app.Client, app.Options.GPUSharingEnabled)
45-
4646
admissionPlugins.RegisterPlugin(admissionGpuSharingPlugin)
47+
48+
if app.Options.GPUPodRuntimeClassName != "" {
49+
admissionRuntimeEnforcementPlugin := runtimeenforcement.New(app.Options.GPUPodRuntimeClassName)
50+
admissionPlugins.RegisterPlugin(admissionRuntimeEnforcementPlugin)
51+
}
52+
4753
app.RegisterPlugins(admissionPlugins)
4854
return nil
4955
}

deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ spec:
4444
admission:
4545
description: Admission holds KAI admission webhooks
4646
properties:
47+
gpuPodRuntimeClassName:
48+
description: |-
49+
GPUPodRuntimeClassName specifies the runtime class to be set for GPU pods
50+
set to empty string to disable
51+
type: string
4752
gpuSharing:
4853
description: GPUSharing enables GPU sharing functionality for
4954
the admission service

deployments/kai-scheduler/templates/kai-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ spec:
8585
targetPort: {{ .Values.admission.ports.webhookPort | default 9443 }}
8686
probePort: {{ .Values.admission.ports.probePort | default 8081 }}
8787
metricsPort: {{ .Values.admission.ports.metricsPort | default 8080 }}
88+
gpuPodRuntimeClassName: {{ .Values.admission.gpuPodRuntimeClassName | default "nvidia" }}
8889

8990
nodeScaleAdjuster:
9091
service:

deployments/kai-scheduler/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ admission:
7777
metricsPort: 8080
7878
probePort: 8081
7979
cdi: false
80+
gpuPodRuntimeClassName: nvidia
8081

8182
nodescaleadjuster:
8283
image:

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ require (
1616
github.com/kubeflow/training-operator v1.9.3
1717
github.com/onsi/ginkgo/v2 v2.25.3
1818
github.com/onsi/gomega v1.38.2
19+
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7
1920
github.com/pkg/errors v0.9.1
2021
github.com/prometheus/client_golang v1.23.2
2122
github.com/prometheus/common v0.66.1

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
226226
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
227227
github.com/opencontainers/selinux v1.11.1 h1:nHFvthhM0qY8/m+vfhJylliSshm8G1jJ2jDMcgULaH8=
228228
github.com/opencontainers/selinux v1.11.1/go.mod h1:E5dMC3VPuVvVHDYmi78qvhJp8+M586T4DlDRYpFkyec=
229+
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7 h1:dZ9uBd0Cw3+l1RGpYRkWdrRjM9yvfxrjW/uPHKUwtIQ=
230+
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7/go.mod h1:yk60tHAmHhtVpJQo3TwVYq2zpuP70iJIFDCmeKMIzPw=
229231
github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
230232
github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
231233
github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=

0 commit comments

Comments
 (0)