Skip to content

Commit 89ba8dd

Browse files
authored
Siormeir/feat-create-admission-webhooks-service-phase-3 (#350)
* feat: switch kai to use seperate admission webhooks * fix: switch gpusharingm admissionhooks * fix: remove namespace call * feat: split spusharing, remove hooks from binder package * fix: remove mutate from mock plugin in binder * feat: set up new admisson service * feat: add service and add to creation flow * fix: remove references from binder * fix: remove rbac as it is not nessessary * fix: apply CR comments * fix: conflict issues * fix: merge conflicts * fix: fix rebasing * fix: alias import * fix: clean struct names
1 parent ed3f9e8 commit 89ba8dd

File tree

15 files changed

+463
-39
lines changed

15 files changed

+463
-39
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ KUSTOMIZE ?= $(LOCALBIN)/kustomize
1616

1717
# Space seperated list of services to build by default
1818
# SERVICE_NAMES := service1 service2 service3
19-
SERVICE_NAMES := podgrouper scheduler binder webhookmanager resourcereservation snapshot-tool scalingpod nodescaleadjuster podgroupcontroller queuecontroller fairshare-simulator
19+
SERVICE_NAMES := podgrouper scheduler binder webhookmanager resourcereservation snapshot-tool scalingpod nodescaleadjuster podgroupcontroller queuecontroller fairshare-simulator admission
2020

2121

2222
lint: fmt-go vet-go lint-go
@@ -66,6 +66,7 @@ manifests: controller-gen kustomize ## Generate ClusterRole and CustomResourceDe
6666
$(CONTROLLER_GEN) rbac:roleName=kai-node-scale-adjuster,headerFile="./hack/boilerplate.yaml.txt" paths="./pkg/nodescaleadjuster/..." paths="./cmd/nodescaleadjuster/..." output:stdout > deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml
6767
$(CONTROLLER_GEN) rbac:roleName=kai-podgroup-controller,headerFile="./hack/boilerplate.yaml.txt" paths="./pkg/podgroupcontroller/..." paths="./cmd/podgroupcontroller/..." output:stdout > deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml
6868
$(CONTROLLER_GEN) rbac:roleName=queuecontroller,headerFile="./hack/boilerplate.yaml.txt" paths="./pkg/queuecontroller/..." paths="./cmd/queuecontroller/..." output:stdout > deployments/kai-scheduler/templates/rbac/queuecontroller.yaml
69+
$(CONTROLLER_GEN) rbac:roleName=kai-admission,headerFile="./hack/boilerplate.yaml.txt" paths="./pkg/admission/..." paths="./cmd/admission/..." output:stdout > deployments/kai-scheduler/templates/rbac/admission.yaml
6970

7071
$(CONTROLLER_GEN) rbac:roleName=kai-webhookmanager,headerFile="./hack/boilerplate.yaml.txt" paths="./pkg/webhookmanager/..." paths="./cmd/webhookmanager/..." output:stdout > deployments/kustomization/webhookmanager-clusterrole/resource.yaml
7172
$(KUSTOMIZE) build deployments/kustomization/webhookmanager-clusterrole > deployments/kai-scheduler/templates/rbac/webhookmanager.yaml

cmd/admission/app/app.go

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package app
5+
6+
import (
7+
"context"
8+
"flag"
9+
10+
admissionhooks "github.com/NVIDIA/KAI-scheduler/pkg/admission/webhook/v1alpha2/podhooks"
11+
12+
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
13+
// to ensure that exec-entrypoint and run can make use of them.
14+
_ "k8s.io/client-go/plugin/pkg/client/auth"
15+
16+
"github.com/spf13/pflag"
17+
"go.uber.org/zap/zapcore"
18+
corev1 "k8s.io/api/core/v1"
19+
"k8s.io/apimachinery/pkg/runtime"
20+
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
21+
"k8s.io/client-go/informers"
22+
"k8s.io/client-go/kubernetes"
23+
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
24+
ctrl "sigs.k8s.io/controller-runtime"
25+
"sigs.k8s.io/controller-runtime/pkg/client"
26+
"sigs.k8s.io/controller-runtime/pkg/log/zap"
27+
"sigs.k8s.io/controller-runtime/pkg/manager"
28+
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
29+
"sigs.k8s.io/controller-runtime/pkg/webhook"
30+
31+
schedulingv1alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v1alpha2"
32+
33+
admissionplugins "github.com/NVIDIA/KAI-scheduler/pkg/admission/plugins"
34+
"github.com/NVIDIA/KAI-scheduler/pkg/binder/controllers"
35+
)
36+
37+
var (
38+
scheme = runtime.NewScheme()
39+
setupLog = ctrl.Log.WithName("setup")
40+
)
41+
42+
func init() {
43+
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
44+
45+
utilruntime.Must(schedulingv1alpha2.AddToScheme(scheme))
46+
// +kubebuilder:scaffold:scheme
47+
}
48+
49+
type App struct {
50+
K8sInterface kubernetes.Interface
51+
Client client.WithWatch
52+
InformerFactory informers.SharedInformerFactory
53+
Options *Options
54+
manager manager.Manager
55+
reconcilerParams *controllers.ReconcilerParams
56+
admissionPlugins *admissionplugins.KaiAdmissionPlugins
57+
}
58+
59+
// +kubebuilder:webhook:path=/mutate--v1-pod,mutating=true,failurePolicy=fail,sideEffects=None,resources=pods,verbs=create,groups=core,versions=v1,name=admission.run.ai,admissionReviewVersions=v1,reinvocationPolicy=IfNeeded
60+
// +kubebuilder:webhook:path=/validate--v1-pod,mutating=false,failurePolicy=fail,sideEffects=None,resources=pods,verbs=create;update,groups=core,versions=v1,name=admission.run.ai,admissionReviewVersions=v1
61+
62+
func New() (*App, error) {
63+
options := InitOptions()
64+
opts := zap.Options{
65+
Development: true,
66+
TimeEncoder: zapcore.ISO8601TimeEncoder,
67+
}
68+
opts.BindFlags(flag.CommandLine)
69+
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
70+
71+
pflag.Parse()
72+
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
73+
74+
config := ctrl.GetConfigOrDie()
75+
config.QPS = float32(options.QPS)
76+
config.Burst = options.Burst
77+
78+
mgr, err := ctrl.NewManager(config, ctrl.Options{
79+
Scheme: scheme,
80+
Metrics: server.Options{
81+
BindAddress: options.MetricsAddr,
82+
},
83+
WebhookServer: webhook.NewServer(webhook.Options{
84+
Port: options.WebhookPort,
85+
}),
86+
HealthProbeBindAddress: options.ProbeAddr,
87+
LeaderElection: options.EnableLeaderElection,
88+
LeaderElectionID: "2ad35f9c.kai.scheduler",
89+
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
90+
// when the Manager ends. This requires the binary to immediately end when the
91+
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
92+
// speeds up voluntary leader transitions as the new leader don't have to wait
93+
// LeaseDuration time first.
94+
//
95+
// In the default scaffold provided, the program ends immediately after
96+
// the manager stops, so would be fine to enable this option. However,
97+
// if you are doing or is intended to do any operation such as perform cleanups
98+
// after the manager stops then its usage might be unsafe.
99+
// LeaderElectionReleaseOnCancel: true,
100+
})
101+
if err != nil {
102+
setupLog.Error(err, "unable to start manager")
103+
return nil, err
104+
}
105+
106+
clientWithWatch, err := client.NewWithWatch(mgr.GetConfig(), client.Options{
107+
Scheme: scheme,
108+
Cache: &client.CacheOptions{
109+
Reader: mgr.GetCache(),
110+
},
111+
})
112+
if err != nil {
113+
setupLog.Error(err, "unable to create client with watch")
114+
return nil, err
115+
}
116+
117+
kubeClient := kubernetes.NewForConfigOrDie(config)
118+
informerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
119+
120+
reconcilerParams := &controllers.ReconcilerParams{
121+
RateLimiterBaseDelaySeconds: options.RateLimiterBaseDelaySeconds,
122+
RateLimiterMaxDelaySeconds: options.RateLimiterMaxDelaySeconds,
123+
}
124+
125+
app := &App{
126+
K8sInterface: kubeClient,
127+
Client: clientWithWatch,
128+
InformerFactory: informerFactory,
129+
Options: options,
130+
manager: mgr,
131+
reconcilerParams: reconcilerParams,
132+
}
133+
return app, nil
134+
}
135+
136+
func (app *App) RegisterPlugins(admissionPlugins *admissionplugins.KaiAdmissionPlugins) {
137+
app.admissionPlugins = admissionPlugins
138+
}
139+
140+
func (app *App) Run() error {
141+
var err error
142+
go func() {
143+
app.manager.GetCache().WaitForCacheSync(context.Background())
144+
}()
145+
146+
// +kubebuilder:scaffold:builder
147+
148+
if err = ctrl.NewWebhookManagedBy(app.manager).For(&corev1.Pod{}).
149+
WithDefaulter(admissionhooks.NewPodMutator(app.manager.GetClient(), app.admissionPlugins, app.Options.SchedulerName)).
150+
WithValidator(admissionhooks.NewPodValidator(app.manager.GetClient(), app.admissionPlugins, app.Options.SchedulerName)).Complete(); err != nil {
151+
setupLog.Error(err, "unable to create pod webhooks", "webhook", "Pod")
152+
return err
153+
}
154+
155+
stopCh := make(chan struct{})
156+
app.InformerFactory.Start(stopCh)
157+
app.InformerFactory.WaitForCacheSync(stopCh)
158+
159+
if err = app.manager.AddHealthzCheck("healthz", app.manager.GetWebhookServer().StartedChecker()); err != nil {
160+
setupLog.Error(err, "unable to set up health check")
161+
return err
162+
}
163+
if err = app.manager.AddReadyzCheck("readyz", app.manager.GetWebhookServer().StartedChecker()); err != nil {
164+
setupLog.Error(err, "unable to set up ready check")
165+
return err
166+
}
167+
168+
setupLog.Info("starting manager")
169+
if err = app.manager.Start(ctrl.SetupSignalHandler()); err != nil {
170+
setupLog.Error(err, "problem running manager")
171+
return err
172+
}
173+
return nil
174+
}

cmd/admission/app/options.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Copyright 2025 NVIDIA CORPORATION
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package app
5+
6+
import (
7+
"github.com/spf13/pflag"
8+
9+
utilfeature "k8s.io/apiserver/pkg/util/feature"
10+
)
11+
12+
type Options struct {
13+
SchedulerName string
14+
QPS float64
15+
Burst int
16+
RateLimiterBaseDelaySeconds int
17+
RateLimiterMaxDelaySeconds int
18+
EnableLeaderElection bool
19+
MetricsAddr string
20+
ProbeAddr string
21+
WebhookPort int
22+
FakeGPUNodes bool
23+
GpuCdiEnabled bool
24+
VolumeBindingTimeoutSeconds int
25+
GPUSharingEnabled bool
26+
}
27+
28+
func InitOptions() *Options {
29+
options := &Options{}
30+
31+
fs := pflag.CommandLine
32+
33+
fs.StringVar(&options.SchedulerName,
34+
"scheduler-name", "kai-scheduler",
35+
"The scheduler name the workloads are scheduled with")
36+
fs.Float64Var(&options.QPS,
37+
"qps", 50,
38+
"Queries per second to the K8s API server")
39+
fs.IntVar(&options.Burst,
40+
"burst", 300,
41+
"Burst to the K8s API server")
42+
fs.IntVar(&options.RateLimiterBaseDelaySeconds,
43+
"rate-limiter-base-delay", 1,
44+
"Base delay in seconds for the ExponentialFailureRateLimiter")
45+
fs.IntVar(&options.RateLimiterMaxDelaySeconds,
46+
"rate-limiter-max-delay", 60,
47+
"Max delay in seconds for the ExponentialFailureRateLimiter")
48+
fs.BoolVar(&options.EnableLeaderElection,
49+
"leader-elect", false,
50+
"Enable leader election for controller manager. "+
51+
"Enabling this will ensure there is only one active controller manager.")
52+
fs.StringVar(&options.MetricsAddr,
53+
"metrics-bind-address", ":8080",
54+
"The address the metric endpoint binds to.")
55+
fs.StringVar(&options.ProbeAddr,
56+
"health-probe-bind-address", ":8081",
57+
"The address the probe endpoint binds to.")
58+
fs.IntVar(&options.WebhookPort,
59+
"webhook-addr", 9443,
60+
"The port the webhook binds to.")
61+
fs.BoolVar(&options.FakeGPUNodes,
62+
"fake-gpu-nodes", false,
63+
"Enables running fractions on fake gpu nodes for testing")
64+
fs.BoolVar(&options.GpuCdiEnabled,
65+
"cdi-enabled", false,
66+
"Specifies if the gpu device plugin uses the cdi devices api to set gpu devices to the pods")
67+
fs.IntVar(&options.VolumeBindingTimeoutSeconds,
68+
"volume-binding-timeout-seconds", 120,
69+
"Volume binding timeout in seconds")
70+
fs.BoolVar(&options.GPUSharingEnabled,
71+
"gpu-sharing-enabled", false,
72+
"Specifies if the GPU sharing is enabled")
73+
74+
utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
75+
76+
return options
77+
}

cmd/admission/main.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package main
2+
3+
// Copyright 2025 NVIDIA CORPORATION
4+
// SPDX-License-Identifier: Apache-2.0
5+
6+
import (
7+
"os"
8+
9+
ctrl "sigs.k8s.io/controller-runtime"
10+
11+
"github.com/NVIDIA/KAI-scheduler/cmd/admission/app"
12+
13+
"github.com/NVIDIA/KAI-scheduler/pkg/admission/plugins"
14+
"github.com/NVIDIA/KAI-scheduler/pkg/admission/webhook/v1alpha2/gpusharing"
15+
)
16+
17+
var (
18+
setupLog = ctrl.Log.WithName("admission-setup")
19+
)
20+
21+
func main() {
22+
app, err := app.New()
23+
if err != nil {
24+
setupLog.Error(err, "failed to create app")
25+
os.Exit(1)
26+
}
27+
28+
err = registerPlugins(app)
29+
if err != nil {
30+
setupLog.Error(err, "failed to register plugins")
31+
os.Exit(1)
32+
}
33+
34+
err = app.Run()
35+
if err != nil {
36+
setupLog.Error(err, "failed to run app")
37+
os.Exit(1)
38+
}
39+
}
40+
41+
func registerPlugins(app *app.App) error {
42+
admissionPlugins := plugins.New()
43+
44+
admissionGpuSharingPlugin := gpusharing.New(app.Client,
45+
app.Options.GpuCdiEnabled, app.Options.GPUSharingEnabled)
46+
47+
admissionPlugins.RegisterPlugin(admissionGpuSharingPlugin)
48+
app.RegisterPlugins(admissionPlugins)
49+
return nil
50+
}

cmd/binder/app/app.go

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99
"fmt"
1010
"time"
1111

12-
admissionhooks "github.com/NVIDIA/KAI-scheduler/pkg/admission/webhook/v1alpha2/podhooks"
1312
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
1413

1514
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -33,11 +32,10 @@ import (
3332

3433
schedulingv1alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v1alpha2"
3534

36-
admissionplugins "github.com/NVIDIA/KAI-scheduler/pkg/admission/plugins"
3735
"github.com/NVIDIA/KAI-scheduler/pkg/binder/binding"
3836
"github.com/NVIDIA/KAI-scheduler/pkg/binder/binding/resourcereservation"
3937
"github.com/NVIDIA/KAI-scheduler/pkg/binder/controllers"
40-
bindingplugins "github.com/NVIDIA/KAI-scheduler/pkg/binder/plugins"
38+
"github.com/NVIDIA/KAI-scheduler/pkg/binder/plugins"
4139
)
4240

4341
var (
@@ -60,13 +58,9 @@ type App struct {
6058
manager manager.Manager
6159
rrs resourcereservation.Interface
6260
reconcilerParams *controllers.ReconcilerParams
63-
admissionPlugins *admissionplugins.KaiAdmissionPlugins
64-
bindingPlugins *bindingplugins.BinderPlugins
61+
plugins *plugins.BinderPlugins
6562
}
6663

67-
// +kubebuilder:webhook:path=/mutate--v1-pod,mutating=true,failurePolicy=fail,sideEffects=None,resources=pods,verbs=create,groups=core,versions=v1,name=binder.run.ai,admissionReviewVersions=v1,reinvocationPolicy=IfNeeded
68-
// +kubebuilder:webhook:path=/validate--v1-pod,mutating=false,failurePolicy=fail,sideEffects=None,resources=pods,verbs=create;update,groups=core,versions=v1,name=binder.run.ai,admissionReviewVersions=v1
69-
7064
func New() (*App, error) {
7165
options := InitOptions()
7266
opts := zap.Options{
@@ -152,9 +146,8 @@ func New() (*App, error) {
152146
return app, nil
153147
}
154148

155-
func (app *App) RegisterPlugins(admissionPlugins *admissionplugins.KaiAdmissionPlugins, bindingPlugins *bindingplugins.BinderPlugins) {
156-
app.admissionPlugins = admissionPlugins
157-
app.bindingPlugins = bindingPlugins
149+
func (app *App) RegisterPlugins(plugins *plugins.BinderPlugins) {
150+
app.plugins = plugins
158151
}
159152

160153
func (app *App) Run() error {
@@ -179,14 +172,7 @@ func (app *App) Run() error {
179172
return err
180173
}
181174

182-
if err = ctrl.NewWebhookManagedBy(app.manager).For(&corev1.Pod{}).
183-
WithDefaulter(admissionhooks.NewPodMutator(app.manager.GetClient(), app.admissionPlugins, app.Options.SchedulerName)).
184-
WithValidator(admissionhooks.NewPodValidator(app.manager.GetClient(), app.admissionPlugins, app.Options.SchedulerName)).Complete(); err != nil {
185-
setupLog.Error(err, "unable to create pod webhooks", "webhook", "Pod")
186-
return err
187-
}
188-
189-
binder := binding.NewBinder(app.Client, app.rrs, app.bindingPlugins)
175+
binder := binding.NewBinder(app.Client, app.rrs, app.plugins)
190176

191177
stopCh := make(chan struct{})
192178
app.InformerFactory.Start(stopCh)

0 commit comments

Comments
 (0)