Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions cmd/binder/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"

schedulingv1alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v1alpha2"

Expand Down Expand Up @@ -82,9 +81,6 @@ func New() (*App, error) {
Metrics: server.Options{
BindAddress: options.MetricsAddr,
},
WebhookServer: webhook.NewServer(webhook.Options{
Port: options.WebhookPort,
}),
HealthProbeBindAddress: options.ProbeAddr,
LeaderElection: options.EnableLeaderElection,
LeaderElectionID: "2ad35f9c.kai.scheduler",
Expand Down Expand Up @@ -190,15 +186,6 @@ func (app *App) Run() error {
}
// +kubebuilder:scaffold:builder

if err = app.manager.AddHealthzCheck("healthz", app.manager.GetWebhookServer().StartedChecker()); err != nil {
setupLog.Error(err, "unable to set up health check")
return err
}
if err = app.manager.AddReadyzCheck("readyz", app.manager.GetWebhookServer().StartedChecker()); err != nil {
setupLog.Error(err, "unable to set up ready check")
return err
}

setupLog.Info("starting manager")
if err = app.manager.Start(ctrl.SetupSignalHandler()); err != nil {
setupLog.Error(err, "problem running manager")
Expand Down
8 changes: 0 additions & 8 deletions cmd/binder/app/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,9 @@ type Options struct {
EnableLeaderElection bool
MetricsAddr string
ProbeAddr string
WebhookPort int
FakeGPUNodes bool
GpuCdiEnabled bool
VolumeBindingTimeoutSeconds int
GPUSharingEnabled bool
}

func InitOptions() *Options {
Expand Down Expand Up @@ -83,9 +81,6 @@ func InitOptions() *Options {
fs.StringVar(&options.ProbeAddr,
"health-probe-bind-address", ":8081",
"The address the probe endpoint binds to.")
fs.IntVar(&options.WebhookPort,
"webhook-addr", 9443,
"The port the webhook binds to.")
fs.BoolVar(&options.FakeGPUNodes,
"fake-gpu-nodes", false,
"Enables running fractions on fake gpu nodes for testing")
Expand All @@ -95,9 +90,6 @@ func InitOptions() *Options {
fs.IntVar(&options.VolumeBindingTimeoutSeconds,
"volume-binding-timeout-seconds", 120,
"Volume binding timeout in seconds")
fs.BoolVar(&options.GPUSharingEnabled,
"gpu-sharing-enabled", false,
"Specifies if the GPU sharing is enabled")

utilfeature.DefaultMutableFeatureGate.AddFlag(fs)

Expand Down
3 changes: 1 addition & 2 deletions cmd/binder/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ func registerPlugins(app *app.App) error {
}
binderPlugins.RegisterPlugin(k8sPlugins)

bindingGpuSharingPlugin := gpusharing.New(app.Client,
app.Options.GpuCdiEnabled, app.Options.GPUSharingEnabled)
bindingGpuSharingPlugin := gpusharing.New(app.Client, app.Options.GpuCdiEnabled)

binderPlugins.RegisterPlugin(bindingGpuSharingPlugin)
app.RegisterPlugins(binderPlugins)
Expand Down
18 changes: 0 additions & 18 deletions deployments/kai-scheduler/templates/services/binder.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@ spec:
imagePullPolicy: {{ .Values.binder.image.pullPolicy }}
args:
- "--leader-elect={{ .Values.global.leaderElection }}"
- "--webhook-addr={{ .Values.binder.ports.webhookPort }}"
- "--health-probe-bind-address=:{{ .Values.binder.ports.probePort }}"
- "--resource-reservation-pod-image={{ .Values.global.registry }}/{{ .Values.binder.resourceReservationImage.name }}:{{ .Chart.Version }}"
- "--metrics-bind-address=:{{ .Values.binder.ports.metricsPort }}"
- "--gpu-sharing-enabled={{ .Values.global.gpuSharing }}"
- "--cdi-enabled={{ .Values.binder.cdi }}"
- "--resource-reservation-namespace={{ .Values.global.resourceReservation.namespace }}"
- "--resource-reservation-service-account={{ .Values.global.resourceReservation.serviceAccount }}"
Expand All @@ -44,26 +42,10 @@ spec:
securityContext:
{{- toYaml .Values.global.securityContext | nindent 12 }}
{{- end }}
ports:
- name: webhook
containerPort: {{ .Values.binder.ports.webhookPort }}
readinessProbe:
httpGet:
path: "/readyz"
port: {{ .Values.binder.ports.probePort }}
volumeMounts:
- name: certs
readOnly: true
mountPath: "/tmp/k8s-webhook-server/serving-certs"
{{- if .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml .Values.global.imagePullSecrets | nindent 8 }}
{{- end }}
volumes:
- name: certs
secret:
secretName: {{ .Values.binder.certSecretName }}
defaultMode: 420
{{- if .Values.global.nodeSelector }}
nodeSelector:
{{- toYaml .Values.global.nodeSelector | nindent 8 }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "binder-pre-install-job" "serviceName" "binder" "secretName" .Values.binder.certSecretName "weight" "0" "Values" .Values "Chart" .Chart "Release" .Release) }}

---
{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "queuecontroller-pre-install-job" "serviceName" "queuecontroller" "secretName" .Values.queuecontroller.certSecretName "weight" "1" "Values" .Values "Chart" .Chart "Release" .Release) }}

---
{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "kai-admission-pre-install-job" "serviceName" "kai-admission" "secretName" .Values.admission.certSecretName "weight" "0" "Values" .Values "Chart" .Chart "Release" .Release) }}
{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "kai-admission-pre-install-job" "serviceName" "kai-admission" "secretName" .Values.admission.certSecretName "weight" "0" "Values" .Values "Chart" .Chart "Release" .Release) }}
2 changes: 0 additions & 2 deletions deployments/kai-scheduler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,8 @@ binder:
pullPolicy: IfNotPresent
resourceReservationImage:
name: resourcereservation
certSecretName: binder-webhook-tls-secret
additionalArgs: []
ports:
webhookPort: 9443
metricsPort: 8080
probePort: 8081
resources:
Expand Down
6 changes: 3 additions & 3 deletions pkg/binder/binding/default_binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestBind(t *testing.T) {
kubeClient := fake.NewClientBuilder().WithRuntimeObjects(kubeObjects...).WithInterceptorFuncs(test_utils.EmptyBind).Build()

binderPlugins := plugins.New()
bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false, true)
bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false)
binderPlugins.RegisterPlugin(bindingGpuSharingPlugin)

binder := NewBinder(kubeClient, rrs, binderPlugins)
Expand Down Expand Up @@ -175,7 +175,7 @@ func TestBindApplyResourceReceivedType(t *testing.T) {
kubeClient := fake.NewClientBuilder().WithRuntimeObjects(kubeObjects...).WithInterceptorFuncs(test_utils.EmptyBind).Build()

binderPlugins := plugins.New()
bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false, true)
bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false)
binderPlugins.RegisterPlugin(bindingGpuSharingPlugin)

binder := NewBinder(kubeClient, rrs, binderPlugins)
Expand Down Expand Up @@ -222,7 +222,7 @@ func TestBindFail(t *testing.T) {
kubeClient := fake.NewClientBuilder().WithRuntimeObjects(kubeObjects...).WithInterceptorFuncs(test_utils.EmptyBind).Build()

binderPlugins := plugins.New()
bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false, true)
bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false)
binderPlugins.RegisterPlugin(bindingGpuSharingPlugin)

binder := NewBinder(kubeClient, rrs, binderPlugins)
Expand Down
4 changes: 2 additions & 2 deletions pkg/binder/binding/fraction_binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ var _ = Describe("FractionBinder", func() {
testData.kubeObjects...).WithInterceptorFuncs(testData.clientInterceptFuncs).Build()

binderPlugins := plugins.New()
bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false, true)
bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false)
binderPlugins.RegisterPlugin(bindingGpuSharingPlugin)

testedBinder := NewBinder(fakeClient, rrs, binderPlugins)
Expand Down Expand Up @@ -237,7 +237,7 @@ var _ = Describe("FractionBinder", func() {
happyFlowObjects...).WithInterceptorFuncs(clientInterceptFuncs).Build()

binderPlugins := plugins.New()
bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false, true)
bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false)
binderPlugins.RegisterPlugin(bindingGpuSharingPlugin)

testedBinder := NewBinder(fakeClient, rrs, binderPlugins)
Expand Down
8 changes: 4 additions & 4 deletions pkg/binder/controllers/integration_tests/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ var _ = BeforeSuite(func() {

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "..", "deployments", "crds", "internal")},
CRDDirectoryPaths: []string{
filepath.Join("..", "..", "..", "..", "deployments", "crds", "internal"),
filepath.Join("..", "..", "..", "..", "deployments", "crds", "external"),
},
ErrorIfCRDPathMissing: true,
}

// Add the kueue crd to the test environment
testEnv.CRDDirectoryPaths = append(testEnv.CRDDirectoryPaths, filepath.Join("..", "..", "..", "..", "deployments", "crds", "external"))

var err error
// cfg is defined in this file globally.
cfg, err = testEnv.Start()
Expand Down
4 changes: 1 addition & 3 deletions pkg/binder/plugins/gpusharing/gpu_sharing.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,12 @@ const (
type GPUSharing struct {
kubeClient client.Client
gpuDevicePluginUsesCdi bool
gpuSharingEnabled bool
}

func New(kubeClient client.Client, gpuDevicePluginUsesCdi bool, gpuSharingEnabled bool) *GPUSharing {
func New(kubeClient client.Client, gpuDevicePluginUsesCdi bool) *GPUSharing {
return &GPUSharing{
kubeClient: kubeClient,
gpuDevicePluginUsesCdi: gpuDevicePluginUsesCdi,
gpuSharingEnabled: gpuSharingEnabled,
}
}

Expand Down
Loading