diff --git a/cmd/binder/app/app.go b/cmd/binder/app/app.go index 9c47b03db..c0348f86a 100644 --- a/cmd/binder/app/app.go +++ b/cmd/binder/app/app.go @@ -28,7 +28,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/server" - "sigs.k8s.io/controller-runtime/pkg/webhook" schedulingv1alpha2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v1alpha2" @@ -82,9 +81,6 @@ func New() (*App, error) { Metrics: server.Options{ BindAddress: options.MetricsAddr, }, - WebhookServer: webhook.NewServer(webhook.Options{ - Port: options.WebhookPort, - }), HealthProbeBindAddress: options.ProbeAddr, LeaderElection: options.EnableLeaderElection, LeaderElectionID: "2ad35f9c.kai.scheduler", @@ -190,15 +186,6 @@ func (app *App) Run() error { } // +kubebuilder:scaffold:builder - if err = app.manager.AddHealthzCheck("healthz", app.manager.GetWebhookServer().StartedChecker()); err != nil { - setupLog.Error(err, "unable to set up health check") - return err - } - if err = app.manager.AddReadyzCheck("readyz", app.manager.GetWebhookServer().StartedChecker()); err != nil { - setupLog.Error(err, "unable to set up ready check") - return err - } - setupLog.Info("starting manager") if err = app.manager.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") diff --git a/cmd/binder/app/options.go b/cmd/binder/app/options.go index 5b7ab7ce2..758eccee2 100644 --- a/cmd/binder/app/options.go +++ b/cmd/binder/app/options.go @@ -25,11 +25,9 @@ type Options struct { EnableLeaderElection bool MetricsAddr string ProbeAddr string - WebhookPort int FakeGPUNodes bool GpuCdiEnabled bool VolumeBindingTimeoutSeconds int - GPUSharingEnabled bool } func InitOptions() *Options { @@ -83,9 +81,6 @@ func InitOptions() *Options { fs.StringVar(&options.ProbeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") - fs.IntVar(&options.WebhookPort, - "webhook-addr", 9443, - "The port the webhook binds to.") fs.BoolVar(&options.FakeGPUNodes, "fake-gpu-nodes", false, "Enables running fractions on fake gpu nodes for testing") @@ -95,9 +90,6 @@ func InitOptions() *Options { fs.IntVar(&options.VolumeBindingTimeoutSeconds, "volume-binding-timeout-seconds", 120, "Volume binding timeout in seconds") - fs.BoolVar(&options.GPUSharingEnabled, - "gpu-sharing-enabled", false, - "Specifies if the GPU sharing is enabled") utilfeature.DefaultMutableFeatureGate.AddFlag(fs) diff --git a/cmd/binder/main.go b/cmd/binder/main.go index 7093963d3..202d708ce 100644 --- a/cmd/binder/main.go +++ b/cmd/binder/main.go @@ -48,8 +48,7 @@ func registerPlugins(app *app.App) error { } binderPlugins.RegisterPlugin(k8sPlugins) - bindingGpuSharingPlugin := gpusharing.New(app.Client, - app.Options.GpuCdiEnabled, app.Options.GPUSharingEnabled) + bindingGpuSharingPlugin := gpusharing.New(app.Client, app.Options.GpuCdiEnabled) binderPlugins.RegisterPlugin(bindingGpuSharingPlugin) app.RegisterPlugins(binderPlugins) diff --git a/deployments/kai-scheduler/templates/services/binder.yaml b/deployments/kai-scheduler/templates/services/binder.yaml index 2916b165e..9337d62d6 100644 --- a/deployments/kai-scheduler/templates/services/binder.yaml +++ b/deployments/kai-scheduler/templates/services/binder.yaml @@ -24,11 +24,9 @@ spec: imagePullPolicy: {{ .Values.binder.image.pullPolicy }} args: - "--leader-elect={{ .Values.global.leaderElection }}" - - "--webhook-addr={{ .Values.binder.ports.webhookPort }}" - "--health-probe-bind-address=:{{ .Values.binder.ports.probePort }}" - "--resource-reservation-pod-image={{ .Values.global.registry }}/{{ .Values.binder.resourceReservationImage.name }}:{{ .Chart.Version }}" - "--metrics-bind-address=:{{ .Values.binder.ports.metricsPort }}" - - "--gpu-sharing-enabled={{ .Values.global.gpuSharing }}" - "--cdi-enabled={{ .Values.binder.cdi }}" - "--resource-reservation-namespace={{ .Values.global.resourceReservation.namespace }}" - "--resource-reservation-service-account={{ .Values.global.resourceReservation.serviceAccount }}" @@ -44,26 +42,10 @@ spec: securityContext: {{- toYaml .Values.global.securityContext | nindent 12 }} {{- end }} - ports: - - name: webhook - containerPort: {{ .Values.binder.ports.webhookPort }} - readinessProbe: - httpGet: - path: "/readyz" - port: {{ .Values.binder.ports.probePort }} - volumeMounts: - - name: certs - readOnly: true - mountPath: "/tmp/k8s-webhook-server/serving-certs" {{- if .Values.global.imagePullSecrets }} imagePullSecrets: {{- toYaml .Values.global.imagePullSecrets | nindent 8 }} {{- end }} - volumes: - - name: certs - secret: - secretName: {{ .Values.binder.certSecretName }} - defaultMode: 420 {{- if .Values.global.nodeSelector }} nodeSelector: {{- toYaml .Values.global.nodeSelector | nindent 8 }} diff --git a/deployments/kai-scheduler/templates/services/pre-install-hook.yaml b/deployments/kai-scheduler/templates/services/pre-install-hook.yaml index f325c90f6..16825f59d 100644 --- a/deployments/kai-scheduler/templates/services/pre-install-hook.yaml +++ b/deployments/kai-scheduler/templates/services/pre-install-hook.yaml @@ -1,10 +1,7 @@ # Copyright 2025 NVIDIA CORPORATION # SPDX-License-Identifier: Apache-2.0 -{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "binder-pre-install-job" "serviceName" "binder" "secretName" .Values.binder.certSecretName "weight" "0" "Values" .Values "Chart" .Chart "Release" .Release) }} - ---- {{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "queuecontroller-pre-install-job" "serviceName" "queuecontroller" "secretName" .Values.queuecontroller.certSecretName "weight" "1" "Values" .Values "Chart" .Chart "Release" .Release) }} --- -{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "kai-admission-pre-install-job" "serviceName" "kai-admission" "secretName" .Values.admission.certSecretName "weight" "0" "Values" .Values "Chart" .Chart "Release" .Release) }} \ No newline at end of file +{{ include "kai-scheduler.preInstallWebhookJob" (dict "jobName" "kai-admission-pre-install-job" "serviceName" "kai-admission" "secretName" .Values.admission.certSecretName "weight" "0" "Values" .Values "Chart" .Chart "Release" .Release) }} diff --git a/deployments/kai-scheduler/values.yaml b/deployments/kai-scheduler/values.yaml index cd0e073f1..1cb43cdcc 100644 --- a/deployments/kai-scheduler/values.yaml +++ b/deployments/kai-scheduler/values.yaml @@ -53,10 +53,8 @@ binder: pullPolicy: IfNotPresent resourceReservationImage: name: resourcereservation - certSecretName: binder-webhook-tls-secret additionalArgs: [] ports: - webhookPort: 9443 metricsPort: 8080 probePort: 8081 resources: diff --git a/pkg/binder/binding/default_binder_test.go b/pkg/binder/binding/default_binder_test.go index 3b8795af3..7f43b4b55 100644 --- a/pkg/binder/binding/default_binder_test.go +++ b/pkg/binder/binding/default_binder_test.go @@ -71,7 +71,7 @@ func TestBind(t *testing.T) { kubeClient := fake.NewClientBuilder().WithRuntimeObjects(kubeObjects...).WithInterceptorFuncs(test_utils.EmptyBind).Build() binderPlugins := plugins.New() - bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false, true) + bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false) binderPlugins.RegisterPlugin(bindingGpuSharingPlugin) binder := NewBinder(kubeClient, rrs, binderPlugins) @@ -175,7 +175,7 @@ func TestBindApplyResourceReceivedType(t *testing.T) { kubeClient := fake.NewClientBuilder().WithRuntimeObjects(kubeObjects...).WithInterceptorFuncs(test_utils.EmptyBind).Build() binderPlugins := plugins.New() - bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false, true) + bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false) binderPlugins.RegisterPlugin(bindingGpuSharingPlugin) binder := NewBinder(kubeClient, rrs, binderPlugins) @@ -222,7 +222,7 @@ func TestBindFail(t *testing.T) { kubeClient := fake.NewClientBuilder().WithRuntimeObjects(kubeObjects...).WithInterceptorFuncs(test_utils.EmptyBind).Build() binderPlugins := plugins.New() - bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false, true) + bindingGpuSharingPlugin := bindinggpusharing.New(kubeClient, false) binderPlugins.RegisterPlugin(bindingGpuSharingPlugin) binder := NewBinder(kubeClient, rrs, binderPlugins) diff --git a/pkg/binder/binding/fraction_binder_test.go b/pkg/binder/binding/fraction_binder_test.go index fcf5aef3a..d7215dc0d 100644 --- a/pkg/binder/binding/fraction_binder_test.go +++ b/pkg/binder/binding/fraction_binder_test.go @@ -175,7 +175,7 @@ var _ = Describe("FractionBinder", func() { testData.kubeObjects...).WithInterceptorFuncs(testData.clientInterceptFuncs).Build() binderPlugins := plugins.New() - bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false, true) + bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false) binderPlugins.RegisterPlugin(bindingGpuSharingPlugin) testedBinder := NewBinder(fakeClient, rrs, binderPlugins) @@ -237,7 +237,7 @@ var _ = Describe("FractionBinder", func() { happyFlowObjects...).WithInterceptorFuncs(clientInterceptFuncs).Build() binderPlugins := plugins.New() - bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false, true) + bindingGpuSharingPlugin := bindinggpusharing.New(fakeClient, false) binderPlugins.RegisterPlugin(bindingGpuSharingPlugin) testedBinder := NewBinder(fakeClient, rrs, binderPlugins) diff --git a/pkg/binder/controllers/integration_tests/suite_test.go b/pkg/binder/controllers/integration_tests/suite_test.go index 2188313b3..2a9c05768 100644 --- a/pkg/binder/controllers/integration_tests/suite_test.go +++ b/pkg/binder/controllers/integration_tests/suite_test.go @@ -65,13 +65,13 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "..", "deployments", "crds", "internal")}, + CRDDirectoryPaths: []string{ + filepath.Join("..", "..", "..", "..", "deployments", "crds", "internal"), + filepath.Join("..", "..", "..", "..", "deployments", "crds", "external"), + }, ErrorIfCRDPathMissing: true, } - // Add the kueue crd to the test environment - testEnv.CRDDirectoryPaths = append(testEnv.CRDDirectoryPaths, filepath.Join("..", "..", "..", "..", "deployments", "crds", "external")) - var err error // cfg is defined in this file globally. cfg, err = testEnv.Start() diff --git a/pkg/binder/plugins/gpusharing/gpu_sharing.go b/pkg/binder/plugins/gpusharing/gpu_sharing.go index 7a497e672..eccbfee8a 100644 --- a/pkg/binder/plugins/gpusharing/gpu_sharing.go +++ b/pkg/binder/plugins/gpusharing/gpu_sharing.go @@ -27,14 +27,12 @@ const ( type GPUSharing struct { kubeClient client.Client gpuDevicePluginUsesCdi bool - gpuSharingEnabled bool } -func New(kubeClient client.Client, gpuDevicePluginUsesCdi bool, gpuSharingEnabled bool) *GPUSharing { +func New(kubeClient client.Client, gpuDevicePluginUsesCdi bool) *GPUSharing { return &GPUSharing{ kubeClient: kubeClient, gpuDevicePluginUsesCdi: gpuDevicePluginUsesCdi, - gpuSharingEnabled: gpuSharingEnabled, } }