|
| 1 | +// Copyright 2025 NVIDIA CORPORATION |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +package app |
| 5 | + |
| 6 | +import ( |
| 7 | + "flag" |
| 8 | + "log" |
| 9 | + |
| 10 | + "go.uber.org/zap/zapcore" |
| 11 | + v1 "k8s.io/api/core/v1" |
| 12 | + "k8s.io/apimachinery/pkg/runtime" |
| 13 | + utilruntime "k8s.io/apimachinery/pkg/util/runtime" |
| 14 | + clientgoscheme "k8s.io/client-go/kubernetes/scheme" |
| 15 | + ctrl "sigs.k8s.io/controller-runtime" |
| 16 | + "sigs.k8s.io/controller-runtime/pkg/healthz" |
| 17 | + "sigs.k8s.io/controller-runtime/pkg/log/zap" |
| 18 | + |
| 19 | + "github.com/NVIDIA/KAI-scheduler/pkg/nodescaleadjuster/consts" |
| 20 | + "github.com/NVIDIA/KAI-scheduler/pkg/nodescaleadjuster/controller" |
| 21 | + "github.com/NVIDIA/KAI-scheduler/pkg/nodescaleadjuster/scale_adjuster" |
| 22 | + "github.com/NVIDIA/KAI-scheduler/pkg/nodescaleadjuster/scaler" |
| 23 | +) |
| 24 | + |
| 25 | +var ( |
| 26 | + scheme = runtime.NewScheme() |
| 27 | + setupLog = ctrl.Log.WithName("setup") |
| 28 | +) |
| 29 | + |
| 30 | +func init() { |
| 31 | + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) |
| 32 | + utilruntime.Must(v1.AddToScheme(scheme)) |
| 33 | + // +kubebuilder:scaffold:scheme |
| 34 | +} |
| 35 | + |
| 36 | +func Run() error { |
| 37 | + options := NewOptions() |
| 38 | + options.AddFlags() |
| 39 | + |
| 40 | + opts := zap.Options{ |
| 41 | + Development: true, |
| 42 | + TimeEncoder: zapcore.ISO8601TimeEncoder, |
| 43 | + } |
| 44 | + opts.BindFlags(flag.CommandLine) |
| 45 | + flag.Parse() |
| 46 | + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) |
| 47 | + |
| 48 | + log.Println("Node scale adjuster started") |
| 49 | + |
| 50 | + clientConfig := ctrl.GetConfigOrDie() |
| 51 | + mgr, err := ctrl.NewManager(clientConfig, ctrl.Options{ |
| 52 | + Scheme: scheme, |
| 53 | + }) |
| 54 | + if err != nil { |
| 55 | + setupLog.Error(err, "unable to start manager") |
| 56 | + return err |
| 57 | + } |
| 58 | + |
| 59 | + nodeScaler := scaler.NewScaler(mgr.GetClient(), options.ScalingPodImage, options.ScalingPodNamespace, |
| 60 | + options.ScalingPodAppLabel, options.ScalingPodServiceAccount) |
| 61 | + |
| 62 | + scaleAdjuster := scale_adjuster.NewScaleAdjuster( |
| 63 | + mgr.GetClient(), |
| 64 | + nodeScaler, |
| 65 | + options.ScalingPodNamespace, |
| 66 | + consts.DefaultCoolDownSeconds, |
| 67 | + options.GPUMemoryToFractionRatio, |
| 68 | + options.SchedulerName) |
| 69 | + |
| 70 | + podReconciler := &controller.PodReconciler{ |
| 71 | + ScaleAdjuster: scaleAdjuster, |
| 72 | + SchedulerName: options.SchedulerName, |
| 73 | + NodeScaleNamespace: options.ScalingPodImage, |
| 74 | + Client: mgr.GetClient(), |
| 75 | + Scheme: mgr.GetScheme(), |
| 76 | + } |
| 77 | + |
| 78 | + if err = podReconciler.SetupWithManager(mgr); err != nil { |
| 79 | + setupLog.Error(err, "unable to create controller", "controller", "Pod") |
| 80 | + return err |
| 81 | + } |
| 82 | + |
| 83 | + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { |
| 84 | + setupLog.Error(err, "unable to set up health check") |
| 85 | + return err |
| 86 | + } |
| 87 | + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { |
| 88 | + setupLog.Error(err, "unable to set up ready check") |
| 89 | + return err |
| 90 | + } |
| 91 | + |
| 92 | + setupLog.Info("starting manager") |
| 93 | + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { |
| 94 | + setupLog.Error(err, "problem running manager") |
| 95 | + return err |
| 96 | + } |
| 97 | + |
| 98 | + return nil |
| 99 | +} |
0 commit comments