NVIDIA
diff --git a/‎.github/workflows/on-pr.yaml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/on-pr.yaml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎cmd/binder/app/app.go‎
Lines changed: 4 additions & 2 deletions b/‎cmd/binder/app/app.go‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎cmd/binder/app/options.go‎
Lines changed: 42 additions & 16 deletions b/‎cmd/binder/app/options.go‎
Lines changed: 42 additions & 16 deletions
diff --git a/‎cmd/scheduler/app/options/options.go‎
Lines changed: 3 additions & 0 deletions b/‎cmd/scheduler/app/options/options.go‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmd/scheduler/app/options/options_test.go‎
Lines changed: 1 addition & 0 deletions b/‎cmd/scheduler/app/options/options_test.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmd/scheduler/app/server.go‎
Lines changed: 6 additions & 0 deletions b/‎cmd/scheduler/app/server.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎hack/fake-gpu-operator-values.yaml‎
Lines changed: 3 additions & 0 deletions b/‎hack/fake-gpu-operator-values.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pkg/binder/binding/resourcereservation/resource_reservation.go‎
Lines changed: 30 additions & 27 deletions b/‎pkg/binder/binding/resourcereservation/resource_reservation.go‎
Lines changed: 30 additions & 27 deletions
@@ -106,10 +106,8 @@ jobs:
 
       - name: Deploy fake gpu operator
         run: |
-          helm repo add fake-gpu-operator https://fake-gpu-operator.storage.googleapis.com
-          helm repo update
-          helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace \
-              --version 0.0.58 --values ./hack/fake-gpu-operator-values.yaml --wait
+          helm upgrade -i gpu-operator oci://ghcr.io/run-ai/fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace \
+              --version 0.0.62 --values ./hack/fake-gpu-operator-values.yaml --wait
 
       - name: install KAI-scheduler
         env:
 
@@ -119,8 +119,10 @@ func New() (*App, error) {
 	kubeClient := kubernetes.NewForConfigOrDie(config)
 	informerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
 
-	rrs := resourcereservation.NewService(options.FakeGPUNodes, clientWithWatch, options.ResourceReservePodImage,
-		time.Duration(options.ResourceReservationAllocationTimeout)*time.Second)
+	rrs := resourcereservation.NewService(options.FakeGPUNodes, clientWithWatch, options.ResourceReservationPodImage,
+		time.Duration(options.ResourceReservationAllocationTimeout)*time.Second,
+		options.ResourceReservationNamespace, options.ResourceReservationServiceAccount,
+		options.ResourceReservationAppLabel)
 
 	reconcilerParams := &controllers.ReconcilerParams{
 		MaxConcurrentReconciles:     options.MaxConcurrentReconciles,
 
@@ -10,7 +10,11 @@ import (
 )
 
 type Options struct {
-	ResourceReservePodImage              string
+	SchedulerName                        string
+	ResourceReservationNamespace         string
+	ResourceReservationServiceAccount    string
+	ResourceReservationPodImage          string
+	ResourceReservationAppLabel          string
 	ResourceReservationAllocationTimeout int
 	QPS                                  float64
 	Burst                                int
@@ -25,37 +29,61 @@ type Options struct {
 	GpuCdiEnabled                        bool
 	VolumeBindingTimeoutSeconds          int
 	GPUSharingEnabled                    bool
-	SchedulerName                        string
 }
 
 func InitOptions() *Options {
 	options := &Options{}
 
 	fs := pflag.CommandLine
 
-	fs.StringVar(&options.ResourceReservePodImage,
-		"resource-reservation-pod-image", "registry/local/kai-scheduler/resource-reservation", "Container image for the resource reservation pod")
+	fs.StringVar(&options.SchedulerName,
+		"scheduler-name", "kai-scheduler",
+		"The scheduler name the workloads are scheduled with")
+	fs.StringVar(&options.ResourceReservationNamespace,
+		"resource-reservation-namespace", "runai-reservation",
+		"Namespace for resource reservation pods")
+	fs.StringVar(&options.ResourceReservationServiceAccount,
+		"resource-reservation-service-account", "runai-reservation",
+		"Service account name for resource reservation pods")
+	fs.StringVar(&options.ResourceReservationPodImage,
+		"resource-reservation-pod-image", "registry/local/kai-scheduler/resource-reservation",
+		"Container image for the resource reservation pod")
+	fs.StringVar(&options.ResourceReservationAppLabel,
+		"resource-reservation-app-label", "runai-reservation",
+		"App label value of resource reservation pods")
 	fs.IntVar(&options.ResourceReservationAllocationTimeout,
 		"resource-reservation-allocation-timeout", 40,
 		"Resource reservation allocation timeout in seconds")
-	fs.Float64Var(&options.QPS, "qps", 50, "Queries per second to the K8s API server")
-	fs.IntVar(&options.Burst, "burst", 300, "Burst to the K8s API server")
+	fs.Float64Var(&options.QPS,
+		"qps", 50,
+		"Queries per second to the K8s API server")
+	fs.IntVar(&options.Burst,
+		"burst", 300,
+		"Burst to the K8s API server")
 	fs.IntVar(&options.MaxConcurrentReconciles,
-		"max-concurrent-reconciles", 10, "Max concurrent reconciles")
-	fs.IntVar(&options.RateLimiterBaseDelaySeconds, "rate-limiter-base-delay", 1,
+		"max-concurrent-reconciles", 10,
+		"Max concurrent reconciles")
+	fs.IntVar(&options.RateLimiterBaseDelaySeconds,
+		"rate-limiter-base-delay", 1,
 		"Base delay in seconds for the ExponentialFailureRateLimiter")
-	fs.IntVar(&options.RateLimiterMaxDelaySeconds, "rate-limiter-max-delay", 60,
+	fs.IntVar(&options.RateLimiterMaxDelaySeconds,
+		"rate-limiter-max-delay", 60,
 		"Max delay in seconds for the ExponentialFailureRateLimiter")
-	fs.BoolVar(&options.EnableLeaderElection, "leader-elect", false,
+	fs.BoolVar(&options.EnableLeaderElection,
+		"leader-elect", false,
 		"Enable leader election for controller manager. "+
 			"Enabling this will ensure there is only one active controller manager.")
-	fs.StringVar(&options.MetricsAddr, "metrics-bind-address", ":8080",
+	fs.StringVar(&options.MetricsAddr,
+		"metrics-bind-address", ":8080",
 		"The address the metric endpoint binds to.")
-	fs.StringVar(&options.ProbeAddr, "health-probe-bind-address", ":8081",
+	fs.StringVar(&options.ProbeAddr,
+		"health-probe-bind-address", ":8081",
 		"The address the probe endpoint binds to.")
-	fs.IntVar(&options.WebhookPort, "webhook-addr", 9443,
+	fs.IntVar(&options.WebhookPort,
+		"webhook-addr", 9443,
 		"The port the webhook binds to.")
-	fs.BoolVar(&options.FakeGPUNodes, "fake-gpu-nodes", false,
+	fs.BoolVar(&options.FakeGPUNodes,
+		"fake-gpu-nodes", false,
 		"Enables running fractions on fake gpu nodes for testing")
 	fs.BoolVar(&options.GpuCdiEnabled,
 		"cdi-enabled", false,
@@ -66,8 +94,6 @@ func InitOptions() *Options {
 	fs.BoolVar(&options.GPUSharingEnabled,
 		"gpu-sharing-enabled", false,
 		"Specifies if the GPU sharing is enabled")
-	fs.StringVar(&options.SchedulerName, "scheduler-name", "kai-scheduler",
-		"The scheduler name that will be used to schedule the jobs")
 
 	utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
 
 
@@ -14,6 +14,7 @@ import (
 
 const (
 	defaultSchedulerName               = "kai-scheduler"
+	defaultResourceReservationAppLabel = "runai-reservation"
 	defaultMetricsNamespace            = "kai"
 	defaultSchedulerPeriod             = time.Second
 	defaultStalenessGracePeriod        = 60 * time.Second
@@ -31,6 +32,7 @@ const (
 // ServerOption is the main context object for the controller manager.
 type ServerOption struct {
 	SchedulerName                     string
+	ResourceReservationAppLabel       string
 	SchedulerConf                     string
 	SchedulePeriod                    time.Duration
 	EnableLeaderElection              bool
@@ -72,6 +74,7 @@ func NewServerOption() *ServerOption {
 func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
 	// kai-scheduler will ignore pods with scheduler names other than specified with the option
 	fs.StringVar(&s.SchedulerName, "scheduler-name", defaultSchedulerName, "The scheduler name in pod spec that handled by this scheduler")
+	fs.StringVar(&s.ResourceReservationAppLabel, "resource-reservation-app-label", defaultResourceReservationAppLabel, "App label value of resource reservation pods")
 	fs.BoolVar(&s.RestrictSchedulingNodes, "restrict-node-scheduling", false, "kai-scheduler will allocate jobs only to restricted nodes")
 	fs.StringVar(&s.NodePoolLabelKey, "nodepool-label-key", defaultNodePoolLabelKey, "The label key by which to filter scheduling nodepool")
 	fs.StringVar(&s.NodePoolLabelValue, "partition-label-value", "", "The label value by which to filter scheduling partition")
 
@@ -28,6 +28,7 @@ func TestAddFlags(t *testing.T) {
 	// This is a snapshot of expected options parsed by args.
 	expected := &ServerOption{
 		SchedulerName:                     defaultSchedulerName,
+		ResourceReservationAppLabel:       defaultResourceReservationAppLabel,
 		SchedulePeriod:                    5 * time.Minute,
 		PrintVersion:                      true,
 		MetricsNamespace:                  defaultMetricsNamespace,
 
@@ -97,6 +97,7 @@ func RunApp() error {
 	} else {
 		defer flushLogs()
 	}
+	setConfig(so)
 
 	config := clientconfig.GetConfigOrDie()
 	config.QPS = float32(so.QPS)
@@ -128,6 +129,11 @@ func setupLogging(so *options.ServerOption) error {
 	return nil
 }
 
+func setConfig(so *options.ServerOption) {
+	config := conf.GetConfig()
+	config.ResourceReservationAppLabelValue = so.ResourceReservationAppLabel
+}
+
 func Run(opt *options.ServerOption, config *restclient.Config, mux *http.ServeMux) error {
 	if opt.PrintVersion {
 		version.PrintVersion()
 
@@ -13,3 +13,6 @@ topology:
       gpuCount: 8
       gpuMemory: 11441
   nodePoolLabelKey: run.ai/simulated-gpu-node-pool
+
+environment:
+  resourceReservationNamespace: runai-reservation
@@ -34,18 +34,14 @@ type Interface interface {
 }
 
 const (
-	resourceReservation                  = "runai-reservation"
-	namespace                            = "runai-reservation"
-	serviceAccountName                   = "runai-reservation"
-	scalingPodsNamespace                 = "runai-scale-adjust"
-	gpuIndexAnnotationName               = "run.ai/reserve_for_gpu_index"
-	numberOfGPUsToReserve                = 1
-	appLabelValue                        = resourceReservation
-	gpuReservationPodPrefix              = resourceReservation + "-gpu"
-	runaiResourceReservationAppLabelName = "app.runai.resource.reservation"
-	reservationPodRandomCharacters       = 5
-	unknownGpuIndicator                  = "-1"
-	nodeIndex                            = "runai-node"
+	resourceReservation            = "resource-reservation"
+	gpuReservationPodPrefix        = "gpu-reservation"
+	scalingPodsNamespace           = "runai-scale-adjust"
+	gpuIndexAnnotationName         = "run.ai/reserve_for_gpu_index"
+	numberOfGPUsToReserve          = 1
+	reservationPodRandomCharacters = 5
+	unknownGpuIndicator            = "-1"
+	nodeIndex                      = "runai-node"
 )
 
 type service struct {
@@ -54,20 +50,29 @@ type service struct {
 	reservationPodImage string
 	allocationTimeout   time.Duration
 	gpuGroupMutex       *group_mutex.GroupMutex
+	namespace           string
+	serviceAccountName  string
+	appLabelValue       string
 }
 
 func NewService(
 	fakeGPuNodes bool,
 	kubeClient client.WithWatch,
 	reservationPodImage string,
 	allocationTimeout time.Duration,
+	namespace string,
+	serviceAccountName string,
+	appLabelValue string,
 ) *service {
 	return &service{
 		fakeGPuNodes:        fakeGPuNodes,
 		kubeClient:          kubeClient,
 		reservationPodImage: reservationPodImage,
 		allocationTimeout:   allocationTimeout,
 		gpuGroupMutex:       group_mutex.NewGroupMutex(),
+		namespace:           namespace,
+		serviceAccountName:  serviceAccountName,
+		appLabelValue:       appLabelValue,
 	}
 }
 
@@ -157,7 +162,7 @@ func (rsc *service) syncForPods(ctx context.Context, pods []*v1.Pod, gpuGroupToS
 	fractionPods := map[string][]*v1.Pod{}
 
 	for _, pod := range pods {
-		if pod.Namespace == namespace {
+		if pod.Namespace == rsc.namespace {
 			reservationPods[gpuGroupToSync] = pod
 			continue
 		}
@@ -301,7 +306,7 @@ func (rsc *service) findGPUIndexByGroup(gpuGroup string) (
 ) {
 	pods := &v1.PodList{}
 	err = rsc.kubeClient.List(context.Background(), pods,
-		client.InNamespace(namespace),
+		client.InNamespace(rsc.namespace),
 		client.MatchingLabels{constants.GPUGroup: gpuGroup})
 	if err != nil {
 		return "", err
@@ -334,7 +339,7 @@ func (rsc *service) createGPUReservationPodAndGetIndex(ctx context.Context, node
 			logger.Error(deleteErr, "failed to delete reservation pod", "name", pod.Name)
 		}
 		return unknownGpuIndicator, fmt.Errorf(
-			"failed waiting for GPU reservation pod to allocate: %v/%v", nodeName, pod.Name)
+			"failed waiting for GPU reservation pod to allocate: %v/%v", rsc.namespace, pod.Name)
 	}
 
 	return gpuIndex, err
@@ -385,16 +390,16 @@ func (rsc *service) createGPUReservationPod(ctx context.Context, nodeName, gpuGr
 		},
 	}
 
-	pod, err := rsc.createResourceReservationPod(nodeName, gpuGroup, podName, gpuReservationPodPrefix, resources)
+	pod, err := rsc.createResourceReservationPod(nodeName, gpuGroup, podName, resources)
 	if err != nil {
-		logger.Error(err, "Failed to created GPU reservation pod on node",
-			"nodeName", nodeName, "namespace", namespace, "name", podName)
+		logger.Error(err, "Failed to create GPU reservation pod on node",
+			"nodeName", nodeName, "namespace", rsc.namespace, "name", podName)
 		return nil, err
 	}
 
 	logger.Info(
 		"Successfully created GPU resource reservation pod",
-		"nodeName", nodeName, "namespace", namespace, "name", podName)
+		"nodeName", nodeName, "namespace", rsc.namespace, "name", podName)
 	return pod, nil
 }
 
@@ -405,7 +410,7 @@ func (rsc *service) waitForGPUReservationPodAllocation(
 	pods := &v1.PodList{}
 	watcher, err := rsc.kubeClient.Watch(
 		ctx, pods,
-		client.InNamespace(namespace),
+		client.InNamespace(rsc.namespace),
 		client.MatchingFields{"metadata.name": gpuReservationPodName},
 	)
 	if err != nil {
@@ -432,25 +437,23 @@ func (rsc *service) waitForGPUReservationPodAllocation(
 }
 
 func (rsc *service) createResourceReservationPod(
-	nodeName, gpuGroup, podName, appName string,
-	resources v1.ResourceRequirements,
+	nodeName, gpuGroup, podName string, resources v1.ResourceRequirements,
 ) (*v1.Pod, error) {
 	podSpec := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      podName,
-			Namespace: namespace,
+			Namespace: rsc.namespace,
 			Labels: map[string]string{
-				constants.AppLabelName:               appLabelValue,
-				constants.GPUGroup:                   gpuGroup,
-				runaiResourceReservationAppLabelName: appName,
+				constants.AppLabelName: rsc.appLabelValue,
+				constants.GPUGroup:     gpuGroup,
 			},
 			Annotations: map[string]string{
 				karpenterv1.DoNotDisruptAnnotationKey: "true",
 			},
 		},
 		Spec: v1.PodSpec{
 			NodeName:           nodeName,
-			ServiceAccountName: serviceAccountName,
+			ServiceAccountName: rsc.serviceAccountName,
 			Containers: []v1.Container{
 				{
 					Name:            resourceReservation,