Skip to content

Commit f3cf776

Browse files
authored
Made resource reservation parameters configurable (#106)
* Made resource reservation namespace, serviceAccount and app label configurable * Changed fake-gpu-operator deployment to support configurable resource reservation namespace * added scheduler global config * Changed kai-resource-reservation -> runai-reservation defaults
1 parent 21dac19 commit f3cf776

File tree

21 files changed

+202
-88
lines changed

21 files changed

+202
-88
lines changed

.github/workflows/on-pr.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,8 @@ jobs:
106106
107107
- name: Deploy fake gpu operator
108108
run: |
109-
helm repo add fake-gpu-operator https://fake-gpu-operator.storage.googleapis.com
110-
helm repo update
111-
helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace \
112-
--version 0.0.58 --values ./hack/fake-gpu-operator-values.yaml --wait
109+
helm upgrade -i gpu-operator oci://ghcr.io/run-ai/fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace \
110+
--version 0.0.62 --values ./hack/fake-gpu-operator-values.yaml --wait
113111
114112
- name: install KAI-scheduler
115113
env:

cmd/binder/app/app.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,10 @@ func New() (*App, error) {
119119
kubeClient := kubernetes.NewForConfigOrDie(config)
120120
informerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
121121

122-
rrs := resourcereservation.NewService(options.FakeGPUNodes, clientWithWatch, options.ResourceReservePodImage,
123-
time.Duration(options.ResourceReservationAllocationTimeout)*time.Second)
122+
rrs := resourcereservation.NewService(options.FakeGPUNodes, clientWithWatch, options.ResourceReservationPodImage,
123+
time.Duration(options.ResourceReservationAllocationTimeout)*time.Second,
124+
options.ResourceReservationNamespace, options.ResourceReservationServiceAccount,
125+
options.ResourceReservationAppLabel)
124126

125127
reconcilerParams := &controllers.ReconcilerParams{
126128
MaxConcurrentReconciles: options.MaxConcurrentReconciles,

cmd/binder/app/options.go

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ import (
1010
)
1111

1212
type Options struct {
13-
ResourceReservePodImage string
13+
SchedulerName string
14+
ResourceReservationNamespace string
15+
ResourceReservationServiceAccount string
16+
ResourceReservationPodImage string
17+
ResourceReservationAppLabel string
1418
ResourceReservationAllocationTimeout int
1519
QPS float64
1620
Burst int
@@ -25,37 +29,61 @@ type Options struct {
2529
GpuCdiEnabled bool
2630
VolumeBindingTimeoutSeconds int
2731
GPUSharingEnabled bool
28-
SchedulerName string
2932
}
3033

3134
func InitOptions() *Options {
3235
options := &Options{}
3336

3437
fs := pflag.CommandLine
3538

36-
fs.StringVar(&options.ResourceReservePodImage,
37-
"resource-reservation-pod-image", "registry/local/kai-scheduler/resource-reservation", "Container image for the resource reservation pod")
39+
fs.StringVar(&options.SchedulerName,
40+
"scheduler-name", "kai-scheduler",
41+
"The scheduler name the workloads are scheduled with")
42+
fs.StringVar(&options.ResourceReservationNamespace,
43+
"resource-reservation-namespace", "runai-reservation",
44+
"Namespace for resource reservation pods")
45+
fs.StringVar(&options.ResourceReservationServiceAccount,
46+
"resource-reservation-service-account", "runai-reservation",
47+
"Service account name for resource reservation pods")
48+
fs.StringVar(&options.ResourceReservationPodImage,
49+
"resource-reservation-pod-image", "registry/local/kai-scheduler/resource-reservation",
50+
"Container image for the resource reservation pod")
51+
fs.StringVar(&options.ResourceReservationAppLabel,
52+
"resource-reservation-app-label", "runai-reservation",
53+
"App label value of resource reservation pods")
3854
fs.IntVar(&options.ResourceReservationAllocationTimeout,
3955
"resource-reservation-allocation-timeout", 40,
4056
"Resource reservation allocation timeout in seconds")
41-
fs.Float64Var(&options.QPS, "qps", 50, "Queries per second to the K8s API server")
42-
fs.IntVar(&options.Burst, "burst", 300, "Burst to the K8s API server")
57+
fs.Float64Var(&options.QPS,
58+
"qps", 50,
59+
"Queries per second to the K8s API server")
60+
fs.IntVar(&options.Burst,
61+
"burst", 300,
62+
"Burst to the K8s API server")
4363
fs.IntVar(&options.MaxConcurrentReconciles,
44-
"max-concurrent-reconciles", 10, "Max concurrent reconciles")
45-
fs.IntVar(&options.RateLimiterBaseDelaySeconds, "rate-limiter-base-delay", 1,
64+
"max-concurrent-reconciles", 10,
65+
"Max concurrent reconciles")
66+
fs.IntVar(&options.RateLimiterBaseDelaySeconds,
67+
"rate-limiter-base-delay", 1,
4668
"Base delay in seconds for the ExponentialFailureRateLimiter")
47-
fs.IntVar(&options.RateLimiterMaxDelaySeconds, "rate-limiter-max-delay", 60,
69+
fs.IntVar(&options.RateLimiterMaxDelaySeconds,
70+
"rate-limiter-max-delay", 60,
4871
"Max delay in seconds for the ExponentialFailureRateLimiter")
49-
fs.BoolVar(&options.EnableLeaderElection, "leader-elect", false,
72+
fs.BoolVar(&options.EnableLeaderElection,
73+
"leader-elect", false,
5074
"Enable leader election for controller manager. "+
5175
"Enabling this will ensure there is only one active controller manager.")
52-
fs.StringVar(&options.MetricsAddr, "metrics-bind-address", ":8080",
76+
fs.StringVar(&options.MetricsAddr,
77+
"metrics-bind-address", ":8080",
5378
"The address the metric endpoint binds to.")
54-
fs.StringVar(&options.ProbeAddr, "health-probe-bind-address", ":8081",
79+
fs.StringVar(&options.ProbeAddr,
80+
"health-probe-bind-address", ":8081",
5581
"The address the probe endpoint binds to.")
56-
fs.IntVar(&options.WebhookPort, "webhook-addr", 9443,
82+
fs.IntVar(&options.WebhookPort,
83+
"webhook-addr", 9443,
5784
"The port the webhook binds to.")
58-
fs.BoolVar(&options.FakeGPUNodes, "fake-gpu-nodes", false,
85+
fs.BoolVar(&options.FakeGPUNodes,
86+
"fake-gpu-nodes", false,
5987
"Enables running fractions on fake gpu nodes for testing")
6088
fs.BoolVar(&options.GpuCdiEnabled,
6189
"cdi-enabled", false,
@@ -66,8 +94,6 @@ func InitOptions() *Options {
6694
fs.BoolVar(&options.GPUSharingEnabled,
6795
"gpu-sharing-enabled", false,
6896
"Specifies if the GPU sharing is enabled")
69-
fs.StringVar(&options.SchedulerName, "scheduler-name", "kai-scheduler",
70-
"The scheduler name that will be used to schedule the jobs")
7197

7298
utilfeature.DefaultMutableFeatureGate.AddFlag(fs)
7399

cmd/scheduler/app/options/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
const (
1616
defaultSchedulerName = "kai-scheduler"
17+
defaultResourceReservationAppLabel = "runai-reservation"
1718
defaultMetricsNamespace = "kai"
1819
defaultSchedulerPeriod = time.Second
1920
defaultStalenessGracePeriod = 60 * time.Second
@@ -31,6 +32,7 @@ const (
3132
// ServerOption is the main context object for the controller manager.
3233
type ServerOption struct {
3334
SchedulerName string
35+
ResourceReservationAppLabel string
3436
SchedulerConf string
3537
SchedulePeriod time.Duration
3638
EnableLeaderElection bool
@@ -72,6 +74,7 @@ func NewServerOption() *ServerOption {
7274
func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
7375
// kai-scheduler will ignore pods with scheduler names other than specified with the option
7476
fs.StringVar(&s.SchedulerName, "scheduler-name", defaultSchedulerName, "The scheduler name in pod spec that handled by this scheduler")
77+
fs.StringVar(&s.ResourceReservationAppLabel, "resource-reservation-app-label", defaultResourceReservationAppLabel, "App label value of resource reservation pods")
7578
fs.BoolVar(&s.RestrictSchedulingNodes, "restrict-node-scheduling", false, "kai-scheduler will allocate jobs only to restricted nodes")
7679
fs.StringVar(&s.NodePoolLabelKey, "nodepool-label-key", defaultNodePoolLabelKey, "The label key by which to filter scheduling nodepool")
7780
fs.StringVar(&s.NodePoolLabelValue, "partition-label-value", "", "The label value by which to filter scheduling partition")

cmd/scheduler/app/options/options_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func TestAddFlags(t *testing.T) {
2828
// This is a snapshot of expected options parsed by args.
2929
expected := &ServerOption{
3030
SchedulerName: defaultSchedulerName,
31+
ResourceReservationAppLabel: defaultResourceReservationAppLabel,
3132
SchedulePeriod: 5 * time.Minute,
3233
PrintVersion: true,
3334
MetricsNamespace: defaultMetricsNamespace,

cmd/scheduler/app/server.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ func RunApp() error {
9797
} else {
9898
defer flushLogs()
9999
}
100+
setConfig(so)
100101

101102
config := clientconfig.GetConfigOrDie()
102103
config.QPS = float32(so.QPS)
@@ -128,6 +129,11 @@ func setupLogging(so *options.ServerOption) error {
128129
return nil
129130
}
130131

132+
func setConfig(so *options.ServerOption) {
133+
config := conf.GetConfig()
134+
config.ResourceReservationAppLabelValue = so.ResourceReservationAppLabel
135+
}
136+
131137
func Run(opt *options.ServerOption, config *restclient.Config, mux *http.ServeMux) error {
132138
if opt.PrintVersion {
133139
version.PrintVersion()

hack/fake-gpu-operator-values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ topology:
1313
gpuCount: 8
1414
gpuMemory: 11441
1515
nodePoolLabelKey: run.ai/simulated-gpu-node-pool
16+
17+
environment:
18+
resourceReservationNamespace: runai-reservation

pkg/binder/binding/resourcereservation/resource_reservation.go

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,14 @@ type Interface interface {
3434
}
3535

3636
const (
37-
resourceReservation = "runai-reservation"
38-
namespace = "runai-reservation"
39-
serviceAccountName = "runai-reservation"
40-
scalingPodsNamespace = "runai-scale-adjust"
41-
gpuIndexAnnotationName = "run.ai/reserve_for_gpu_index"
42-
numberOfGPUsToReserve = 1
43-
appLabelValue = resourceReservation
44-
gpuReservationPodPrefix = resourceReservation + "-gpu"
45-
runaiResourceReservationAppLabelName = "app.runai.resource.reservation"
46-
reservationPodRandomCharacters = 5
47-
unknownGpuIndicator = "-1"
48-
nodeIndex = "runai-node"
37+
resourceReservation = "resource-reservation"
38+
gpuReservationPodPrefix = "gpu-reservation"
39+
scalingPodsNamespace = "runai-scale-adjust"
40+
gpuIndexAnnotationName = "run.ai/reserve_for_gpu_index"
41+
numberOfGPUsToReserve = 1
42+
reservationPodRandomCharacters = 5
43+
unknownGpuIndicator = "-1"
44+
nodeIndex = "runai-node"
4945
)
5046

5147
type service struct {
@@ -54,20 +50,29 @@ type service struct {
5450
reservationPodImage string
5551
allocationTimeout time.Duration
5652
gpuGroupMutex *group_mutex.GroupMutex
53+
namespace string
54+
serviceAccountName string
55+
appLabelValue string
5756
}
5857

5958
func NewService(
6059
fakeGPuNodes bool,
6160
kubeClient client.WithWatch,
6261
reservationPodImage string,
6362
allocationTimeout time.Duration,
63+
namespace string,
64+
serviceAccountName string,
65+
appLabelValue string,
6466
) *service {
6567
return &service{
6668
fakeGPuNodes: fakeGPuNodes,
6769
kubeClient: kubeClient,
6870
reservationPodImage: reservationPodImage,
6971
allocationTimeout: allocationTimeout,
7072
gpuGroupMutex: group_mutex.NewGroupMutex(),
73+
namespace: namespace,
74+
serviceAccountName: serviceAccountName,
75+
appLabelValue: appLabelValue,
7176
}
7277
}
7378

@@ -157,7 +162,7 @@ func (rsc *service) syncForPods(ctx context.Context, pods []*v1.Pod, gpuGroupToS
157162
fractionPods := map[string][]*v1.Pod{}
158163

159164
for _, pod := range pods {
160-
if pod.Namespace == namespace {
165+
if pod.Namespace == rsc.namespace {
161166
reservationPods[gpuGroupToSync] = pod
162167
continue
163168
}
@@ -301,7 +306,7 @@ func (rsc *service) findGPUIndexByGroup(gpuGroup string) (
301306
) {
302307
pods := &v1.PodList{}
303308
err = rsc.kubeClient.List(context.Background(), pods,
304-
client.InNamespace(namespace),
309+
client.InNamespace(rsc.namespace),
305310
client.MatchingLabels{constants.GPUGroup: gpuGroup})
306311
if err != nil {
307312
return "", err
@@ -334,7 +339,7 @@ func (rsc *service) createGPUReservationPodAndGetIndex(ctx context.Context, node
334339
logger.Error(deleteErr, "failed to delete reservation pod", "name", pod.Name)
335340
}
336341
return unknownGpuIndicator, fmt.Errorf(
337-
"failed waiting for GPU reservation pod to allocate: %v/%v", nodeName, pod.Name)
342+
"failed waiting for GPU reservation pod to allocate: %v/%v", rsc.namespace, pod.Name)
338343
}
339344

340345
return gpuIndex, err
@@ -385,16 +390,16 @@ func (rsc *service) createGPUReservationPod(ctx context.Context, nodeName, gpuGr
385390
},
386391
}
387392

388-
pod, err := rsc.createResourceReservationPod(nodeName, gpuGroup, podName, gpuReservationPodPrefix, resources)
393+
pod, err := rsc.createResourceReservationPod(nodeName, gpuGroup, podName, resources)
389394
if err != nil {
390-
logger.Error(err, "Failed to created GPU reservation pod on node",
391-
"nodeName", nodeName, "namespace", namespace, "name", podName)
395+
logger.Error(err, "Failed to create GPU reservation pod on node",
396+
"nodeName", nodeName, "namespace", rsc.namespace, "name", podName)
392397
return nil, err
393398
}
394399

395400
logger.Info(
396401
"Successfully created GPU resource reservation pod",
397-
"nodeName", nodeName, "namespace", namespace, "name", podName)
402+
"nodeName", nodeName, "namespace", rsc.namespace, "name", podName)
398403
return pod, nil
399404
}
400405

@@ -405,7 +410,7 @@ func (rsc *service) waitForGPUReservationPodAllocation(
405410
pods := &v1.PodList{}
406411
watcher, err := rsc.kubeClient.Watch(
407412
ctx, pods,
408-
client.InNamespace(namespace),
413+
client.InNamespace(rsc.namespace),
409414
client.MatchingFields{"metadata.name": gpuReservationPodName},
410415
)
411416
if err != nil {
@@ -432,25 +437,23 @@ func (rsc *service) waitForGPUReservationPodAllocation(
432437
}
433438

434439
func (rsc *service) createResourceReservationPod(
435-
nodeName, gpuGroup, podName, appName string,
436-
resources v1.ResourceRequirements,
440+
nodeName, gpuGroup, podName string, resources v1.ResourceRequirements,
437441
) (*v1.Pod, error) {
438442
podSpec := &v1.Pod{
439443
ObjectMeta: metav1.ObjectMeta{
440444
Name: podName,
441-
Namespace: namespace,
445+
Namespace: rsc.namespace,
442446
Labels: map[string]string{
443-
constants.AppLabelName: appLabelValue,
444-
constants.GPUGroup: gpuGroup,
445-
runaiResourceReservationAppLabelName: appName,
447+
constants.AppLabelName: rsc.appLabelValue,
448+
constants.GPUGroup: gpuGroup,
446449
},
447450
Annotations: map[string]string{
448451
karpenterv1.DoNotDisruptAnnotationKey: "true",
449452
},
450453
},
451454
Spec: v1.PodSpec{
452455
NodeName: nodeName,
453-
ServiceAccountName: serviceAccountName,
456+
ServiceAccountName: rsc.serviceAccountName,
454457
Containers: []v1.Container{
455458
{
456459
Name: resourceReservation,

0 commit comments

Comments
 (0)