Skip to content

Commit b78f935

Browse files
committed
add RayJob and MPIJob workflows with Kueue and KAI
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent 1cca6ce commit b78f935

File tree

18 files changed

+431
-91
lines changed

18 files changed

+431
-91
lines changed

docs/examples/kueue/kueue.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,17 @@ kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases
99
kubectl apply -f charts/overrides/kueue/priority.yaml
1010
```
1111

12-
Run a kueue job:
12+
Run a Job with kueue:
1313
```bash
1414
./bin/knavigator -workflow resources/workflows/kueue/test-job.yaml -cleanup
1515
```
1616

17-
Run a preemption workflow with kueue:
17+
Run a preemption workflow with kueue:
1818
```bash
1919
./bin/knavigator -workflow resources/workflows/kueue/test-preemption.yaml -cleanup
2020
```
21+
22+
Run a RayJob with kueue:
23+
```bash
24+
./bin/knavigator -workflow resources/workflows/kueue/test-rayjob.yaml -cleanup
25+
```

resources/benchmarks/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ This directory contains benchmark tests for the following workload managers and
55
- Kueue
66
- Volcano
77
- Yunikorn
8+
- Kai
89
- Run:ai
910

1011
The benchmark tests involve submitting workloads intended to evaluate the scheduler's performance under specific scenarios.

resources/benchmarks/gang-scheduling/workflows/config-kai.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ tasks:
2626
nameFormat: "job{{._ENUM_}}"
2727
podNameFormat: "{{._NAME_}}-[a-z0-9]+"
2828
podCount: "{{.replicas}}"
29+
- id: register-lw
30+
type: RegisterObj
31+
params:
32+
template: "resources/benchmarks/templates/kai/mpijob.yaml"
33+
nameFormat: "job{{._ENUM_}}"
34+
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
35+
podCount: "{{.workers}} + 1"
2936
- id: default-queue
3037
type: SubmitObj
3138
params:

resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,20 @@ tasks:
3434
nameFormat: "job{{._ENUM_}}"
3535
podNameFormat: "{{._NAME_}}-[0-9]-.*"
3636
podCount: "{{.replicas}}"
37+
#- id: register-lw
38+
# type: RegisterObj
39+
# params:
40+
# template: "resources/benchmarks/templates/kueue/rayjob.yaml"
41+
# nameFormat: "job{{._ENUM_}}"
42+
# podNameFormat: "{{._NAME_}}-raycluster-.*"
43+
# podCount: "{{.workers}} + 1"
44+
- id: register-lw
45+
type: RegisterObj
46+
params:
47+
template: "resources/benchmarks/templates/kueue/mpijob.yaml"
48+
nameFormat: "job{{._ENUM_}}"
49+
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
50+
podCount: "{{.workers}} + 1"
3751
- id: create-resource-flavor
3852
type: SubmitObj
3953
params:
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: config-runai
16+
tasks:
17+
- id: register
18+
type: RegisterObj
19+
params:
20+
template: "resources/benchmarks/templates/runai/trainingworkload.yaml"
21+
nameFormat: "twl{{._ENUM_}}"
22+
podNameFormat: "{{._NAME_}}-0-0"
23+
podCount: 1
24+
- id: register-lw
25+
type: RegisterObj
26+
params:
27+
template: "resources/benchmarks/templates/runai/distributedworkload.yaml"
28+
nameFormat: "dwl{{._ENUM_}}"
29+
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
30+
podCount: "{{.workers}} + 1"

resources/benchmarks/gang-scheduling/workflows/runai-test.yaml renamed to resources/benchmarks/gang-scheduling/workflows/run-test-lw.yaml

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,139 +12,124 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
name: test-gang-scheduling-runai
15+
name: test-gang-scheduling-lw
1616
tasks:
17-
- id: register-trainingworkload
18-
type: RegisterObj
19-
params:
20-
template: "resources/benchmarks/templates/runai/trainingworkload.yaml"
21-
nameFormat: "twl{{._ENUM_}}"
22-
podNameFormat: "{{._NAME_}}-0-0"
23-
podCount: 1
24-
- id: register-distributedworkload
25-
type: RegisterObj
26-
params:
27-
template: "resources/benchmarks/templates/runai/distributedworkload.yaml"
28-
nameFormat: "dwl{{._ENUM_}}"
29-
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
30-
podCount: "{{.workers}} + 1"
31-
#
32-
### Benchmark test
33-
#
3417
- id: job1
3518
type: SubmitObj
3619
params:
37-
refTaskId: register-distributedworkload
20+
refTaskId: register-lw
3821
count: 1
3922
params:
4023
workers: 31
4124
ttl: 2m
4225
- id: job2
4326
type: SubmitObj
4427
params:
45-
refTaskId: register-distributedworkload
28+
refTaskId: register-lw
4629
count: 2
4730
params:
4831
workers: 15
4932
ttl: 2m
5033
- id: job3
5134
type: SubmitObj
5235
params:
53-
refTaskId: register-distributedworkload
36+
refTaskId: register-lw
5437
count: 3
5538
params:
5639
workers: 9
5740
ttl: 2m
5841
- id: job3.1
5942
type: SubmitObj
6043
params:
61-
refTaskId: register-distributedworkload
44+
refTaskId: register-lw
6245
count: 1
6346
params:
6447
workers: 1
6548
ttl: 2m
6649
- id: job4
6750
type: SubmitObj
6851
params:
69-
refTaskId: register-distributedworkload
52+
refTaskId: register-lw
7053
count: 4
7154
params:
7255
workers: 7
7356
ttl: 2m
7457
- id: job5
7558
type: SubmitObj
7659
params:
77-
refTaskId: register-distributedworkload
60+
refTaskId: register-lw
7861
count: 5
7962
params:
8063
workers: 5
8164
ttl: 2m
8265
- id: job5.1
8366
type: SubmitObj
8467
params:
85-
refTaskId: register-trainingworkload
68+
refTaskId: register
8669
count: 2
8770
params:
71+
replicas: 1
8872
ttl: 2m
8973
- id: job6
9074
type: SubmitObj
9175
params:
92-
refTaskId: register-distributedworkload
76+
refTaskId: register-lw
9377
count: 6
9478
params:
9579
workers: 4
9680
ttl: 2m
9781
- id: job6.1
9882
type: SubmitObj
9983
params:
100-
refTaskId: register-distributedworkload
84+
refTaskId: register-lw
10185
count: 1
10286
params:
10387
workers: 1
10488
ttl: 2m
10589
- id: job7
10690
type: SubmitObj
10791
params:
108-
refTaskId: register-distributedworkload
92+
refTaskId: register-lw
10993
count: 7
11094
params:
11195
workers: 3
11296
ttl: 2m
11397
- id: job7.1
11498
type: SubmitObj
11599
params:
116-
refTaskId: register-distributedworkload
100+
refTaskId: register-lw
117101
count: 1
118102
params:
119103
workers: 1
120104
ttl: 2m
121105
- id: job7.2
122106
type: SubmitObj
123107
params:
124-
refTaskId: register-trainingworkload
108+
refTaskId: register
125109
count: 2
126110
params:
111+
replicas: 1
127112
ttl: 2m
128113
- id: job8
129114
type: SubmitObj
130115
params:
131-
refTaskId: register-distributedworkload
116+
refTaskId: register-lw
132117
count: 8
133118
params:
134119
workers: 3
135120
ttl: 2m
136121
- id: job9
137122
type: SubmitObj
138123
params:
139-
refTaskId: register-distributedworkload
124+
refTaskId: register-lw
140125
count: 9
141126
params:
142127
workers: 2
143128
ttl: 2m
144129
- id: job9.1
145130
type: SubmitObj
146131
params:
147-
refTaskId: register-distributedworkload
132+
refTaskId: register-lw
148133
count: 1
149134
params:
150135
workers: 4

resources/benchmarks/templates/kai/job.yaml

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2-
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
6-
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
8-
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
14-
151
apiVersion: batch/v1
162
kind: Job
173
metadata:
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
apiVersion: kubeflow.org/v2beta1
2+
kind: MPIJob
3+
metadata:
4+
name: "{{._NAME_}}"
5+
namespace: "default"
6+
labels:
7+
runai/queue: "test"
8+
spec:
9+
slotsPerWorker: 1
10+
runPolicy:
11+
cleanPodPolicy: Running
12+
mpiReplicaSpecs:
13+
Launcher:
14+
replicas: 1
15+
template:
16+
metadata:
17+
annotations:
18+
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
19+
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
20+
spec:
21+
schedulerName: kai-scheduler
22+
containers:
23+
- image: busybox
24+
name: mpi-launcher
25+
resources:
26+
limits:
27+
cpu: 100m
28+
memory: 250M
29+
nvidia.com/gpu: "8"
30+
Worker:
31+
replicas: {{.workers}}
32+
template:
33+
metadata:
34+
annotations:
35+
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
36+
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
37+
labels:
38+
app: {{._NAME_}}
39+
spec:
40+
schedulerName: kai-scheduler
41+
containers:
42+
- image: busybox
43+
name: mpi-worker
44+
resources:
45+
limits:
46+
cpu: 100m
47+
memory: 250M
48+
nvidia.com/gpu: "8"

resources/benchmarks/templates/kai/queue.yaml

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2-
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
6-
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
8-
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
14-
151
apiVersion: scheduling.run.ai/v2
162
kind: Queue
173
metadata:

resources/benchmarks/templates/kueue/job.yaml

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,3 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2-
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
6-
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
8-
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
14-
151
apiVersion: batch/v1
162
kind: Job
173
metadata:

0 commit comments

Comments
 (0)