File tree Expand file tree Collapse file tree 19 files changed +353
-97
lines changed
gang-scheduling/workflows Expand file tree Collapse file tree 19 files changed +353
-97
lines changed Original file line number Diff line number Diff line change 1010
1111Install [ JobSet API] ( https://github.com/kubernetes-sigs/jobset ) in your cluster:
1212``` shell
13- kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml
13+ JOBSET_VERSION=v0.8.1
14+ kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION} /manifests.yaml
1415```
1516
1617Run a jobset with workers:
Original file line number Diff line number Diff line change 11## Example of running ` KAI ` with ` knavigator `
22
3- ### Running workflows with ` MPI job `
3+ ### Running workflows with ` MPI job ` and ` Job `
44
55Install [ KAI scheduler] ( https://github.com/NVIDIA/KAI-Scheduler/blob/main/README.md ) in your cluster.
66
7- Run an MPI job:
7+ Run an MPI job:
88``` shell
99./bin/knavigator -workflow resources/workflows/kai/test-mpijob.yaml
1010```
11+
12+ Run a multi-replica Job:
13+ ``` shell
14+ ./bin/knavigator -workflow resources/workflows/kai/test-job.yaml
15+ ```
Original file line number Diff line number Diff line change 33Install ` kueue ` by following these [ instructions] ( https://kueue.sigs.k8s.io/docs/installation/ ) :
44
55``` bash
6- KUEUE_VERSION=v0.9.0
6+ KUEUE_VERSION=v0.11.4
77kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION} /manifests.yaml
88
99kubectl apply -f charts/overrides/kueue/priority.yaml
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ name : test-kai-job
16+ description : register, deploy and configure kai custom resources
17+ tasks :
18+ - id : register-queue
19+ type : RegisterObj
20+ params :
21+ template : " resources/templates/kai/queue.yaml"
22+ - id : register
23+ type : RegisterObj
24+ params :
25+ template : " resources/benchmarks/templates/kai/job.yaml"
26+ nameFormat : " job{{._ENUM_}}"
27+ podNameFormat : " {{._NAME_}}-[a-z0-9]+"
28+ podCount : " {{.replicas}}"
29+ - id : default-queue
30+ type : SubmitObj
31+ params :
32+ refTaskId : register-queue
33+ canExist : true
34+ params :
35+ name : default
36+ - id : test-queue
37+ type : SubmitObj
38+ params :
39+ refTaskId : register-queue
40+ canExist : true
41+ params :
42+ name : test
43+ parentQueue : default
Original file line number Diff line number Diff line change @@ -118,7 +118,6 @@ tasks:
118118 - "ray.io/rayjob"
119119 - "ray.io/raycluster"
120120 - "jobset.x-k8s.io/jobset"
121- - "kubeflow.org/mxjob"
122121 - "kubeflow.org/paddlejob"
123122 - "kubeflow.org/pytorchjob"
124123 - "kubeflow.org/tfjob"
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ name : test-kai-job
16+ description : register, deploy and configure kai custom resources
17+ tasks :
18+ - id : register-queue
19+ type : RegisterObj
20+ params :
21+ template : " resources/templates/kai/queue.yaml"
22+ - id : register
23+ type : RegisterObj
24+ params :
25+ template : " resources/benchmarks/templates/kai/job.yaml"
26+ nameFormat : " job{{._ENUM_}}"
27+ podNameFormat : " {{._NAME_}}-[a-z0-9]+"
28+ podCount : " {{.replicas}}"
29+ - id : default-queue
30+ type : SubmitObj
31+ params :
32+ refTaskId : register-queue
33+ canExist : true
34+ params :
35+ name : default
36+ - id : test-queue
37+ type : SubmitObj
38+ params :
39+ refTaskId : register-queue
40+ canExist : true
41+ params :
42+ name : test
43+ parentQueue : default
Original file line number Diff line number Diff line change @@ -118,7 +118,6 @@ tasks:
118118 - "ray.io/rayjob"
119119 - "ray.io/raycluster"
120120 - "jobset.x-k8s.io/jobset"
121- - "kubeflow.org/mxjob"
122121 - "kubeflow.org/paddlejob"
123122 - "kubeflow.org/pytorchjob"
124123 - "kubeflow.org/tfjob"
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : batch/v1
16+ kind : Job
17+ metadata :
18+ name : " {{._NAME_}}"
19+ namespace : " default"
20+ spec :
21+ completions : {{.replicas}}
22+ parallelism : {{.replicas}}
23+ template :
24+ metadata :
25+ labels :
26+ runai/queue : " test"
27+ annotations :
28+ pod-complete.stage.kwok.x-k8s.io/delay : {{.ttl}}
29+ pod-complete.stage.kwok.x-k8s.io/jitter-delay : {{.ttl}}
30+ spec :
31+ schedulerName : kai-scheduler
32+ containers :
33+ - name : test
34+ image : busybox
35+ imagePullPolicy : IfNotPresent
36+ resources :
37+ limits :
38+ cpu : 100m
39+ memory : 250M
40+ nvidia.com/gpu : " 8"
41+ requests :
42+ cpu : 100m
43+ memory : 250M
44+ nvidia.com/gpu : " 8"
45+ restartPolicy : Never
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : scheduling.run.ai/v2
16+ kind : Queue
17+ metadata :
18+ name : " {{.name}}"
19+ spec :
20+ {{- if .parentQueue }}
21+ parentQueue : " {{.parentQueue}}"
22+ {{- end }}
23+ resources :
24+ cpu :
25+ quota : -1
26+ limit : -1
27+ overQuotaWeight : 1
28+ gpu :
29+ quota : -1
30+ limit : -1
31+ overQuotaWeight : 1
32+ memory :
33+ quota : -1
34+ limit : -1
35+ overQuotaWeight : 1
Original file line number Diff line number Diff line change 1+ # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ apiVersion : batch/v1
16+ kind : Job
17+ metadata :
18+ name : " {{._NAME_}}"
19+ namespace : " {{.namespace}}"
20+ spec :
21+ completions : {{.replicas}}
22+ parallelism : {{.replicas}}
23+ template :
24+ metadata :
25+ labels :
26+ runai/queue : " {{.queue}}"
27+ annotations :
28+ pod-complete.stage.kwok.x-k8s.io/delay : {{.ttl}}
29+ pod-complete.stage.kwok.x-k8s.io/jitter-delay : {{.ttl}}
30+ spec :
31+ schedulerName : kai-scheduler
32+ containers :
33+ - name : test
34+ image : {{.image}}
35+ imagePullPolicy : IfNotPresent
36+ resources :
37+ limits :
38+ cpu : " {{.cpu}}"
39+ memory : {{.memory}}
40+ nvidia.com/gpu : " {{.gpu}}"
41+ requests :
42+ cpu : " {{.cpu}}"
43+ memory : {{.memory}}
44+ nvidia.com/gpu : " {{.gpu}}"
45+ restartPolicy : Never
You can’t perform that action at this time.
0 commit comments