Skip to content

Commit e6e0fe8

Browse files
committed
upgrade 3rd-party components
Signed-off-by: Dmitry Shmulevich <[email protected]>
1 parent 9003e4c commit e6e0fe8

File tree

6 files changed

+136
-37
lines changed

6 files changed

+136
-37
lines changed

docs/examples/jobset/jobset.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010

1111
Install [JobSet API](https://github.com/kubernetes-sigs/jobset) in your cluster:
1212
```shell
13-
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml
13+
JOBSET_VERSION=v0.8.1
14+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml
1415
```
1516

1617
Run a jobset with workers:

docs/examples/kueue/kueue.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Install `kueue` by following these [instructions](https://kueue.sigs.k8s.io/docs/installation/):
44

55
```bash
6-
KUEUE_VERSION=v0.9.0
6+
KUEUE_VERSION=v0.11.4
77
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
88

99
kubectl apply -f charts/overrides/kueue/priority.yaml

resources/templates/kai/job.yaml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: batch/v1
16+
kind: Job
17+
metadata:
18+
name: "{{._NAME_}}"
19+
namespace: "{{.namespace}}"
20+
spec:
21+
completions: {{.replicas}}
22+
parallelism: {{.replicas}}
23+
template:
24+
metadata:
25+
labels:
26+
runai/queue: "{{.queue}}"
27+
annotations:
28+
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
29+
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
30+
spec:
31+
schedulerName: kai-scheduler
32+
containers:
33+
- name: test
34+
image: {{.image}}
35+
imagePullPolicy: IfNotPresent
36+
resources:
37+
limits:
38+
cpu: "{{.cpu}}"
39+
memory: {{.memory}}
40+
nvidia.com/gpu: "{{.gpu}}"
41+
requests:
42+
cpu: "{{.cpu}}"
43+
memory: {{.memory}}
44+
nvidia.com/gpu: "{{.gpu}}"
45+
restartPolicy: Never
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: test-kai-job
16+
description: register, deploy and configure kai
17+
tasks:
18+
- id: register-queue
19+
type: RegisterObj
20+
params:
21+
template: "resources/templates/kai/queue.yaml"
22+
- id: register-job
23+
type: RegisterObj
24+
params:
25+
template: "resources/templates/kai/job.yaml"
26+
nameFormat: "job{{._ENUM_}}"
27+
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
28+
podCount: "{{.replicas}}"
29+
- id: configure
30+
type: Configure
31+
params:
32+
nodes:
33+
- type: dgxa100.80g
34+
count: 3
35+
labels:
36+
nvidia.com/gpu.count: "8"
37+
timeout: 1m
38+
- id: default-queue
39+
type: SubmitObj
40+
params:
41+
refTaskId: register-queue
42+
params:
43+
name: default
44+
- id: test-queue
45+
type: SubmitObj
46+
params:
47+
refTaskId: register-queue
48+
params:
49+
name: test
50+
parentQueue: default
51+
- id: job
52+
type: SubmitObj
53+
params:
54+
refTaskId: register-job
55+
count: 1
56+
params:
57+
namespace: default
58+
queue: test
59+
replicas: 3
60+
image: ubuntu
61+
cpu: 100m
62+
memory: 250M
63+
gpu: 8
64+
ttl: "20s"
65+
- id: status
66+
type: CheckPod
67+
params:
68+
refTaskId: job
69+
status: Running
70+
timeout: 10s

resources/workflows/kai/test-mpijob.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
name: test-kai-mpijob
16-
description: register, deploy and configure run:ai custom resources
16+
description: register, deploy and configure kai
1717
tasks:
1818
- id: register-queue
1919
type: RegisterObj

scripts/env.sh

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,10 @@ function wait_for_pods() {
6969

7070
# KWOK
7171
#
72-
73-
KWOK_REPO=kubernetes-sigs/kwok
74-
KWOK_RELEASE="v0.6.1"
75-
7672
function deploy_kwok() {
7773
printGreen Deploying KWOK
74+
KWOK_REPO=kubernetes-sigs/kwok
75+
KWOK_RELEASE="v0.6.1"
7876

7977
# Deploy KWOK controller
8078
kubectl apply -f https://github.com/${KWOK_REPO}/releases/download/${KWOK_RELEASE}/kwok.yaml
@@ -88,11 +86,9 @@ function deploy_kwok() {
8886

8987
# Prometheus
9088
#
91-
92-
PROMETHEUS_STACK_VERSION=61.5.0
93-
9489
function deploy_prometheus() {
9590
printGreen Deploying Prometheus
91+
PROMETHEUS_STACK_VERSION=61.5.0
9692

9793
helm repo add --force-update prometheus-community https://prometheus-community.github.io/helm-charts
9894

@@ -121,10 +117,9 @@ function deploy_prometheus() {
121117
#
122118

123119
# https://github.com/kubernetes-sigs/jobset
124-
JOBSET_VERSION=v0.7.0
125-
126120
function deploy_jobset() {
127121
printGreen Deploying jobset
122+
JOBSET_VERSION=v0.8.1
128123

129124
kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml
130125

@@ -137,10 +132,9 @@ function deploy_jobset() {
137132
}
138133

139134
# https://github.com/kubernetes-sigs/kueue
140-
KUEUE_VERSION=v0.9.0
141-
142135
function deploy_kueue() {
143136
printGreen Deploying kueue
137+
KUEUE_VERSION=v0.11.4
144138

145139
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
146140

@@ -153,10 +147,9 @@ function deploy_kueue() {
153147
}
154148

155149
# https://github.com/volcano-sh/volcano
156-
VOLCANO_VERSION=v1.10.0
157-
158150
function deploy_volcano() {
159151
printGreen Deploying volcano
152+
VOLCANO_VERSION=v1.11.2
160153

161154
helm repo add --force-update volcano-sh https://volcano-sh.github.io/helm-charts
162155

@@ -174,10 +167,9 @@ function deploy_volcano() {
174167
}
175168

176169
# https://github.com/apache/yunikorn-core
177-
YUNIKORN_VERSION=v1.6.0
178-
179170
function deploy_yunikorn() {
180171
printGreen Deploying yunikorn
172+
YUNIKORN_VERSION=v1.6.2
181173

182174
helm repo add --force-update yunikorn https://apache.github.io/yunikorn-release
183175

@@ -189,12 +181,11 @@ function deploy_yunikorn() {
189181
}
190182

191183
# https://www.run.ai/
192-
TRAINING_OPERATOR_VERSION=v1.8.0
193-
MPI_OPERATOR_VERSION=v0.4.0
194-
RUNAI_VERSION=2.18.49
195-
196184
function deploy_runai() {
197185
printGreen Deploying run:ai
186+
TRAINING_OPERATOR_VERSION=v1.8.0
187+
MPI_OPERATOR_VERSION=v0.4.0
188+
RUNAI_VERSION=2.18.49
198189

199190
if [[ -z "$RUNAI_CONTROL_PLANE_URL" ]] || [[ -z "$RUNAI_CLIENT_SECRET" ]] || [[ -z "$RUNAI_CLUSTER_ID" ]]; then
200191
printRed "
@@ -232,29 +223,21 @@ Run:ai deployment requires environment variables:
232223
}
233224

234225
# https://github.com/NVIDIA/KAI-Scheduler/
235-
TRAINING_OPERATOR_VERSION=v1.8.0
236-
MPI_OPERATOR_VERSION=v0.4.0
237226
function deploy_kai() {
238227
printGreen Deploying kai
228+
MPI_OPERATOR_VERSION=v0.6.0
229+
KAI_VERSION=v0.4.7
239230

240-
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION"
231+
kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml
241232

242-
kubectl patch deployment training-operator -n kubeflow --type='json' \
243-
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]'
244-
245-
kubectl delete crd mpijobs.kubeflow.org
246-
247-
kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml
248-
249-
helm repo add --force-update nvidia-k8s https://helm.ngc.nvidia.com/nvidia/k8s
250-
helm repo update
251-
helm upgrade --install kai-scheduler nvidia-k8s/kai-scheduler -n kai-scheduler \
252-
--create-namespace --wait --set "global.registry=nvcr.io/nvidia/k8s"
233+
helm upgrade --install kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler -n kai-scheduler \
234+
--version="$KAI_VERSION" --create-namespace --wait
253235
}
254236

255-
SCHEDULER_PLUGINS_VERSION=v0.29.7
237+
256238
function deploy_scheduler_plugins() {
257239
printGreen Deploying scheduler-plugins
240+
SCHEDULER_PLUGINS_VERSION=v0.29.7
258241

259242
helm upgrade --install --repo https://scheduler-plugins.sigs.k8s.io scheduler-plugins scheduler-plugins \
260243
-n scheduler-plugins --create-namespace --version $SCHEDULER_PLUGINS_VERSION \

0 commit comments

Comments
 (0)