@@ -69,12 +69,10 @@ function wait_for_pods() {
6969
7070# KWOK
7171#
72-
73- KWOK_REPO=kubernetes-sigs/kwok
74- KWOK_RELEASE=" v0.6.1"
75-
7672function deploy_kwok() {
7773 printGreen Deploying KWOK
74+ KWOK_REPO=kubernetes-sigs/kwok
75+ KWOK_RELEASE=" v0.6.1"
7876
7977 # Deploy KWOK controller
8078 kubectl apply -f https://github.com/${KWOK_REPO} /releases/download/${KWOK_RELEASE} /kwok.yaml
@@ -88,11 +86,9 @@ function deploy_kwok() {
8886
8987# Prometheus
9088#
91-
92- PROMETHEUS_STACK_VERSION=61.5.0
93-
9489function deploy_prometheus() {
9590 printGreen Deploying Prometheus
91+ PROMETHEUS_STACK_VERSION=61.5.0
9692
9793 helm repo add --force-update prometheus-community https://prometheus-community.github.io/helm-charts
9894
@@ -121,10 +117,9 @@ function deploy_prometheus() {
121117#
122118
123119# https://github.com/kubernetes-sigs/jobset
124- JOBSET_VERSION=v0.7.0
125-
126120function deploy_jobset() {
127121 printGreen Deploying jobset
122+ JOBSET_VERSION=v0.8.1
128123
129124 kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION} /manifests.yaml
130125
@@ -137,10 +132,9 @@ function deploy_jobset() {
137132}
138133
139134# https://github.com/kubernetes-sigs/kueue
140- KUEUE_VERSION=v0.9.0
141-
142135function deploy_kueue() {
143136 printGreen Deploying kueue
137+ KUEUE_VERSION=v0.11.4
144138
145139 kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION} /manifests.yaml
146140
@@ -153,10 +147,9 @@ function deploy_kueue() {
153147}
154148
155149# https://github.com/volcano-sh/volcano
156- VOLCANO_VERSION=v1.10.0
157-
158150function deploy_volcano() {
159151 printGreen Deploying volcano
152+ VOLCANO_VERSION=v1.11.2
160153
161154 helm repo add --force-update volcano-sh https://volcano-sh.github.io/helm-charts
162155
@@ -174,10 +167,9 @@ function deploy_volcano() {
174167}
175168
176169# https://github.com/apache/yunikorn-core
177- YUNIKORN_VERSION=v1.6.0
178-
179170function deploy_yunikorn() {
180171 printGreen Deploying yunikorn
172+ YUNIKORN_VERSION=v1.6.2
181173
182174 helm repo add --force-update yunikorn https://apache.github.io/yunikorn-release
183175
@@ -189,12 +181,11 @@ function deploy_yunikorn() {
189181}
190182
191183# https://www.run.ai/
192- TRAINING_OPERATOR_VERSION=v1.8.0
193- MPI_OPERATOR_VERSION=v0.4.0
194- RUNAI_VERSION=2.18.49
195-
196184function deploy_runai() {
197185 printGreen Deploying run:ai
186+ TRAINING_OPERATOR_VERSION=v1.8.0
187+ MPI_OPERATOR_VERSION=v0.4.0
188+ RUNAI_VERSION=2.18.49
198189
199190 if [[ -z " $RUNAI_CONTROL_PLANE_URL " ]] || [[ -z " $RUNAI_CLIENT_SECRET " ]] || [[ -z " $RUNAI_CLUSTER_ID " ]]; then
200191 printRed "
@@ -232,29 +223,21 @@ Run:ai deployment requires environment variables:
232223}
233224
234225# https://github.com/NVIDIA/KAI-Scheduler/
235- TRAINING_OPERATOR_VERSION=v1.8.0
236- MPI_OPERATOR_VERSION=v0.4.0
237226function deploy_kai() {
238227 printGreen Deploying kai
228+ MPI_OPERATOR_VERSION=v0.6.0
229+ KAI_VERSION=v0.4.7
239230
240- kubectl apply -k " github. com/kubeflow/training -operator/manifests/overlays/standalone?ref= $TRAINING_OPERATOR_VERSION "
231+ kubectl apply --server-side -f https://raw.githubusercontent. com/kubeflow/mpi -operator/$MPI_OPERATOR_VERSION /deploy/v2beta1/mpi-operator.yaml
241232
242- kubectl patch deployment training-operator -n kubeflow --type=' json' \
243- -p=' [{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]'
244-
245- kubectl delete crd mpijobs.kubeflow.org
246-
247- kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION /deploy/v2beta1/mpi-operator.yaml
248-
249- helm repo add --force-update nvidia-k8s https://helm.ngc.nvidia.com/nvidia/k8s
250- helm repo update
251- helm upgrade --install kai-scheduler nvidia-k8s/kai-scheduler -n kai-scheduler \
252- --create-namespace --wait --set " global.registry=nvcr.io/nvidia/k8s"
233+ helm upgrade --install kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler -n kai-scheduler \
234+ --version=" $KAI_VERSION " --create-namespace --wait
253235}
254236
255- SCHEDULER_PLUGINS_VERSION=v0.29.7
237+
256238function deploy_scheduler_plugins() {
257239 printGreen Deploying scheduler-plugins
240+ SCHEDULER_PLUGINS_VERSION=v0.29.7
258241
259242 helm upgrade --install --repo https://scheduler-plugins.sigs.k8s.io scheduler-plugins scheduler-plugins \
260243 -n scheduler-plugins --create-namespace --version $SCHEDULER_PLUGINS_VERSION \
0 commit comments