upgrade 3rd-party components

dmitsh · dmitsh · commit e6e0fe8f76c4 · 2025-05-01T14:06:26.000-07:00
Signed-off-by: Dmitry Shmulevich &lt;dshmulevich@nvidia.com&gt;
diff --git a/docs/examples/jobset/jobset.md b/docs/examples/jobset/jobset.md
@@ -10,7 +10,8 @@
 
 Install [JobSet API](https://github.com/kubernetes-sigs/jobset) in your cluster:
 ```shell
-kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/v0.5.2/manifests.yaml
+JOBSET_VERSION=v0.8.1
+kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml
 ```
 
 Run a jobset with workers: 
diff --git a/docs/examples/kueue/kueue.md b/docs/examples/kueue/kueue.md
@@ -3,7 +3,7 @@
 Install `kueue` by following these [instructions](https://kueue.sigs.k8s.io/docs/installation/):
 
 ```bash
-KUEUE_VERSION=v0.9.0
+KUEUE_VERSION=v0.11.4
 kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
 
 kubectl apply -f charts/overrides/kueue/priority.yaml
diff --git a/resources/templates/kai/job.yaml b/resources/templates/kai/job.yaml
@@ -0,0 +1,45 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "{{._NAME_}}"
+  namespace: "{{.namespace}}"
+spec:
+  completions: {{.replicas}}
+  parallelism: {{.replicas}}
+  template:
+    metadata:
+      labels:
+        runai/queue: "{{.queue}}"
+      annotations:
+        pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+        pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+    spec:
+      schedulerName: kai-scheduler
+      containers:
+      - name: test
+        image: {{.image}}
+        imagePullPolicy: IfNotPresent
+        resources:
+          limits:
+            cpu: "{{.cpu}}"
+            memory: {{.memory}}
+            nvidia.com/gpu: "{{.gpu}}"
+          requests:
+            cpu: "{{.cpu}}"
+            memory: {{.memory}}
+            nvidia.com/gpu: "{{.gpu}}"
+      restartPolicy: Never
diff --git a/resources/workflows/kai/test-job.yaml b/resources/workflows/kai/test-job.yaml
@@ -0,0 +1,70 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: test-kai-job
+description: register, deploy and configure kai
+tasks:
+- id: register-queue
+  type: RegisterObj
+  params:
+    template: "resources/templates/kai/queue.yaml"
+- id: register-job
+  type: RegisterObj
+  params:
+    template: "resources/templates/kai/job.yaml"
+    nameFormat: "job{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
+    podCount: "{{.replicas}}"
+- id: configure
+  type: Configure
+  params:
+    nodes:
+    - type: dgxa100.80g
+      count: 3
+      labels:
+        nvidia.com/gpu.count: "8"
+    timeout: 1m
+- id: default-queue
+  type: SubmitObj
+  params:
+    refTaskId: register-queue
+    params:
+      name: default
+- id: test-queue
+  type: SubmitObj
+  params:
+    refTaskId: register-queue
+    params:
+      name: test
+      parentQueue: default
+- id: job
+  type: SubmitObj
+  params:
+    refTaskId: register-job
+    count: 1
+    params:
+      namespace: default
+      queue: test
+      replicas: 3
+      image: ubuntu
+      cpu: 100m
+      memory: 250M
+      gpu: 8
+      ttl: "20s"
+- id: status
+  type: CheckPod
+  params:
+    refTaskId: job
+    status: Running
+    timeout: 10s
diff --git a/resources/workflows/kai/test-mpijob.yaml b/resources/workflows/kai/test-mpijob.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 name: test-kai-mpijob
-description: register, deploy and configure run:ai custom resources
+description: register, deploy and configure kai
 tasks:
 - id: register-queue
   type: RegisterObj
diff --git a/scripts/env.sh b/scripts/env.sh
@@ -69,12 +69,10 @@ function wait_for_pods() {
 
 # KWOK
 #
-
-KWOK_REPO=kubernetes-sigs/kwok
-KWOK_RELEASE="v0.6.1"
-
 function deploy_kwok() {
   printGreen Deploying KWOK
+  KWOK_REPO=kubernetes-sigs/kwok
+  KWOK_RELEASE="v0.6.1"
 
   # Deploy KWOK controller
   kubectl apply -f https://github.com/${KWOK_REPO}/releases/download/${KWOK_RELEASE}/kwok.yaml
@@ -88,11 +86,9 @@ function deploy_kwok() {
 
 # Prometheus
 #
-
-PROMETHEUS_STACK_VERSION=61.5.0
-
 function deploy_prometheus() {
   printGreen Deploying Prometheus
+  PROMETHEUS_STACK_VERSION=61.5.0
 
   helm repo add --force-update prometheus-community https://prometheus-community.github.io/helm-charts
 
@@ -121,10 +117,9 @@ function deploy_prometheus() {
 #
 
 # https://github.com/kubernetes-sigs/jobset
-JOBSET_VERSION=v0.7.0
-
 function deploy_jobset() {
   printGreen Deploying jobset
+  JOBSET_VERSION=v0.8.1
 
   kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml
 
@@ -137,10 +132,9 @@ function deploy_jobset() {
 }
 
 # https://github.com/kubernetes-sigs/kueue
-KUEUE_VERSION=v0.9.0
-
 function deploy_kueue() {
   printGreen Deploying kueue
+  KUEUE_VERSION=v0.11.4
 
   kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
 
@@ -153,10 +147,9 @@ function deploy_kueue() {
 }
 
 # https://github.com/volcano-sh/volcano
-VOLCANO_VERSION=v1.10.0
-
 function deploy_volcano() {
   printGreen Deploying volcano
+  VOLCANO_VERSION=v1.11.2
 
   helm repo add --force-update volcano-sh https://volcano-sh.github.io/helm-charts
 
@@ -174,10 +167,9 @@ function deploy_volcano() {
 }
 
 # https://github.com/apache/yunikorn-core
-YUNIKORN_VERSION=v1.6.0
-
 function deploy_yunikorn() {
   printGreen Deploying yunikorn
+  YUNIKORN_VERSION=v1.6.2
 
   helm repo add --force-update yunikorn https://apache.github.io/yunikorn-release
 
@@ -189,12 +181,11 @@ function deploy_yunikorn() {
 }
 
 # https://www.run.ai/
-TRAINING_OPERATOR_VERSION=v1.8.0
-MPI_OPERATOR_VERSION=v0.4.0
-RUNAI_VERSION=2.18.49
-
 function deploy_runai() {
   printGreen Deploying run:ai
+  TRAINING_OPERATOR_VERSION=v1.8.0
+  MPI_OPERATOR_VERSION=v0.4.0
+  RUNAI_VERSION=2.18.49
 
   if [[ -z "$RUNAI_CONTROL_PLANE_URL" ]] || [[ -z "$RUNAI_CLIENT_SECRET" ]] || [[ -z "$RUNAI_CLUSTER_ID" ]]; then
     printRed "
@@ -232,29 +223,21 @@ Run:ai deployment requires environment variables:
 }
 
 # https://github.com/NVIDIA/KAI-Scheduler/
-TRAINING_OPERATOR_VERSION=v1.8.0
-MPI_OPERATOR_VERSION=v0.4.0
 function deploy_kai() {
   printGreen Deploying kai
+  MPI_OPERATOR_VERSION=v0.6.0
+  KAI_VERSION=v0.4.7
 
-  kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION"
+  kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml
 
-  kubectl patch deployment training-operator -n kubeflow --type='json' \
-    -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]'
-
-  kubectl delete crd mpijobs.kubeflow.org
-
-  kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml
-
-  helm repo add --force-update nvidia-k8s https://helm.ngc.nvidia.com/nvidia/k8s
-  helm repo update
-  helm upgrade --install kai-scheduler nvidia-k8s/kai-scheduler -n kai-scheduler \
-    --create-namespace --wait --set "global.registry=nvcr.io/nvidia/k8s"
+  helm upgrade --install kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler -n kai-scheduler \
+    --version="$KAI_VERSION" --create-namespace --wait
 }
 
-SCHEDULER_PLUGINS_VERSION=v0.29.7
+
 function deploy_scheduler_plugins() {
   printGreen Deploying scheduler-plugins
+  SCHEDULER_PLUGINS_VERSION=v0.29.7
 
   helm upgrade --install --repo https://scheduler-plugins.sigs.k8s.io scheduler-plugins scheduler-plugins \
     -n scheduler-plugins --create-namespace --version $SCHEDULER_PLUGINS_VERSION \