add support for kai-scheduler (#120)

dmitsh · web-flow · commit 44de5bb4e304 · 2025-03-26T09:01:56.000-07:00
Signed-off-by: Dmitry Shmulevich &lt;dshmulevich@nvidia.com&gt;
diff --git a/docs/examples/kai/kai.md b/docs/examples/kai/kai.md
@@ -0,0 +1,10 @@
+## Example of running `KAI` with `knavigator`
+
+### Running workflows with `MPI job`
+
+Install [KAI scheduler](https://github.com/NVIDIA/KAI-Scheduler/blob/main/README.md) in your cluster.
+
+Run an MPI job: 
+```shell
+./bin/knavigator -workflow resources/workflows/kai/test-mpijob.yaml
+```
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -27,3 +27,4 @@ We have tested several of these and offer templates and workflows to support the
 * [Kueue](./examples/kueue/kueue.md)
 * [YuniKorn](./examples/yunikorn/yunikorn.md)
 * [Run:ai](./examples/runai/runai.md)
+* [Kai](./examples/kai/kai.md)
diff --git a/resources/templates/kai/mpijob.yaml b/resources/templates/kai/mpijob.yaml
@@ -0,0 +1,62 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: "{{._NAME_}}"
+  namespace: "{{.namespace}}"
+  labels:
+    runai/queue: "{{.queue}}"
+spec:
+  slotsPerWorker: 1
+  runPolicy:
+    cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        metadata:
+          annotations:
+            pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+            pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+        spec:
+          schedulerName: kai-scheduler
+          containers:
+          - image: {{.image}}
+            name: mpi-launcher
+            resources:
+              limits:
+                cpu: "{{.cpu}}"
+                memory: {{.memory}}
+                nvidia.com/gpu: "{{.gpu}}"
+    Worker:
+      replicas: {{.workers}}
+      template:
+        metadata:
+          annotations:
+            pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+            pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+          labels:
+            app: {{._NAME_}}
+        spec:
+          schedulerName: kai-scheduler
+          containers:
+          - image: {{.image}}
+            name: mpi-worker
+            resources:
+              limits:
+                cpu: "{{.cpu}}"
+                memory: {{.memory}}
+                nvidia.com/gpu: "{{.gpu}}"
diff --git a/resources/templates/kai/queue.yaml b/resources/templates/kai/queue.yaml
@@ -0,0 +1,35 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: "{{.name}}"
+spec:
+  {{- if .parentQueue }}
+  parentQueue: "{{.parentQueue}}"
+  {{- end }}
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
diff --git a/resources/workflows/kai/test-mpijob.yaml b/resources/workflows/kai/test-mpijob.yaml
@@ -0,0 +1,70 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: test-kai-mpijob
+description: register, deploy and configure run:ai custom resources
+tasks:
+- id: register-queue
+  type: RegisterObj
+  params:
+    template: "resources/templates/kai/queue.yaml"
+- id: register-mpijob
+  type: RegisterObj
+  params:
+    template: "resources/templates/kai/mpijob.yaml"
+    nameFormat: "mpijob{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
+    podCount: "{{.workers}} + 1"
+- id: configure
+  type: Configure
+  params:
+    nodes:
+    - type: dgxa100.80g
+      count: 3
+      labels:
+        nvidia.com/gpu.count: "8"
+    timeout: 1m
+- id: default-queue
+  type: SubmitObj
+  params:
+    refTaskId: register-queue
+    params:
+      name: default
+- id: test-queue
+  type: SubmitObj
+  params:
+    refTaskId: register-queue
+    params:
+      name: test
+      parentQueue: default
+- id: mpijob
+  type: SubmitObj
+  params:
+    refTaskId: register-mpijob
+    count: 1
+    params:
+      namespace: default
+      queue: test
+      workers: 2
+      image: ubuntu
+      cpu: 100m
+      memory: 250M
+      gpu: 8
+      ttl: "20s"
+- id: status
+  type: CheckPod
+  params:
+    refTaskId: mpijob
+    status: Running
+    timeout: 10s
diff --git a/scripts/create-test-cluster.sh b/scripts/create-test-cluster.sh
@@ -33,10 +33,12 @@ if kind get clusters > /dev/null 2>&1; then
   read -p "> " choice
   if [[ "$choice" == "y" ]]; then
     kind delete cluster
-    kind create cluster --image=kindest/node:v1.29.7
+    kind create cluster
+    # --image=kindest/node:v1.29.7
   fi
 else
-  kind create cluster --image=kindest/node:v1.29.7
+  kind create cluster
+  # --image=kindest/node:v1.29.7
 fi
 
 deploy_prometheus
@@ -52,7 +54,8 @@ cat << EOF
   3: volcano (https://github.com/volcano-sh/volcano)
   4: yunikorn (https://github.com/apache/yunikorn-core)
   5: run:ai (https://www.run.ai)
-  6: combined: coscheduler plugin + jobset + kueue
+  6: kai (https://github.com/NVIDIA/KAI-Scheduler)
+  7: combined: coscheduler plugin + jobset + kueue
 EOF
 read -p "> " choice
 
@@ -73,6 +76,9 @@ case "$choice" in
     deploy_runai
     ;;
   6)
+    deploy_kai
+    ;;
+  7)
     deploy_scheduler_plugins
     deploy_jobset
     deploy_kueue
diff --git a/scripts/env.sh b/scripts/env.sh
@@ -231,6 +231,27 @@ Run:ai deployment requires environment variables:
     --set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}'
 }
 
+# https://github.com/NVIDIA/KAI-Scheduler/
+TRAINING_OPERATOR_VERSION=v1.8.0
+MPI_OPERATOR_VERSION=v0.4.0
+function deploy_kai() {
+  printGreen Deploying kai
+
+  kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION"
+
+  kubectl patch deployment training-operator -n kubeflow --type='json' \
+    -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]'
+
+  kubectl delete crd mpijobs.kubeflow.org
+
+  kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml
+
+  helm repo add --force-update nvidia-k8s https://helm.ngc.nvidia.com/nvidia/k8s
+  helm repo update
+  helm upgrade --install kai-scheduler nvidia-k8s/kai-scheduler -n kai-scheduler \
+    --create-namespace --wait --set "global.registry=nvcr.io/nvidia/k8s"
+}
+
 SCHEDULER_PLUGINS_VERSION=v0.29.7
 function deploy_scheduler_plugins() {
   printGreen Deploying scheduler-plugins