Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/examples/kai/kai.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## Example of running `KAI` with `knavigator`

### Running workflows with `MPI job`

Install [KAI scheduler](https://github.com/NVIDIA/KAI-Scheduler/blob/main/README.md) in your cluster.

Run an MPI job:
```shell
./bin/knavigator -workflow resources/workflows/kai/test-mpijob.yaml
```
1 change: 1 addition & 0 deletions docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ We have tested several of these and offer templates and workflows to support the
* [Kueue](./examples/kueue/kueue.md)
* [YuniKorn](./examples/yunikorn/yunikorn.md)
* [Run:ai](./examples/runai/runai.md)
* [Kai](./examples/kai/kai.md)
62 changes: 62 additions & 0 deletions resources/templates/kai/mpijob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: "{{._NAME_}}"
namespace: "{{.namespace}}"
labels:
runai/queue: "{{.queue}}"
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
metadata:
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
spec:
schedulerName: kai-scheduler
containers:
- image: {{.image}}
name: mpi-launcher
resources:
limits:
cpu: "{{.cpu}}"
memory: {{.memory}}
nvidia.com/gpu: "{{.gpu}}"
Worker:
replicas: {{.workers}}
template:
metadata:
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
labels:
app: {{._NAME_}}
spec:
schedulerName: kai-scheduler
containers:
- image: {{.image}}
name: mpi-worker
resources:
limits:
cpu: "{{.cpu}}"
memory: {{.memory}}
nvidia.com/gpu: "{{.gpu}}"
35 changes: 35 additions & 0 deletions resources/templates/kai/queue.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: scheduling.run.ai/v2
kind: Queue
metadata:
name: "{{.name}}"
spec:
{{- if .parentQueue }}
parentQueue: "{{.parentQueue}}"
{{- end }}
resources:
cpu:
quota: -1
limit: -1
overQuotaWeight: 1
gpu:
quota: -1
limit: -1
overQuotaWeight: 1
memory:
quota: -1
limit: -1
overQuotaWeight: 1
70 changes: 70 additions & 0 deletions resources/workflows/kai/test-mpijob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: test-kai-mpijob
description: register, deploy and configure run:ai custom resources
tasks:
- id: register-queue
type: RegisterObj
params:
template: "resources/templates/kai/queue.yaml"
- id: register-mpijob
type: RegisterObj
params:
template: "resources/templates/kai/mpijob.yaml"
nameFormat: "mpijob{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
podCount: "{{.workers}} + 1"
- id: configure
type: Configure
params:
nodes:
- type: dgxa100.80g
count: 3
labels:
nvidia.com/gpu.count: "8"
timeout: 1m
- id: default-queue
type: SubmitObj
params:
refTaskId: register-queue
params:
name: default
- id: test-queue
type: SubmitObj
params:
refTaskId: register-queue
params:
name: test
parentQueue: default
- id: mpijob
type: SubmitObj
params:
refTaskId: register-mpijob
count: 1
params:
namespace: default
queue: test
workers: 2
image: ubuntu
cpu: 100m
memory: 250M
gpu: 8
ttl: "20s"
- id: status
type: CheckPod
params:
refTaskId: mpijob
status: Running
timeout: 10s
12 changes: 9 additions & 3 deletions scripts/create-test-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ if kind get clusters > /dev/null 2>&1; then
read -p "> " choice
if [[ "$choice" == "y" ]]; then
kind delete cluster
kind create cluster --image=kindest/node:v1.29.7
kind create cluster
# --image=kindest/node:v1.29.7
fi
else
kind create cluster --image=kindest/node:v1.29.7
kind create cluster
# --image=kindest/node:v1.29.7
fi

deploy_prometheus
Expand All @@ -52,7 +54,8 @@ cat << EOF
3: volcano (https://github.com/volcano-sh/volcano)
4: yunikorn (https://github.com/apache/yunikorn-core)
5: run:ai (https://www.run.ai)
6: combined: coscheduler plugin + jobset + kueue
6: kai (https://github.com/NVIDIA/KAI-Scheduler)
7: combined: coscheduler plugin + jobset + kueue
EOF
read -p "> " choice

Expand All @@ -73,6 +76,9 @@ case "$choice" in
deploy_runai
;;
6)
deploy_kai
;;
7)
deploy_scheduler_plugins
deploy_jobset
deploy_kueue
Expand Down
21 changes: 21 additions & 0 deletions scripts/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,27 @@ Run:ai deployment requires environment variables:
--set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}'
}

# https://github.com/NVIDIA/KAI-Scheduler/
TRAINING_OPERATOR_VERSION=v1.8.0
MPI_OPERATOR_VERSION=v0.4.0
function deploy_kai() {
printGreen Deploying kai

kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$TRAINING_OPERATOR_VERSION"

kubectl patch deployment training-operator -n kubeflow --type='json' \
-p='[{"op": "add", "path": "/spec/template/spec/containers/0/args", "value": ["--enable-scheme=tfjob", "--enable-scheme=pytorchjob", "--enable-scheme=xgboostjob"]}]'

kubectl delete crd mpijobs.kubeflow.org

kubectl apply -f https://raw.githubusercontent.com/kubeflow/mpi-operator/$MPI_OPERATOR_VERSION/deploy/v2beta1/mpi-operator.yaml

helm repo add --force-update nvidia-k8s https://helm.ngc.nvidia.com/nvidia/k8s
helm repo update
helm upgrade --install kai-scheduler nvidia-k8s/kai-scheduler -n kai-scheduler \
--create-namespace --wait --set "global.registry=nvcr.io/nvidia/k8s"
}

SCHEDULER_PLUGINS_VERSION=v0.29.7
function deploy_scheduler_plugins() {
printGreen Deploying scheduler-plugins
Expand Down