NVIDIA · dmitsh · May 5, 2025 · May 2, 2025
@@ -9,12 +9,17 @@ kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases
 kubectl apply -f charts/overrides/kueue/priority.yaml
 ```
 
-Run a kueue job: 
+Run a Job with kueue:
 ```bash
 ./bin/knavigator -workflow resources/workflows/kueue/test-job.yaml -cleanup
 ```
 
-Run a preemption workflow with kueue: 
+Run a preemption workflow with kueue:
 ```bash
 ./bin/knavigator -workflow resources/workflows/kueue/test-preemption.yaml -cleanup
 ```
+
+Run a RayJob with kueue:
+```bash
+./bin/knavigator -workflow resources/workflows/kueue/test-rayjob.yaml -cleanup
+```
@@ -5,6 +5,7 @@ This directory contains benchmark tests for the following workload managers and
 - Kueue
 - Volcano
 - Yunikorn
+- Kai
 - Run:ai
 
 The benchmark tests involve submitting workloads intended to evaluate the scheduler's performance under specific scenarios.

@@ -26,6 +26,13 @@ tasks:
     nameFormat: "job{{._ENUM_}}"
     podNameFormat: "{{._NAME_}}-[a-z0-9]+"
     podCount: "{{.replicas}}"
+- id: register-lw
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/kai/mpijob.yaml"
+    nameFormat: "job{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
+    podCount: "{{.workers}} + 1"
 - id: default-queue
   type: SubmitObj
   params:

@@ -34,6 +34,20 @@ tasks:
     nameFormat: "job{{._ENUM_}}"
     podNameFormat: "{{._NAME_}}-[0-9]-.*"
     podCount: "{{.replicas}}"
+#- id: register-lw
+#  type: RegisterObj
+#  params:
+#    template: "resources/benchmarks/templates/kueue/rayjob.yaml"
+#    nameFormat: "job{{._ENUM_}}"
+#    podNameFormat: "{{._NAME_}}-raycluster-.*"
+#    podCount: "{{.workers}} + 1"
+- id: register-lw
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/kueue/mpijob.yaml"
+    nameFormat: "job{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
+    podCount: "{{.workers}} + 1"
 - id: create-resource-flavor
   type: SubmitObj
   params:

@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: config-runai
+tasks:
+- id: register
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/runai/trainingworkload.yaml"
+    nameFormat: "twl{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-0-0"
+    podCount: 1
+- id: register-lw
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/runai/distributedworkload.yaml"
+    nameFormat: "dwl{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
+    podCount: "{{.workers}} + 1"
@@ -12,139 +12,124 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: test-gang-scheduling-runai
+name: test-gang-scheduling-lw
 tasks:
-- id: register-trainingworkload
-  type: RegisterObj
-  params:
-    template: "resources/benchmarks/templates/runai/trainingworkload.yaml"
-    nameFormat: "twl{{._ENUM_}}"
-    podNameFormat: "{{._NAME_}}-0-0"
-    podCount: 1
-- id: register-distributedworkload
-  type: RegisterObj
-  params:
-    template: "resources/benchmarks/templates/runai/distributedworkload.yaml"
-    nameFormat: "dwl{{._ENUM_}}"
-    podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
-    podCount: "{{.workers}} + 1"
-#
-### Benchmark test
-#
 - id: job1
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 1
     params:
       workers: 31
       ttl: 2m
 - id: job2
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 2
     params:
       workers: 15
       ttl: 2m
 - id: job3
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 3
     params:
       workers: 9
       ttl: 2m
 - id: job3.1
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 1
     params:
       workers: 1
       ttl: 2m
 - id: job4
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 4
     params:
       workers: 7
       ttl: 2m
 - id: job5
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 5
     params:
       workers: 5
       ttl: 2m
 - id: job5.1
   type: SubmitObj
   params:
-    refTaskId: register-trainingworkload
+    refTaskId: register
     count: 2
     params:
+      replicas: 1
       ttl: 2m
 - id: job6
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 6
     params:
       workers: 4
       ttl: 2m
 - id: job6.1
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 1
     params:
       workers: 1
       ttl: 2m
 - id: job7
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 7
     params:
       workers: 3
       ttl: 2m
 - id: job7.1
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 1
     params:
       workers: 1
       ttl: 2m
 - id: job7.2
   type: SubmitObj
   params:
-    refTaskId: register-trainingworkload
+    refTaskId: register
     count: 2
     params:
+      replicas: 1
       ttl: 2m
 - id: job8
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 8
     params:
      workers: 3
      ttl: 2m
 - id: job9
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 9
     params:
       workers: 2
       ttl: 2m
 - id: job9.1
   type: SubmitObj
   params:
-    refTaskId: register-distributedworkload
+    refTaskId: register-lw
     count: 1
     params:
       workers: 4

@@ -1,17 +1,3 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 apiVersion: batch/v1
 kind: Job
 metadata:

@@ -0,0 +1,48 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: "{{._NAME_}}"
+  namespace: "default"
+  labels:
+    runai/queue: "test"
+spec:
+  slotsPerWorker: 1
+  runPolicy:
+    cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        metadata:
+          annotations:
+            pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+            pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+        spec:
+          schedulerName: kai-scheduler
+          containers:
+          - image: busybox
+            name: mpi-launcher
+            resources:
+              limits:
+                cpu: 100m
+                memory: 250M
+                nvidia.com/gpu: "8"
+    Worker:
+      replicas: {{.workers}}
+      template:
+        metadata:
+          annotations:
+            pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+            pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+          labels:
+            app: {{._NAME_}}
+        spec:
+          schedulerName: kai-scheduler
+          containers:
+          - image: busybox
+            name: mpi-worker
+            resources:
+              limits:
+                cpu: 100m
+                memory: 250M
+                nvidia.com/gpu: "8"
@@ -1,17 +1,3 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 apiVersion: scheduling.run.ai/v2
 kind: Queue
 metadata:

@@ -1,17 +1,3 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 apiVersion: batch/v1
 kind: Job
 metadata:
-Original file line number
+Diff line change
@@ Expand Up @@
     - Kueue
     - Volcano
     - Yunikorn
+    - Kai
     - Run:ai
     The benchmark tests involve submitting workloads intended to evaluate the scheduler's performance under specific scenarios.
@@ Expand Down @@