feat: Specify fraction container name (#654)

itsomri · web-flow · commit 6282e043a7ff · 2025-11-17T15:00:51.000Z
* Allow user to specify container tame and type for fractions
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Added a preferred podAntiAffinity term by default for all services, can be set to required instead by setting `global.requireDefaultPodAffinityTerm`
 - Added support for service-level affinities
 - Added [time aware scheduling](docs/timeaware/README.md) capabilities
+- Added option to specify container name and type for fraction containers
 
 ### Fixed
 - (Openshift only) - High CPU usage for the operator pod due to continues reconciles
diff --git a/docs/batch/batch-job.yaml b/docs/batch/batch-job.yaml
@@ -11,7 +11,7 @@ spec:
   template:
     metadata:
       labels:
-        kai.scheduler/queue: test
+        kai.scheduler/queue: default-queue
     spec:
       schedulerName: kai-scheduler
       restartPolicy: OnFailure
diff --git a/docs/batch/pytorch-job.yaml b/docs/batch/pytorch-job.yaml
@@ -6,7 +6,7 @@ kind: "PyTorchJob"
 metadata:
   name: "pytorch-dist-mnist-nccl"
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
 spec:
   pytorchReplicaSpecs:
     Master:
diff --git a/docs/dra/gpu-imex-pod.yaml b/docs/dra/gpu-imex-pod.yaml
@@ -16,7 +16,7 @@ kind: Pod
 metadata:
   name: gpu-imex-pod
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
 spec:
   schedulerName: kai-scheduler
   containers:
diff --git a/docs/elastic/pytorch-elastic.yaml b/docs/elastic/pytorch-elastic.yaml
@@ -6,7 +6,7 @@ kind: PyTorchJob
 metadata:
   name: elastic-example-imagenet
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
 spec:
   elasticPolicy:
     rdzvBackend: c10d
diff --git a/docs/gpu-sharing/README.md b/docs/gpu-sharing/README.md
@@ -42,3 +42,31 @@ kubectl apply -f gpu-memory.yaml
 In the gpu-memory.yaml file, the pod includes a `gpu-memory` annotation with a value of 2000 (in Mib), meaning:
 * The pod is allowed to consume up to 2000 Mib of a GPU device memory
 * The remaining GPU device memory can be shared with other pods in the cluster
+
+### GPU Fraction with Non-Default Container
+By default, GPU fraction allocation is applied to the first container (index 0) in the pod. However, you can specify a different container to receive the GPU allocation using the `gpu-fraction-container-name` annotation.
+
+#### Specific Container
+To allocate GPU fraction to a specific container in a multi-container pod:
+```
+kubectl apply -f gpu-sharing-non-default-container.yaml
+```
+
+In the gpu-sharing-non-default-container.yaml file, the pod includes:
+* `gpu-fraction: "0.5"` - Requests half of a GPU device memory
+* `gpu-fraction-container-name: "gpu-workload"` - Specifies that the container named "gpu-workload" should receive the GPU allocation instead of the default first container
+
+This is useful for pods with sidecar containers where only one specific container needs GPU access.
+
+#### Init Container
+To allocate GPU fraction to an init container:
+```
+kubectl apply -f gpu-sharing-init-container.yaml
+```
+
+In the gpu-sharing-init-container.yaml file, the pod includes:
+* `gpu-fraction: "0.5"` - Requests half of a GPU device memory
+* `gpu-fraction-container-name: "gpu-init"` - Specifies the init container name. If not defined, will default to the first container.
+* `gpu-fraction-container-type: "InitContainer"` - Indicates the container is an init container
+
+This is useful for workloads that need GPU access during initialization (e.g., model loading, dataset preprocessing) before the main application container starts.
diff --git a/docs/gpu-sharing/gpu-memory.yaml b/docs/gpu-sharing/gpu-memory.yaml
@@ -6,7 +6,7 @@ kind: Pod
 metadata:
   name: gpu-sharing
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
   annotations:
     gpu-memory: "2000" # in Mib
 spec:
diff --git a/docs/gpu-sharing/gpu-sharing-init-container.yaml b/docs/gpu-sharing/gpu-sharing-init-container.yaml
@@ -0,0 +1,26 @@
+# Copyright 2025 NVIDIA CORPORATION
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-sharing-init-container
+  labels:
+    kai.scheduler/queue: default-queue
+  annotations:
+    gpu-fraction: "0.5"
+    # Specify an init container to receive the GPU fraction allocation
+    gpu-fraction-container-name: "gpu-init"
+    gpu-fraction-container-type: "InitContainer"
+spec:
+  schedulerName: kai-scheduler
+  initContainers:
+    - name: gpu-init
+      image: nvidia/cuda:11.0-base
+      command: ["nvidia-smi"]
+      args: ["-L"]
+  containers:
+    - name: main-app
+      image: ubuntu
+      args: ["sleep", "infinity"]
+
diff --git a/docs/gpu-sharing/gpu-sharing-non-default-container.yaml b/docs/gpu-sharing/gpu-sharing-non-default-container.yaml
@@ -0,0 +1,29 @@
+# Copyright 2025 NVIDIA CORPORATION
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-sharing-non-default
+  labels:
+    kai.scheduler/queue: default-queue
+  annotations:
+    gpu-fraction: "0.5"
+    # Specify which container should receive the GPU fraction allocation
+    # By default, the first container (index 0) receives the GPU allocation
+    # Use this annotation to specify a different container by name
+    gpu-fraction-container-name: "gpu-workload"
+spec:
+  schedulerName: kai-scheduler
+  containers:
+    - name: sidecar
+      image: busybox
+      args: ["sleep", "infinity"]
+    - name: gpu-workload
+      image: nvidia/cuda:11.0-base
+      command: ["nvidia-smi"]
+      args: ["-L"]
+    - name: another-sidecar
+      image: busybox
+      args: ["sleep", "infinity"]
+
diff --git a/docs/gpu-sharing/gpu-sharing.yaml b/docs/gpu-sharing/gpu-sharing.yaml
@@ -6,7 +6,7 @@ kind: Pod
 metadata:
   name: gpu-sharing
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
   annotations:
     gpu-fraction: "0.5"
 spec:
diff --git a/docs/gpu-sharing/mps/gpu-sharing-with-mps.yaml b/docs/gpu-sharing/mps/gpu-sharing-with-mps.yaml
@@ -6,7 +6,7 @@ kind: Pod
 metadata:
   name: gpu-sharing-with-mps
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
   annotations:
     gpu-fraction: "0.5"
 spec:
diff --git a/docs/priority/example/build-priority-pod.yaml b/docs/priority/example/build-priority-pod.yaml
@@ -6,7 +6,7 @@ kind: Pod
 metadata:
   name: build-pod
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
     priorityClassName: build
 spec:
   schedulerName: kai-scheduler
diff --git a/docs/priority/example/train-priority-pod.yaml b/docs/priority/example/train-priority-pod.yaml
@@ -6,7 +6,7 @@ kind: Pod
 metadata:
   name: train-pod
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: default-queue
     priorityClassName: train
 spec:
   schedulerName: kai-scheduler
diff --git a/docs/quickstart/README.md b/docs/quickstart/README.md
@@ -28,7 +28,7 @@ Pods can now be assigned to the new queue and submitted to the cluster for sched
 
 ### Assigning Pods to Queues
 To schedule a pod using KAI Scheduler, ensure the following:
-1. Specify the queue name using the `kai.scheduler/queue: test` label on the pod/workload.
+1. Specify the queue name using the `kai.scheduler/queue: default-queue` label on the pod/workload.
 2. Set the scheduler name in the pod specification as `kai-scheduler`
 This ensures the pod is placed in the correct scheduling queue and managed by KAI Scheduler.
 
diff --git a/pkg/admission/webhook/v1alpha2/gpusharing/gpu_sharing.go b/pkg/admission/webhook/v1alpha2/gpusharing/gpu_sharing.go
@@ -56,11 +56,11 @@ func (p *GPUSharing) Mutate(pod *v1.Pod) error {
 		return nil
 	}
 
-	containerRef := &gpusharingconfigmap.PodContainerRef{
-		Container: &pod.Spec.Containers[fractionContainerIndex],
-		Index:     fractionContainerIndex,
-		Type:      gpusharingconfigmap.RegularContainer,
+	containerRef, err := common.GetFractionContainerRef(pod)
+	if err != nil {
+		return fmt.Errorf("failed to get fraction container ref: %w", err)
 	}
+
 	capabilitiesConfigMapName := gpusharingconfigmap.SetGpuCapabilitiesConfigMapName(pod, containerRef)
 	directEnvVarsMapName, err := gpusharingconfigmap.ExtractDirectEnvVarsConfigMapName(pod, containerRef)
 	if err != nil {
diff --git a/pkg/admission/webhook/v1alpha2/gpusharing/gpu_sharing_test.go b/pkg/admission/webhook/v1alpha2/gpusharing/gpu_sharing_test.go
diff --git a/pkg/binder/common/gpu_access.go b/pkg/binder/common/gpu_access.go
diff --git a/pkg/binder/plugins/gpusharing/gpu_sharing.go b/pkg/binder/plugins/gpusharing/gpu_sharing.go
diff --git a/pkg/binder/plugins/gpusharing/gpu_sharing_test.go b/pkg/binder/plugins/gpusharing/gpu_sharing_test.go
diff --git a/pkg/common/constants/constants.go b/pkg/common/constants/constants.go