Skip to content

Commit 2145d06

Browse files
authored
support runtimeClasses (#380)
1 parent 7ef4b90 commit 2145d06

File tree

10 files changed

+205
-19
lines changed

10 files changed

+205
-19
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1414

1515
### Fixed
1616
- crd-upgrader respects global values for nodeSelector, affinity and tolerations
17+
- kai-scheduler will not ignore pod spec.overhead field
1718

1819
## [v0.7.12] - 2025-08-04
1920

pkg/scheduler/api/common_info/errors.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const (
1919
ResourcesWereNotFoundMsg = "no nodes with enough resources were found"
2020
DefaultPodgroupError = "Unable to schedule podgroup"
2121
DefaultPodError = "Unable to schedule pod"
22+
OverheadMessage = "Not enough resources due to pod overhead resources"
2223
)
2324

2425
type FitError struct {
@@ -60,7 +61,7 @@ func NewFitErrorByReasons(name, namespace, nodeName string, err error, reasons .
6061
func NewFitErrorInsufficientResource(
6162
name, namespace, nodeName string,
6263
resourceRequested *resource_info.ResourceRequirements, usedResource, capacityResource *resource_info.Resource,
63-
capacityGpuMemory int64, gangSchedulingJob bool,
64+
capacityGpuMemory int64, gangSchedulingJob bool, messageSuffix string,
6465
) *FitError {
6566
availableResource := capacityResource.Clone()
6667
availableResource.Sub(usedResource)
@@ -146,6 +147,15 @@ func NewFitErrorInsufficientResource(
146147
}
147148
}
148149

150+
if len(messageSuffix) > 0 {
151+
for i, msg := range shortMessages {
152+
shortMessages[i] = fmt.Sprintf("%s. %s", msg, messageSuffix)
153+
}
154+
for i, msg := range detailedMessages {
155+
detailedMessages[i] = fmt.Sprintf("%s. %s", msg, messageSuffix)
156+
}
157+
}
158+
149159
return NewFitErrorWithDetailedMessage(name, namespace, nodeName, shortMessages, detailedMessages...)
150160
}
151161

pkg/scheduler/api/common_info/errors_test.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ func TestNewFitErrorInsufficientResource(t *testing.T) {
5151
capacityResource *resource_info.Resource
5252
capacityGpuMemory int64
5353
gangSchedulingJob bool
54+
suffix string
5455
}
5556
tests := []struct {
5657
name string
@@ -163,12 +164,33 @@ func TestNewFitErrorInsufficientResource(t *testing.T) {
163164
DetailedReasons: []string{"Node didn't have enough resources: Each gpu on the node has a gpu memory capacity of 1000 Mib. 2000 Mib of gpu memory has been requested."},
164165
},
165166
},
167+
{
168+
name: "Not enough cpu due to pod overhead",
169+
args: args{
170+
name: "t1",
171+
namespace: "n1",
172+
nodeName: "node1",
173+
resourceRequested: resource_info.NewResourceRequirements(0, 1500, 1000),
174+
usedResource: BuildResource("500m", "1M"),
175+
capacityResource: BuildResource("1000m", "2M"),
176+
capacityGpuMemory: 0,
177+
gangSchedulingJob: false,
178+
suffix: "Message suffix",
179+
},
180+
want: &FitError{
181+
taskName: "t1",
182+
taskNamespace: "n1",
183+
NodeName: "node1",
184+
Reasons: []string{"node(s) didn't have enough resources: CPU cores. Message suffix"},
185+
DetailedReasons: []string{"Node didn't have enough resources: CPU cores, requested: 1.5, used: 0.5, capacity: 1. Message suffix"},
186+
},
187+
},
166188
}
167189
for _, tt := range tests {
168190
t.Run(tt.name, func(t *testing.T) {
169191
if got := NewFitErrorInsufficientResource(tt.args.name, tt.args.namespace, tt.args.nodeName,
170192
tt.args.resourceRequested, tt.args.usedResource, tt.args.capacityResource, tt.args.capacityGpuMemory,
171-
tt.args.gangSchedulingJob); !reflect.DeepEqual(got, tt.want) {
193+
tt.args.gangSchedulingJob, tt.args.suffix); !reflect.DeepEqual(got, tt.want) {
172194
t.Errorf("NewFitErrorInsufficientResource() = %v, want %v", got, tt.want)
173195
}
174196
})

pkg/scheduler/api/node_info/mig_node_info_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ func TestIsTaskAllocatable_Mig(t *testing.T) {
5858
t.Run(testName, func(t *testing.T) {
5959
runAllocatableTest(
6060
t, testData, testName,
61-
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error) {
61+
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError) {
6262
allocatable, err := ni.IsTaskAllocatable(task), ni.FittingError(task, false)
6363
return allocatable, err
6464
},

pkg/scheduler/api/node_info/node_info.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
sc_info "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/storagecapacity_info"
4242
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/storageclaim_info"
4343
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/conf"
44+
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/k8s_utils"
4445
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log"
4546
)
4647

@@ -258,9 +259,23 @@ func (ni *NodeInfo) FittingError(task *pod_info.PodInfo, isGangTask bool) *commo
258259
task.ResReq.GetNumOfGpuDevices(), ni.getResourceGpuPortion(task.ResReq), requestedResources.GpuMemory())
259260
}
260261

261-
return common_info.NewFitErrorInsufficientResource(
262+
messageSuffix := ""
263+
if len(task.Pod.Spec.Overhead) > 0 {
264+
// Adding to node idle instead of subtracting from pod requested resources
265+
idleResourcesWithOverhead := ni.Idle.Clone()
266+
idleResourcesWithOverhead.Add(resource_info.ResourceFromResourceList(task.Pod.Spec.Overhead))
267+
enoughResourcesWithoutOverhead := ni.lessEqualTaskToNodeResources(task.ResReq, idleResourcesWithOverhead)
268+
if enoughResourcesWithoutOverhead {
269+
messageSuffix = fmt.Sprintf("%s. The overhead resources are %v", common_info.OverheadMessage,
270+
k8s_utils.StringResourceList(task.Pod.Spec.Overhead))
271+
}
272+
}
273+
274+
fitError := common_info.NewFitErrorInsufficientResource(
262275
task.Name, task.Namespace, ni.Name, task.ResReq, totalUsed, totalCapability, ni.MemoryOfEveryGpuOnNode,
263-
isGangTask)
276+
isGangTask, messageSuffix)
277+
278+
return fitError
264279
}
265280

266281
allocatable, err := ni.isTaskStorageAllocatable(task)

pkg/scheduler/api/node_info/node_info_test.go

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -464,10 +464,12 @@ type allocatableTestData struct {
464464
podsResources []v1.ResourceList
465465
podResourcesToAllocate v1.ResourceList
466466
podAnnotations map[string]string
467+
podOverhead v1.ResourceList
467468
expected bool
468469
expectedMessageContains []string
470+
expectOverheadMessage bool
469471
}
470-
type allocatableTestFunction func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error)
472+
type allocatableTestFunction func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError)
471473

472474
func TestIsTaskAllocatable(t *testing.T) {
473475
nodeCapacityDifferent := common_info.BuildNode("n2", common_info.BuildResourceList("1000m", "1G"))
@@ -538,13 +540,49 @@ func TestIsTaskAllocatable(t *testing.T) {
538540
expected: true,
539541
expectedMessageContains: []string{},
540542
},
543+
"pod with overhead that fits without overhead but not with overhead": {
544+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
545+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
546+
podResourcesToAllocate: common_info.BuildResourceList("500m", "500M"),
547+
podOverhead: common_info.BuildResourceList("600m", "600M"),
548+
expected: false,
549+
expectedMessageContains: []string{"CPU cores", "memory"},
550+
expectOverheadMessage: true,
551+
},
552+
"pod with overhead that doesn't fit even without overhead": {
553+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
554+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
555+
podResourcesToAllocate: common_info.BuildResourceList("1500m", "1500M"),
556+
podOverhead: common_info.BuildResourceList("100m", "100M"),
557+
expected: false,
558+
expectedMessageContains: []string{"CPU cores", "memory"},
559+
expectOverheadMessage: false,
560+
},
561+
"pod without overhead that doesn't fit": {
562+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
563+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
564+
podResourcesToAllocate: common_info.BuildResourceList("1500m", "1500M"),
565+
podOverhead: v1.ResourceList{},
566+
expected: false,
567+
expectedMessageContains: []string{"CPU cores", "memory"},
568+
expectOverheadMessage: false,
569+
},
570+
"pod with overhead that fits with overhead": {
571+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
572+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
573+
podResourcesToAllocate: common_info.BuildResourceList("500m", "500M"),
574+
podOverhead: common_info.BuildResourceList("100m", "100M"),
575+
expected: true,
576+
expectedMessageContains: []string{},
577+
expectOverheadMessage: false,
578+
},
541579
}
542580

543581
for testName, testData := range tests {
544582
t.Run(testName, func(t *testing.T) {
545583
runAllocatableTest(
546584
t, testData, testName,
547-
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error) {
585+
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError) {
548586
return ni.IsTaskAllocatable(task), ni.FittingError(task, false)
549587
},
550588
)
@@ -580,7 +618,7 @@ func TestIsTaskAllocatableOnReleasingOrIdle(t *testing.T) {
580618
t.Run(testName, func(t *testing.T) {
581619
runAllocatableTest(
582620
t, testData, testName,
583-
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error) {
621+
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError) {
584622
return ni.IsTaskAllocatableOnReleasingOrIdle(task), nil
585623
},
586624
)
@@ -611,6 +649,11 @@ func runAllocatableTest(
611649
"podToAllocate", "p1", "n1", v1.PodRunning, testData.podResourcesToAllocate,
612650
[]metav1.OwnerReference{}, make(map[string]string), testData.podAnnotations)
613651
addJobAnnotation(pod)
652+
653+
if len(testData.podOverhead) > 0 {
654+
pod.Spec.Overhead = testData.podOverhead
655+
}
656+
614657
task := pod_info.NewTaskInfo(pod)
615658
allocatable, fitErr := testedFunction(ni, task)
616659
if allocatable != testData.expected {
@@ -622,6 +665,20 @@ func runAllocatableTest(
622665
t.Errorf("%s: expected error message to contain %s, got %s", testName, expectedMessage, fitErr.Error())
623666
}
624667
}
668+
669+
if testData.expectOverheadMessage {
670+
if !strings.Contains(fitErr.Error(), "Not enough resources due to pod overhead resources") {
671+
t.Errorf("%s: expected overhead message, got %s", testName, fitErr.Error())
672+
}
673+
} else {
674+
fitErrMessage := fitErr.Error()
675+
if strings.Contains(fitErrMessage, "Not enough resources due to pod overhead resources") {
676+
t.Errorf("%s: unexpected overhead message, got %s", testName, fitErrMessage)
677+
}
678+
}
679+
} else if len(testData.expectedMessageContains) > 0 {
680+
// If we expected an error but got none, that's a test failure
681+
t.Errorf("%s: expected error but got none", testName)
625682
}
626683
}
627684

pkg/scheduler/api/pod_info/pod_info.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,11 @@ func getPodResourceRequest(pod *v1.Pod) *resource_info.ResourceRequirements {
310310
}
311311
}
312312

313+
if pod.Spec.Overhead != nil {
314+
overheadReq := resource_info.RequirementsFromResourceList(pod.Spec.Overhead)
315+
result.Add(&overheadReq.BaseResource)
316+
}
317+
313318
return result
314319
}
315320

pkg/scheduler/api/pod_info/pod_info_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,27 @@ func TestGetPodResourceRequest(t *testing.T) {
125125
},
126126
expectedResource: resource_info.NewResourceRequirements(1, 3000, 5000000000),
127127
},
128+
{
129+
name: "pod with overhead resources",
130+
pod: &v1.Pod{
131+
Spec: v1.PodSpec{
132+
Containers: []v1.Container{
133+
{
134+
Resources: v1.ResourceRequirements{
135+
Requests: common_info.BuildResourceListWithGPU("1000m", "1G", "1"),
136+
},
137+
},
138+
{
139+
Resources: v1.ResourceRequirements{
140+
Requests: common_info.BuildResourceList("2000m", "1G"),
141+
},
142+
},
143+
},
144+
Overhead: common_info.BuildResourceList("1000m", "1G"),
145+
},
146+
},
147+
expectedResource: resource_info.NewResourceRequirements(1, 4000, 3000000000),
148+
},
128149
}
129150
for i, test := range tests {
130151
req := getPodResourceRequest(test.pod)

pkg/scheduler/k8s_utils/k8s_utils.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,12 @@ func annotationAndLabelsPatchBytes(annotations, labels map[string]interface{}) (
5151
"annotations": annotations,
5252
}})
5353
}
54+
55+
func StringResourceList(resources v1.ResourceList) string {
56+
output := ""
57+
for name, value := range resources {
58+
output += fmt.Sprintf(" %s: %s", name, value.String())
59+
}
60+
61+
return output
62+
}

test/e2e/suites/integrations/k8s_native/k8s_native_test.go

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
. "github.com/onsi/ginkgo/v2"
1414
. "github.com/onsi/gomega"
1515
v1 "k8s.io/api/core/v1"
16+
nodev1 "k8s.io/api/node/v1"
1617
"k8s.io/apimachinery/pkg/api/resource"
1718
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1819
"k8s.io/utils/ptr"
@@ -48,20 +49,65 @@ var _ = Describe("K8S Native object integrations", Ordered, func() {
4849
testCtx.ClusterCleanup(ctx)
4950
})
5051

51-
It("Pod", func(ctx context.Context) {
52-
pod := rd.CreatePodObject(testCtx.Queues[0], v1.ResourceRequirements{})
52+
Context("Pods", func() {
53+
It("schedules simple pods", func(ctx context.Context) {
54+
pod := rd.CreatePodObject(testCtx.Queues[0], v1.ResourceRequirements{})
5355

54-
_, err := rd.CreatePod(ctx, testCtx.KubeClientset, pod)
55-
if err != nil {
56-
Expect(err).NotTo(HaveOccurred(), "Failed to create pod-job")
57-
}
56+
_, err := rd.CreatePod(ctx, testCtx.KubeClientset, pod)
57+
if err != nil {
58+
Expect(err).NotTo(HaveOccurred(), "Failed to create pod-job")
59+
}
5860

59-
defer func() {
60-
err = testCtx.KubeClientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
61-
Expect(err).To(Succeed())
62-
}()
61+
defer func() {
62+
err = testCtx.KubeClientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
63+
Expect(err).To(Succeed())
64+
}()
65+
66+
wait.ForPodScheduled(ctx, testCtx.ControllerClient, pod)
67+
})
68+
69+
It("considers pod Overhead from runtimeclass", func(ctx context.Context) {
70+
runtimeClassName := "my-runtime-class-" + utils.GenerateRandomK8sName(5)
71+
limitedQueue := queue.CreateQueueObject("limited-"+utils.GenerateRandomK8sName(10), testCtx.Queues[1].Name)
72+
limitedQueue.Spec.Resources.CPU.Limit = 1
73+
testCtx.AddQueues(ctx, []*v2.Queue{limitedQueue})
74+
75+
defer func() {
76+
Expect(testCtx.ControllerClient.Delete(ctx, limitedQueue)).To(Succeed())
77+
}()
78+
79+
pod := rd.CreatePodObject(limitedQueue, v1.ResourceRequirements{})
80+
pod.Spec.RuntimeClassName = &runtimeClassName
6381

64-
wait.ForPodScheduled(ctx, testCtx.ControllerClient, pod)
82+
runtimeClass := &nodev1.RuntimeClass{
83+
ObjectMeta: metav1.ObjectMeta{
84+
Name: runtimeClassName,
85+
},
86+
Handler: "runc",
87+
Overhead: &nodev1.Overhead{
88+
PodFixed: v1.ResourceList{
89+
v1.ResourceCPU: resource.MustParse("2"),
90+
},
91+
},
92+
}
93+
94+
Expect(testCtx.ControllerClient.Create(ctx, runtimeClass)).To(Succeed())
95+
96+
defer func() {
97+
Expect(testCtx.ControllerClient.Delete(ctx, runtimeClass)).To(Succeed())
98+
}()
99+
100+
_, err := rd.CreatePod(ctx, testCtx.KubeClientset, pod)
101+
if err != nil {
102+
Expect(err).NotTo(HaveOccurred(), "Failed to create pod-job")
103+
}
104+
105+
defer func() {
106+
Expect(testCtx.ControllerClient.Delete(ctx, pod)).To(Succeed())
107+
}()
108+
109+
wait.ForPodUnschedulable(ctx, testCtx.ControllerClient, pod)
110+
})
65111
})
66112

67113
It("ReplicaSet", func(ctx context.Context) {

0 commit comments

Comments
 (0)