Skip to content

Commit 1ef11b7

Browse files
authored
v0.4 - add runtime class support (#399)
* add pod overhead resources to total requirements * adding e2e test with runtime class * update changelog * add failure message due to overhead resources * update message due to pod overhead
1 parent be5afe2 commit 1ef11b7

File tree

10 files changed

+209
-19
lines changed

10 files changed

+209
-19
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

7+
## [Unreleased]
8+
9+
### Fixed
10+
- kai-scheduler will not ignore pod spec.overhead field
11+
712
## [v0.4.12] - 2025-07-18
813

914
### Fixed

pkg/scheduler/api/common_info/errors.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const (
1919
ResourcesWereNotFoundMsg = "no nodes with enough resources were found"
2020
DefaultPodgroupError = "Unable to schedule podgroup"
2121
DefaultPodError = "Unable to schedule pod"
22+
OverheadMessage = "Not enough resources due to pod overhead resources"
2223
)
2324

2425
type FitError struct {
@@ -60,7 +61,7 @@ func NewFitErrorByReasons(name, namespace, nodeName string, err error, reasons .
6061
func NewFitErrorInsufficientResource(
6162
name, namespace, nodeName string,
6263
resourceRequested *resource_info.ResourceRequirements, usedResource, capacityResource *resource_info.Resource,
63-
capacityGpuMemory int64, gangSchedulingJob bool,
64+
capacityGpuMemory int64, gangSchedulingJob bool, messageSuffix string,
6465
) *FitError {
6566
availableResource := capacityResource.Clone()
6667
availableResource.Sub(usedResource)
@@ -146,6 +147,15 @@ func NewFitErrorInsufficientResource(
146147
}
147148
}
148149

150+
if len(messageSuffix) > 0 {
151+
for i, msg := range shortMessages {
152+
shortMessages[i] = fmt.Sprintf("%s. %s", msg, messageSuffix)
153+
}
154+
for i, msg := range detailedMessages {
155+
detailedMessages[i] = fmt.Sprintf("%s. %s", msg, messageSuffix)
156+
}
157+
}
158+
149159
return NewFitErrorWithDetailedMessage(name, namespace, nodeName, shortMessages, detailedMessages...)
150160
}
151161

pkg/scheduler/api/common_info/errors_test.go

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ func TestNewFitErrorInsufficientResource(t *testing.T) {
5151
capacityResource *resource_info.Resource
5252
capacityGpuMemory int64
5353
gangSchedulingJob bool
54+
suffix string
5455
}
5556
tests := []struct {
5657
name string
@@ -163,12 +164,33 @@ func TestNewFitErrorInsufficientResource(t *testing.T) {
163164
DetailedReasons: []string{"Node didn't have enough resources: Each gpu on the node has a gpu memory capacity of 1000 Mib. 2000 Mib of gpu memory has been requested."},
164165
},
165166
},
167+
{
168+
name: "Not enough cpu due to pod overhead",
169+
args: args{
170+
name: "t1",
171+
namespace: "n1",
172+
nodeName: "node1",
173+
resourceRequested: resource_info.NewResourceRequirements(0, 1500, 1000),
174+
usedResource: BuildResource("500m", "1M"),
175+
capacityResource: BuildResource("1000m", "2M"),
176+
capacityGpuMemory: 0,
177+
gangSchedulingJob: false,
178+
suffix: "Message suffix",
179+
},
180+
want: &FitError{
181+
taskName: "t1",
182+
taskNamespace: "n1",
183+
NodeName: "node1",
184+
Reasons: []string{"node(s) didn't have enough resources: CPU cores. Message suffix"},
185+
DetailedReasons: []string{"Node didn't have enough resources: CPU cores, requested: 1.5, used: 0.5, capacity: 1. Message suffix"},
186+
},
187+
},
166188
}
167189
for _, tt := range tests {
168190
t.Run(tt.name, func(t *testing.T) {
169191
if got := NewFitErrorInsufficientResource(tt.args.name, tt.args.namespace, tt.args.nodeName,
170192
tt.args.resourceRequested, tt.args.usedResource, tt.args.capacityResource, tt.args.capacityGpuMemory,
171-
tt.args.gangSchedulingJob); !reflect.DeepEqual(got, tt.want) {
193+
tt.args.gangSchedulingJob, tt.args.suffix); !reflect.DeepEqual(got, tt.want) {
172194
t.Errorf("NewFitErrorInsufficientResource() = %v, want %v", got, tt.want)
173195
}
174196
})

pkg/scheduler/api/node_info/mig_node_info_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ func TestIsTaskAllocatable_Mig(t *testing.T) {
5858
t.Run(testName, func(t *testing.T) {
5959
runAllocatableTest(
6060
t, testData, testName,
61-
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error) {
61+
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError) {
6262
allocatable, err := ni.IsTaskAllocatable(task), ni.FittingError(task, false)
6363
return allocatable, err
6464
},

pkg/scheduler/api/node_info/node_info.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info"
2525
sc_info "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/storagecapacity_info"
2626
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/storageclaim_info"
27+
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/k8s_utils"
2728
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log"
2829
)
2930

@@ -243,9 +244,23 @@ func (ni *NodeInfo) FittingError(task *pod_info.PodInfo, isGangTask bool) *commo
243244
task.ResReq.GetNumOfGpuDevices(), ni.getResourceGpuPortion(task.ResReq), requestedResources.GpuMemory())
244245
}
245246

246-
return common_info.NewFitErrorInsufficientResource(
247+
messageSuffix := ""
248+
if len(task.Pod.Spec.Overhead) > 0 {
249+
// Adding to node idle instead of subtracting from pod requested resources
250+
idleResourcesWithOverhead := ni.Idle.Clone()
251+
idleResourcesWithOverhead.Add(resource_info.ResourceFromResourceList(task.Pod.Spec.Overhead))
252+
enoughResourcesWithoutOverhead := ni.lessEqualTaskToNodeResources(task.ResReq, idleResourcesWithOverhead)
253+
if enoughResourcesWithoutOverhead {
254+
messageSuffix = fmt.Sprintf("%s. The overhead resources are %v", common_info.OverheadMessage,
255+
k8s_utils.StringResourceList(task.Pod.Spec.Overhead))
256+
}
257+
}
258+
259+
fitError := common_info.NewFitErrorInsufficientResource(
247260
task.Name, task.Namespace, ni.Name, task.ResReq, totalUsed, totalCapability, ni.MemoryOfEveryGpuOnNode,
248-
isGangTask)
261+
isGangTask, messageSuffix)
262+
263+
return fitError
249264
}
250265

251266
allocatable, err := ni.isTaskStorageAllocatable(task)

pkg/scheduler/api/node_info/node_info_test.go

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,10 +445,12 @@ type allocatableTestData struct {
445445
podsResources []v1.ResourceList
446446
podResourcesToAllocate v1.ResourceList
447447
podAnnotations map[string]string
448+
podOverhead v1.ResourceList
448449
expected bool
449450
expectedMessageContains []string
451+
expectOverheadMessage bool
450452
}
451-
type allocatableTestFunction func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error)
453+
type allocatableTestFunction func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError)
452454

453455
func TestIsTaskAllocatable(t *testing.T) {
454456
nodeCapacityDifferent := common_info.BuildNode("n2", common_info.BuildResourceList("1000m", "1G"))
@@ -519,13 +521,49 @@ func TestIsTaskAllocatable(t *testing.T) {
519521
expected: true,
520522
expectedMessageContains: []string{},
521523
},
524+
"pod with overhead that fits without overhead but not with overhead": {
525+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
526+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
527+
podResourcesToAllocate: common_info.BuildResourceList("500m", "500M"),
528+
podOverhead: common_info.BuildResourceList("600m", "600M"),
529+
expected: false,
530+
expectedMessageContains: []string{"CPU cores", "memory"},
531+
expectOverheadMessage: true,
532+
},
533+
"pod with overhead that doesn't fit even without overhead": {
534+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
535+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
536+
podResourcesToAllocate: common_info.BuildResourceList("1500m", "1500M"),
537+
podOverhead: common_info.BuildResourceList("100m", "100M"),
538+
expected: false,
539+
expectedMessageContains: []string{"CPU cores", "memory"},
540+
expectOverheadMessage: false,
541+
},
542+
"pod without overhead that doesn't fit": {
543+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
544+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
545+
podResourcesToAllocate: common_info.BuildResourceList("1500m", "1500M"),
546+
podOverhead: v1.ResourceList{},
547+
expected: false,
548+
expectedMessageContains: []string{"CPU cores", "memory"},
549+
expectOverheadMessage: false,
550+
},
551+
"pod with overhead that fits with overhead": {
552+
node: common_info.BuildNode("n1", common_info.BuildResourceList("2000m", "2G")),
553+
podsResources: []v1.ResourceList{common_info.BuildResourceList("1000m", "1G")},
554+
podResourcesToAllocate: common_info.BuildResourceList("500m", "500M"),
555+
podOverhead: common_info.BuildResourceList("100m", "100M"),
556+
expected: true,
557+
expectedMessageContains: []string{},
558+
expectOverheadMessage: false,
559+
},
522560
}
523561

524562
for testName, testData := range tests {
525563
t.Run(testName, func(t *testing.T) {
526564
runAllocatableTest(
527565
t, testData, testName,
528-
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error) {
566+
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError) {
529567
return ni.IsTaskAllocatable(task), ni.FittingError(task, false)
530568
},
531569
)
@@ -561,7 +599,7 @@ func TestIsTaskAllocatableOnReleasingOrIdle(t *testing.T) {
561599
t.Run(testName, func(t *testing.T) {
562600
runAllocatableTest(
563601
t, testData, testName,
564-
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, error) {
602+
func(ni *NodeInfo, task *pod_info.PodInfo) (bool, *common_info.FitError) {
565603
return ni.IsTaskAllocatableOnReleasingOrIdle(task), nil
566604
},
567605
)
@@ -592,6 +630,11 @@ func runAllocatableTest(
592630
"podToAllocate", "p1", "n1", v1.PodRunning, testData.podResourcesToAllocate,
593631
[]metav1.OwnerReference{}, make(map[string]string), testData.podAnnotations)
594632
addJobAnnotation(pod)
633+
634+
if len(testData.podOverhead) > 0 {
635+
pod.Spec.Overhead = testData.podOverhead
636+
}
637+
595638
task := pod_info.NewTaskInfo(pod)
596639
allocatable, fitErr := testedFunction(ni, task)
597640
if allocatable != testData.expected {
@@ -603,6 +646,20 @@ func runAllocatableTest(
603646
t.Errorf("%s: expected error message to contain %s, got %s", testName, expectedMessage, fitErr.Error())
604647
}
605648
}
649+
650+
if testData.expectOverheadMessage {
651+
if !strings.Contains(fitErr.Error(), "Not enough resources due to pod overhead resources") {
652+
t.Errorf("%s: expected overhead message, got %s", testName, fitErr.Error())
653+
}
654+
} else {
655+
fitErrMessage := fitErr.Error()
656+
if strings.Contains(fitErrMessage, "Not enough resources due to pod overhead resources") {
657+
t.Errorf("%s: unexpected overhead message, got %s", testName, fitErrMessage)
658+
}
659+
}
660+
} else if len(testData.expectedMessageContains) > 0 {
661+
// If we expected an error but got none, that's a test failure
662+
t.Errorf("%s: expected error but got none", testName)
606663
}
607664
}
608665

pkg/scheduler/api/pod_info/pod_info.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,11 @@ func getPodResourceRequest(pod *v1.Pod) *resource_info.ResourceRequirements {
307307
}
308308
}
309309

310+
if pod.Spec.Overhead != nil {
311+
overheadReq := resource_info.RequirementsFromResourceList(pod.Spec.Overhead)
312+
result.Add(&overheadReq.BaseResource)
313+
}
314+
310315
return result
311316
}
312317

pkg/scheduler/api/pod_info/pod_info_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,27 @@ func TestGetPodResourceRequest(t *testing.T) {
109109
},
110110
expectedResource: resource_info.NewResourceRequirements(1, 3000, 5000000000),
111111
},
112+
{
113+
name: "pod with overhead resources",
114+
pod: &v1.Pod{
115+
Spec: v1.PodSpec{
116+
Containers: []v1.Container{
117+
{
118+
Resources: v1.ResourceRequirements{
119+
Requests: common_info.BuildResourceListWithGPU("1000m", "1G", "1"),
120+
},
121+
},
122+
{
123+
Resources: v1.ResourceRequirements{
124+
Requests: common_info.BuildResourceList("2000m", "1G"),
125+
},
126+
},
127+
},
128+
Overhead: common_info.BuildResourceList("1000m", "1G"),
129+
},
130+
},
131+
expectedResource: resource_info.NewResourceRequirements(1, 4000, 3000000000),
132+
},
112133
}
113134
for i, test := range tests {
114135
req := getPodResourceRequest(test.pod)

pkg/scheduler/k8s_utils/k8s_utils.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,12 @@ func annotationAndLabelsPatchBytes(annotations, labels map[string]interface{}) (
5151
"annotations": annotations,
5252
}})
5353
}
54+
55+
func StringResourceList(resources v1.ResourceList) string {
56+
output := ""
57+
for name, value := range resources {
58+
output += fmt.Sprintf(" %s: %s", name, value.String())
59+
}
60+
61+
return output
62+
}

test/e2e/suites/integrations/k8s_native/k8s_native_test.go

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
. "github.com/onsi/ginkgo/v2"
1414
. "github.com/onsi/gomega"
1515
v1 "k8s.io/api/core/v1"
16+
nodev1 "k8s.io/api/node/v1"
1617
"k8s.io/apimachinery/pkg/api/resource"
1718
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1819
"k8s.io/utils/ptr"
@@ -48,20 +49,65 @@ var _ = Describe("K8S Native object integrations", Ordered, func() {
4849
testCtx.ClusterCleanup(ctx)
4950
})
5051

51-
It("Pod", func(ctx context.Context) {
52-
pod := rd.CreatePodObject(testCtx.Queues[0], v1.ResourceRequirements{})
52+
Context("Pods", func() {
53+
It("schedules simple pods", func(ctx context.Context) {
54+
pod := rd.CreatePodObject(testCtx.Queues[0], v1.ResourceRequirements{})
5355

54-
_, err := rd.CreatePod(ctx, testCtx.KubeClientset, pod)
55-
if err != nil {
56-
Expect(err).NotTo(HaveOccurred(), "Failed to create pod-job")
57-
}
56+
_, err := rd.CreatePod(ctx, testCtx.KubeClientset, pod)
57+
if err != nil {
58+
Expect(err).NotTo(HaveOccurred(), "Failed to create pod-job")
59+
}
5860

59-
defer func() {
60-
err = testCtx.KubeClientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
61-
Expect(err).To(Succeed())
62-
}()
61+
defer func() {
62+
err = testCtx.KubeClientset.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
63+
Expect(err).To(Succeed())
64+
}()
65+
66+
wait.ForPodScheduled(ctx, testCtx.ControllerClient, pod)
67+
})
68+
69+
It("considers pod Overhead from runtimeclass", func(ctx context.Context) {
70+
runtimeClassName := "my-runtime-class-" + utils.GenerateRandomK8sName(5)
71+
limitedQueue := queue.CreateQueueObject("limited-"+utils.GenerateRandomK8sName(10), testCtx.Queues[1].Name)
72+
limitedQueue.Spec.Resources.CPU.Limit = 1
73+
testCtx.AddQueues(ctx, []*v2.Queue{limitedQueue})
74+
75+
defer func() {
76+
Expect(testCtx.ControllerClient.Delete(ctx, limitedQueue)).To(Succeed())
77+
}()
78+
79+
pod := rd.CreatePodObject(limitedQueue, v1.ResourceRequirements{})
80+
pod.Spec.RuntimeClassName = &runtimeClassName
6381

64-
wait.ForPodScheduled(ctx, testCtx.ControllerClient, pod)
82+
runtimeClass := &nodev1.RuntimeClass{
83+
ObjectMeta: metav1.ObjectMeta{
84+
Name: runtimeClassName,
85+
},
86+
Handler: "runc",
87+
Overhead: &nodev1.Overhead{
88+
PodFixed: v1.ResourceList{
89+
v1.ResourceCPU: resource.MustParse("2"),
90+
},
91+
},
92+
}
93+
94+
Expect(testCtx.ControllerClient.Create(ctx, runtimeClass)).To(Succeed())
95+
96+
defer func() {
97+
Expect(testCtx.ControllerClient.Delete(ctx, runtimeClass)).To(Succeed())
98+
}()
99+
100+
_, err := rd.CreatePod(ctx, testCtx.KubeClientset, pod)
101+
if err != nil {
102+
Expect(err).NotTo(HaveOccurred(), "Failed to create pod-job")
103+
}
104+
105+
defer func() {
106+
Expect(testCtx.ControllerClient.Delete(ctx, pod)).To(Succeed())
107+
}()
108+
109+
wait.ForPodUnschedulable(ctx, testCtx.ControllerClient, pod)
110+
})
65111
})
66112

67113
It("ReplicaSet", func(ctx context.Context) {

0 commit comments

Comments
 (0)