Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

### Fixed
- kai-scheduler will not ignore pod spec.overhead field
- Fixed wrong GPU memory unit conversion from node `nvidia.com/gpu.memory` labels
- Fixed incorrect MIG GPU usage calculation leading to wrong scheduling decision

## [v0.4.12] - 2025-07-18

Expand Down
10 changes: 1 addition & 9 deletions pkg/scheduler/api/node_info/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (
"go.uber.org/multierr"
"golang.org/x/exp/maps"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"

commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
Expand Down Expand Up @@ -622,8 +621,7 @@ func getNodeGpuMemory(node *v1.Node) (int64, bool) {
gpuMemoryLabelValue = convertBytesToMib(gpuMemoryLabelValue)
}

gpuMemoryInMb := convertMibToMb(gpuMemoryLabelValue)
return gpuMemoryInMb - (gpuMemoryInMb % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes
return gpuMemoryLabelValue - (gpuMemoryLabelValue % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes
}

func checkGpuMemoryIsInMib(gpuMemoryValue int64) bool {
Expand All @@ -634,12 +632,6 @@ func convertBytesToMib(gpuMemoryValue int64) int64 {
return gpuMemoryValue / BitToMib
}

func convertMibToMb(countInMib int64) int64 {
resourceMemory := resource.NewQuantity(countInMib*1024*1024, resource.BinarySI)
mbResourceMemory := resource.NewQuantity(resourceMemory.Value(), resource.DecimalSI)
return mbResourceMemory.Value() / MbToBRatio
}

func (ni *NodeInfo) IsCPUOnlyNode() bool {
if ni.IsMIGEnabled() {
return false
Expand Down
11 changes: 2 additions & 9 deletions pkg/scheduler/api/node_info/node_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -433,13 +433,6 @@ func TestAddRemovePods(t *testing.T) {
}
}

func TestConvertMibToMb(t *testing.T) {
mibSize := int64(100)
mbSize := convertMibToMb(mibSize)
mbSizeManualConversion := int64(float64(mibSize) * MibToMbScale)
assert.Equal(t, mbSize, mbSizeManualConversion)
}

type allocatableTestData struct {
node *v1.Node
podsResources []v1.ResourceList
Expand Down Expand Up @@ -668,15 +661,15 @@ func TestGpuOperatorHasMemoryError_MibInput(t *testing.T) {
testNode.Labels[GpuMemoryLabel] = "4096"
gpuMemoryInMb, ok := getNodeGpuMemory(testNode)
assert.Equal(t, true, ok)
assert.Equal(t, int64(4200), gpuMemoryInMb)
assert.Equal(t, int64(4000), gpuMemoryInMb)
}

func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) {
testNode := common_info.BuildNode("n1", common_info.BuildResourceList("8000m", "10G"))
testNode.Labels[GpuMemoryLabel] = "4295000001"
gpuMemoryInMb, ok := getNodeGpuMemory(testNode)
assert.Equal(t, true, ok)
assert.Equal(t, int64(4200), gpuMemoryInMb)
assert.Equal(t, int64(4000), gpuMemoryInMb)
}

func addJobAnnotation(pod *v1.Pod) {
Expand Down
4 changes: 0 additions & 4 deletions pkg/scheduler/api/pod_info/pod_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,10 +365,6 @@ func getTaskStatus(pod *v1.Pod, bindRequest *bindrequest_info.BindRequestInfo) p
}

func (pi *PodInfo) updatePodAdditionalFields(bindRequest *bindrequest_info.BindRequestInfo) {
if len(pi.Job) == 0 {
return
}

if bindRequest != nil && len(bindRequest.BindRequest.Spec.SelectedGPUGroups) > 0 {
pi.GPUGroups = bindRequest.BindRequest.Spec.SelectedGPUGroups
} else {
Expand Down
51 changes: 51 additions & 0 deletions pkg/scheduler/cache/cluster_info/cluster_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,15 @@ func TestSnapshotNodes(t *testing.T) {
Phase: v1core.PodRunning,
},
}
exampleMIGPod := examplePod.DeepCopy()
exampleMIGPod.Name = "mig-pod"
exampleMIGPod.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2")
exampleMIGPodWithPG := examplePod.DeepCopy()
exampleMIGPodWithPG.Name = "mig-pod-with-pg"
exampleMIGPodWithPG.Annotations = map[string]string{
commonconstants.PodGroupAnnotationForPod: "pg-1",
}
exampleMIGPodWithPG.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2")
tests := map[string]struct {
objs []runtime.Object
resultNodes []*node_info.NodeInfo
Expand Down Expand Up @@ -307,6 +316,48 @@ func TestSnapshotNodes(t *testing.T) {
resultPodsLen: 1,
nodePoolName: "pool-a",
},
"MIG Job": {
objs: []runtime.Object{
&v1core.Node{
ObjectMeta: v1.ObjectMeta{
Name: "node-1",
},
Status: v1core.NodeStatus{
Allocatable: v1core.ResourceList{
"cpu": resource.MustParse("10"),
"nvidia.com/mig-1g.5gb": resource.MustParse("10"),
},
},
},
exampleMIGPod,
exampleMIGPodWithPG,
},
resultNodes: []*node_info.NodeInfo{
{
Name: "node-1",
Idle: resource_info.ResourceFromResourceList(
v1core.ResourceList{
"cpu": resource.MustParse("6"),
"nvidia.com/mig-1g.5gb": resource.MustParse("6"),
},
),
Used: resource_info.ResourceFromResourceList(
v1core.ResourceList{
"cpu": resource.MustParse("4"),
"memory": resource.MustParse("0"),
"nvidia.com/mig-1g.5gb": resource.MustParse("4"),
},
),
Releasing: resource_info.ResourceFromResourceList(
v1core.ResourceList{
"cpu": resource.MustParse("0"),
"memory": resource.MustParse("0"),
},
),
},
},
resultPodsLen: 2,
},
}

for name, test := range tests {
Expand Down
9 changes: 9 additions & 0 deletions pkg/scheduler/cache/status_updater/concurrency_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,15 @@ var _ = Describe("Status Updater Concurrency - large scale: increase queue size"
podGroupsFromCluster = append(podGroupsFromCluster, podGroup.DeepCopy())
}

Eventually(func() int {
numberOfPodGroupInFlight := 0
statusUpdater.inFlightPodGroups.Range(func(key any, value any) bool {
numberOfPodGroupInFlight += 1
return true
})
return numberOfPodGroupInFlight
}, time.Second*20, time.Millisecond*50).Should(Equal(0))

statusUpdater.SyncPodGroupsWithPendingUpdates(podGroupsFromCluster)

// check that the pods groups are now not updated anymore
Expand Down
13 changes: 0 additions & 13 deletions pkg/scheduler/plugins/proportion/proportion.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (

commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info/resources"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status"
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info"
Expand Down Expand Up @@ -146,18 +145,6 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou
nodeResource.Add(rs.NewResourceQuantities(node.Allocatable.Cpu(), node.Allocatable.Memory(), 0))
} else {
nodeResource.Add(utils.QuantifyResource(node.Allocatable))
migEnabledGpus := 0
for resource, qty := range node.Node.Status.Allocatable {
if resource_info.IsMigResource(resource) {
gpu, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(string(resource))
if err != nil {
log.InfraLogger.Errorf("Failed to extract gpu and memory from mig resource %v: %v", resource, err)
continue
}
migEnabledGpus += int(qty.Value()) * gpu
}
}
nodeResource[rs.GpuResource] += float64(migEnabledGpus)
}

// Subtract resources of non-related pods
Expand Down
22 changes: 22 additions & 0 deletions pkg/scheduler/plugins/proportion/proportion_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,28 @@ var _ = Describe("Set Fair Share in Proportion", func() {
rs.GpuResource: 2,
},
},
{
name: "mig gpu node",
isRestrictNode: true,
node: &node_info.NodeInfo{
Name: "n1",
Node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
node_info.GpuWorkerNode: "true",
},
},
},
Allocatable: resource_info.ResourceFromResourceList(
common_info.BuildResourceListWithMig("8000m", "10G", "nvidia.com/mig-1g.5gb"),
),
},
want: rs.ResourceQuantities{
rs.CpuResource: 8000,
rs.MemoryResource: 10000000000,
rs.GpuResource: 1,
},
},
{
name: "ignore extra resources",
isRestrictNode: true,
Expand Down