From 79a484fa6803d646523a73bc58dc7614a4b7f671 Mon Sep 17 00:00:00 2001 From: Hoa Ngo Date: Mon, 25 Aug 2025 13:59:39 +0700 Subject: [PATCH 1/5] fix: GPU memory calculation using incorrect unit conversion (#417) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: GPU memory calculation using incorrect unit conversion Fixes #416 The GPU memory validation was incorrectly converting MiB to MB and back, causing the scheduler to use inflated memory values (48300 MB instead of 46068 MiB) when checking shared GPU allocations. This allowed pods to be scheduled beyond the actual GPU memory capacity. Changes: - Use original MiB value from nvidia.com/gpu.memory node label directly - Remove unnecessary MiB->MB->MiB conversion that caused precision loss - Maintain the flooring logic for memory alignment but in MiB units * Remove unused convertMibToMb function and explanatory comments * Remove unused resource import after convertMibToMb cleanup 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * test: update GPU memory unit conversion test expectations Fix test expectations after removing MiB to MB conversion. Tests now correctly expect 4000 MiB instead of 4200 MB. Fixes #416 * Format code with go fmt --------- Co-authored-by: Ngô Quang Hòa Co-authored-by: Claude --- pkg/scheduler/api/node_info/node_info.go | 10 +--------- pkg/scheduler/api/node_info/node_info_test.go | 11 ++--------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/pkg/scheduler/api/node_info/node_info.go b/pkg/scheduler/api/node_info/node_info.go index 1a8162cc0..e64cc6fe9 100644 --- a/pkg/scheduler/api/node_info/node_info.go +++ b/pkg/scheduler/api/node_info/node_info.go @@ -12,7 +12,6 @@ import ( "go.uber.org/multierr" "golang.org/x/exp/maps" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" @@ -622,8 +621,7 @@ func getNodeGpuMemory(node *v1.Node) (int64, bool) { gpuMemoryLabelValue = convertBytesToMib(gpuMemoryLabelValue) } - gpuMemoryInMb := convertMibToMb(gpuMemoryLabelValue) - return gpuMemoryInMb - (gpuMemoryInMb % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes + return gpuMemoryLabelValue - (gpuMemoryLabelValue % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes } func checkGpuMemoryIsInMib(gpuMemoryValue int64) bool { @@ -634,12 +632,6 @@ func convertBytesToMib(gpuMemoryValue int64) int64 { return gpuMemoryValue / BitToMib } -func convertMibToMb(countInMib int64) int64 { - resourceMemory := resource.NewQuantity(countInMib*1024*1024, resource.BinarySI) - mbResourceMemory := resource.NewQuantity(resourceMemory.Value(), resource.DecimalSI) - return mbResourceMemory.Value() / MbToBRatio -} - func (ni *NodeInfo) IsCPUOnlyNode() bool { if ni.IsMIGEnabled() { return false diff --git a/pkg/scheduler/api/node_info/node_info_test.go b/pkg/scheduler/api/node_info/node_info_test.go index 40cef7513..3e985e709 100644 --- a/pkg/scheduler/api/node_info/node_info_test.go +++ b/pkg/scheduler/api/node_info/node_info_test.go @@ -433,13 +433,6 @@ func TestAddRemovePods(t *testing.T) { } } -func TestConvertMibToMb(t *testing.T) { - mibSize := int64(100) - mbSize := convertMibToMb(mibSize) - mbSizeManualConversion := int64(float64(mibSize) * MibToMbScale) - assert.Equal(t, mbSize, mbSizeManualConversion) -} - type allocatableTestData struct { node *v1.Node podsResources []v1.ResourceList @@ -668,7 +661,7 @@ func TestGpuOperatorHasMemoryError_MibInput(t *testing.T) { testNode.Labels[GpuMemoryLabel] = "4096" gpuMemoryInMb, ok := getNodeGpuMemory(testNode) assert.Equal(t, true, ok) - assert.Equal(t, int64(4200), gpuMemoryInMb) + assert.Equal(t, int64(4000), gpuMemoryInMb) } func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) { @@ -676,7 +669,7 @@ func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) { testNode.Labels[GpuMemoryLabel] = "4295000001" gpuMemoryInMb, ok := getNodeGpuMemory(testNode) assert.Equal(t, true, ok) - assert.Equal(t, int64(4200), gpuMemoryInMb) + assert.Equal(t, int64(4000), gpuMemoryInMb) } func addJobAnnotation(pod *v1.Pod) { From f2cac2a794a2bc23bf0f1858a8aa0331abffd59c Mon Sep 17 00:00:00 2001 From: Billy Mao Date: Mon, 25 Aug 2025 15:29:05 +0800 Subject: [PATCH 2/5] Fix: incorrect scheduling decision and calculation when using MIG (#422) * fix mig scheduling issue * clean up the code --------- Co-authored-by: bmao --- pkg/scheduler/api/pod_info/pod_info.go | 4 -- .../cache/cluster_info/cluster_info_test.go | 51 +++++++++++++++++++ .../plugins/proportion/proportion.go | 13 ----- .../plugins/proportion/proportion_test.go | 22 ++++++++ 4 files changed, 73 insertions(+), 17 deletions(-) diff --git a/pkg/scheduler/api/pod_info/pod_info.go b/pkg/scheduler/api/pod_info/pod_info.go index c735d2d7e..2c74adebb 100644 --- a/pkg/scheduler/api/pod_info/pod_info.go +++ b/pkg/scheduler/api/pod_info/pod_info.go @@ -365,10 +365,6 @@ func getTaskStatus(pod *v1.Pod, bindRequest *bindrequest_info.BindRequestInfo) p } func (pi *PodInfo) updatePodAdditionalFields(bindRequest *bindrequest_info.BindRequestInfo) { - if len(pi.Job) == 0 { - return - } - if bindRequest != nil && len(bindRequest.BindRequest.Spec.SelectedGPUGroups) > 0 { pi.GPUGroups = bindRequest.BindRequest.Spec.SelectedGPUGroups } else { diff --git a/pkg/scheduler/cache/cluster_info/cluster_info_test.go b/pkg/scheduler/cache/cluster_info/cluster_info_test.go index c064bd270..1110292e8 100644 --- a/pkg/scheduler/cache/cluster_info/cluster_info_test.go +++ b/pkg/scheduler/cache/cluster_info/cluster_info_test.go @@ -169,6 +169,15 @@ func TestSnapshotNodes(t *testing.T) { Phase: v1core.PodRunning, }, } + exampleMIGPod := examplePod.DeepCopy() + exampleMIGPod.Name = "mig-pod" + exampleMIGPod.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2") + exampleMIGPodWithPG := examplePod.DeepCopy() + exampleMIGPodWithPG.Name = "mig-pod-with-pg" + exampleMIGPodWithPG.Annotations = map[string]string{ + commonconstants.PodGroupAnnotationForPod: "pg-1", + } + exampleMIGPodWithPG.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2") tests := map[string]struct { objs []runtime.Object resultNodes []*node_info.NodeInfo @@ -307,6 +316,48 @@ func TestSnapshotNodes(t *testing.T) { resultPodsLen: 1, nodePoolName: "pool-a", }, + "MIG Job": { + objs: []runtime.Object{ + &v1core.Node{ + ObjectMeta: v1.ObjectMeta{ + Name: "node-1", + }, + Status: v1core.NodeStatus{ + Allocatable: v1core.ResourceList{ + "cpu": resource.MustParse("10"), + "nvidia.com/mig-1g.5gb": resource.MustParse("10"), + }, + }, + }, + exampleMIGPod, + exampleMIGPodWithPG, + }, + resultNodes: []*node_info.NodeInfo{ + { + Name: "node-1", + Idle: resource_info.ResourceFromResourceList( + v1core.ResourceList{ + "cpu": resource.MustParse("6"), + "nvidia.com/mig-1g.5gb": resource.MustParse("6"), + }, + ), + Used: resource_info.ResourceFromResourceList( + v1core.ResourceList{ + "cpu": resource.MustParse("4"), + "memory": resource.MustParse("0"), + "nvidia.com/mig-1g.5gb": resource.MustParse("4"), + }, + ), + Releasing: resource_info.ResourceFromResourceList( + v1core.ResourceList{ + "cpu": resource.MustParse("0"), + "memory": resource.MustParse("0"), + }, + ), + }, + }, + resultPodsLen: 2, + }, } for name, test := range tests { diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go index 4696e062d..d83d24625 100644 --- a/pkg/scheduler/plugins/proportion/proportion.go +++ b/pkg/scheduler/plugins/proportion/proportion.go @@ -9,7 +9,6 @@ import ( commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" - "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info/resources" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info" @@ -146,18 +145,6 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou nodeResource.Add(rs.NewResourceQuantities(node.Allocatable.Cpu(), node.Allocatable.Memory(), 0)) } else { nodeResource.Add(utils.QuantifyResource(node.Allocatable)) - migEnabledGpus := 0 - for resource, qty := range node.Node.Status.Allocatable { - if resource_info.IsMigResource(resource) { - gpu, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(string(resource)) - if err != nil { - log.InfraLogger.Errorf("Failed to extract gpu and memory from mig resource %v: %v", resource, err) - continue - } - migEnabledGpus += int(qty.Value()) * gpu - } - } - nodeResource[rs.GpuResource] += float64(migEnabledGpus) } // Subtract resources of non-related pods diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go index d242695c4..2022c5d2b 100644 --- a/pkg/scheduler/plugins/proportion/proportion_test.go +++ b/pkg/scheduler/plugins/proportion/proportion_test.go @@ -564,6 +564,28 @@ var _ = Describe("Set Fair Share in Proportion", func() { rs.GpuResource: 2, }, }, + { + name: "mig gpu node", + isRestrictNode: true, + node: &node_info.NodeInfo{ + Name: "n1", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "node-role.kubernetes.io/gpu-worker": "true", + }, + }, + }, + Allocatable: resource_info.ResourceFromResourceList( + common_info.BuildResourceListWithMig("8000m", "10G", "nvidia.com/mig-1g.5gb"), + ), + }, + want: rs.ResourceQuantities{ + rs.CpuResource: 8000, + rs.MemoryResource: 10000000000, + rs.GpuResource: 1, + }, + }, { name: "ignore extra resources", isRestrictNode: true, From b8189b2b8bd3dc0c4e79bf71c2011aa9a61b7155 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Mon, 25 Aug 2025 09:35:04 +0200 Subject: [PATCH 3/5] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a455b73c..722dd03eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Fixed - kai-scheduler will not ignore pod spec.overhead field +- Fixed wrong GPU memory unit conversion from node `nvidia.com/gpu.memory` labels +- Fixed incorrect MIG GPU usage calculation leading to wrong scheduling decision ## [v0.4.12] - 2025-07-18 From 0d3b53969dd6b39885df42a398024ebd75cb5b29 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Mon, 25 Aug 2025 14:30:44 +0200 Subject: [PATCH 4/5] fix proportion test --- pkg/scheduler/plugins/proportion/proportion_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go index 2022c5d2b..32139e401 100644 --- a/pkg/scheduler/plugins/proportion/proportion_test.go +++ b/pkg/scheduler/plugins/proportion/proportion_test.go @@ -572,7 +572,7 @@ var _ = Describe("Set Fair Share in Proportion", func() { Node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ - "node-role.kubernetes.io/gpu-worker": "true", + node_info.GpuWorkerNode: "true", }, }, }, From 47df9b7ec7431ff7bf953ac7c31f3757809af8f5 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Tue, 26 Aug 2025 12:16:45 +0200 Subject: [PATCH 5/5] fix concurrency test in v0.4 --- pkg/scheduler/cache/status_updater/concurrency_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/scheduler/cache/status_updater/concurrency_test.go b/pkg/scheduler/cache/status_updater/concurrency_test.go index 74968c23a..143d66ea8 100644 --- a/pkg/scheduler/cache/status_updater/concurrency_test.go +++ b/pkg/scheduler/cache/status_updater/concurrency_test.go @@ -156,6 +156,15 @@ var _ = Describe("Status Updater Concurrency - large scale: increase queue size" podGroupsFromCluster = append(podGroupsFromCluster, podGroup.DeepCopy()) } + Eventually(func() int { + numberOfPodGroupInFlight := 0 + statusUpdater.inFlightPodGroups.Range(func(key any, value any) bool { + numberOfPodGroupInFlight += 1 + return true + }) + return numberOfPodGroupInFlight + }, time.Second*20, time.Millisecond*50).Should(Equal(0)) + statusUpdater.SyncPodGroupsWithPendingUpdates(podGroupsFromCluster) // check that the pods groups are now not updated anymore