Skip to content

Commit dacb465

Browse files
enoodleeveningcafeNgô Quang Hòaclaudehello2mao
authored
v0.4 - cherry pick mig and memory fixes (#432)
* fix: GPU memory calculation using incorrect unit conversion (#417) * fix: GPU memory calculation using incorrect unit conversion Fixes #416 The GPU memory validation was incorrectly converting MiB to MB and back, causing the scheduler to use inflated memory values (48300 MB instead of 46068 MiB) when checking shared GPU allocations. This allowed pods to be scheduled beyond the actual GPU memory capacity. Changes: - Use original MiB value from nvidia.com/gpu.memory node label directly - Remove unnecessary MiB->MB->MiB conversion that caused precision loss - Maintain the flooring logic for memory alignment but in MiB units * Remove unused convertMibToMb function and explanatory comments * Remove unused resource import after convertMibToMb cleanup 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]> * test: update GPU memory unit conversion test expectations Fix test expectations after removing MiB to MB conversion. Tests now correctly expect 4000 MiB instead of 4200 MB. Fixes #416 * Format code with go fmt --------- Co-authored-by: Ngô Quang Hòa <[email protected]> Co-authored-by: Claude <[email protected]> * Fix: incorrect scheduling decision and calculation when using MIG (#422) * fix mig scheduling issue * clean up the code --------- Co-authored-by: bmao <[email protected]> * update changelog * fix proportion test * fix concurrency test in v0.4 --------- Co-authored-by: Hoa Ngo <[email protected]> Co-authored-by: Ngô Quang Hòa <[email protected]> Co-authored-by: Claude <[email protected]> Co-authored-by: Billy Mao <[email protected]> Co-authored-by: bmao <[email protected]>
1 parent d9cdc9d commit dacb465

File tree

8 files changed

+87
-35
lines changed

8 files changed

+87
-35
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
88

99
### Fixed
1010
- kai-scheduler will not ignore pod spec.overhead field
11+
- Fixed wrong GPU memory unit conversion from node `nvidia.com/gpu.memory` labels
12+
- Fixed incorrect MIG GPU usage calculation leading to wrong scheduling decision
1113

1214
## [v0.4.12] - 2025-07-18
1315

pkg/scheduler/api/node_info/node_info.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"go.uber.org/multierr"
1313
"golang.org/x/exp/maps"
1414
v1 "k8s.io/api/core/v1"
15-
"k8s.io/apimachinery/pkg/api/resource"
1615

1716
commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
1817
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
@@ -622,8 +621,7 @@ func getNodeGpuMemory(node *v1.Node) (int64, bool) {
622621
gpuMemoryLabelValue = convertBytesToMib(gpuMemoryLabelValue)
623622
}
624623

625-
gpuMemoryInMb := convertMibToMb(gpuMemoryLabelValue)
626-
return gpuMemoryInMb - (gpuMemoryInMb % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes
624+
return gpuMemoryLabelValue - (gpuMemoryLabelValue % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes
627625
}
628626

629627
func checkGpuMemoryIsInMib(gpuMemoryValue int64) bool {
@@ -634,12 +632,6 @@ func convertBytesToMib(gpuMemoryValue int64) int64 {
634632
return gpuMemoryValue / BitToMib
635633
}
636634

637-
func convertMibToMb(countInMib int64) int64 {
638-
resourceMemory := resource.NewQuantity(countInMib*1024*1024, resource.BinarySI)
639-
mbResourceMemory := resource.NewQuantity(resourceMemory.Value(), resource.DecimalSI)
640-
return mbResourceMemory.Value() / MbToBRatio
641-
}
642-
643635
func (ni *NodeInfo) IsCPUOnlyNode() bool {
644636
if ni.IsMIGEnabled() {
645637
return false

pkg/scheduler/api/node_info/node_info_test.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -433,13 +433,6 @@ func TestAddRemovePods(t *testing.T) {
433433
}
434434
}
435435

436-
func TestConvertMibToMb(t *testing.T) {
437-
mibSize := int64(100)
438-
mbSize := convertMibToMb(mibSize)
439-
mbSizeManualConversion := int64(float64(mibSize) * MibToMbScale)
440-
assert.Equal(t, mbSize, mbSizeManualConversion)
441-
}
442-
443436
type allocatableTestData struct {
444437
node *v1.Node
445438
podsResources []v1.ResourceList
@@ -668,15 +661,15 @@ func TestGpuOperatorHasMemoryError_MibInput(t *testing.T) {
668661
testNode.Labels[GpuMemoryLabel] = "4096"
669662
gpuMemoryInMb, ok := getNodeGpuMemory(testNode)
670663
assert.Equal(t, true, ok)
671-
assert.Equal(t, int64(4200), gpuMemoryInMb)
664+
assert.Equal(t, int64(4000), gpuMemoryInMb)
672665
}
673666

674667
func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) {
675668
testNode := common_info.BuildNode("n1", common_info.BuildResourceList("8000m", "10G"))
676669
testNode.Labels[GpuMemoryLabel] = "4295000001"
677670
gpuMemoryInMb, ok := getNodeGpuMemory(testNode)
678671
assert.Equal(t, true, ok)
679-
assert.Equal(t, int64(4200), gpuMemoryInMb)
672+
assert.Equal(t, int64(4000), gpuMemoryInMb)
680673
}
681674

682675
func addJobAnnotation(pod *v1.Pod) {

pkg/scheduler/api/pod_info/pod_info.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,10 +365,6 @@ func getTaskStatus(pod *v1.Pod, bindRequest *bindrequest_info.BindRequestInfo) p
365365
}
366366

367367
func (pi *PodInfo) updatePodAdditionalFields(bindRequest *bindrequest_info.BindRequestInfo) {
368-
if len(pi.Job) == 0 {
369-
return
370-
}
371-
372368
if bindRequest != nil && len(bindRequest.BindRequest.Spec.SelectedGPUGroups) > 0 {
373369
pi.GPUGroups = bindRequest.BindRequest.Spec.SelectedGPUGroups
374370
} else {

pkg/scheduler/cache/cluster_info/cluster_info_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,15 @@ func TestSnapshotNodes(t *testing.T) {
169169
Phase: v1core.PodRunning,
170170
},
171171
}
172+
exampleMIGPod := examplePod.DeepCopy()
173+
exampleMIGPod.Name = "mig-pod"
174+
exampleMIGPod.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2")
175+
exampleMIGPodWithPG := examplePod.DeepCopy()
176+
exampleMIGPodWithPG.Name = "mig-pod-with-pg"
177+
exampleMIGPodWithPG.Annotations = map[string]string{
178+
commonconstants.PodGroupAnnotationForPod: "pg-1",
179+
}
180+
exampleMIGPodWithPG.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2")
172181
tests := map[string]struct {
173182
objs []runtime.Object
174183
resultNodes []*node_info.NodeInfo
@@ -307,6 +316,48 @@ func TestSnapshotNodes(t *testing.T) {
307316
resultPodsLen: 1,
308317
nodePoolName: "pool-a",
309318
},
319+
"MIG Job": {
320+
objs: []runtime.Object{
321+
&v1core.Node{
322+
ObjectMeta: v1.ObjectMeta{
323+
Name: "node-1",
324+
},
325+
Status: v1core.NodeStatus{
326+
Allocatable: v1core.ResourceList{
327+
"cpu": resource.MustParse("10"),
328+
"nvidia.com/mig-1g.5gb": resource.MustParse("10"),
329+
},
330+
},
331+
},
332+
exampleMIGPod,
333+
exampleMIGPodWithPG,
334+
},
335+
resultNodes: []*node_info.NodeInfo{
336+
{
337+
Name: "node-1",
338+
Idle: resource_info.ResourceFromResourceList(
339+
v1core.ResourceList{
340+
"cpu": resource.MustParse("6"),
341+
"nvidia.com/mig-1g.5gb": resource.MustParse("6"),
342+
},
343+
),
344+
Used: resource_info.ResourceFromResourceList(
345+
v1core.ResourceList{
346+
"cpu": resource.MustParse("4"),
347+
"memory": resource.MustParse("0"),
348+
"nvidia.com/mig-1g.5gb": resource.MustParse("4"),
349+
},
350+
),
351+
Releasing: resource_info.ResourceFromResourceList(
352+
v1core.ResourceList{
353+
"cpu": resource.MustParse("0"),
354+
"memory": resource.MustParse("0"),
355+
},
356+
),
357+
},
358+
},
359+
resultPodsLen: 2,
360+
},
310361
}
311362

312363
for name, test := range tests {

pkg/scheduler/cache/status_updater/concurrency_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,15 @@ var _ = Describe("Status Updater Concurrency - large scale: increase queue size"
156156
podGroupsFromCluster = append(podGroupsFromCluster, podGroup.DeepCopy())
157157
}
158158

159+
Eventually(func() int {
160+
numberOfPodGroupInFlight := 0
161+
statusUpdater.inFlightPodGroups.Range(func(key any, value any) bool {
162+
numberOfPodGroupInFlight += 1
163+
return true
164+
})
165+
return numberOfPodGroupInFlight
166+
}, time.Second*20, time.Millisecond*50).Should(Equal(0))
167+
159168
statusUpdater.SyncPodGroupsWithPendingUpdates(podGroupsFromCluster)
160169

161170
// check that the pods groups are now not updated anymore

pkg/scheduler/plugins/proportion/proportion.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99

1010
commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
1111
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
12-
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info/resources"
1312
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info"
1413
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status"
1514
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info"
@@ -146,18 +145,6 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou
146145
nodeResource.Add(rs.NewResourceQuantities(node.Allocatable.Cpu(), node.Allocatable.Memory(), 0))
147146
} else {
148147
nodeResource.Add(utils.QuantifyResource(node.Allocatable))
149-
migEnabledGpus := 0
150-
for resource, qty := range node.Node.Status.Allocatable {
151-
if resource_info.IsMigResource(resource) {
152-
gpu, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(string(resource))
153-
if err != nil {
154-
log.InfraLogger.Errorf("Failed to extract gpu and memory from mig resource %v: %v", resource, err)
155-
continue
156-
}
157-
migEnabledGpus += int(qty.Value()) * gpu
158-
}
159-
}
160-
nodeResource[rs.GpuResource] += float64(migEnabledGpus)
161148
}
162149

163150
// Subtract resources of non-related pods

pkg/scheduler/plugins/proportion/proportion_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,28 @@ var _ = Describe("Set Fair Share in Proportion", func() {
564564
rs.GpuResource: 2,
565565
},
566566
},
567+
{
568+
name: "mig gpu node",
569+
isRestrictNode: true,
570+
node: &node_info.NodeInfo{
571+
Name: "n1",
572+
Node: &v1.Node{
573+
ObjectMeta: metav1.ObjectMeta{
574+
Labels: map[string]string{
575+
node_info.GpuWorkerNode: "true",
576+
},
577+
},
578+
},
579+
Allocatable: resource_info.ResourceFromResourceList(
580+
common_info.BuildResourceListWithMig("8000m", "10G", "nvidia.com/mig-1g.5gb"),
581+
),
582+
},
583+
want: rs.ResourceQuantities{
584+
rs.CpuResource: 8000,
585+
rs.MemoryResource: 10000000000,
586+
rs.GpuResource: 1,
587+
},
588+
},
567589
{
568590
name: "ignore extra resources",
569591
isRestrictNode: true,

0 commit comments

Comments
 (0)