v0.4 - cherry pick mig and memory fixes (#432)

enoodle · eveningcafe · Ngô Quang Hòa · web-flow · commit dacb465a43bd · 2025-08-26T12:53:17.000+02:00
* fix: GPU memory calculation using incorrect unit conversion (#417) * fix: GPU memory calculation using incorrect unit conversion Fixes #416 The GPU memory validation was incorrectly converting MiB to MB and back, causing the scheduler to use inflated memory values (48300 MB instead of 46068 MiB) when checking shared GPU allocations. This allowed pods to be scheduled beyond the actual GPU memory capacity. Changes: - Use original MiB value from nvidia.com/gpu.memory node label directly - Remove unnecessary MiB->MB->MiB conversion that caused precision loss - Maintain the flooring logic for memory alignment but in MiB units * Remove unused convertMibToMb function and explanatory comments * Remove unused resource import after convertMibToMb cleanup 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * test: update GPU memory unit conversion test expectations Fix test expectations after removing MiB to MB conversion. Tests now correctly expect 4000 MiB instead of 4200 MB. Fixes #416 * Format code with go fmt --------- Co-authored-by: Ngô Quang Hòa <hoanq3@vng.com.vn> Co-authored-by: Claude <noreply@anthropic.com> * Fix: incorrect scheduling decision and calculation when using MIG (#422) * fix mig scheduling issue * clean up the code --------- Co-authored-by: bmao <bmao@ebay.com> * update changelog * fix proportion test * fix concurrency test in v0.4 --------- Co-authored-by: Hoa Ngo <ngohoa211@gmail.com> Co-authored-by: Ngô Quang Hòa <hoanq3@vng.com.vn> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Billy Mao <hello2mao@gmail.com> Co-authored-by: bmao <bmao@ebay.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ### Fixed
 - kai-scheduler will not ignore pod spec.overhead field
+- Fixed wrong GPU memory unit conversion from node `nvidia.com/gpu.memory` labels
+- Fixed incorrect MIG GPU usage calculation leading to wrong scheduling decision
 
 ## [v0.4.12] - 2025-07-18
 
diff --git a/pkg/scheduler/api/node_info/node_info.go b/pkg/scheduler/api/node_info/node_info.go
@@ -12,7 +12,6 @@ import (
 	"go.uber.org/multierr"
 	"golang.org/x/exp/maps"
 	v1 "k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/resource"
 
 	commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
 	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
@@ -622,8 +621,7 @@ func getNodeGpuMemory(node *v1.Node) (int64, bool) {
 		gpuMemoryLabelValue = convertBytesToMib(gpuMemoryLabelValue)
 	}
 
-	gpuMemoryInMb := convertMibToMb(gpuMemoryLabelValue)
-	return gpuMemoryInMb - (gpuMemoryInMb % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes
+	return gpuMemoryLabelValue - (gpuMemoryLabelValue % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes
 }
 
 func checkGpuMemoryIsInMib(gpuMemoryValue int64) bool {
@@ -634,12 +632,6 @@ func convertBytesToMib(gpuMemoryValue int64) int64 {
 	return gpuMemoryValue / BitToMib
 }
 
-func convertMibToMb(countInMib int64) int64 {
-	resourceMemory := resource.NewQuantity(countInMib*1024*1024, resource.BinarySI)
-	mbResourceMemory := resource.NewQuantity(resourceMemory.Value(), resource.DecimalSI)
-	return mbResourceMemory.Value() / MbToBRatio
-}
-
 func (ni *NodeInfo) IsCPUOnlyNode() bool {
 	if ni.IsMIGEnabled() {
 		return false
diff --git a/pkg/scheduler/api/node_info/node_info_test.go b/pkg/scheduler/api/node_info/node_info_test.go
@@ -433,13 +433,6 @@ func TestAddRemovePods(t *testing.T) {
 	}
 }
 
-func TestConvertMibToMb(t *testing.T) {
-	mibSize := int64(100)
-	mbSize := convertMibToMb(mibSize)
-	mbSizeManualConversion := int64(float64(mibSize) * MibToMbScale)
-	assert.Equal(t, mbSize, mbSizeManualConversion)
-}
-
 type allocatableTestData struct {
 	node                    *v1.Node
 	podsResources           []v1.ResourceList
@@ -668,15 +661,15 @@ func TestGpuOperatorHasMemoryError_MibInput(t *testing.T) {
 	testNode.Labels[GpuMemoryLabel] = "4096"
 	gpuMemoryInMb, ok := getNodeGpuMemory(testNode)
 	assert.Equal(t, true, ok)
-	assert.Equal(t, int64(4200), gpuMemoryInMb)
+	assert.Equal(t, int64(4000), gpuMemoryInMb)
 }
 
 func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) {
 	testNode := common_info.BuildNode("n1", common_info.BuildResourceList("8000m", "10G"))
 	testNode.Labels[GpuMemoryLabel] = "4295000001"
 	gpuMemoryInMb, ok := getNodeGpuMemory(testNode)
 	assert.Equal(t, true, ok)
-	assert.Equal(t, int64(4200), gpuMemoryInMb)
+	assert.Equal(t, int64(4000), gpuMemoryInMb)
 }
 
 func addJobAnnotation(pod *v1.Pod) {
diff --git a/pkg/scheduler/api/pod_info/pod_info.go b/pkg/scheduler/api/pod_info/pod_info.go
@@ -365,10 +365,6 @@ func getTaskStatus(pod *v1.Pod, bindRequest *bindrequest_info.BindRequestInfo) p
 }
 
 func (pi *PodInfo) updatePodAdditionalFields(bindRequest *bindrequest_info.BindRequestInfo) {
-	if len(pi.Job) == 0 {
-		return
-	}
-
 	if bindRequest != nil && len(bindRequest.BindRequest.Spec.SelectedGPUGroups) > 0 {
 		pi.GPUGroups = bindRequest.BindRequest.Spec.SelectedGPUGroups
 	} else {
diff --git a/pkg/scheduler/cache/cluster_info/cluster_info_test.go b/pkg/scheduler/cache/cluster_info/cluster_info_test.go
@@ -169,6 +169,15 @@ func TestSnapshotNodes(t *testing.T) {
 			Phase: v1core.PodRunning,
 		},
 	}
+	exampleMIGPod := examplePod.DeepCopy()
+	exampleMIGPod.Name = "mig-pod"
+	exampleMIGPod.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2")
+	exampleMIGPodWithPG := examplePod.DeepCopy()
+	exampleMIGPodWithPG.Name = "mig-pod-with-pg"
+	exampleMIGPodWithPG.Annotations = map[string]string{
+		commonconstants.PodGroupAnnotationForPod: "pg-1",
+	}
+	exampleMIGPodWithPG.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2")
 	tests := map[string]struct {
 		objs          []runtime.Object
 		resultNodes   []*node_info.NodeInfo
@@ -307,6 +316,48 @@ func TestSnapshotNodes(t *testing.T) {
 			resultPodsLen: 1,
 			nodePoolName:  "pool-a",
 		},
+		"MIG Job": {
+			objs: []runtime.Object{
+				&v1core.Node{
+					ObjectMeta: v1.ObjectMeta{
+						Name: "node-1",
+					},
+					Status: v1core.NodeStatus{
+						Allocatable: v1core.ResourceList{
+							"cpu":                   resource.MustParse("10"),
+							"nvidia.com/mig-1g.5gb": resource.MustParse("10"),
+						},
+					},
+				},
+				exampleMIGPod,
+				exampleMIGPodWithPG,
+			},
+			resultNodes: []*node_info.NodeInfo{
+				{
+					Name: "node-1",
+					Idle: resource_info.ResourceFromResourceList(
+						v1core.ResourceList{
+							"cpu":                   resource.MustParse("6"),
+							"nvidia.com/mig-1g.5gb": resource.MustParse("6"),
+						},
+					),
+					Used: resource_info.ResourceFromResourceList(
+						v1core.ResourceList{
+							"cpu":                   resource.MustParse("4"),
+							"memory":                resource.MustParse("0"),
+							"nvidia.com/mig-1g.5gb": resource.MustParse("4"),
+						},
+					),
+					Releasing: resource_info.ResourceFromResourceList(
+						v1core.ResourceList{
+							"cpu":    resource.MustParse("0"),
+							"memory": resource.MustParse("0"),
+						},
+					),
+				},
+			},
+			resultPodsLen: 2,
+		},
 	}
 
 	for name, test := range tests {
diff --git a/pkg/scheduler/cache/status_updater/concurrency_test.go b/pkg/scheduler/cache/status_updater/concurrency_test.go
@@ -156,6 +156,15 @@ var _ = Describe("Status Updater Concurrency - large scale: increase queue size"
 				podGroupsFromCluster = append(podGroupsFromCluster, podGroup.DeepCopy())
 			}
 
+			Eventually(func() int {
+				numberOfPodGroupInFlight := 0
+				statusUpdater.inFlightPodGroups.Range(func(key any, value any) bool {
+					numberOfPodGroupInFlight += 1
+					return true
+				})
+				return numberOfPodGroupInFlight
+			}, time.Second*20, time.Millisecond*50).Should(Equal(0))
+
 			statusUpdater.SyncPodGroupsWithPendingUpdates(podGroupsFromCluster)
 
 			// check that the pods groups are now not updated anymore
diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go
@@ -9,7 +9,6 @@ import (
 
 	commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
 	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
-	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info/resources"
 	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info"
 	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status"
 	"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info"
@@ -146,18 +145,6 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou
 		nodeResource.Add(rs.NewResourceQuantities(node.Allocatable.Cpu(), node.Allocatable.Memory(), 0))
 	} else {
 		nodeResource.Add(utils.QuantifyResource(node.Allocatable))
-		migEnabledGpus := 0
-		for resource, qty := range node.Node.Status.Allocatable {
-			if resource_info.IsMigResource(resource) {
-				gpu, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(string(resource))
-				if err != nil {
-					log.InfraLogger.Errorf("Failed to extract gpu and memory from mig resource %v: %v", resource, err)
-					continue
-				}
-				migEnabledGpus += int(qty.Value()) * gpu
-			}
-		}
-		nodeResource[rs.GpuResource] += float64(migEnabledGpus)
 	}
 
 	// Subtract resources of non-related pods
diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go
@@ -564,6 +564,28 @@ var _ = Describe("Set Fair Share in Proportion", func() {
 					rs.GpuResource:    2,
 				},
 			},
+			{
+				name:           "mig gpu node",
+				isRestrictNode: true,
+				node: &node_info.NodeInfo{
+					Name: "n1",
+					Node: &v1.Node{
+						ObjectMeta: metav1.ObjectMeta{
+							Labels: map[string]string{
+								node_info.GpuWorkerNode: "true",
+							},
+						},
+					},
+					Allocatable: resource_info.ResourceFromResourceList(
+						common_info.BuildResourceListWithMig("8000m", "10G", "nvidia.com/mig-1g.5gb"),
+					),
+				},
+				want: rs.ResourceQuantities{
+					rs.CpuResource:    8000,
+					rs.MemoryResource: 10000000000,
+					rs.GpuResource:    1,
+				},
+			},
 			{
 				name:           "ignore extra resources",
 				isRestrictNode: true,