diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a455b73c..722dd03eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Fixed - kai-scheduler will not ignore pod spec.overhead field +- Fixed wrong GPU memory unit conversion from node `nvidia.com/gpu.memory` labels +- Fixed incorrect MIG GPU usage calculation leading to wrong scheduling decision ## [v0.4.12] - 2025-07-18 diff --git a/pkg/scheduler/api/node_info/node_info.go b/pkg/scheduler/api/node_info/node_info.go index 1a8162cc0..e64cc6fe9 100644 --- a/pkg/scheduler/api/node_info/node_info.go +++ b/pkg/scheduler/api/node_info/node_info.go @@ -12,7 +12,6 @@ import ( "go.uber.org/multierr" "golang.org/x/exp/maps" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" @@ -622,8 +621,7 @@ func getNodeGpuMemory(node *v1.Node) (int64, bool) { gpuMemoryLabelValue = convertBytesToMib(gpuMemoryLabelValue) } - gpuMemoryInMb := convertMibToMb(gpuMemoryLabelValue) - return gpuMemoryInMb - (gpuMemoryInMb % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes + return gpuMemoryLabelValue - (gpuMemoryLabelValue % 100), true // Floor the memory count to make sure its divided by 100 so there will not be 2 jobs that get same bytes } func checkGpuMemoryIsInMib(gpuMemoryValue int64) bool { @@ -634,12 +632,6 @@ func convertBytesToMib(gpuMemoryValue int64) int64 { return gpuMemoryValue / BitToMib } -func convertMibToMb(countInMib int64) int64 { - resourceMemory := resource.NewQuantity(countInMib*1024*1024, resource.BinarySI) - mbResourceMemory := resource.NewQuantity(resourceMemory.Value(), resource.DecimalSI) - return mbResourceMemory.Value() / MbToBRatio -} - func (ni *NodeInfo) IsCPUOnlyNode() bool { if ni.IsMIGEnabled() { return false diff --git a/pkg/scheduler/api/node_info/node_info_test.go b/pkg/scheduler/api/node_info/node_info_test.go index 40cef7513..3e985e709 100644 --- a/pkg/scheduler/api/node_info/node_info_test.go +++ b/pkg/scheduler/api/node_info/node_info_test.go @@ -433,13 +433,6 @@ func TestAddRemovePods(t *testing.T) { } } -func TestConvertMibToMb(t *testing.T) { - mibSize := int64(100) - mbSize := convertMibToMb(mibSize) - mbSizeManualConversion := int64(float64(mibSize) * MibToMbScale) - assert.Equal(t, mbSize, mbSizeManualConversion) -} - type allocatableTestData struct { node *v1.Node podsResources []v1.ResourceList @@ -668,7 +661,7 @@ func TestGpuOperatorHasMemoryError_MibInput(t *testing.T) { testNode.Labels[GpuMemoryLabel] = "4096" gpuMemoryInMb, ok := getNodeGpuMemory(testNode) assert.Equal(t, true, ok) - assert.Equal(t, int64(4200), gpuMemoryInMb) + assert.Equal(t, int64(4000), gpuMemoryInMb) } func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) { @@ -676,7 +669,7 @@ func TestGpuOperatorHasMemoryError_Bytes(t *testing.T) { testNode.Labels[GpuMemoryLabel] = "4295000001" gpuMemoryInMb, ok := getNodeGpuMemory(testNode) assert.Equal(t, true, ok) - assert.Equal(t, int64(4200), gpuMemoryInMb) + assert.Equal(t, int64(4000), gpuMemoryInMb) } func addJobAnnotation(pod *v1.Pod) { diff --git a/pkg/scheduler/api/pod_info/pod_info.go b/pkg/scheduler/api/pod_info/pod_info.go index c735d2d7e..2c74adebb 100644 --- a/pkg/scheduler/api/pod_info/pod_info.go +++ b/pkg/scheduler/api/pod_info/pod_info.go @@ -365,10 +365,6 @@ func getTaskStatus(pod *v1.Pod, bindRequest *bindrequest_info.BindRequestInfo) p } func (pi *PodInfo) updatePodAdditionalFields(bindRequest *bindrequest_info.BindRequestInfo) { - if len(pi.Job) == 0 { - return - } - if bindRequest != nil && len(bindRequest.BindRequest.Spec.SelectedGPUGroups) > 0 { pi.GPUGroups = bindRequest.BindRequest.Spec.SelectedGPUGroups } else { diff --git a/pkg/scheduler/cache/cluster_info/cluster_info_test.go b/pkg/scheduler/cache/cluster_info/cluster_info_test.go index c064bd270..1110292e8 100644 --- a/pkg/scheduler/cache/cluster_info/cluster_info_test.go +++ b/pkg/scheduler/cache/cluster_info/cluster_info_test.go @@ -169,6 +169,15 @@ func TestSnapshotNodes(t *testing.T) { Phase: v1core.PodRunning, }, } + exampleMIGPod := examplePod.DeepCopy() + exampleMIGPod.Name = "mig-pod" + exampleMIGPod.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2") + exampleMIGPodWithPG := examplePod.DeepCopy() + exampleMIGPodWithPG.Name = "mig-pod-with-pg" + exampleMIGPodWithPG.Annotations = map[string]string{ + commonconstants.PodGroupAnnotationForPod: "pg-1", + } + exampleMIGPodWithPG.Spec.Containers[0].Resources.Requests["nvidia.com/mig-1g.5gb"] = resource.MustParse("2") tests := map[string]struct { objs []runtime.Object resultNodes []*node_info.NodeInfo @@ -307,6 +316,48 @@ func TestSnapshotNodes(t *testing.T) { resultPodsLen: 1, nodePoolName: "pool-a", }, + "MIG Job": { + objs: []runtime.Object{ + &v1core.Node{ + ObjectMeta: v1.ObjectMeta{ + Name: "node-1", + }, + Status: v1core.NodeStatus{ + Allocatable: v1core.ResourceList{ + "cpu": resource.MustParse("10"), + "nvidia.com/mig-1g.5gb": resource.MustParse("10"), + }, + }, + }, + exampleMIGPod, + exampleMIGPodWithPG, + }, + resultNodes: []*node_info.NodeInfo{ + { + Name: "node-1", + Idle: resource_info.ResourceFromResourceList( + v1core.ResourceList{ + "cpu": resource.MustParse("6"), + "nvidia.com/mig-1g.5gb": resource.MustParse("6"), + }, + ), + Used: resource_info.ResourceFromResourceList( + v1core.ResourceList{ + "cpu": resource.MustParse("4"), + "memory": resource.MustParse("0"), + "nvidia.com/mig-1g.5gb": resource.MustParse("4"), + }, + ), + Releasing: resource_info.ResourceFromResourceList( + v1core.ResourceList{ + "cpu": resource.MustParse("0"), + "memory": resource.MustParse("0"), + }, + ), + }, + }, + resultPodsLen: 2, + }, } for name, test := range tests { diff --git a/pkg/scheduler/cache/status_updater/concurrency_test.go b/pkg/scheduler/cache/status_updater/concurrency_test.go index 74968c23a..143d66ea8 100644 --- a/pkg/scheduler/cache/status_updater/concurrency_test.go +++ b/pkg/scheduler/cache/status_updater/concurrency_test.go @@ -156,6 +156,15 @@ var _ = Describe("Status Updater Concurrency - large scale: increase queue size" podGroupsFromCluster = append(podGroupsFromCluster, podGroup.DeepCopy()) } + Eventually(func() int { + numberOfPodGroupInFlight := 0 + statusUpdater.inFlightPodGroups.Range(func(key any, value any) bool { + numberOfPodGroupInFlight += 1 + return true + }) + return numberOfPodGroupInFlight + }, time.Second*20, time.Millisecond*50).Should(Equal(0)) + statusUpdater.SyncPodGroupsWithPendingUpdates(podGroupsFromCluster) // check that the pods groups are now not updated anymore diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go index 4696e062d..d83d24625 100644 --- a/pkg/scheduler/plugins/proportion/proportion.go +++ b/pkg/scheduler/plugins/proportion/proportion.go @@ -9,7 +9,6 @@ import ( commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" - "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info/resources" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status" "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info" @@ -146,18 +145,6 @@ func getNodeResources(ssn *framework.Session, node *node_info.NodeInfo) rs.Resou nodeResource.Add(rs.NewResourceQuantities(node.Allocatable.Cpu(), node.Allocatable.Memory(), 0)) } else { nodeResource.Add(utils.QuantifyResource(node.Allocatable)) - migEnabledGpus := 0 - for resource, qty := range node.Node.Status.Allocatable { - if resource_info.IsMigResource(resource) { - gpu, _, err := resources.ExtractGpuAndMemoryFromMigResourceName(string(resource)) - if err != nil { - log.InfraLogger.Errorf("Failed to extract gpu and memory from mig resource %v: %v", resource, err) - continue - } - migEnabledGpus += int(qty.Value()) * gpu - } - } - nodeResource[rs.GpuResource] += float64(migEnabledGpus) } // Subtract resources of non-related pods diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go index d242695c4..32139e401 100644 --- a/pkg/scheduler/plugins/proportion/proportion_test.go +++ b/pkg/scheduler/plugins/proportion/proportion_test.go @@ -564,6 +564,28 @@ var _ = Describe("Set Fair Share in Proportion", func() { rs.GpuResource: 2, }, }, + { + name: "mig gpu node", + isRestrictNode: true, + node: &node_info.NodeInfo{ + Name: "n1", + Node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + node_info.GpuWorkerNode: "true", + }, + }, + }, + Allocatable: resource_info.ResourceFromResourceList( + common_info.BuildResourceListWithMig("8000m", "10G", "nvidia.com/mig-1g.5gb"), + ), + }, + want: rs.ResourceQuantities{ + rs.CpuResource: 8000, + rs.MemoryResource: 10000000000, + rs.GpuResource: 1, + }, + }, { name: "ignore extra resources", isRestrictNode: true,