0.6 - cherry pick fix resources checks (#327)

enoodle · romanbaron · web-flow · commit 9d1b132d6862 · 2025-07-28T13:13:39.000+02:00
* Fixed a scenario where only GPU resources where checked for job and node, causing it to be bound instead of being pipelined (#322) * update changelog --------- Co-authored-by: Roman Baron <91824211+romanbaron@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ## [Unreleased]
 
+## [v0.6.9] - 2025-07-18
+
+### Fixed
+- Fixed a scenario where only GPU resources where checked for job and node, causing it to be bound instead of being pipelined
+
 ## [v0.6.8] - 2025-07-13
 
 ### Fixed
diff --git a/pkg/scheduler/actions/allocate/allocateGpuMemory_test.go b/pkg/scheduler/actions/allocate/allocateGpuMemory_test.go
@@ -246,5 +246,172 @@ func getMemoryGPUTestsMetadata() []integration_tests_utils.TestTopologyMetadata
 				},
 			},
 		},
+		{
+			TestTopologyBasic: test_utils.TestTopologyBasic{
+				Name: "Pending job requests GPU memory, assigned to an already shared GPU device, memory resource cannot be allocated",
+				Jobs: []*jobs_fake.TestJobBasic{
+					{
+						Name:                  "pending_job-0",
+						RequiredGpuMemory:     50,
+						RequiredMemoryPerTask: 750,
+						Priority:              constants.PriorityBuildNumber,
+						QueueName:             "queue0",
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{
+								State:     pod_status.Pending,
+								GPUGroups: []string{"0"},
+							},
+						},
+					},
+					{
+						Name:                  "running_job-0",
+						RequiredMemoryPerTask: 1000,
+						RequiredGpuMemory:     25,
+						Priority:              constants.PriorityBuildNumber,
+						QueueName:             "queue0",
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{
+								State:     pod_status.Running,
+								GPUGroups: []string{"0"},
+								NodeName:  "node0",
+							},
+						},
+					},
+					{
+						Name:                  "running_job-1",
+						RequiredMemoryPerTask: 500,
+						Priority:              constants.PriorityBuildNumber,
+						QueueName:             "queue0",
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{
+								State:     pod_status.Releasing,
+								GPUGroups: []string{"0"},
+								NodeName:  "node0",
+							},
+						},
+					},
+				},
+				Nodes: map[string]nodes_fake.TestNodeBasic{
+					"node0": {
+						GPUs:      1,
+						CPUMemory: 2000,
+					},
+				},
+				Queues: []test_utils.TestQueueBasic{
+					{
+						Name:         "queue0",
+						DeservedGPUs: 1,
+					},
+				},
+				JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
+					"pending_job-0": {
+						Status:         pod_status.Pipelined,
+						MemoryRequired: 750,
+						GPUGroups:      []string{"0"},
+					},
+					"running_job-0": {
+						Status:         pod_status.Running,
+						GPUGroups:      []string{"0"},
+						MemoryRequired: 1000,
+						NodeName:       "node0",
+					},
+					"running_job-1": {
+						Status:         pod_status.Releasing,
+						GPUGroups:      []string{"0"},
+						MemoryRequired: 500,
+						NodeName:       "node0",
+					},
+				},
+				Mocks: &test_utils.TestMock{
+					CacheRequirements: &test_utils.CacheMocking{
+						NumberOfCacheBinds:      0,
+						NumberOfPipelineActions: 1,
+					},
+				},
+			},
+		},
+		{
+			TestTopologyBasic: test_utils.TestTopologyBasic{
+				Name: "Pending job requests gpu memory, new shared GPU device selected, memory cannot be allocated",
+				Jobs: []*jobs_fake.TestJobBasic{
+					{
+						Name:                  "pending_job-0",
+						RequiredGpuMemory:     50,
+						RequiredMemoryPerTask: 750,
+						Priority:              constants.PriorityBuildNumber,
+						QueueName:             "queue0",
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{
+								State:     pod_status.Pending,
+								GPUGroups: []string{"0"},
+							},
+						},
+					},
+					{
+						Name:                  "running_job-0",
+						RequiredMemoryPerTask: 1000,
+						Priority:              constants.PriorityBuildNumber,
+						QueueName:             "queue0",
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{
+								State:     pod_status.Running,
+								GPUGroups: []string{"0"},
+								NodeName:  "node0",
+							},
+						},
+					},
+					{
+						Name:                  "running_job-1",
+						RequiredMemoryPerTask: 500,
+						Priority:              constants.PriorityBuildNumber,
+						QueueName:             "queue0",
+						Tasks: []*tasks_fake.TestTaskBasic{
+							{
+								State:     pod_status.Releasing,
+								GPUGroups: []string{"0"},
+								NodeName:  "node0",
+							},
+						},
+					},
+				},
+				Nodes: map[string]nodes_fake.TestNodeBasic{
+					"node0": {
+						GPUs:      1,
+						CPUMemory: 2000,
+					},
+				},
+				Queues: []test_utils.TestQueueBasic{
+					{
+						Name:         "queue0",
+						DeservedGPUs: 1,
+					},
+				},
+				JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
+					"pending_job-0": {
+						Status:         pod_status.Pipelined,
+						MemoryRequired: 750,
+						GPUGroups:      []string{"0"},
+					},
+					"running_job-0": {
+						Status:         pod_status.Running,
+						GPUGroups:      []string{"0"},
+						MemoryRequired: 1000,
+						NodeName:       "node0",
+					},
+					"running_job-1": {
+						Status:         pod_status.Releasing,
+						GPUGroups:      []string{"0"},
+						MemoryRequired: 500,
+						NodeName:       "node0",
+					},
+				},
+				Mocks: &test_utils.TestMock{
+					CacheRequirements: &test_utils.CacheMocking{
+						NumberOfCacheBinds:      0,
+						NumberOfPipelineActions: 1,
+					},
+				},
+			},
+		},
 	}
 }
diff --git a/pkg/scheduler/actions/common/allocate.go b/pkg/scheduler/actions/common/allocate.go
@@ -64,15 +64,7 @@ func allocateTask(ssn *framework.Session, stmt *framework.Statement, nodes []*no
 		if !ssn.FittingNode(task, node, !isPipelineOnly) {
 			continue
 		}
-
-		if task.IsFractionRequest() {
-			success = gpu_sharing.AllocateFractionalGPUTaskToNode(ssn, stmt, task, node, isPipelineOnly)
-		} else if task.IsMemoryRequest() {
-			success = allocateGpuMemoryTaskToNode(ssn, stmt, task, node, isPipelineOnly)
-		} else {
-			success = allocateTaskToNode(ssn, stmt, task, node, isPipelineOnly)
-		}
-
+		success = allocateTaskToNode(ssn, stmt, task, node, isPipelineOnly)
 		if success {
 			break
 		}
@@ -91,16 +83,16 @@ func allocateTask(ssn *framework.Session, stmt *framework.Statement, nodes []*no
 }
 
 func allocateTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo, isPipelineOnly bool) bool {
+	if task.IsFractionRequest() || task.IsMemoryRequest() {
+		return gpu_sharing.AllocateFractionalGPUTaskToNode(ssn, stmt, task, node, isPipelineOnly)
+	}
+
 	if taskAllocatable := node.IsTaskAllocatable(task); !isPipelineOnly && taskAllocatable {
 		return bindTaskToNode(ssn, stmt, task, node)
 	}
 	return pipelineTaskToNode(ssn, stmt, task, node, !isPipelineOnly)
 }
 
-func allocateGpuMemoryTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo, isPipelineOnly bool) bool {
-	return gpu_sharing.AllocateFractionalGPUTaskToNode(ssn, stmt, task, node, isPipelineOnly)
-}
-
 func bindTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo) bool {
 	log.InfraLogger.V(6).Infof("Binding Task <%v/%v> to node <%v>, requires: %v GPUs",
 		task.Namespace, task.Name, node.Name, task.ResReq)
diff --git a/pkg/scheduler/gpu_sharing/gpuSharing.go b/pkg/scheduler/gpu_sharing/gpuSharing.go
@@ -53,7 +53,9 @@ func getNodePreferableGpuForSharing(fittingGPUsOnNode []string, node *node_info.
 			}
 		} else {
 			nodeGpusSharing.IsReleasing =
-				nodeGpusSharing.IsReleasing || !node.EnoughIdleResourcesOnGpu(pod.ResReq, gpuIdx)
+				nodeGpusSharing.IsReleasing ||
+					!node.EnoughIdleResourcesOnGpu(pod.ResReq, gpuIdx) ||
+					!node.IsTaskAllocatable(pod)
 			nodeGpusSharing.Groups = append(nodeGpusSharing.Groups, gpuIdx)
 		}
 

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,9 @@ func getNodePreferableGpuForSharing(fittingGPUsOnNode []string, node *node_info.`
`53`	`53`	`}`
`54`	`54`	`} else {`
`55`	`55`	`nodeGpusSharing.IsReleasing =`
`56`		`- nodeGpusSharing.IsReleasing \|\| !node.EnoughIdleResourcesOnGpu(pod.ResReq, gpuIdx)`
	`56`	`+ nodeGpusSharing.IsReleasing \|\|`
	`57`	`+ !node.EnoughIdleResourcesOnGpu(pod.ResReq, gpuIdx) \|\|`
	`58`	`+ !node.IsTaskAllocatable(pod)`
`57`	`59`	`nodeGpusSharing.Groups = append(nodeGpusSharing.Groups, gpuIdx)`
`58`	`60`	`}`
`59`	`61`