Skip to content

Commit 9d1b132

Browse files
enoodleromanbaron
andauthored
0.6 - cherry pick fix resources checks (#327)
* Fixed a scenario where only GPU resources where checked for job and node, causing it to be bound instead of being pipelined (#322) * update changelog --------- Co-authored-by: Roman Baron <[email protected]>
1 parent 3d961a4 commit 9d1b132

File tree

4 files changed

+180
-14
lines changed

4 files changed

+180
-14
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

77
## [Unreleased]
88

9+
## [v0.6.9] - 2025-07-18
10+
11+
### Fixed
12+
- Fixed a scenario where only GPU resources where checked for job and node, causing it to be bound instead of being pipelined
13+
914
## [v0.6.8] - 2025-07-13
1015

1116
### Fixed

pkg/scheduler/actions/allocate/allocateGpuMemory_test.go

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,5 +246,172 @@ func getMemoryGPUTestsMetadata() []integration_tests_utils.TestTopologyMetadata
246246
},
247247
},
248248
},
249+
{
250+
TestTopologyBasic: test_utils.TestTopologyBasic{
251+
Name: "Pending job requests GPU memory, assigned to an already shared GPU device, memory resource cannot be allocated",
252+
Jobs: []*jobs_fake.TestJobBasic{
253+
{
254+
Name: "pending_job-0",
255+
RequiredGpuMemory: 50,
256+
RequiredMemoryPerTask: 750,
257+
Priority: constants.PriorityBuildNumber,
258+
QueueName: "queue0",
259+
Tasks: []*tasks_fake.TestTaskBasic{
260+
{
261+
State: pod_status.Pending,
262+
GPUGroups: []string{"0"},
263+
},
264+
},
265+
},
266+
{
267+
Name: "running_job-0",
268+
RequiredMemoryPerTask: 1000,
269+
RequiredGpuMemory: 25,
270+
Priority: constants.PriorityBuildNumber,
271+
QueueName: "queue0",
272+
Tasks: []*tasks_fake.TestTaskBasic{
273+
{
274+
State: pod_status.Running,
275+
GPUGroups: []string{"0"},
276+
NodeName: "node0",
277+
},
278+
},
279+
},
280+
{
281+
Name: "running_job-1",
282+
RequiredMemoryPerTask: 500,
283+
Priority: constants.PriorityBuildNumber,
284+
QueueName: "queue0",
285+
Tasks: []*tasks_fake.TestTaskBasic{
286+
{
287+
State: pod_status.Releasing,
288+
GPUGroups: []string{"0"},
289+
NodeName: "node0",
290+
},
291+
},
292+
},
293+
},
294+
Nodes: map[string]nodes_fake.TestNodeBasic{
295+
"node0": {
296+
GPUs: 1,
297+
CPUMemory: 2000,
298+
},
299+
},
300+
Queues: []test_utils.TestQueueBasic{
301+
{
302+
Name: "queue0",
303+
DeservedGPUs: 1,
304+
},
305+
},
306+
JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
307+
"pending_job-0": {
308+
Status: pod_status.Pipelined,
309+
MemoryRequired: 750,
310+
GPUGroups: []string{"0"},
311+
},
312+
"running_job-0": {
313+
Status: pod_status.Running,
314+
GPUGroups: []string{"0"},
315+
MemoryRequired: 1000,
316+
NodeName: "node0",
317+
},
318+
"running_job-1": {
319+
Status: pod_status.Releasing,
320+
GPUGroups: []string{"0"},
321+
MemoryRequired: 500,
322+
NodeName: "node0",
323+
},
324+
},
325+
Mocks: &test_utils.TestMock{
326+
CacheRequirements: &test_utils.CacheMocking{
327+
NumberOfCacheBinds: 0,
328+
NumberOfPipelineActions: 1,
329+
},
330+
},
331+
},
332+
},
333+
{
334+
TestTopologyBasic: test_utils.TestTopologyBasic{
335+
Name: "Pending job requests gpu memory, new shared GPU device selected, memory cannot be allocated",
336+
Jobs: []*jobs_fake.TestJobBasic{
337+
{
338+
Name: "pending_job-0",
339+
RequiredGpuMemory: 50,
340+
RequiredMemoryPerTask: 750,
341+
Priority: constants.PriorityBuildNumber,
342+
QueueName: "queue0",
343+
Tasks: []*tasks_fake.TestTaskBasic{
344+
{
345+
State: pod_status.Pending,
346+
GPUGroups: []string{"0"},
347+
},
348+
},
349+
},
350+
{
351+
Name: "running_job-0",
352+
RequiredMemoryPerTask: 1000,
353+
Priority: constants.PriorityBuildNumber,
354+
QueueName: "queue0",
355+
Tasks: []*tasks_fake.TestTaskBasic{
356+
{
357+
State: pod_status.Running,
358+
GPUGroups: []string{"0"},
359+
NodeName: "node0",
360+
},
361+
},
362+
},
363+
{
364+
Name: "running_job-1",
365+
RequiredMemoryPerTask: 500,
366+
Priority: constants.PriorityBuildNumber,
367+
QueueName: "queue0",
368+
Tasks: []*tasks_fake.TestTaskBasic{
369+
{
370+
State: pod_status.Releasing,
371+
GPUGroups: []string{"0"},
372+
NodeName: "node0",
373+
},
374+
},
375+
},
376+
},
377+
Nodes: map[string]nodes_fake.TestNodeBasic{
378+
"node0": {
379+
GPUs: 1,
380+
CPUMemory: 2000,
381+
},
382+
},
383+
Queues: []test_utils.TestQueueBasic{
384+
{
385+
Name: "queue0",
386+
DeservedGPUs: 1,
387+
},
388+
},
389+
JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
390+
"pending_job-0": {
391+
Status: pod_status.Pipelined,
392+
MemoryRequired: 750,
393+
GPUGroups: []string{"0"},
394+
},
395+
"running_job-0": {
396+
Status: pod_status.Running,
397+
GPUGroups: []string{"0"},
398+
MemoryRequired: 1000,
399+
NodeName: "node0",
400+
},
401+
"running_job-1": {
402+
Status: pod_status.Releasing,
403+
GPUGroups: []string{"0"},
404+
MemoryRequired: 500,
405+
NodeName: "node0",
406+
},
407+
},
408+
Mocks: &test_utils.TestMock{
409+
CacheRequirements: &test_utils.CacheMocking{
410+
NumberOfCacheBinds: 0,
411+
NumberOfPipelineActions: 1,
412+
},
413+
},
414+
},
415+
},
249416
}
250417
}

pkg/scheduler/actions/common/allocate.go

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,7 @@ func allocateTask(ssn *framework.Session, stmt *framework.Statement, nodes []*no
6464
if !ssn.FittingNode(task, node, !isPipelineOnly) {
6565
continue
6666
}
67-
68-
if task.IsFractionRequest() {
69-
success = gpu_sharing.AllocateFractionalGPUTaskToNode(ssn, stmt, task, node, isPipelineOnly)
70-
} else if task.IsMemoryRequest() {
71-
success = allocateGpuMemoryTaskToNode(ssn, stmt, task, node, isPipelineOnly)
72-
} else {
73-
success = allocateTaskToNode(ssn, stmt, task, node, isPipelineOnly)
74-
}
75-
67+
success = allocateTaskToNode(ssn, stmt, task, node, isPipelineOnly)
7668
if success {
7769
break
7870
}
@@ -91,16 +83,16 @@ func allocateTask(ssn *framework.Session, stmt *framework.Statement, nodes []*no
9183
}
9284

9385
func allocateTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo, isPipelineOnly bool) bool {
86+
if task.IsFractionRequest() || task.IsMemoryRequest() {
87+
return gpu_sharing.AllocateFractionalGPUTaskToNode(ssn, stmt, task, node, isPipelineOnly)
88+
}
89+
9490
if taskAllocatable := node.IsTaskAllocatable(task); !isPipelineOnly && taskAllocatable {
9591
return bindTaskToNode(ssn, stmt, task, node)
9692
}
9793
return pipelineTaskToNode(ssn, stmt, task, node, !isPipelineOnly)
9894
}
9995

100-
func allocateGpuMemoryTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo, isPipelineOnly bool) bool {
101-
return gpu_sharing.AllocateFractionalGPUTaskToNode(ssn, stmt, task, node, isPipelineOnly)
102-
}
103-
10496
func bindTaskToNode(ssn *framework.Session, stmt *framework.Statement, task *pod_info.PodInfo, node *node_info.NodeInfo) bool {
10597
log.InfraLogger.V(6).Infof("Binding Task <%v/%v> to node <%v>, requires: %v GPUs",
10698
task.Namespace, task.Name, node.Name, task.ResReq)

pkg/scheduler/gpu_sharing/gpuSharing.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ func getNodePreferableGpuForSharing(fittingGPUsOnNode []string, node *node_info.
5353
}
5454
} else {
5555
nodeGpusSharing.IsReleasing =
56-
nodeGpusSharing.IsReleasing || !node.EnoughIdleResourcesOnGpu(pod.ResReq, gpuIdx)
56+
nodeGpusSharing.IsReleasing ||
57+
!node.EnoughIdleResourcesOnGpu(pod.ResReq, gpuIdx) ||
58+
!node.IsTaskAllocatable(pod)
5759
nodeGpusSharing.Groups = append(nodeGpusSharing.Groups, gpuIdx)
5860
}
5961

0 commit comments

Comments
 (0)