Skip to content

Commit 1d3b0a0

Browse files
authored
Utilize SubGroupOrderFn in allocation_info.go for partial allocation … (#367)
Utilize SubGroupOrderFn in allocation_info.go for partial allocation scenarios
1 parent 373dbc3 commit 1d3b0a0

File tree

5 files changed

+672
-4
lines changed

5 files changed

+672
-4
lines changed

pkg/scheduler/actions/allocate/allocate_subgroups_test.go

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,302 @@ func getAllocationSubGroupsTestsMetadata() []integration_tests_utils.TestTopolog
167167
},
168168
},
169169
},
170+
{
171+
TestTopologyBasic: test_utils.TestTopologyBasic{
172+
Name: "Allocate job with SubGroups - allocate resource to unsatisfied sub group",
173+
Jobs: []*jobs_fake.TestJobBasic{
174+
{
175+
Name: "job0",
176+
QueueName: "queue0",
177+
Priority: constants.PriorityTrainNumber,
178+
SubGroups: map[string]*podgroup_info.SubGroupInfo{
179+
"sub0": podgroup_info.NewSubGroupInfo("sub0", 2),
180+
"sub1": podgroup_info.NewSubGroupInfo("sub1", 2),
181+
},
182+
Tasks: []*tasks_fake.TestTaskBasic{
183+
{
184+
NodeName: "node0",
185+
State: pod_status.Running,
186+
SubGroupName: "sub0",
187+
RequiredGPUs: ptr.To(int64(1)),
188+
},
189+
{
190+
NodeName: "node0",
191+
State: pod_status.Running,
192+
SubGroupName: "sub0",
193+
RequiredGPUs: ptr.To(int64(1)),
194+
},
195+
{
196+
State: pod_status.Pending,
197+
SubGroupName: "sub0",
198+
RequiredGPUs: ptr.To(int64(1)),
199+
},
200+
{
201+
State: pod_status.Pending,
202+
SubGroupName: "sub1",
203+
RequiredGPUs: ptr.To(int64(1)),
204+
},
205+
{
206+
State: pod_status.Pending,
207+
SubGroupName: "sub1",
208+
RequiredGPUs: ptr.To(int64(1)),
209+
},
210+
},
211+
MinAvailable: pointer.Int32(4),
212+
},
213+
},
214+
Nodes: map[string]nodes_fake.TestNodeBasic{
215+
"node0": {
216+
GPUs: 4,
217+
},
218+
},
219+
Queues: []test_utils.TestQueueBasic{
220+
{
221+
Name: "queue0",
222+
DeservedGPUs: 1,
223+
},
224+
},
225+
Mocks: &test_utils.TestMock{
226+
CacheRequirements: &test_utils.CacheMocking{
227+
NumberOfCacheBinds: 2,
228+
},
229+
},
230+
TaskExpectedResults: map[string]test_utils.TestExpectedResultBasic{
231+
"job0-0": {
232+
NodeName: "node0",
233+
GPUsRequired: 1,
234+
Status: pod_status.Running,
235+
},
236+
"job0-1": {
237+
NodeName: "node0",
238+
GPUsRequired: 1,
239+
Status: pod_status.Running,
240+
},
241+
"job0-2": {
242+
GPUsRequired: 1,
243+
Status: pod_status.Pending,
244+
},
245+
"job0-3": {
246+
NodeName: "node0",
247+
GPUsRequired: 1,
248+
Status: pod_status.Binding,
249+
},
250+
"job0-4": {
251+
NodeName: "node0",
252+
GPUsRequired: 1,
253+
Status: pod_status.Binding,
254+
},
255+
},
256+
},
257+
},
258+
{
259+
TestTopologyBasic: test_utils.TestTopologyBasic{
260+
Name: "Allocate job with SubGroups - allocate resource up to MinAvailable",
261+
Jobs: []*jobs_fake.TestJobBasic{
262+
{
263+
Name: "job0",
264+
QueueName: "queue0",
265+
Priority: constants.PriorityTrainNumber,
266+
SubGroups: map[string]*podgroup_info.SubGroupInfo{
267+
"sub0": podgroup_info.NewSubGroupInfo("sub0", 2),
268+
"sub1": podgroup_info.NewSubGroupInfo("sub1", 1),
269+
},
270+
Tasks: []*tasks_fake.TestTaskBasic{
271+
{
272+
NodeName: "node0",
273+
State: pod_status.Running,
274+
SubGroupName: "sub0",
275+
RequiredGPUs: ptr.To(int64(2)),
276+
},
277+
{
278+
State: pod_status.Pending,
279+
SubGroupName: "sub0",
280+
RequiredGPUs: ptr.To(int64(1)),
281+
},
282+
{
283+
State: pod_status.Pending,
284+
SubGroupName: "sub0",
285+
RequiredGPUs: ptr.To(int64(1)),
286+
},
287+
{
288+
State: pod_status.Pending,
289+
SubGroupName: "sub0",
290+
RequiredGPUs: ptr.To(int64(1)),
291+
},
292+
{
293+
State: pod_status.Pending,
294+
SubGroupName: "sub0",
295+
RequiredGPUs: ptr.To(int64(1)),
296+
},
297+
{
298+
State: pod_status.Pending,
299+
SubGroupName: "sub1",
300+
RequiredGPUs: ptr.To(int64(1)),
301+
},
302+
{
303+
State: pod_status.Pending,
304+
SubGroupName: "sub1",
305+
RequiredGPUs: ptr.To(int64(1)),
306+
},
307+
},
308+
MinAvailable: pointer.Int32(3),
309+
},
310+
},
311+
Nodes: map[string]nodes_fake.TestNodeBasic{
312+
"node0": {
313+
GPUs: 4,
314+
},
315+
},
316+
Queues: []test_utils.TestQueueBasic{
317+
{
318+
Name: "queue0",
319+
DeservedGPUs: 1,
320+
},
321+
},
322+
Mocks: &test_utils.TestMock{
323+
CacheRequirements: &test_utils.CacheMocking{
324+
NumberOfCacheBinds: 3,
325+
},
326+
},
327+
TaskExpectedResults: map[string]test_utils.TestExpectedResultBasic{
328+
"job0-0": {
329+
NodeName: "node0",
330+
GPUsRequired: 2,
331+
Status: pod_status.Running,
332+
},
333+
"job0-1": {
334+
NodeName: "node0",
335+
GPUsRequired: 1,
336+
Status: pod_status.Binding,
337+
},
338+
"job0-2": {
339+
GPUsRequired: 1,
340+
Status: pod_status.Pending,
341+
},
342+
"job0-3": {
343+
GPUsRequired: 1,
344+
Status: pod_status.Pending,
345+
},
346+
"job0-4": {
347+
GPUsRequired: 1,
348+
Status: pod_status.Pending,
349+
},
350+
"job0-5": {
351+
NodeName: "node0",
352+
GPUsRequired: 1,
353+
Status: pod_status.Binding,
354+
},
355+
"job0-6": {
356+
GPUsRequired: 1,
357+
Status: pod_status.Pending,
358+
},
359+
},
360+
},
361+
},
362+
{
363+
TestTopologyBasic: test_utils.TestTopologyBasic{
364+
Name: "Allocate job with SubGroups - allocate resource beyond MinAvailable",
365+
Jobs: []*jobs_fake.TestJobBasic{
366+
{
367+
Name: "job0",
368+
QueueName: "queue0",
369+
Priority: constants.PriorityTrainNumber,
370+
SubGroups: map[string]*podgroup_info.SubGroupInfo{
371+
"sub0": podgroup_info.NewSubGroupInfo("sub0", 2),
372+
"sub1": podgroup_info.NewSubGroupInfo("sub1", 1),
373+
},
374+
Tasks: []*tasks_fake.TestTaskBasic{
375+
{
376+
NodeName: "node0",
377+
State: pod_status.Running,
378+
SubGroupName: "sub0",
379+
RequiredGPUs: ptr.To(int64(1)),
380+
},
381+
{
382+
State: pod_status.Pending,
383+
SubGroupName: "sub0",
384+
RequiredGPUs: ptr.To(int64(1)),
385+
},
386+
{
387+
State: pod_status.Pending,
388+
SubGroupName: "sub0",
389+
RequiredGPUs: ptr.To(int64(1)),
390+
},
391+
{
392+
State: pod_status.Pending,
393+
SubGroupName: "sub0",
394+
RequiredGPUs: ptr.To(int64(1)),
395+
},
396+
{
397+
State: pod_status.Pending,
398+
SubGroupName: "sub0",
399+
RequiredGPUs: ptr.To(int64(1)),
400+
},
401+
{
402+
State: pod_status.Pending,
403+
SubGroupName: "sub1",
404+
RequiredGPUs: ptr.To(int64(1)),
405+
},
406+
{
407+
State: pod_status.Pending,
408+
SubGroupName: "sub1",
409+
RequiredGPUs: ptr.To(int64(1)),
410+
},
411+
},
412+
MinAvailable: pointer.Int32(3),
413+
},
414+
},
415+
Nodes: map[string]nodes_fake.TestNodeBasic{
416+
"node0": {
417+
GPUs: 4,
418+
},
419+
},
420+
Queues: []test_utils.TestQueueBasic{
421+
{
422+
Name: "queue0",
423+
DeservedGPUs: 1,
424+
},
425+
},
426+
Mocks: &test_utils.TestMock{
427+
CacheRequirements: &test_utils.CacheMocking{
428+
NumberOfCacheBinds: 3,
429+
},
430+
},
431+
TaskExpectedResults: map[string]test_utils.TestExpectedResultBasic{
432+
"job0-0": {
433+
NodeName: "node0",
434+
GPUsRequired: 1,
435+
Status: pod_status.Running,
436+
},
437+
"job0-1": {
438+
NodeName: "node0",
439+
GPUsRequired: 1,
440+
Status: pod_status.Binding,
441+
},
442+
"job0-2": {
443+
GPUsRequired: 1,
444+
Status: pod_status.Binding,
445+
},
446+
"job0-3": {
447+
GPUsRequired: 1,
448+
Status: pod_status.Pending,
449+
},
450+
"job0-4": {
451+
GPUsRequired: 1,
452+
Status: pod_status.Pending,
453+
},
454+
"job0-5": {
455+
NodeName: "node0",
456+
GPUsRequired: 1,
457+
Status: pod_status.Binding,
458+
},
459+
"job0-6": {
460+
GPUsRequired: 1,
461+
Status: pod_status.Pending,
462+
},
463+
},
464+
},
465+
},
170466
{
171467
TestTopologyBasic: test_utils.TestTopologyBasic{
172468
Name: "Allocate job with SubGroups - cannot satisfy sub group gang",

0 commit comments

Comments
 (0)