Skip to content

Commit da36a65

Browse files
authored
Roman/subgroups eviction info (#368)
Added SubGroups support to eviction_info.go
1 parent 1d3b0a0 commit da36a65

File tree

6 files changed

+1009
-19
lines changed

6 files changed

+1009
-19
lines changed

pkg/scheduler/actions/consolidation/consolidation_subgroups_test.go

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,5 +345,192 @@ func getSubGroupsConsolidationTestsMetadata() []integration_tests_utils.TestTopo
345345
},
346346
},
347347
},
348+
{
349+
TestTopologyBasic: test_utils.TestTopologyBasic{
350+
Name: "job with sub groups consolidated by a pending job - partial eviction",
351+
Jobs: []*jobs_fake.TestJobBasic{
352+
{
353+
Name: "running_job",
354+
Priority: constants.PriorityTrainNumber,
355+
QueueName: "queue0",
356+
SubGroups: map[string]*podgroup_info.SubGroupInfo{
357+
"sub-0": podgroup_info.NewSubGroupInfo("sub-0", 1),
358+
"sub-1": podgroup_info.NewSubGroupInfo("sub-1", 1),
359+
},
360+
Tasks: []*tasks_fake.TestTaskBasic{
361+
{
362+
State: pod_status.Running,
363+
NodeName: "node0",
364+
SubGroupName: "sub-0",
365+
RequiredGPUs: ptr.To(int64(1)),
366+
},
367+
{
368+
State: pod_status.Running,
369+
NodeName: "node1",
370+
SubGroupName: "sub-1",
371+
RequiredGPUs: ptr.To(int64(3)),
372+
},
373+
},
374+
MinAvailable: ptr.To(int32(2)),
375+
},
376+
{
377+
Name: "pending_job",
378+
RequiredGPUsPerTask: 4,
379+
Priority: constants.PriorityTrainNumber,
380+
QueueName: "queue0",
381+
Tasks: []*tasks_fake.TestTaskBasic{
382+
{
383+
State: pod_status.Pending,
384+
},
385+
},
386+
},
387+
},
388+
Nodes: map[string]nodes_fake.TestNodeBasic{
389+
"node0": {
390+
GPUs: 4,
391+
},
392+
"node1": {
393+
GPUs: 4,
394+
},
395+
},
396+
Queues: []test_utils.TestQueueBasic{
397+
{
398+
Name: "queue0",
399+
DeservedGPUs: 2,
400+
},
401+
},
402+
TaskExpectedResults: map[string]test_utils.TestExpectedResultBasic{
403+
"running_job-0": {
404+
NodeName: "node0",
405+
GPUsRequired: 1,
406+
Status: pod_status.Running,
407+
},
408+
"running_job-1": {
409+
NodeName: "node0",
410+
GPUsRequired: 3,
411+
Status: pod_status.Pipelined,
412+
},
413+
"pending_job-0": {
414+
GPUsRequired: 4,
415+
NodeName: "node1",
416+
Status: pod_status.Pipelined,
417+
},
418+
},
419+
Mocks: &test_utils.TestMock{
420+
CacheRequirements: &test_utils.CacheMocking{
421+
NumberOfCacheEvictions: 1,
422+
NumberOfPipelineActions: 2,
423+
},
424+
},
425+
},
426+
},
427+
{
428+
TestTopologyBasic: test_utils.TestTopologyBasic{
429+
Name: "job with sub groups consolidated by a pending job - complete eviction",
430+
Jobs: []*jobs_fake.TestJobBasic{
431+
{
432+
Name: "running_job0",
433+
Priority: constants.PriorityTrainNumber,
434+
QueueName: "queue0",
435+
Tasks: []*tasks_fake.TestTaskBasic{
436+
{
437+
State: pod_status.Running,
438+
NodeName: "node0",
439+
RequiredGPUs: ptr.To(int64(2)),
440+
},
441+
},
442+
MinAvailable: ptr.To(int32(2)),
443+
},
444+
{
445+
Name: "running_job1",
446+
Priority: constants.PriorityTrainNumber,
447+
QueueName: "queue0",
448+
SubGroups: map[string]*podgroup_info.SubGroupInfo{
449+
"sub-0": podgroup_info.NewSubGroupInfo("sub-0", 1),
450+
"sub-1": podgroup_info.NewSubGroupInfo("sub-1", 1),
451+
},
452+
Tasks: []*tasks_fake.TestTaskBasic{
453+
{
454+
State: pod_status.Running,
455+
NodeName: "node1",
456+
SubGroupName: "sub-0",
457+
RequiredGPUs: ptr.To(int64(1)),
458+
},
459+
{
460+
State: pod_status.Running,
461+
NodeName: "node2",
462+
SubGroupName: "sub-1",
463+
RequiredGPUs: ptr.To(int64(1)),
464+
},
465+
},
466+
MinAvailable: ptr.To(int32(2)),
467+
},
468+
{
469+
Name: "pending_job",
470+
RequiredGPUsPerTask: 4,
471+
Priority: constants.PriorityTrainNumber,
472+
QueueName: "queue0",
473+
Tasks: []*tasks_fake.TestTaskBasic{
474+
{
475+
State: pod_status.Pending,
476+
},
477+
{
478+
State: pod_status.Pending,
479+
},
480+
},
481+
},
482+
},
483+
Nodes: map[string]nodes_fake.TestNodeBasic{
484+
"node0": {
485+
GPUs: 4,
486+
},
487+
"node1": {
488+
GPUs: 4,
489+
},
490+
"node2": {
491+
GPUs: 4,
492+
},
493+
},
494+
Queues: []test_utils.TestQueueBasic{
495+
{
496+
Name: "queue0",
497+
DeservedGPUs: 2,
498+
},
499+
},
500+
TaskExpectedResults: map[string]test_utils.TestExpectedResultBasic{
501+
"running_job0-0": {
502+
NodeName: "node0",
503+
GPUsRequired: 2,
504+
Status: pod_status.Running,
505+
},
506+
"running_job1-0": {
507+
NodeName: "node0",
508+
GPUsRequired: 1,
509+
Status: pod_status.Pipelined,
510+
},
511+
"running_job1-1": {
512+
NodeName: "node0",
513+
GPUsRequired: 1,
514+
Status: pod_status.Pipelined,
515+
},
516+
"pending_job-0": {
517+
GPUsRequired: 4,
518+
NodeName: "node1",
519+
Status: pod_status.Pipelined,
520+
},
521+
"pending_job-1": {
522+
GPUsRequired: 4,
523+
NodeName: "node2",
524+
Status: pod_status.Pipelined,
525+
},
526+
},
527+
Mocks: &test_utils.TestMock{
528+
CacheRequirements: &test_utils.CacheMocking{
529+
NumberOfCacheEvictions: 2,
530+
NumberOfPipelineActions: 4,
531+
},
532+
},
533+
},
534+
},
348535
}
349536
}

0 commit comments

Comments
 (0)