Skip to content

Commit 7ef4b90

Browse files
authored
Allow reclaiming with lower utilization ratio (#374)
1 parent 509e82f commit 7ef4b90

File tree

6 files changed

+604
-44
lines changed

6 files changed

+604
-44
lines changed

docs/fairness/README.md

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,25 @@ These two steps are repeated across all hierarchy levels until every leaf queue
1818
## Fair Share
1919
Once the fair share for each queue is calculated, it serves two primary purposes:
2020
1. Queue Order - Queues with a fair share further below their allocation will be prioritized for scheduling.
21-
2. Reclaim action - If scheduling cannot be performed due to limited resources in the cluster, the scheduler will evict workloads from queues that have exceeded their fair share, giving priority to queues that are below their fair share. For more details, refer to the reclaim strategies.
21+
2. Reclaim action – When reclamation is required, the scheduler compares the **Saturation Ratio** (`Allocated / FairShare`) of queues that share the same parent. A queue can only reclaim resources if, **after** the transfer, its utilisation ratio remains lower than that of every sibling queue. For more details see the reclaim strategies.
2222

2323
## Reclaim Strategies
2424
There are two main reclaim strategies:
2525
1. Workloads from queues with resources below their fair share can evict workloads from queues that have exceeded their fair share.
2626
2. Workloads from queues under their quota can evict workloads from queues that have exceeded their quota.
2727

28-
In both strategies, the scheduler ensures that the initial state remains unchanged after resource reclamation. Specifically, a queue below its fair share will not exceed that share after reclamation, and a queue below its quota will not exceed the quota.
29-
The scheduler will prioritize the first strategy.
28+
In both strategies, the scheduler ensures that the relative ordering is preserved: a queue that had the lowest utilisation ratio in its level before reclamation will still have the lowest ratio afterwards. Likewise, a queue that was below its quota will remain below its quota.
29+
The scheduler will prioritize the first strategy.
30+
31+
### Reclaim Ratio Adjustment
32+
The Saturation Ratio comparison can be adjusted using the `reclaimerUtilizationMultiplier` plugin argument. This multiplier is applied to the reclaimer's Saturation Ratio before comparison:
33+
- Values > 1.0 make it harder for jobs to reclaim resources (more conservative)
34+
- Minimum value is 1.0 (standard comparison, default)
35+
- Values < 1.0 are not allowed and will be set to 1.0 - These values could cause infinite reclaim cycles that we want to avoid.
36+
37+
Example configuration:
38+
```yaml
39+
pluginArguments:
40+
proportion:
41+
reclaimerUtilizationMultiplier: "1.2" # Makes reclamation 20% more conservative
42+
```

pkg/scheduler/actions/reclaim/reclaimDepartments_test.go

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,5 +765,301 @@ func getTestsDepartmentsMetadata() []integration_tests_utils.TestTopologyMetadat
765765
},
766766
},
767767
},
768+
{
769+
TestTopologyBasic: test_utils.TestTopologyBasic{
770+
Name: "Reclaim from overquota department with multiple departments",
771+
Jobs: []*jobs_fake.TestJobBasic{
772+
{
773+
Name: "d1_p1_pending_job",
774+
RequiredGPUsPerTask: 1,
775+
Priority: constants.PriorityInferenceNumber,
776+
QueueName: "d1_project1",
777+
Tasks: []*tasks_fake.TestTaskBasic{
778+
{
779+
State: pod_status.Pending,
780+
},
781+
},
782+
},
783+
{
784+
Name: "d1_p2_job1",
785+
RequiredGPUsPerTask: 1,
786+
Priority: constants.PriorityTrainNumber,
787+
QueueName: "d1_project2",
788+
Tasks: []*tasks_fake.TestTaskBasic{
789+
{
790+
State: pod_status.Running,
791+
NodeName: "node0",
792+
},
793+
},
794+
},
795+
{
796+
Name: "d1_p2_job2",
797+
RequiredGPUsPerTask: 1,
798+
Priority: constants.PriorityTrainNumber,
799+
QueueName: "d1_project2",
800+
Tasks: []*tasks_fake.TestTaskBasic{
801+
{
802+
State: pod_status.Running,
803+
NodeName: "node0",
804+
},
805+
},
806+
},
807+
{
808+
Name: "d1_p2_job3",
809+
RequiredGPUsPerTask: 1,
810+
Priority: constants.PriorityTrainNumber,
811+
QueueName: "d1_project2",
812+
Tasks: []*tasks_fake.TestTaskBasic{
813+
{
814+
State: pod_status.Running,
815+
NodeName: "node0",
816+
},
817+
},
818+
},
819+
{
820+
Name: "d1_p2_job4",
821+
RequiredGPUsPerTask: 1,
822+
Priority: constants.PriorityTrainNumber,
823+
QueueName: "d1_project2",
824+
Tasks: []*tasks_fake.TestTaskBasic{
825+
{
826+
State: pod_status.Running,
827+
NodeName: "node0",
828+
},
829+
},
830+
},
831+
{
832+
Name: "d2_job1",
833+
RequiredGPUsPerTask: 1,
834+
Priority: constants.PriorityTrainNumber,
835+
QueueName: "d2_project1",
836+
Tasks: []*tasks_fake.TestTaskBasic{
837+
{
838+
State: pod_status.Running,
839+
NodeName: "node0",
840+
},
841+
},
842+
},
843+
{
844+
Name: "d2_job2",
845+
RequiredGPUsPerTask: 1,
846+
Priority: constants.PriorityTrainNumber,
847+
QueueName: "d2_project1",
848+
Tasks: []*tasks_fake.TestTaskBasic{
849+
{
850+
State: pod_status.Running,
851+
NodeName: "node0",
852+
},
853+
},
854+
},
855+
{
856+
Name: "d2_job3",
857+
RequiredGPUsPerTask: 1,
858+
Priority: constants.PriorityTrainNumber,
859+
QueueName: "d2_project1",
860+
Tasks: []*tasks_fake.TestTaskBasic{
861+
{
862+
State: pod_status.Running,
863+
NodeName: "node0",
864+
},
865+
},
866+
},
867+
{
868+
Name: "d2_job4",
869+
RequiredGPUsPerTask: 1,
870+
Priority: constants.PriorityTrainNumber,
871+
QueueName: "d2_project1",
872+
Tasks: []*tasks_fake.TestTaskBasic{
873+
{
874+
State: pod_status.Running,
875+
NodeName: "node0",
876+
},
877+
},
878+
},
879+
{
880+
Name: "d2_job5",
881+
RequiredGPUsPerTask: 1,
882+
Priority: constants.PriorityTrainNumber,
883+
QueueName: "d2_project1",
884+
Tasks: []*tasks_fake.TestTaskBasic{
885+
{
886+
State: pod_status.Running,
887+
NodeName: "node0",
888+
},
889+
},
890+
},
891+
{
892+
Name: "d2_job6",
893+
RequiredGPUsPerTask: 1,
894+
Priority: constants.PriorityTrainNumber,
895+
QueueName: "d2_project1",
896+
Tasks: []*tasks_fake.TestTaskBasic{
897+
{
898+
State: pod_status.Running,
899+
NodeName: "node0",
900+
},
901+
},
902+
},
903+
{
904+
Name: "d2_job7",
905+
RequiredGPUsPerTask: 1,
906+
Priority: constants.PriorityTrainNumber,
907+
QueueName: "d2_project1",
908+
Tasks: []*tasks_fake.TestTaskBasic{
909+
{
910+
State: pod_status.Running,
911+
NodeName: "node0",
912+
},
913+
},
914+
},
915+
{
916+
Name: "d2_job8",
917+
RequiredGPUsPerTask: 1,
918+
Priority: constants.PriorityTrainNumber,
919+
QueueName: "d2_project1",
920+
Tasks: []*tasks_fake.TestTaskBasic{
921+
{
922+
State: pod_status.Running,
923+
NodeName: "node0",
924+
},
925+
},
926+
},
927+
{
928+
Name: "d3_job1",
929+
RequiredGPUsPerTask: 1,
930+
Priority: constants.PriorityTrainNumber,
931+
QueueName: "d3_project1",
932+
Tasks: []*tasks_fake.TestTaskBasic{
933+
{
934+
State: pod_status.Running,
935+
NodeName: "node0",
936+
},
937+
},
938+
},
939+
{
940+
Name: "d3_job2",
941+
RequiredGPUsPerTask: 1,
942+
Priority: constants.PriorityTrainNumber,
943+
QueueName: "d3_project1",
944+
Tasks: []*tasks_fake.TestTaskBasic{
945+
{
946+
State: pod_status.Running,
947+
NodeName: "node0",
948+
},
949+
},
950+
},
951+
{
952+
Name: "d3_job3",
953+
RequiredGPUsPerTask: 1,
954+
Priority: constants.PriorityTrainNumber,
955+
QueueName: "d3_project1",
956+
Tasks: []*tasks_fake.TestTaskBasic{
957+
{
958+
State: pod_status.Running,
959+
NodeName: "node0",
960+
},
961+
},
962+
},
963+
{
964+
Name: "d3_job4",
965+
RequiredGPUsPerTask: 1,
966+
Priority: constants.PriorityTrainNumber,
967+
QueueName: "d3_project1",
968+
Tasks: []*tasks_fake.TestTaskBasic{
969+
{
970+
State: pod_status.Running,
971+
NodeName: "node0",
972+
},
973+
},
974+
},
975+
{
976+
Name: "d4_job1",
977+
RequiredGPUsPerTask: 5,
978+
Priority: constants.PriorityTrainNumber,
979+
QueueName: "d4_project1",
980+
Tasks: []*tasks_fake.TestTaskBasic{
981+
{
982+
State: pod_status.Pending,
983+
NodeName: "node0",
984+
},
985+
},
986+
},
987+
},
988+
Nodes: map[string]nodes_fake.TestNodeBasic{
989+
"node0": {
990+
GPUs: 16,
991+
},
992+
},
993+
Queues: []test_utils.TestQueueBasic{
994+
{
995+
Name: "d1_project1",
996+
DeservedGPUs: 3,
997+
GPUOverQuotaWeight: 3,
998+
ParentQueue: "d1",
999+
},
1000+
{
1001+
Name: "d1_project2",
1002+
DeservedGPUs: 1,
1003+
GPUOverQuotaWeight: 1,
1004+
ParentQueue: "d1",
1005+
},
1006+
{
1007+
Name: "d2_project1",
1008+
DeservedGPUs: 4,
1009+
GPUOverQuotaWeight: 4,
1010+
ParentQueue: "d2",
1011+
},
1012+
{
1013+
Name: "d3_project1",
1014+
DeservedGPUs: 4,
1015+
GPUOverQuotaWeight: 4,
1016+
ParentQueue: "d3",
1017+
},
1018+
{
1019+
Name: "d4_project1",
1020+
DeservedGPUs: 4,
1021+
GPUOverQuotaWeight: 4,
1022+
ParentQueue: "d4",
1023+
},
1024+
},
1025+
Departments: []test_utils.TestDepartmentBasic{
1026+
{
1027+
Name: "d1",
1028+
DeservedGPUs: 4,
1029+
},
1030+
{
1031+
Name: "d2",
1032+
DeservedGPUs: 4,
1033+
},
1034+
{
1035+
Name: "d3",
1036+
DeservedGPUs: 4,
1037+
},
1038+
{
1039+
Name: "d4",
1040+
DeservedGPUs: 4,
1041+
},
1042+
},
1043+
JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
1044+
"d1_p1_pending_job": {
1045+
NodeName: "node0",
1046+
GPUsRequired: 1,
1047+
Status: pod_status.Pipelined,
1048+
},
1049+
"d2_job8": {
1050+
NodeName: "node0",
1051+
GPUsRequired: 1,
1052+
Status: pod_status.Releasing,
1053+
},
1054+
},
1055+
Mocks: &test_utils.TestMock{
1056+
CacheRequirements: &test_utils.CacheMocking{
1057+
NumberOfCacheBinds: 1,
1058+
NumberOfCacheEvictions: 1,
1059+
NumberOfPipelineActions: 1,
1060+
},
1061+
},
1062+
},
1063+
},
7681064
}
7691065
}

pkg/scheduler/plugins/proportion/proportion.go

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ package proportion
2121

2222
import (
2323
"math"
24+
"strconv"
2425

2526
commonconstants "github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
2627
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api"
@@ -54,19 +55,34 @@ type proportionPlugin struct {
5455
queues map[common_info.QueueID]*rs.QueueAttributes
5556
jobSimulationQueues map[common_info.QueueID]*rs.QueueAttributes
5657
// Arguments given for the plugin
57-
pluginArguments map[string]string
58-
subGroupOrderFn common_info.LessFn
59-
taskOrderFunc common_info.LessFn
60-
reclaimablePlugin *rec.Reclaimable
61-
isInferencePreemptible bool
62-
allowConsolidatingReclaim bool
58+
pluginArguments map[string]string
59+
subGroupOrderFn common_info.LessFn
60+
taskOrderFunc common_info.LessFn
61+
reclaimablePlugin *rec.Reclaimable
62+
isInferencePreemptible bool
63+
allowConsolidatingReclaim bool
64+
relcaimerSaturationMultiplier float64
6365
}
6466

6567
func New(arguments map[string]string) framework.Plugin {
68+
multiplier := 1.0
69+
if val, exists := arguments["relcaimerSaturationMultiplier"]; exists {
70+
if m, err := strconv.ParseFloat(val, 64); err == nil {
71+
if m < 1.0 {
72+
log.InfraLogger.Warningf("relcaimerSaturationMultiplier must be >= 1.0, got %v. Using default value of 1.0", m)
73+
} else {
74+
multiplier = m
75+
}
76+
} else {
77+
log.InfraLogger.V(1).Errorf("Failed to parse relcaimerSaturationMultiplier: %s. Using default 1.", val)
78+
}
79+
}
80+
6681
return &proportionPlugin{
67-
totalResource: rs.EmptyResourceQuantities(),
68-
queues: map[common_info.QueueID]*rs.QueueAttributes{},
69-
pluginArguments: arguments,
82+
totalResource: rs.EmptyResourceQuantities(),
83+
queues: map[common_info.QueueID]*rs.QueueAttributes{},
84+
pluginArguments: arguments,
85+
relcaimerSaturationMultiplier: multiplier,
7086
}
7187
}
7288

@@ -78,7 +94,7 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) {
7894
pp.calculateResourcesProportion(ssn)
7995
pp.subGroupOrderFn = ssn.SubGroupOrderFn
8096
pp.taskOrderFunc = ssn.TaskOrderFn
81-
pp.reclaimablePlugin = rec.New(ssn.IsInferencePreemptible())
97+
pp.reclaimablePlugin = rec.New(pp.relcaimerSaturationMultiplier)
8298
pp.isInferencePreemptible = ssn.IsInferencePreemptible()
8399
capacityPolicy := cp.New(pp.queues, ssn.IsInferencePreemptible())
84100
ssn.AddQueueOrderFn(pp.queueOrder)

0 commit comments

Comments
 (0)