Skip to content

Commit a49253b

Browse files
authored
Use tasks from ssn to validate scenario (#294)
* Use tasks from ssn to validate scenario
1 parent a8e6a21 commit a49253b

File tree

4 files changed

+351
-10
lines changed

4 files changed

+351
-10
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1010
- Added optional pod and namespace label selectors to limit the scope of monitored pods
1111
- Added a plugin extension point for scheduler plugins to add annotations to BindRequests.
1212

13+
### Fixes
14+
- Fixed cases where reclaim validation operated on outdated info, allowing invalid reclaim scenarios
15+
1316
## [v0.6.0] - 2025-06-16
1417

1518
### Changed

pkg/scheduler/actions/common/solvers/scenario/base_scenario.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,5 +180,11 @@ func (s *BaseScenario) GetPreemptor() *podgroup_info.PodGroupInfo {
180180
}
181181

182182
func (s *BaseScenario) GetVictims() map[common_info.PodGroupID]*api.VictimInfo {
183+
for _, victim := range s.victims {
184+
for i, task := range victim.Tasks {
185+
ogTask := s.getJobForTask(task).PodInfos[task.UID]
186+
victim.Tasks[i] = ogTask
187+
}
188+
}
183189
return s.victims
184190
}

pkg/scheduler/actions/reclaim/reclaim_test.go

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3710,5 +3710,344 @@ func getTestsMetadata() []integration_tests_utils.TestTopologyMetadata {
37103710
},
37113711
},
37123712
},
3713+
{
3714+
// In some cases, when multiple tasks and nodes are involved, tasks get re-written to the podgroupinfo object
3715+
// in the session - this test is to ensure that we don't have any issues with that
3716+
TestTopologyBasic: test_utils.TestTopologyBasic{
3717+
Name: "queue0 is in deserved share, queue1 is under fair share - do not reclaim - multiple tasks",
3718+
Jobs: []*jobs_fake.TestJobBasic{
3719+
{
3720+
Name: "q0_n0_job0",
3721+
RequiredGPUsPerTask: 1,
3722+
Priority: constants.PriorityTrainNumber,
3723+
QueueName: "queue0",
3724+
Tasks: []*tasks_fake.TestTaskBasic{
3725+
{
3726+
NodeName: "node0",
3727+
State: pod_status.Running,
3728+
},
3729+
},
3730+
}, {
3731+
Name: "q0_n0_job1",
3732+
RequiredGPUsPerTask: 1,
3733+
Priority: constants.PriorityTrainNumber,
3734+
QueueName: "queue0",
3735+
Tasks: []*tasks_fake.TestTaskBasic{
3736+
{
3737+
NodeName: "node0",
3738+
State: pod_status.Running,
3739+
},
3740+
},
3741+
},
3742+
{
3743+
Name: "q0_n0_job2",
3744+
RequiredGPUsPerTask: 1,
3745+
Priority: constants.PriorityTrainNumber,
3746+
QueueName: "queue0",
3747+
Tasks: []*tasks_fake.TestTaskBasic{
3748+
{
3749+
NodeName: "node0",
3750+
State: pod_status.Running,
3751+
},
3752+
},
3753+
},
3754+
{
3755+
Name: "q0_n0_job3",
3756+
RequiredGPUsPerTask: 1,
3757+
Priority: constants.PriorityTrainNumber,
3758+
QueueName: "queue0",
3759+
Tasks: []*tasks_fake.TestTaskBasic{
3760+
{
3761+
NodeName: "node0",
3762+
State: pod_status.Running,
3763+
},
3764+
},
3765+
},
3766+
{
3767+
Name: "q0_n1_job0",
3768+
RequiredGPUsPerTask: 1,
3769+
Priority: constants.PriorityTrainNumber,
3770+
QueueName: "queue0",
3771+
Tasks: []*tasks_fake.TestTaskBasic{
3772+
{
3773+
NodeName: "node1",
3774+
State: pod_status.Running,
3775+
},
3776+
},
3777+
}, {
3778+
Name: "q0_n1_job1",
3779+
RequiredGPUsPerTask: 1,
3780+
Priority: constants.PriorityTrainNumber,
3781+
QueueName: "queue0",
3782+
Tasks: []*tasks_fake.TestTaskBasic{
3783+
{
3784+
NodeName: "node1",
3785+
State: pod_status.Running,
3786+
},
3787+
},
3788+
},
3789+
{
3790+
Name: "q0_n1_job2",
3791+
RequiredGPUsPerTask: 1,
3792+
Priority: constants.PriorityTrainNumber,
3793+
QueueName: "queue0",
3794+
Tasks: []*tasks_fake.TestTaskBasic{
3795+
{
3796+
NodeName: "node1",
3797+
State: pod_status.Running,
3798+
},
3799+
},
3800+
},
3801+
{
3802+
Name: "q0_n1_job3",
3803+
RequiredGPUsPerTask: 1,
3804+
Priority: constants.PriorityTrainNumber,
3805+
QueueName: "queue0",
3806+
Tasks: []*tasks_fake.TestTaskBasic{
3807+
{
3808+
NodeName: "node1",
3809+
State: pod_status.Running,
3810+
},
3811+
},
3812+
}, {
3813+
Name: "q0_n2_job0",
3814+
RequiredGPUsPerTask: 1,
3815+
Priority: constants.PriorityTrainNumber,
3816+
QueueName: "queue0",
3817+
Tasks: []*tasks_fake.TestTaskBasic{
3818+
{
3819+
NodeName: "node2",
3820+
State: pod_status.Running,
3821+
},
3822+
},
3823+
}, {
3824+
Name: "q0_n2_job1",
3825+
RequiredGPUsPerTask: 1,
3826+
Priority: constants.PriorityTrainNumber,
3827+
QueueName: "queue0",
3828+
Tasks: []*tasks_fake.TestTaskBasic{
3829+
{
3830+
NodeName: "node2",
3831+
State: pod_status.Running,
3832+
},
3833+
},
3834+
},
3835+
{
3836+
Name: "q0_n2_job2",
3837+
RequiredGPUsPerTask: 1,
3838+
Priority: constants.PriorityTrainNumber,
3839+
QueueName: "queue0",
3840+
Tasks: []*tasks_fake.TestTaskBasic{
3841+
{
3842+
NodeName: "node2",
3843+
State: pod_status.Running,
3844+
},
3845+
},
3846+
},
3847+
{
3848+
Name: "q0_n2_job3",
3849+
RequiredGPUsPerTask: 1,
3850+
Priority: constants.PriorityTrainNumber,
3851+
QueueName: "queue0",
3852+
Tasks: []*tasks_fake.TestTaskBasic{
3853+
{
3854+
NodeName: "node2",
3855+
State: pod_status.Running,
3856+
},
3857+
},
3858+
},
3859+
{
3860+
Name: "q0_n3_job0",
3861+
RequiredGPUsPerTask: 1,
3862+
Priority: constants.PriorityTrainNumber,
3863+
QueueName: "queue0",
3864+
Tasks: []*tasks_fake.TestTaskBasic{
3865+
{
3866+
NodeName: "node3",
3867+
State: pod_status.Running,
3868+
},
3869+
},
3870+
}, {
3871+
Name: "q0_n3_job1",
3872+
RequiredGPUsPerTask: 1,
3873+
Priority: constants.PriorityTrainNumber,
3874+
QueueName: "queue0",
3875+
Tasks: []*tasks_fake.TestTaskBasic{
3876+
{
3877+
NodeName: "node3",
3878+
State: pod_status.Running,
3879+
},
3880+
},
3881+
},
3882+
{
3883+
Name: "q0_n3_job2",
3884+
RequiredGPUsPerTask: 1,
3885+
Priority: constants.PriorityTrainNumber,
3886+
QueueName: "queue0",
3887+
Tasks: []*tasks_fake.TestTaskBasic{
3888+
{
3889+
NodeName: "node3",
3890+
State: pod_status.Running,
3891+
},
3892+
},
3893+
},
3894+
{
3895+
Name: "q0_n3_job3",
3896+
RequiredGPUsPerTask: 1,
3897+
Priority: constants.PriorityTrainNumber,
3898+
QueueName: "queue0",
3899+
Tasks: []*tasks_fake.TestTaskBasic{
3900+
{
3901+
NodeName: "node3",
3902+
State: pod_status.Running,
3903+
},
3904+
},
3905+
},
3906+
{
3907+
Name: "q1_job1",
3908+
RequiredGPUsPerTask: 1,
3909+
Priority: constants.PriorityTrainNumber,
3910+
QueueName: "queue1",
3911+
Tasks: []*tasks_fake.TestTaskBasic{
3912+
{
3913+
State: pod_status.Pending,
3914+
},
3915+
{
3916+
State: pod_status.Pending,
3917+
},
3918+
{
3919+
State: pod_status.Pending,
3920+
},
3921+
{
3922+
State: pod_status.Pending,
3923+
},
3924+
{
3925+
State: pod_status.Pending,
3926+
},
3927+
},
3928+
},
3929+
},
3930+
Nodes: map[string]nodes_fake.TestNodeBasic{
3931+
"node0": {
3932+
GPUs: 4,
3933+
},
3934+
"node1": {
3935+
GPUs: 4,
3936+
},
3937+
"node2": {
3938+
GPUs: 4,
3939+
},
3940+
"node3": {
3941+
GPUs: 4,
3942+
},
3943+
},
3944+
Queues: []test_utils.TestQueueBasic{
3945+
{
3946+
Name: "queue0",
3947+
DeservedGPUs: 12,
3948+
GPUOverQuotaWeight: 0,
3949+
},
3950+
{
3951+
Name: "queue1",
3952+
DeservedGPUs: 5,
3953+
GPUOverQuotaWeight: 1,
3954+
},
3955+
},
3956+
JobExpectedResults: map[string]test_utils.TestExpectedResultBasic{
3957+
"q0_n0_job0": {
3958+
GPUsRequired: 1,
3959+
Status: pod_status.Running,
3960+
DontValidateGPUGroup: true,
3961+
},
3962+
"q0_n0_job1": {
3963+
GPUsRequired: 1,
3964+
Status: pod_status.Running,
3965+
DontValidateGPUGroup: true,
3966+
},
3967+
"q0_n0_job2": {
3968+
GPUsRequired: 1,
3969+
Status: pod_status.Running,
3970+
DontValidateGPUGroup: true,
3971+
},
3972+
"q0_n0_job3": {
3973+
GPUsRequired: 1,
3974+
Status: pod_status.Running,
3975+
DontValidateGPUGroup: true,
3976+
},
3977+
"q0_n1_job0": {
3978+
GPUsRequired: 1,
3979+
Status: pod_status.Running,
3980+
DontValidateGPUGroup: true,
3981+
},
3982+
"q0_n1_job1": {
3983+
GPUsRequired: 1,
3984+
Status: pod_status.Running,
3985+
DontValidateGPUGroup: true,
3986+
},
3987+
"q0_n1_job2": {
3988+
GPUsRequired: 1,
3989+
Status: pod_status.Running,
3990+
DontValidateGPUGroup: true,
3991+
},
3992+
"q0_n1_job3": {
3993+
GPUsRequired: 1,
3994+
Status: pod_status.Running,
3995+
DontValidateGPUGroup: true,
3996+
},
3997+
"q0_n2_job0": {
3998+
GPUsRequired: 1,
3999+
Status: pod_status.Running,
4000+
DontValidateGPUGroup: true,
4001+
},
4002+
"q0_n2_job1": {
4003+
GPUsRequired: 1,
4004+
Status: pod_status.Running,
4005+
DontValidateGPUGroup: true,
4006+
},
4007+
"q0_n2_job2": {
4008+
GPUsRequired: 1,
4009+
Status: pod_status.Running,
4010+
DontValidateGPUGroup: true,
4011+
},
4012+
"q0_n2_job3": {
4013+
GPUsRequired: 1,
4014+
Status: pod_status.Running,
4015+
DontValidateGPUGroup: true,
4016+
},
4017+
"q0_n3_job0": {
4018+
GPUsRequired: 1,
4019+
Status: pod_status.Running,
4020+
DontValidateGPUGroup: true,
4021+
},
4022+
"q0_n3_job1": {
4023+
GPUsRequired: 1,
4024+
Status: pod_status.Running,
4025+
DontValidateGPUGroup: true,
4026+
},
4027+
"q0_n3_job2": {
4028+
GPUsRequired: 1,
4029+
Status: pod_status.Running,
4030+
DontValidateGPUGroup: true,
4031+
},
4032+
"q0_n3_job3": {
4033+
GPUsRequired: 1,
4034+
Status: pod_status.Running,
4035+
DontValidateGPUGroup: true,
4036+
},
4037+
"q1_job1": {
4038+
GPUsRequired: 5,
4039+
Status: pod_status.Pending,
4040+
DontValidateGPUGroup: true,
4041+
},
4042+
},
4043+
Mocks: &test_utils.TestMock{
4044+
CacheRequirements: &test_utils.CacheMocking{
4045+
NumberOfCacheBinds: 0,
4046+
NumberOfCacheEvictions: 0,
4047+
NumberOfPipelineActions: 0,
4048+
},
4049+
},
4050+
},
4051+
},
37134052
}
37144053
}

pkg/scheduler/plugins/proportion/proportion.go

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,16 +138,9 @@ func (pp *proportionPlugin) reclaimableFn(
138138
}
139139

140140
func (pp *proportionPlugin) getVictimResources(victim *api.VictimInfo) []*resource_info.Resource {
141-
var victimTasks []*pod_info.PodInfo
142-
for _, job := range victim.RepresentativeJobs {
143-
for _, task := range job.PodInfos {
144-
victimTasks = append(victimTasks, task)
145-
}
146-
}
147-
148141
var victimResources []*resource_info.Resource
149-
if len(victimTasks) > int(victim.Job.MinAvailable) {
150-
elasticTasks := victimTasks[victim.Job.MinAvailable:]
142+
if len(victim.Tasks) > int(victim.Job.MinAvailable) {
143+
elasticTasks := victim.Tasks[victim.Job.MinAvailable:]
151144
for _, task := range elasticTasks {
152145
resources := getResources(pp.allowConsolidatingReclaim, task)
153146
if resources == nil {
@@ -157,7 +150,7 @@ func (pp *proportionPlugin) getVictimResources(victim *api.VictimInfo) []*resour
157150
}
158151
}
159152

160-
resources := getResources(pp.allowConsolidatingReclaim, victimTasks[:victim.Job.MinAvailable]...)
153+
resources := getResources(pp.allowConsolidatingReclaim, victim.Tasks[:victim.Job.MinAvailable]...)
161154
if resources != nil {
162155
victimResources = append(victimResources, resources)
163156
}

0 commit comments

Comments
 (0)