diff --git a/pkg/scheduler/api/podgroup_info/job_info.go b/pkg/scheduler/api/podgroup_info/job_info.go index b290b8224..2269667e3 100644 --- a/pkg/scheduler/api/podgroup_info/job_info.go +++ b/pkg/scheduler/api/podgroup_info/job_info.go @@ -407,7 +407,7 @@ func (pgi *PodGroupInfo) IsStale() bool { return true } for _, subGroup := range pgi.GetActiveSubGroupInfos() { - if !subGroup.IsGangSatisfied() { + if subGroup.IsStale() { return true } } @@ -416,8 +416,8 @@ func (pgi *PodGroupInfo) IsStale() bool { } func (pgi *PodGroupInfo) IsGangSatisfied() bool { - numActiveTasks := pgi.GetNumActiveUsedTasks() - if numActiveTasks < int(pgi.GetDefaultMinAvailable()) { + numActiveAllocatedTasks := pgi.GetActiveAllocatedTasksCount() + if numActiveAllocatedTasks < int(pgi.GetDefaultMinAvailable()) { return false } for _, subGroup := range pgi.SubGroups { diff --git a/pkg/scheduler/api/podgroup_info/subgroup_info.go b/pkg/scheduler/api/podgroup_info/subgroup_info.go index 2314f3d12..7e6004ab3 100644 --- a/pkg/scheduler/api/podgroup_info/subgroup_info.go +++ b/pkg/scheduler/api/podgroup_info/subgroup_info.go @@ -113,10 +113,15 @@ func (sgi *SubGroupInfo) IsReadyForScheduling() bool { } func (sgi *SubGroupInfo) IsGangSatisfied() bool { - numActiveTasks := sgi.GetNumActiveUsedTasks() + numActiveTasks := sgi.GetNumActiveAllocatedTasks() return numActiveTasks >= int(sgi.minAvailable) } +func (sgi *SubGroupInfo) IsStale() bool { + numActiveUsedTasks := sgi.GetNumActiveUsedTasks() + return numActiveUsedTasks < int(sgi.minAvailable) +} + func (sgi *SubGroupInfo) GetNumActiveAllocatedTasks() int { return sgi.numActiveAllocatedTasks } diff --git a/pkg/scheduler/api/podgroup_info/subgroup_info_test.go b/pkg/scheduler/api/podgroup_info/subgroup_info_test.go index e9f621d48..682ae69e1 100644 --- a/pkg/scheduler/api/podgroup_info/subgroup_info_test.go +++ b/pkg/scheduler/api/podgroup_info/subgroup_info_test.go @@ -254,6 +254,77 @@ func TestIsGangSatisfied(t *testing.T) { } } +func TestIsStale(t *testing.T) { + tests := []struct { + name string + minAvailable int32 + pods []*pod_info.PodInfo + expected bool + }{ + { + name: "not stale when used >= minAvailable", + minAvailable: 2, + pods: []*pod_info.PodInfo{ + {UID: "1", Status: pod_status.Running}, + {UID: "2", Status: pod_status.Pipelined}, + }, + expected: false, + }, + { + name: "not stale with releasing status counted as used", + minAvailable: 2, + pods: []*pod_info.PodInfo{ + {UID: "1", Status: pod_status.Running}, + {UID: "2", Status: pod_status.Releasing}, + }, + expected: false, + }, + { + name: "stale when used < minAvailable", + minAvailable: 3, + pods: []*pod_info.PodInfo{ + {UID: "1", Status: pod_status.Running}, + {UID: "2", Status: pod_status.Releasing}, + }, + expected: true, + }, + { + name: "stale when no used pods", + minAvailable: 1, + pods: []*pod_info.PodInfo{}, + expected: true, + }, + { + name: "not stale when used equals minAvailable", + minAvailable: 1, + pods: []*pod_info.PodInfo{ + {UID: "1", Status: pod_status.Releasing}, + }, + expected: false, + }, + { + name: "stale with only non-used statuses", + minAvailable: 1, + pods: []*pod_info.PodInfo{ + {UID: "1", Status: pod_status.Gated}, + }, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sgi := NewSubGroupInfo("test", tt.minAvailable) + for _, pod := range tt.pods { + sgi.AssignTask(pod) + } + if got := sgi.IsStale(); got != tt.expected { + t.Errorf("IsStale() = %v, want %v", got, tt.expected) + } + }) + } +} + func TestGetNumAliveTasks(t *testing.T) { sgi := NewSubGroupInfo("test", 2) pods := []*pod_info.PodInfo{ diff --git a/pkg/scheduler/cache/status_updater/default_status_updater.go b/pkg/scheduler/cache/status_updater/default_status_updater.go index 6e8e137e2..9c476bed9 100644 --- a/pkg/scheduler/cache/status_updater/default_status_updater.go +++ b/pkg/scheduler/cache/status_updater/default_status_updater.go @@ -247,7 +247,7 @@ func (su *defaultStatusUpdater) recordStaleJobEvent(job *podgroup_info.PodGroupI job.GetNumActiveUsedTasks(), job.GetDefaultMinAvailable()) for _, subGroup := range job.GetActiveSubGroupInfos() { - if !subGroup.IsGangSatisfied() { + if subGroup.IsStale() { message += fmt.Sprintf(", subGroup %s minMember is %d and %d pods are active", subGroup.GetName(), subGroup.GetMinAvailable(), subGroup.GetNumActiveUsedTasks()) }