Skip to content

Commit 305ae3c

Browse files
authored
Merge branch 'main' into erez/move-crds-into-chart
2 parents 3eb0f4a + 3c1e2c9 commit 305ae3c

File tree

3 files changed

+392
-96
lines changed

3 files changed

+392
-96
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
99
### Changed
1010
- Moved the CRDs into the helm chart so that they are also installed by helm and not only by the crd-upgrader, but removed the external kueue clone of topology CRD from being automatically installed.
1111

12+
### Fixed
13+
- Fixed a bug where workload with subgroups would not consider additional tasks above minAvailable
14+
1215
## [v0.8.3] - 20250-8-31
1316

1417
### Removed

pkg/scheduler/api/podgroup_info/allocation_info.go

Lines changed: 41 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -32,35 +32,26 @@ func GetTasksToAllocate(
3232
}
3333

3434
var tasksToAllocate []*pod_info.PodInfo
35-
priorityQueueMap := getTasksPriorityQueuePerSubGroup(podGroupInfo, taskOrderFn, isRealAllocation)
36-
maxNumOfTasksToAllocateMap := getNumTasksToAllocatePerSubGroup(podGroupInfo, isRealAllocation)
37-
3835
subGroupPriorityQueue := getSubGroupsPriorityQueue(podGroupInfo.GetSubGroups(), subGroupOrderFn)
39-
maxNumOfSubGroups := getNumOfSubGroupsToAllocate(podGroupInfo)
40-
numAllocatedSubGroups := 0
36+
maxNumSubGroups := getMaxNumSubGroupsToAllocate(podGroupInfo)
37+
numSubGroupsToAllocate := 0
4138

42-
for !subGroupPriorityQueue.Empty() && (numAllocatedSubGroups < maxNumOfSubGroups) {
39+
for !subGroupPriorityQueue.Empty() && (numSubGroupsToAllocate < maxNumSubGroups) {
4340
nextSubGroup := subGroupPriorityQueue.Pop().(*SubGroupInfo)
44-
taskPriorityQueue := priorityQueueMap[nextSubGroup.GetName()]
45-
maxNumOfTasksToAllocate := maxNumOfTasksToAllocateMap[nextSubGroup.GetName()]
41+
taskPriorityQueue := getTasksPriorityQueue(nextSubGroup, taskOrderFn, isRealAllocation)
42+
if taskPriorityQueue.Empty() {
43+
continue
44+
}
45+
maxNumOfTasksToAllocate := getNumTasksToAllocate(nextSubGroup, isRealAllocation)
4646
subGroupTasks := getTasksFromQueue(taskPriorityQueue, maxNumOfTasksToAllocate)
4747
tasksToAllocate = append(tasksToAllocate, subGroupTasks...)
48-
numAllocatedSubGroups += 1
48+
numSubGroupsToAllocate += 1
4949
}
5050

5151
podGroupInfo.tasksToAllocate = tasksToAllocate
5252
return tasksToAllocate
5353
}
5454

55-
func getTasksFromQueue(priorityQueue *scheduler_util.PriorityQueue, maxNumTasks int) []*pod_info.PodInfo {
56-
var tasksToAllocate []*pod_info.PodInfo
57-
for !priorityQueue.Empty() && (len(tasksToAllocate) < maxNumTasks) {
58-
nextPod := priorityQueue.Pop().(*pod_info.PodInfo)
59-
tasksToAllocate = append(tasksToAllocate, nextPod)
60-
}
61-
return tasksToAllocate
62-
}
63-
6455
func GetTasksToAllocateRequestedGPUs(
6556
podGroupInfo *PodGroupInfo, subGroupOrderFn common_info.LessFn, taskOrderFn common_info.LessFn,
6657
isRealAllocation bool,
@@ -107,20 +98,25 @@ func GetTasksToAllocateInitResource(
10798
return tasksTotalRequestedResource
10899
}
109100

110-
func getTasksPriorityQueuePerSubGroup(
111-
podGroupInfo *PodGroupInfo, taskOrderFn common_info.LessFn, isRealAllocation bool,
112-
) map[string]*scheduler_util.PriorityQueue {
113-
priorityQueuesMap := map[string]*scheduler_util.PriorityQueue{}
114-
for name, subGroup := range podGroupInfo.GetSubGroups() {
115-
priorityQueue := scheduler_util.NewPriorityQueue(taskOrderFn, scheduler_util.QueueCapacityInfinite)
116-
for _, task := range subGroup.podInfos {
117-
if task.ShouldAllocate(isRealAllocation) {
118-
priorityQueue.Push(task)
119-
}
101+
func getTasksPriorityQueue(
102+
subGroup *SubGroupInfo, taskOrderFn common_info.LessFn, isRealAllocation bool,
103+
) *scheduler_util.PriorityQueue {
104+
priorityQueue := scheduler_util.NewPriorityQueue(taskOrderFn, scheduler_util.QueueCapacityInfinite)
105+
for _, task := range subGroup.podInfos {
106+
if task.ShouldAllocate(isRealAllocation) {
107+
priorityQueue.Push(task)
120108
}
121-
priorityQueuesMap[name] = priorityQueue
122109
}
123-
return priorityQueuesMap
110+
return priorityQueue
111+
}
112+
113+
func getTasksFromQueue(priorityQueue *scheduler_util.PriorityQueue, maxNumTasks int) []*pod_info.PodInfo {
114+
var tasksToAllocate []*pod_info.PodInfo
115+
for !priorityQueue.Empty() && (len(tasksToAllocate) < maxNumTasks) {
116+
nextPod := priorityQueue.Pop().(*pod_info.PodInfo)
117+
tasksToAllocate = append(tasksToAllocate, nextPod)
118+
}
119+
return tasksToAllocate
124120
}
125121

126122
func getSubGroupsPriorityQueue(subGroups map[string]*SubGroupInfo,
@@ -132,18 +128,14 @@ func getSubGroupsPriorityQueue(subGroups map[string]*SubGroupInfo,
132128
return priorityQueue
133129
}
134130

135-
func getNumTasksToAllocatePerSubGroup(podGroupInfo *PodGroupInfo, isRealAllocation bool) map[string]int {
136-
maxTasksToAllocate := map[string]int{}
137-
for name, subGroup := range podGroupInfo.GetSubGroups() {
138-
numAllocatedTasks := subGroup.GetNumActiveAllocatedTasks()
139-
if numAllocatedTasks >= int(subGroup.minAvailable) {
140-
numTasksToAllocate := getNumAllocatableTasks(subGroup, isRealAllocation)
141-
maxTasksToAllocate[name] = int(math.Min(float64(numTasksToAllocate), 1))
142-
} else {
143-
maxTasksToAllocate[name] = int(subGroup.minAvailable) - numAllocatedTasks
144-
}
131+
func getNumTasksToAllocate(subGroup *SubGroupInfo, isRealAllocation bool) int {
132+
numAllocatedTasks := subGroup.GetNumActiveAllocatedTasks()
133+
if numAllocatedTasks >= int(subGroup.minAvailable) {
134+
numTasksToAllocate := getNumAllocatableTasks(subGroup, isRealAllocation)
135+
return int(math.Min(float64(numTasksToAllocate), 1))
136+
} else {
137+
return int(subGroup.minAvailable) - numAllocatedTasks
145138
}
146-
return maxTasksToAllocate
147139
}
148140

149141
func getNumAllocatableTasks(subGroup *SubGroupInfo, isRealAllocation bool) int {
@@ -156,12 +148,16 @@ func getNumAllocatableTasks(subGroup *SubGroupInfo, isRealAllocation bool) int {
156148
return numTasksToAllocate
157149
}
158150

159-
func getNumOfSubGroupsToAllocate(podGroupInfo *PodGroupInfo) int {
151+
func getMaxNumSubGroupsToAllocate(podGroupInfo *PodGroupInfo) int {
152+
numUnsatisfied := 0
160153
for _, subGroup := range podGroupInfo.GetSubGroups() {
161154
allocatedTasks := subGroup.GetNumActiveAllocatedTasks()
162-
if allocatedTasks >= int(subGroup.GetMinAvailable()) {
163-
return 1
155+
if allocatedTasks < int(subGroup.GetMinAvailable()) {
156+
numUnsatisfied += 1
164157
}
165158
}
166-
return len(podGroupInfo.SubGroups)
159+
if numUnsatisfied > 0 {
160+
return numUnsatisfied
161+
}
162+
return 1
167163
}

0 commit comments

Comments
 (0)