-
Notifications
You must be signed in to change notification settings - Fork 4.3k
Feat: partitionable devices support #8559
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
MenD32
wants to merge
19
commits into
kubernetes:master
Choose a base branch
from
MenD32:feat/partitionable-devices-support
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+474
−5
Open
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
9dc8e0f
feat: added consumedCounters into pool util calculation
MenD32 9a450a1
feat: added consumedCounters into pool util calculation
MenD32 ff1d30f
tests: added test to check partionable devices are calculated correctly
MenD32 d0f230e
tests: added test to check partionable devices are calculated correctly
MenD32 e92825e
tests: added test to check partionable devices are calculated correctly
MenD32 862482f
Merge branch 'master' into feat/partitionable-devices-support
MenD32 bb5c8d9
fix: bumped crypto and net
MenD32 f9f397a
Merge branch 'master' into feat/partitionable-devices-support
MenD32 4453bf2
Add Capacity Buffer controller logic
abdelrahman882 6e4f48b
feat: added flag to set deletion candidate taint TTL
MenD32 85a0d94
Add rapid release channel to GKE cluster creation command
laoj2 b09676c
change kwok nodegroup annotation key
drmorr0 212869b
merge: merging from main
MenD32 4fa5202
feat: added flag to set deletion candidate taint TTL
MenD32 9f2c8db
Merge branch 'master' into feat/partitionable-devices-support
MenD32 3956443
fix: updated resourceapi to v1
MenD32 c61b8ad
feat: Partionable Devices Support
MenD32 c2633bf
fix: added weighting to summation in order to consider a mix of parti…
MenD32 2ed4602
fix: added weighting to summation in order to consider a mix of parti…
MenD32 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import ( | |
|
|
||
| v1 "k8s.io/api/core/v1" | ||
| resourceapi "k8s.io/api/resource/v1" | ||
| "k8s.io/apimachinery/pkg/api/resource" | ||
| "k8s.io/autoscaler/cluster-autoscaler/simulator/framework" | ||
| ) | ||
|
|
||
|
|
@@ -43,7 +44,7 @@ func CalculateDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (map[stri | |
| poolDevices := getAllDevices(currentSlices) | ||
| allocatedDeviceNames := allocatedDevices[driverName][poolName] | ||
| unallocated, allocated := splitDevicesByAllocation(poolDevices, allocatedDeviceNames) | ||
| result[driverName][poolName] = calculatePoolUtil(unallocated, allocated) | ||
| result[driverName][poolName] = calculatePoolUtil(unallocated, allocated, currentSlices) | ||
| } | ||
| } | ||
| return result, nil | ||
|
|
@@ -69,10 +70,116 @@ func HighestDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (v1.Resourc | |
| return highestResourceName, highestUtil, nil | ||
| } | ||
|
|
||
| func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 { | ||
| numAllocated := float64(len(allocated)) | ||
| numUnallocated := float64(len(unallocated)) | ||
| return numAllocated / (numAllocated + numUnallocated) | ||
| func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlices []*resourceapi.ResourceSlice) float64 { | ||
| TotalConsumedCounters := map[string]map[string]resource.Quantity{} | ||
| for _, resourceSlice := range resourceSlices { | ||
| for _, sharedCounter := range resourceSlice.Spec.SharedCounters { | ||
| if _, ok := TotalConsumedCounters[sharedCounter.Name]; !ok { | ||
| TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{} | ||
| } | ||
| for counter, value := range sharedCounter.Counters { | ||
| TotalConsumedCounters[sharedCounter.Name][counter] = value.Value | ||
| } | ||
| } | ||
| } | ||
| allocatedConsumedCounters := calculateConsumedCounters(allocated) | ||
|
|
||
| // not all devices are partitionable, so fallback to the ratio of non-partionable devices | ||
| allocatedDevicesWithoutCounters := 0 | ||
| devicesWithoutCounters := 0 | ||
|
|
||
| for _, device := range allocated { | ||
| if device.ConsumesCounters == nil { | ||
| devicesWithoutCounters++ | ||
| allocatedDevicesWithoutCounters++ | ||
| } | ||
| } | ||
| for _, device := range unallocated { | ||
| if device.ConsumesCounters == nil { | ||
| devicesWithoutCounters++ | ||
| } | ||
| } | ||
|
|
||
| // we want to find the counter that is most utilized, since it is the "bottleneck" of the pool | ||
| var partitionableUtilization float64 = 0 | ||
| var atomicDevicesUtilization float64 = 0 | ||
| if devicesWithoutCounters != 0 { | ||
| atomicDevicesUtilization = float64(allocatedDevicesWithoutCounters) / float64(devicesWithoutCounters) | ||
| } | ||
| if len(TotalConsumedCounters) == 0 { | ||
| return atomicDevicesUtilization | ||
| } | ||
| for counterSet, counters := range TotalConsumedCounters { | ||
| for counterName, totalValue := range counters { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: is this easier to follow? (rather then checking for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, this is wayyy cleaner. I'll change it |
||
| if totalValue.IsZero() { | ||
| continue | ||
| } | ||
| if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists { | ||
| if allocatedValue, exists := allocatedSet[counterName]; exists { | ||
| utilization := float64(allocatedValue.Value()) / float64(totalValue.Value()) | ||
| if utilization > partitionableUtilization { | ||
| partitionableUtilization = utilization | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| var uniquePartitionableDevicesCount float64 = float64(getUniquePartitionableDevicesCount(allocated)) | ||
| var totalUniqueDevices float64 = uniquePartitionableDevicesCount + float64(devicesWithoutCounters) | ||
| var partitionableDevicesUtilizationWeight float64 = uniquePartitionableDevicesCount / totalUniqueDevices | ||
| var nonPartitionableDevicesUtilizationWeight float64 = 1 - partitionableDevicesUtilizationWeight | ||
| // when a pool has both atomic and partitionable devices, we sum their utilizations since they are mutually exclusive | ||
| return partitionableUtilization*partitionableDevicesUtilizationWeight + atomicDevicesUtilization*nonPartitionableDevicesUtilizationWeight | ||
| } | ||
|
|
||
| // calculateConsumedCounters calculates the total counters consumed by a list of devices | ||
| func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity { | ||
| countersConsumed := map[string]map[string]resource.Quantity{} | ||
| for _, device := range devices { | ||
| if device.ConsumesCounters == nil { | ||
| continue | ||
| } | ||
| for _, consumedCounter := range device.ConsumesCounters { | ||
| if _, ok := countersConsumed[consumedCounter.CounterSet]; !ok { | ||
| countersConsumed[consumedCounter.CounterSet] = map[string]resource.Quantity{} | ||
| } | ||
| for counter, value := range consumedCounter.Counters { | ||
| if _, ok := countersConsumed[consumedCounter.CounterSet][counter]; !ok { | ||
| countersConsumed[consumedCounter.CounterSet][counter] = resource.Quantity{} | ||
| } | ||
| v := countersConsumed[consumedCounter.CounterSet][counter] | ||
| v.Add(value.Value) | ||
| countersConsumed[consumedCounter.CounterSet][counter] = v | ||
| } | ||
| } | ||
| } | ||
| return countersConsumed | ||
| } | ||
|
|
||
| // getUniquePartitionableDevicesCount returns the count of unique partitionable devices in the provided list. | ||
| // a partitionable device can be represented by multiple devices with different names and properties, and for utilization purposes we'd like to count the hardware and not the software. | ||
| func getUniquePartitionableDevicesCount(devices []resourceapi.Device) int { | ||
| var deviceCount int = 0 | ||
| var counted bool | ||
| consumedCounters := map[string]bool{} | ||
| for _, device := range devices { | ||
| // the assumption here is that a partitionable device will consume the actual resources from the hardware, which will be represented by consumedCounters. | ||
| // if a device consumes multiple counters of the same device, we count them both at the same time in order to not "overcount" devices with multiple counters, the assumption here is that a device will always consume some of every resource in a device. (f.e. a GPU DRA request cannot use VRAM without using GPU cycles and vice versa) | ||
| if device.ConsumesCounters != nil { | ||
| counted = false | ||
| for _, consumedCounter := range device.ConsumesCounters { | ||
| if _, exists := consumedCounters[consumedCounter.CounterSet]; !exists { | ||
| consumedCounters[consumedCounter.CounterSet] = true | ||
| } else { | ||
| counted = true | ||
| } | ||
| } | ||
| if !counted { | ||
| deviceCount++ | ||
| } | ||
| } | ||
| } | ||
| return deviceCount | ||
| } | ||
|
|
||
| func splitDevicesByAllocation(devices []resourceapi.Device, allocatedNames []string) (unallocated, allocated []resourceapi.Device) { | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a chance that more than one resource slice from the passed in
resourceSliceswill have a counterCounterSetwith the same name (Nameproperty)? That would be the only reason to check for existence before initializingTotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}. Also, if that's true, are we confident that they won't have any collisions with any of the names of theCountersin theirCounters map[string]Counter? Otherwise we're overwriting them below.tl;dr we may be able to simplify this and simply assign
TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}without having to first check if it's already there, or if not, there may be more checks.I did this and UT still pass:
$ git diff diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go index c717fdfd6..98f7480a6 100644 --- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go +++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go @@ -74,9 +74,7 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlic TotalConsumedCounters := map[string]map[string]resource.Quantity{} for _, resourceSlice := range resourceSlices { for _, sharedCounter := range resourceSlice.Spec.SharedCounters { - if _, ok := TotalConsumedCounters[sharedCounter.Name]; !ok { - TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{} - } + TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{} for counter, value := range sharedCounter.Counters { TotalConsumedCounters[sharedCounter.Name][counter] = value.Value }Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My impression from the KEP is that there shouldn't be any collisions in counterset names, since these are unique within a resource pool.
I got the impression that there could be a collision of the same sharedcounter from 2 different resource pools, but this would be high improbable since it'd imply that the same exact device (same device ID) appears in multiple resource pools.
Since this code is within a pool's scope, I think I'll simplify it the way you suggested