Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9dc8e0f
feat: added consumedCounters into pool util calculation
MenD32 May 23, 2025
9a450a1
feat: added consumedCounters into pool util calculation
MenD32 May 23, 2025
ff1d30f
tests: added test to check partionable devices are calculated correctly
MenD32 May 23, 2025
d0f230e
tests: added test to check partionable devices are calculated correctly
MenD32 May 23, 2025
e92825e
tests: added test to check partionable devices are calculated correctly
MenD32 May 23, 2025
862482f
Merge branch 'master' into feat/partitionable-devices-support
MenD32 Jun 4, 2025
bb5c8d9
fix: bumped crypto and net
MenD32 Jun 10, 2025
f9f397a
Merge branch 'master' into feat/partitionable-devices-support
MenD32 Jun 11, 2025
4453bf2
Add Capacity Buffer controller logic
abdelrahman882 Sep 11, 2025
6e4f48b
feat: added flag to set deletion candidate taint TTL
MenD32 Sep 12, 2025
85a0d94
Add rapid release channel to GKE cluster creation command
laoj2 Sep 17, 2025
b09676c
change kwok nodegroup annotation key
drmorr0 Sep 17, 2025
212869b
merge: merging from main
MenD32 Sep 22, 2025
4fa5202
feat: added flag to set deletion candidate taint TTL
MenD32 Sep 12, 2025
9f2c8db
Merge branch 'master' into feat/partitionable-devices-support
MenD32 Sep 22, 2025
3956443
fix: updated resourceapi to v1
MenD32 Oct 2, 2025
c61b8ad
feat: Partionable Devices Support
MenD32 Nov 7, 2025
c2633bf
fix: added weighting to summation in order to consider a mix of parti…
MenD32 Nov 21, 2025
2ed4602
fix: added weighting to summation in order to consider a mix of parti…
MenD32 Nov 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 112 additions & 5 deletions cluster-autoscaler/simulator/dynamicresources/utils/utilization.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
)

Expand All @@ -43,7 +44,7 @@ func CalculateDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (map[stri
poolDevices := getAllDevices(currentSlices)
allocatedDeviceNames := allocatedDevices[driverName][poolName]
unallocated, allocated := splitDevicesByAllocation(poolDevices, allocatedDeviceNames)
result[driverName][poolName] = calculatePoolUtil(unallocated, allocated)
result[driverName][poolName] = calculatePoolUtil(unallocated, allocated, currentSlices)
}
}
return result, nil
Expand All @@ -69,10 +70,116 @@ func HighestDynamicResourceUtilization(nodeInfo *framework.NodeInfo) (v1.Resourc
return highestResourceName, highestUtil, nil
}

func calculatePoolUtil(unallocated, allocated []resourceapi.Device) float64 {
numAllocated := float64(len(allocated))
numUnallocated := float64(len(unallocated))
return numAllocated / (numAllocated + numUnallocated)
func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlices []*resourceapi.ResourceSlice) float64 {
TotalConsumedCounters := map[string]map[string]resource.Quantity{}
for _, resourceSlice := range resourceSlices {
for _, sharedCounter := range resourceSlice.Spec.SharedCounters {
if _, ok := TotalConsumedCounters[sharedCounter.Name]; !ok {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a chance that more than one resource slice from the passed in resourceSlices will have a counter CounterSet with the same name (Name property)? That would be the only reason to check for existence before initializing TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}. Also, if that's true, are we confident that they won't have any collisions with any of the names of the Counters in their Counters map[string]Counter? Otherwise we're overwriting them below.

tl;dr we may be able to simplify this and simply assign TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{} without having to first check if it's already there, or if not, there may be more checks.

I did this and UT still pass:

$ git diff
diff --git a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go
index c717fdfd6..98f7480a6 100644
--- a/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go
+++ b/cluster-autoscaler/simulator/dynamicresources/utils/utilization.go
@@ -74,9 +74,7 @@ func calculatePoolUtil(unallocated, allocated []resourceapi.Device, resourceSlic
        TotalConsumedCounters := map[string]map[string]resource.Quantity{}
        for _, resourceSlice := range resourceSlices {
                for _, sharedCounter := range resourceSlice.Spec.SharedCounters {
-                       if _, ok := TotalConsumedCounters[sharedCounter.Name]; !ok {
-                               TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}
-                       }
+                       TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}
                        for counter, value := range sharedCounter.Counters {
                                TotalConsumedCounters[sharedCounter.Name][counter] = value.Value
                        }

Copy link
Contributor Author

@MenD32 MenD32 Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My impression from the KEP is that there shouldn't be any collisions in counterset names, since these are unique within a resource pool.

I got the impression that there could be a collision of the same sharedcounter from 2 different resource pools, but this would be high improbable since it'd imply that the same exact device (same device ID) appears in multiple resource pools.

Since this code is within a pool's scope, I think I'll simplify it the way you suggested

TotalConsumedCounters[sharedCounter.Name] = map[string]resource.Quantity{}
}
for counter, value := range sharedCounter.Counters {
TotalConsumedCounters[sharedCounter.Name][counter] = value.Value
}
}
}
allocatedConsumedCounters := calculateConsumedCounters(allocated)

// not all devices are partitionable, so fallback to the ratio of non-partionable devices
allocatedDevicesWithoutCounters := 0
devicesWithoutCounters := 0

for _, device := range allocated {
if device.ConsumesCounters == nil {
devicesWithoutCounters++
allocatedDevicesWithoutCounters++
}
}
for _, device := range unallocated {
if device.ConsumesCounters == nil {
devicesWithoutCounters++
}
}

// we want to find the counter that is most utilized, since it is the "bottleneck" of the pool
var partitionableUtilization float64 = 0
var atomicDevicesUtilization float64 = 0
if devicesWithoutCounters != 0 {
atomicDevicesUtilization = float64(allocatedDevicesWithoutCounters) / float64(devicesWithoutCounters)
}
if len(TotalConsumedCounters) == 0 {
return atomicDevicesUtilization
}
for counterSet, counters := range TotalConsumedCounters {
for counterName, totalValue := range counters {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is this easier to follow?

			if totalValue.IsZero() {
				continue
			}

(rather then checking for !totalValue.IsZero() two nested iterations later)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, this is wayyy cleaner. I'll change it

if totalValue.IsZero() {
continue
}
if allocatedSet, exists := allocatedConsumedCounters[counterSet]; exists {
if allocatedValue, exists := allocatedSet[counterName]; exists {
utilization := float64(allocatedValue.Value()) / float64(totalValue.Value())
if utilization > partitionableUtilization {
partitionableUtilization = utilization
}
}
}
}
}
var uniquePartitionableDevicesCount float64 = float64(getUniquePartitionableDevicesCount(allocated))
var totalUniqueDevices float64 = uniquePartitionableDevicesCount + float64(devicesWithoutCounters)
var partitionableDevicesUtilizationWeight float64 = uniquePartitionableDevicesCount / totalUniqueDevices
var nonPartitionableDevicesUtilizationWeight float64 = 1 - partitionableDevicesUtilizationWeight
// when a pool has both atomic and partitionable devices, we sum their utilizations since they are mutually exclusive
return partitionableUtilization*partitionableDevicesUtilizationWeight + atomicDevicesUtilization*nonPartitionableDevicesUtilizationWeight
}

// calculateConsumedCounters calculates the total counters consumed by a list of devices
func calculateConsumedCounters(devices []resourceapi.Device) map[string]map[string]resource.Quantity {
countersConsumed := map[string]map[string]resource.Quantity{}
for _, device := range devices {
if device.ConsumesCounters == nil {
continue
}
for _, consumedCounter := range device.ConsumesCounters {
if _, ok := countersConsumed[consumedCounter.CounterSet]; !ok {
countersConsumed[consumedCounter.CounterSet] = map[string]resource.Quantity{}
}
for counter, value := range consumedCounter.Counters {
if _, ok := countersConsumed[consumedCounter.CounterSet][counter]; !ok {
countersConsumed[consumedCounter.CounterSet][counter] = resource.Quantity{}
}
v := countersConsumed[consumedCounter.CounterSet][counter]
v.Add(value.Value)
countersConsumed[consumedCounter.CounterSet][counter] = v
}
}
}
return countersConsumed
}

// getUniquePartitionableDevicesCount returns the count of unique partitionable devices in the provided list.
// a partitionable device can be represented by multiple devices with different names and properties, and for utilization purposes we'd like to count the hardware and not the software.
func getUniquePartitionableDevicesCount(devices []resourceapi.Device) int {
var deviceCount int = 0
var counted bool
consumedCounters := map[string]bool{}
for _, device := range devices {
// the assumption here is that a partitionable device will consume the actual resources from the hardware, which will be represented by consumedCounters.
// if a device consumes multiple counters of the same device, we count them both at the same time in order to not "overcount" devices with multiple counters, the assumption here is that a device will always consume some of every resource in a device. (f.e. a GPU DRA request cannot use VRAM without using GPU cycles and vice versa)
if device.ConsumesCounters != nil {
counted = false
for _, consumedCounter := range device.ConsumesCounters {
if _, exists := consumedCounters[consumedCounter.CounterSet]; !exists {
consumedCounters[consumedCounter.CounterSet] = true
} else {
counted = true
}
}
if !counted {
deviceCount++
}
}
}
return deviceCount
}

func splitDevicesByAllocation(devices []resourceapi.Device, allocatedNames []string) (unallocated, allocated []resourceapi.Device) {
Expand Down
Loading