Skip to content

Commit d4ffdad

Browse files
authored
feat: cpu resource limits removed during boost (#59)
1 parent 472be27 commit d4ffdad

File tree

8 files changed

+109
-39
lines changed

8 files changed

+109
-39
lines changed

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Note: this is not an officially supported Google product.
2424
* [[Boost resources] fixed target](#boost-resources-fixed-target)
2525
* [[Boost duration] fixed time](#boost-duration-fixed-time)
2626
* [[Boost duration] POD condition](#boost-duration-pod-condition)
27+
* [Configuration](#configuration)
2728
* [License](#license)
2829

2930
## Description
@@ -35,7 +36,9 @@ The Kube Startup CPU Boost leverages [In-place Resource Resize for Kubernetes Po
3536
feature introduced in Kubernetes 1.27. It allows to revert workload's CPU resource requests and limits
3637
back to their original values without the need to recreate the Pods.
3738

38-
The increase of resources is achieved by Mutating Admission Webhook.
39+
The increase of resources is achieved by Mutating Admission Webhook. By default, the webhook also
40+
removes CPU resource limits if present. The original resource values are set by operator after given
41+
period of time or when the POD condition is met.
3942

4043
## Installation
4144

@@ -203,6 +206,23 @@ Define the POD condition, the resource boost effect will last until the conditio
203206
status: "True"
204207
```
205208

209+
## Configuration
210+
211+
Kube Startup CPU Boost operator can be configured with environmental variables.
212+
213+
| Variable | Type | Default | Description |
214+
| --- | --- | --- | --- |
215+
| `POD_NAMESPACE` | `string` | `kube-startup-cpu-boost-system` | Kube Startup CPU Boost operator namespace |
216+
| `MGR_CHECK_INTERVAL` | `int` | `5` | Duration in seconds between boost manager checks for time based boost duration policy |
217+
| `LEADER_ELECTION` | `bool` | `false` | Enables leader election for controller manager |
218+
| `METRICS_PROBE_BIND_ADDR` | `string` | `:8080` | Address the metrics endpoint binds to |
219+
| `HEALTH_PROBE_BIND_ADDR` | `string` | `:8081` | Address the health probe endpoint binds to |
220+
| `SECURE_METRICS` | `bool` | `false` | Determines if the metrics endpoint is served securely |
221+
| `ZAP_LOG_LEVEL` | `int` | `0` | Log level for ZAP logger |
222+
| `ZAP_DEVELOPMENT` | `bool` | `false` | Enables development mode for ZAP logger |
223+
| `HTTP2` | `bool` | `false` | Determines if the HTTP/2 protocol is used for webhook and metrics servers|
224+
| `REMOVE_LIMITS` | `bool` | `true` | Enables operator to remove container CPU limits during the boost time |
225+
206226
## License
207227

208228
[Apache License 2.0](LICENSE)

cmd/main.go

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,17 +87,6 @@ func main() {
8787
HealthProbeBindAddress: cfg.HealthProbeBindAddr,
8888
LeaderElection: cfg.LeaderElection,
8989
LeaderElectionID: leaderElectionID,
90-
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
91-
// when the Manager ends. This requires the binary to immediately end when the
92-
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
93-
// speeds up voluntary leader transitions as the new leader don't have to wait
94-
// LeaseDuration time first.
95-
//
96-
// In the default scaffold provided, the program ends immediately after
97-
// the manager stops, so would be fine to enable this option. However,
98-
// if you are doing or is intended to do any operation such as perform cleanups
99-
// after the manager stops then its usage might be unsafe.
100-
// LeaderElectionReleaseOnCancel: true,
10190
})
10291
if err != nil {
10392
setupLog.Error(err, "unable to start manager")
@@ -111,7 +100,7 @@ func main() {
111100
}
112101

113102
boostMgr := boost.NewManager(mgr.GetClient())
114-
go setupControllers(mgr, boostMgr, certsReady)
103+
go setupControllers(mgr, boostMgr, cfg, certsReady)
115104

116105
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
117106
setupLog.Error(err, "unable to set up health check")
@@ -131,7 +120,7 @@ func main() {
131120
}
132121
}
133122

134-
func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan struct{}) {
123+
func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, cfg *config.Config, certsReady chan struct{}) {
135124
setupLog.Info("Waiting for certificate generation to complete")
136125
<-certsReady
137126
setupLog.Info("Certificate generation has completed")
@@ -140,7 +129,7 @@ func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan
140129
setupLog.Error(err, "Unable to create webhook", "webhook", failedWebhook)
141130
os.Exit(1)
142131
}
143-
cpuBoostWebHook := boostWebhook.NewPodCPUBoostWebHook(boostMgr, scheme)
132+
cpuBoostWebHook := boostWebhook.NewPodCPUBoostWebHook(boostMgr, scheme, cfg.RemoveLimits)
144133
mgr.GetWebhookServer().Register("/mutate-v1-pod", cpuBoostWebHook)
145134
boostCtrl := &controller.StartupCPUBoostReconciler{
146135
Client: mgr.GetClient(),
@@ -153,11 +142,5 @@ func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan
153142
setupLog.Error(err, "unable to create controller", "controller", "StartupCPUBoost")
154143
os.Exit(1)
155144
}
156-
/*
157-
if err = (&autoscalingv1alpha1.StartupCPUBoost{}).SetupWebhookWithManager(mgr); err != nil {
158-
setupLog.Error(err, "unable to create webhook", "webhook", "StartupCPUBoost")
159-
os.Exit(1)
160-
}
161-
*/
162145
//+kubebuilder:scaffold:builder
163146
}

internal/config/config.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const (
2525
ZapLogLevelDefault = 0 // zapcore.InfoLevel
2626
ZapDevelopmentDefault = false
2727
HTTP2Default = false
28+
RemoveLimitsDefault = true
2829
)
2930

3031
// ConfigProvider provides the Kube Startup CPU Boost configuration
@@ -42,9 +43,9 @@ type Config struct {
4243
// LeaderElection enables leader election for controller manager
4344
// Enabling this will ensure there is only one active controller manager
4445
LeaderElection bool
45-
// MetricsProbeBindAddr is the address the metric endpoint binds to
46+
// MetricsProbeBindAddr is the address the metrics endpoint binds to
4647
MetricsProbeBindAddr string
47-
// HeathProbeBindAddr is the address the probe endpoint binds to
48+
// HeathProbeBindAddr is the address the health probe endpoint binds to
4849
HealthProbeBindAddr string
4950
// SecureMetrics determines if the metrics endpoint is served securely
5051
SecureMetrics bool
@@ -54,6 +55,8 @@ type Config struct {
5455
ZapDevelopment bool
5556
// HTTP2 determines if the HTTP/2 protocol is used for webhook and metrics servers
5657
HTTP2 bool
58+
// RemoveLimits determines if CPU resource limits should be removed during boost
59+
RemoveLimits bool
5760
}
5861

5962
// LoadDefaults loads the default configuration values
@@ -67,4 +70,5 @@ func (c *Config) LoadDefaults() {
6770
c.ZapLogLevel = ZapLogLevelDefault
6871
c.ZapDevelopment = ZapDevelopmentDefault
6972
c.HTTP2 = HTTP2Default
73+
c.RemoveLimits = RemoveLimitsDefault
7074
}

internal/config/config_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,14 @@ var _ = Describe("Config", func() {
5050
It("has valid ZAP log level", func() {
5151
Expect(cfg.ZapLogLevel).To(Equal(config.ZapLogLevelDefault))
5252
})
53-
It("has valid ZAP development ", func() {
53+
It("has valid ZAP development", func() {
5454
Expect(cfg.ZapDevelopment).To(Equal(config.ZapDevelopmentDefault))
5555
})
56-
It("has valid HTTP2 ", func() {
56+
It("has valid HTTP2", func() {
5757
Expect(cfg.HTTP2).To(Equal(config.HTTP2Default))
5858
})
59+
It("has valid RemoveLimits", func() {
60+
Expect(cfg.RemoveLimits).To(Equal(config.RemoveLimitsDefault))
61+
})
5962
})
6063
})

internal/config/env_provider.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ const (
3030
ZapLogLevelEnvVar = "ZAP_LOG_LEVEL"
3131
ZapDevelopmentEnvVar = "ZAP_DEVELOPMENT"
3232
HTTP2EnvVar = "HTTP2"
33+
RemoveLimitsEnvVar = "REMOVE_LIMITS"
3334
)
3435

3536
type LookupEnvFunc func(key string) (string, bool)
@@ -57,6 +58,7 @@ func (p *EnvConfigProvider) LoadConfig() (*Config, error) {
5758
errs = p.loadZapLogLevel(&config, errs)
5859
errs = p.loadZapDevelopment(&config, errs)
5960
errs = p.loadHTTP2(&config, errs)
61+
errs = p.loadRemoveLimits(&config, errs)
6062
var err error
6163
if len(errs) > 0 {
6264
err = errors.Join(errs...)
@@ -142,7 +144,18 @@ func (p *EnvConfigProvider) loadHTTP2(config *Config, curErrs []error) (errs []e
142144
boolVal, err := strconv.ParseBool(v)
143145
config.HTTP2 = boolVal
144146
if err != nil {
145-
errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", LeaderElectionEnvVar, err))
147+
errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", HTTP2EnvVar, err))
148+
}
149+
}
150+
return
151+
}
152+
153+
func (p *EnvConfigProvider) loadRemoveLimits(config *Config, curErrs []error) (errs []error) {
154+
if v, ok := p.lookupFunc(RemoveLimitsEnvVar); ok {
155+
boolVal, err := strconv.ParseBool(v)
156+
config.RemoveLimits = boolVal
157+
if err != nil {
158+
errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", RemoveLimitsEnvVar, err))
146159
}
147160
}
148161
return

internal/config/env_provider_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,13 @@ var _ = Describe("EnvProvider", func() {
131131
Expect(cfg.HTTP2).To(BeTrue())
132132
})
133133
})
134+
When("removeLimits variable is set", func() {
135+
BeforeEach(func() {
136+
lookupFuncMap[config.RemoveLimitsEnvVar] = "false"
137+
})
138+
It("has valid remove limits", func() {
139+
Expect(cfg.RemoveLimits).To(BeFalse())
140+
})
141+
})
134142
})
135143
})

internal/webhook/podcpuboost_webhook.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,17 @@ import (
3232
// +kubebuilder:webhook:path=/mutate-v1-pod,mutating=true,failurePolicy=ignore,sideEffects=None,timeoutSeconds=2,groups="",resources=pods,verbs=create,versions=v1,name=cpuboost.autoscaling.x-k8s.io,admissionReviewVersions=v1
3333

3434
type podCPUBoostHandler struct {
35-
decoder admission.Decoder
36-
manager boost.Manager
35+
decoder admission.Decoder
36+
manager boost.Manager
37+
removeLimits bool
3738
}
3839

39-
func NewPodCPUBoostWebHook(mgr boost.Manager, scheme *runtime.Scheme) *webhook.Admission {
40+
func NewPodCPUBoostWebHook(mgr boost.Manager, scheme *runtime.Scheme, removeLimits bool) *webhook.Admission {
4041
return &webhook.Admission{
4142
Handler: &podCPUBoostHandler{
42-
manager: mgr,
43-
decoder: admission.NewDecoder(scheme),
43+
manager: mgr,
44+
decoder: admission.NewDecoder(scheme),
45+
removeLimits: removeLimits,
4446
},
4547
}
4648
}
@@ -89,6 +91,9 @@ func (h *podCPUBoostHandler) boostContainerResources(ctx context.Context, b boos
8991
"newCpuRequests", resources.Requests.Cpu().String(),
9092
"newCpuLimits", resources.Limits.Cpu().String(),
9193
)
94+
if h.removeLimits {
95+
delete(resources.Limits, corev1.ResourceCPU)
96+
}
9297
pod.Spec.Containers[i].Resources = *resources
9398
log.Info("pod resources increased")
9499
}

internal/webhook/podcpuboost_webhook_test.go

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@ import (
4141
var _ = Describe("Pod CPU Boost Webhook", func() {
4242
Describe("Handles admission requests", func() {
4343
var (
44-
mockCtrl *gomock.Controller
45-
manager *mock.MockManager
46-
managerCall *gomock.Call
47-
pod *corev1.Pod
48-
response webhook.AdmissionResponse
44+
mockCtrl *gomock.Controller
45+
manager *mock.MockManager
46+
managerCall *gomock.Call
47+
pod *corev1.Pod
48+
response webhook.AdmissionResponse
49+
removeLimits bool
4950
)
5051
BeforeEach(func() {
5152
pod = podTemplate.DeepCopy()
@@ -72,7 +73,7 @@ var _ = Describe("Pod CPU Boost Webhook", func() {
7273
},
7374
},
7475
}
75-
hook := bwebhook.NewPodCPUBoostWebHook(manager, scheme.Scheme)
76+
hook := bwebhook.NewPodCPUBoostWebHook(manager, scheme.Scheme, removeLimits)
7677
response = hook.Handle(context.TODO(), admissionReq)
7778
})
7879
When("there is no matching Startup CPU Boost", func() {
@@ -130,6 +131,7 @@ var _ = Describe("Pod CPU Boost Webhook", func() {
130131
resPolicyCallOne = boost.EXPECT().ResourcePolicy(gomock.Eq(containerOneName)).Return(resPolicy, true)
131132
resPolicyCallTwo = boost.EXPECT().ResourcePolicy(gomock.Eq(containerTwoName)).Return(nil, false)
132133
managerCall.Return(boost, true)
134+
removeLimits = true
133135
})
134136
It("retrieves resource policy for containers", func() {
135137
resPolicyCallOne.Times(1)
@@ -162,10 +164,28 @@ var _ = Describe("Pod CPU Boost Webhook", func() {
162164
patch := containerResourcePatch(pod, resPolicy, "requests", 0)
163165
Expect(response.Patches).To(ContainElement(patch))
164166
})
165-
It("returns admission with container-one limits patch", func() {
166-
patch := containerResourcePatch(pod, resPolicy, "limits", 0)
167+
It("returns admission with container-one remove limits patch", func() {
168+
patch := containerRemoveRequirementPatch("limits", 0)
167169
Expect(response.Patches).To(ContainElement(patch))
168170
})
171+
When("container has memory limits set", func() {
172+
BeforeEach(func() {
173+
pod.Spec.Containers[0].Resources.Limits[corev1.ResourceMemory] = apiResource.MustParse("100Mi")
174+
})
175+
It("returns admission with container-one remove CPU limits patch", func() {
176+
patch := containerRemoveCPURequirementPatch("limits", 0)
177+
Expect(response.Patches).To(ContainElement(patch))
178+
})
179+
})
180+
When("removeLimits is not set", func() {
181+
BeforeEach(func() {
182+
removeLimits = false
183+
})
184+
It("returns admission with container-one limits patch", func() {
185+
patch := containerResourcePatch(pod, resPolicy, "limits", 0)
186+
Expect(response.Patches).To(ContainElement(patch))
187+
})
188+
})
169189
When("container has no request and no limits set", func() {
170190
BeforeEach(func() {
171191
pod.Spec.Containers[0].Resources.Requests = nil
@@ -294,3 +314,17 @@ func containerResourcePatch(pod *corev1.Pod, policy resource.ContainerPolicy, re
294314
Value: newQuantity.String(),
295315
}
296316
}
317+
318+
func containerRemoveCPURequirementPatch(requirement string, containerIdx int) jsonpatch.Operation {
319+
return jsonpatch.Operation{
320+
Operation: "remove",
321+
Path: fmt.Sprintf("/spec/containers/%d/resources/%s/cpu", containerIdx, requirement),
322+
}
323+
}
324+
325+
func containerRemoveRequirementPatch(requirement string, containerIdx int) jsonpatch.Operation {
326+
return jsonpatch.Operation{
327+
Operation: "remove",
328+
Path: fmt.Sprintf("/spec/containers/%d/resources/%s", containerIdx, requirement),
329+
}
330+
}

0 commit comments

Comments
 (0)