-
Notifications
You must be signed in to change notification settings - Fork 98
ComputeDomains: adjust task reconciliation behavior for large CD formation #732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
37fd9dd
881680b
6b0000f
edc85d5
7998f8d
e31207f
5d0641f
3f2c670
3bb26bf
6b91daa
8d621bd
ca93022
4df71bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,7 +33,10 @@ import ( | |
| ) | ||
|
|
||
| const ( | ||
| informerResyncPeriod = 10 * time.Minute | ||
| // Detecting when a CD daemon transitions from NotReady to Ready (based on | ||
| // the startup probe) at the moment sometimes requires an informer resync, | ||
| // see https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/742. | ||
| informerResyncPeriod = 4 * time.Minute | ||
| mutationCacheTTL = time.Hour | ||
| ) | ||
|
|
||
|
|
@@ -113,12 +116,14 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) { | |
|
|
||
| m.podManager = NewPodManager(m.config, m.Get, m.mutationCache) | ||
|
|
||
| // Use `WithKey` with hard-coded key, to cancel any previous update task (we | ||
| // want to make sure that the latest CD status update wins). | ||
| _, err = m.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
| AddFunc: func(obj any) { | ||
| m.config.workQueue.Enqueue(obj, m.onAddOrUpdate) | ||
| m.config.workQueue.EnqueueWithKey(obj, "cd", m.onAddOrUpdate) | ||
| }, | ||
| UpdateFunc: func(objOld, objNew any) { | ||
| m.config.workQueue.Enqueue(objNew, m.onAddOrUpdate) | ||
| m.config.workQueue.EnqueueWithKey(objNew, "cd", m.onAddOrUpdate) | ||
| }, | ||
| }) | ||
| if err != nil { | ||
|
|
@@ -213,18 +218,20 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error | |
| return nil | ||
| } | ||
|
|
||
| // Update node info in ComputeDomain. | ||
| if err := m.UpdateComputeDomainNodeInfo(ctx, cd); err != nil { | ||
| return fmt.Errorf("error updating node info in ComputeDomain: %w", err) | ||
| // Update node info in ComputeDomain, if required. | ||
| if err := m.EnsureNodeInfoInCD(ctx, cd); err != nil { | ||
| return fmt.Errorf("CD update: failed to insert/update node info in CD: %w", err) | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| // UpdateComputeDomainNodeInfo updates the Nodes field in the ComputeDomain with | ||
| // info about the ComputeDomain daemon running on this node. Upon success, it | ||
| // reflects the mutation in `m.mutationCache`. | ||
| func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context, cd *nvapi.ComputeDomain) (rerr error) { | ||
| // EnsureNodeInfoInCD makes sure that the current node (by node name) is | ||
| // represented in the `Nodes` field in the ComputeDomain object, and that it | ||
| // reports the IP address of this current pod running the CD daemon. If mutation | ||
| // is needed (first insertion, or IP address update) and successful, it reflects | ||
| // the mutation in `m.mutationCache`. | ||
| func (m *ComputeDomainManager) EnsureNodeInfoInCD(ctx context.Context, cd *nvapi.ComputeDomain) (rerr error) { | ||
| var nodeInfo *nvapi.ComputeDomainNode | ||
|
|
||
| // Create a deep copy of the ComputeDomain to avoid modifying the original | ||
|
|
@@ -246,6 +253,7 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context, | |
|
|
||
| // If there is one and its IP is the same as this one, we are done | ||
| if nodeInfo != nil && nodeInfo.IPAddress == m.config.podIP { | ||
| klog.V(6).Infof("EnsureNodeInfoInCD noop: pod IP unchanged (%s)", m.config.podIP) | ||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -261,7 +269,8 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context, | |
| Name: m.config.nodeName, | ||
| CliqueID: m.config.cliqueID, | ||
| Index: nextIndex, | ||
| Status: nvapi.ComputeDomainStatusNotReady, | ||
| // This is going to be switched to Ready by podmanager. | ||
| Status: nvapi.ComputeDomainStatusNotReady, | ||
| } | ||
|
|
||
| klog.Infof("CD status does not contain node name '%s' yet, try to insert myself: %v", m.config.nodeName, nodeInfo) | ||
|
|
@@ -286,7 +295,7 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context, | |
| } | ||
| m.mutationCache.Mutation(newCD) | ||
|
|
||
| klog.V(2).Infof("Successfully updated CD") | ||
| klog.Infof("Successfully inserted/updated node in CD (nodeinfo: %v)", nodeInfo) | ||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -364,7 +373,9 @@ func (m *ComputeDomainManager) MaybePushNodesUpdate(cd *nvapi.ComputeDomain) { | |
| // perform a stable sort of IP addresses before writing them to the nodes | ||
| // config file. | ||
| if !maps.Equal(newIPs, previousIPs) { | ||
| klog.Infof("IP set changed: previous: %v; new: %v", previousIPs, newIPs) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| klog.V(2).Infof("IP set changed") | ||
| // This log message gets large for large node numbers | ||
| klog.V(6).Infof("previous: %v; new: %v", previousIPs, newIPs) | ||
| m.previousNodes = cd.Status.Nodes | ||
| m.updatedNodesChan <- cd.Status.Nodes | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,11 +72,13 @@ func (pm *PodManager) Start(ctx context.Context) error { | |
| pm.cancelContext = cancel | ||
|
|
||
| _, err := pm.podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
| // Use `WithKey` with hard-coded key, to cancel any previous update task | ||
| // (we want to make sure that the latest pod status update wins). | ||
klueska marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| AddFunc: func(obj any) { | ||
| pm.config.workQueue.Enqueue(obj, pm.addOrUpdate) | ||
| pm.config.workQueue.EnqueueWithKey(obj, "pod", pm.addOrUpdate) | ||
| }, | ||
| UpdateFunc: func(oldObj, newObj any) { | ||
| pm.config.workQueue.Enqueue(newObj, pm.addOrUpdate) | ||
| pm.config.workQueue.EnqueueWithKey(newObj, "pod", pm.addOrUpdate) | ||
| }, | ||
| }) | ||
| if err != nil { | ||
|
|
@@ -130,7 +132,7 @@ func (pm *PodManager) addOrUpdate(ctx context.Context, obj any) error { | |
| } | ||
|
|
||
| if err := pm.updateNodeStatus(ctx, status); err != nil { | ||
| return fmt.Errorf("failed to update node status: %w", err) | ||
| return fmt.Errorf("pod update: failed to update note status in CD (%s): %w", status, err) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the wrapper (workqueue) does not enrich the error message with meaningful context information, and so I added |
||
| } | ||
|
|
||
| return nil | ||
|
|
@@ -153,7 +155,8 @@ func (pm *PodManager) isPodReady(pod *corev1.Pod) bool { | |
| return false | ||
| } | ||
|
|
||
| // updateNodeStatus updates the status of the current node in the CD status. | ||
| // updateNodeStatus updates the status of the current node (the status of the | ||
| // pod running the CD daemon) in the CD status. | ||
| func (pm *PodManager) updateNodeStatus(ctx context.Context, status string) error { | ||
| // Get the current CD using the provided function | ||
| cd, err := pm.getComputeDomain(pm.config.computeDomainUUID) | ||
|
|
@@ -176,13 +179,19 @@ func (pm *PodManager) updateNodeStatus(ctx context.Context, status string) error | |
| } | ||
| } | ||
|
|
||
| // If node not found, exit early | ||
| // If node not found, exit early. Here, we could also assert `status == | ||
| // NotReady`, assumption: the CD daemon only starts after the CD manager has | ||
| // performed the node info insert (leading to node != nil below), and the | ||
| // pod can only change its status to Ready after the CD daemon has started. | ||
| // Return explicit error that is being retried, and rely on the retry chain | ||
| // to be canceled by a newer incoming pod update). | ||
| if node == nil { | ||
| return nil | ||
| return fmt.Errorf("node not yet listed in CD (waiting for insertion)") | ||
| } | ||
|
|
||
| // If status hasn't changed, exit early | ||
| if node.Status == status { | ||
| klog.V(6).Infof("updateNodeStatus noop: status not changed (%s)", status) | ||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -198,6 +207,6 @@ func (pm *PodManager) updateNodeStatus(ctx context.Context, status string) error | |
| } | ||
| pm.computeDomainMutationCache.Mutation(newCD) | ||
|
|
||
| klog.Infof("Successfully updated node %s status to %s", pm.config.nodeName, status) | ||
| klog.Infof("Successfully updated node status in CD (new nodeinfo: %v)", node) | ||
| return nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| /* | ||
| * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package workqueue | ||
|
|
||
| import ( | ||
| "math/rand" | ||
| "time" | ||
|
|
||
| "k8s.io/client-go/util/workqueue" | ||
| "k8s.io/klog/v2" | ||
| ) | ||
|
|
||
| // Jitter relative to the delay yielded by the inner limiter. Example: a factor | ||
| // of 0.1 translates to a jitter interval with a width of 10 % compared to the | ||
| // inner delay, and centered around the inner delay time (resulting in +/- 5 % | ||
| // deviation compared to the inner delay time). | ||
| type JitterRL[T comparable] struct { | ||
| inner workqueue.TypedRateLimiter[T] | ||
| factor float64 | ||
| } | ||
|
|
||
| func NewJitterRateLimiter[T comparable](inner workqueue.TypedRateLimiter[T], factor float64) workqueue.TypedRateLimiter[T] { | ||
| if factor >= 1.0 { | ||
| panic("factor must be < 1.0") | ||
| } | ||
| return &JitterRL[T]{inner: inner, factor: factor} | ||
| } | ||
|
|
||
| func (j *JitterRL[T]) When(item T) time.Duration { | ||
| // Get inner limiter's delay. | ||
| d := j.inner.When(item) | ||
|
|
||
| // Calculate jitter interval width W_j relative to the delay time given by | ||
| // the inner limiter. | ||
| jitterWidthSeconds := d.Seconds() * j.factor | ||
|
|
||
| // Get random number in the interval [-W_j/2, W_j/2). | ||
| jitterSeconds := jitterWidthSeconds * (rand.Float64() - 0.5) | ||
|
|
||
| delay := d + time.Duration(jitterSeconds*float64(time.Second)) | ||
| klog.V(7).Infof("inner: %.5f s, jittered: %.5f s", d.Seconds(), delay.Seconds()) | ||
|
|
||
| return delay | ||
| } | ||
|
|
||
| func (j *JitterRL[T]) Forget(item T) { | ||
| j.inner.Forget(item) | ||
| } | ||
|
|
||
| func (j *JitterRL[T]) NumRequeues(item T) int { | ||
| return j.inner.NumRequeues(item) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -51,13 +51,17 @@ func DefaultPrepUnprepRateLimiter() workqueue.TypedRateLimiter[any] { | |
| // This is a per-item exponential backoff limiter. Each time an item | ||
| // fails and is retried, the delay grows exponentially starting from the | ||
| // lower value up to the upper bound. | ||
| workqueue.NewTypedItemExponentialFailureRateLimiter[any](250*time.Millisecond, 3000*time.Second), | ||
| workqueue.NewTypedItemExponentialFailureRateLimiter[any](250*time.Millisecond, 3000*time.Millisecond), | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| // Global (not per-item) rate limiter. Allows up to 5 retries per | ||
| // second, with bursts of up to 10. | ||
| &workqueue.TypedBucketRateLimiter[any]{Limiter: rate.NewLimiter(rate.Limit(5), 10)}, | ||
| ) | ||
| } | ||
|
|
||
| func DefaultCDDaemonRateLimiter() workqueue.TypedRateLimiter[any] { | ||
| return NewJitterRateLimiter(workqueue.NewTypedItemExponentialFailureRateLimiter[any](5*time.Millisecond, 6000*time.Millisecond), 0.5) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought quite a bit about these numbers, but of course these are just an attempt to pick something meaningful -- we will see over time if and how we want to change method and parameters. |
||
| } | ||
|
|
||
| func DefaultControllerRateLimiter() workqueue.TypedRateLimiter[any] { | ||
| return workqueue.DefaultTypedControllerRateLimiter[any]() | ||
| } | ||
|
|
@@ -136,6 +140,11 @@ func (q *WorkQueue) EnqueueWithKey(obj any, key string, callback func(ctx contex | |
|
|
||
| q.Lock() | ||
| q.activeOps[key] = workItem | ||
| // Do we also want to make sure here that a previously enqueued task for | ||
| // this key isn't going to be run anymore, if not yet started? Currently, | ||
| // the next-scheduled retry attempt is still executed, and business logic is | ||
| // hopefully resilient enough. | ||
| klog.V(7).Infof("enqueue with key: %s", key) | ||
| q.queue.AddRateLimited(workItem) | ||
| q.Unlock() | ||
| } | ||
|
|
@@ -153,16 +162,17 @@ func (q *WorkQueue) processNextWorkItem(ctx context.Context) { | |
| return | ||
| } | ||
|
|
||
| attempts := q.queue.NumRequeues(item) | ||
| err := q.reconcile(ctx, workItem) | ||
| if err != nil { | ||
| // Most often, this is an expected, retryable error in the context of an | ||
| // eventually consistent system. Hence, do not log on an error level. Rely | ||
| // on inner business logic to log unexpected errors on an error level. | ||
| klog.V(1).Infof("Reconcile: %v", err) | ||
| klog.Infof("Reconcile: %v (attempt %d)", err, attempts) | ||
| // Only retry if we're still the current operation for this key | ||
| q.Lock() | ||
| if q.activeOps[workItem.Key] != nil && q.activeOps[workItem.Key] != workItem { | ||
| klog.Errorf("Work item with key '%s' has been replaced with a newer enqueued one, not retrying", workItem.Key) | ||
| klog.Infof("Do not re-enqueue failed work item with key '%s': a newer item was enqueued", workItem.Key) | ||
| q.queue.Forget(workItem) | ||
| } else { | ||
| q.queue.AddRateLimited(workItem) | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I felt like renaming this from
UpdateComputeDomainNodeInfotoEnsureNodeInfoInCDafter I repeatedly found myself slightly confused about the high-level responsibility of this method.