Skip to content

Commit f1fdbe0

Browse files
chore: update conflict with retry and webhook retry with error log detail (#28)
* feat: add session affinity for service * chore: update conflict with retry and webhook retry with error log detail * chore: webhook retry with error log detail --------- Co-authored-by: hardy <[email protected]>
1 parent 87b5170 commit f1fdbe0

File tree

2 files changed

+113
-30
lines changed

2 files changed

+113
-30
lines changed

internal/controller/app/applicationdefinition_controller.go

Lines changed: 97 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -329,12 +329,28 @@ func (r *ApplicationDefinitionReconciler) handleFinalizer(ctx context.Context, s
329329
if !controllerutil.ContainsFinalizer(appDef, appDefFinalizer) {
330330
logger.Info("Adding Finalizer")
331331
controllerutil.AddFinalizer(appDef, appDefFinalizer)
332-
if err := r.Client.Update(ctx, appDef); err != nil {
333-
logger.Error(err, "Failed to add finalizer")
334-
return false, err // Return error to retry update
332+
// Retry loop for adding finalizer
333+
for i := 0; i < 3; i++ {
334+
if err := r.Client.Update(ctx, appDef); err != nil {
335+
if apierrors.IsConflict(err) {
336+
logger.Info("Conflict adding finalizer, retrying...", "attempt", i+1)
337+
// Re-fetch the latest version
338+
if fetchErr := r.Client.Get(ctx, client.ObjectKeyFromObject(appDef), appDef); fetchErr != nil {
339+
logger.Error(fetchErr, "Failed to re-fetch appDef after conflict")
340+
return false, fetchErr
341+
}
342+
// Re-add the finalizer since appDef was re-fetched
343+
controllerutil.AddFinalizer(appDef, appDefFinalizer)
344+
continue
345+
}
346+
logger.Error(err, "Failed to add finalizer")
347+
return false, err
348+
}
349+
// Success, finalizer added
350+
return false, nil
335351
}
336-
// Finalizer added, no further action needed in this step regarding deletion
337-
return false, nil
352+
// If we get here, all retries failed
353+
return false, fmt.Errorf("failed to add finalizer after 3 attempts")
338354
}
339355
} else {
340356
// Object IS being deleted
@@ -352,11 +368,27 @@ func (r *ApplicationDefinitionReconciler) handleFinalizer(ctx context.Context, s
352368

353369
logger.Info("Cleanup complete, removing Finalizer")
354370
controllerutil.RemoveFinalizer(appDef, appDefFinalizer)
355-
if err := r.Client.Update(ctx, appDef); err != nil {
356-
logger.Error(err, "Failed to remove finalizer")
357-
return true, err // isDeleted=true, return error to retry update
371+
// Retry loop for removing finalizer
372+
for i := 0; i < 3; i++ {
373+
if err := r.Client.Update(ctx, appDef); err != nil {
374+
if apierrors.IsConflict(err) {
375+
logger.Info("Conflict removing finalizer, retrying...", "attempt", i+1)
376+
// Re-fetch the latest version
377+
if fetchErr := r.Client.Get(ctx, client.ObjectKeyFromObject(appDef), appDef); fetchErr != nil {
378+
logger.Error(fetchErr, "Failed to re-fetch appDef after conflict")
379+
return true, fetchErr
380+
}
381+
// Re-remove the finalizer since appDef was re-fetched
382+
controllerutil.RemoveFinalizer(appDef, appDefFinalizer)
383+
continue
384+
}
385+
logger.Error(err, "Failed to remove finalizer")
386+
return true, err
387+
}
388+
// Success, finalizer removed
389+
logger.Info("Finalizer removed successfully")
390+
break
358391
}
359-
logger.Info("Finalizer removed successfully")
360392
}
361393
// Object is being deleted, stop further reconciliation
362394
return true, nil
@@ -378,14 +410,37 @@ func (r *ApplicationDefinitionReconciler) setInitialPhase(ctx context.Context, s
378410
Message: "Starting component processing"})
379411

380412
// 需要持久化更新到cr中,否则status一直不会更新,不会触发后续的apply
381-
err := r.Client.Status().Update(ctx, state.appDef)
382-
if err != nil {
383-
return false, err
413+
// Retry loop for status update
414+
for i := 0; i < 3; i++ {
415+
err := r.Client.Status().Update(ctx, state.appDef)
416+
if err != nil {
417+
if apierrors.IsConflict(err) {
418+
logger := log.FromContext(ctx)
419+
logger.Info("Conflict updating status in setInitialPhase, retrying...", "attempt", i+1)
420+
// Re-fetch the latest status
421+
if fetchErr := r.Client.Get(ctx, client.ObjectKeyFromObject(state.appDef), state.appDef); fetchErr != nil {
422+
logger.Error(fetchErr, "Failed to re-fetch appDef after status update conflict")
423+
return false, fetchErr
424+
}
425+
// Reapply the status changes since appDef was re-fetched
426+
state.appDef.Status.Phase = appv1.ApplicationPhaseCreating
427+
setCondition(state.appDef, metav1.Condition{
428+
Type: string(appv1.ConditionReady),
429+
Status: metav1.ConditionFalse,
430+
Reason: "Processing",
431+
Message: "Starting component processing"})
432+
continue
433+
}
434+
return false, err
435+
}
436+
// Success, status updated
437+
r.recordEvent(state.appDef, "Processing", webrecorder.StatusInProgress, "SyncComponent",
438+
corev1.EventTypeNormal, "Processing", "Starting component processing")
439+
// Status will be updated later if needed, just return true to signal requeue
440+
return true, nil // Signal that phase changed and might need status update + requeue
384441
}
385-
r.recordEvent(state.appDef, "Processing", webrecorder.StatusInProgress, "SyncComponent",
386-
corev1.EventTypeNormal, "Processing", "Starting component processing")
387-
// Status will be updated later if needed, just return true to signal requeue
388-
return true, nil // Signal that phase changed and might need status update + requeue
442+
// If we get here, all retries failed
443+
return false, fmt.Errorf("failed to update status in setInitialPhase after 3 attempts")
389444
}
390445
return false, nil // Phase already set, no update needed now
391446
}
@@ -761,18 +816,35 @@ func (r *ApplicationDefinitionReconciler) updateStatusIfNeeded(ctx context.Conte
761816
return false, nil // No changes detected
762817
}
763818

764-
// Status has changed, attempt update
819+
// Status has changed, attempt update with retry
765820
logger.V(1).Info("Status has changed, attempting update.", "newPhase", currentApp.Status.Phase)
766-
if err := r.Client.Status().Update(ctx, currentApp); err != nil {
767-
if apierrors.IsConflict(err) {
768-
logger.Info("Status update conflict detected, requeuing for retry", "error", err.Error())
769-
return false, err // Return conflict error for immediate requeue
821+
822+
// Create a copy of the desired status to reapply if needed
823+
desiredStatus := currentApp.Status.DeepCopy()
824+
825+
for i := 0; i < 3; i++ {
826+
if err := r.Client.Status().Update(ctx, currentApp); err != nil {
827+
if apierrors.IsConflict(err) {
828+
logger.Info("Status update conflict detected, retrying...", "attempt", i+1, "error", err.Error())
829+
// Re-fetch the latest version
830+
if fetchErr := r.Client.Get(ctx, client.ObjectKeyFromObject(currentApp), currentApp); fetchErr != nil {
831+
logger.Error(fetchErr, "Failed to re-fetch appDef after status update conflict")
832+
return false, fetchErr
833+
}
834+
// Reapply the desired status changes since currentApp was re-fetched
835+
currentApp.Status = *desiredStatus.DeepCopy()
836+
// Update ObservedGeneration again since we're overwriting the status
837+
currentApp.Status.ObservedGeneration = currentApp.Generation
838+
continue
839+
}
840+
logger.Error(err, "Failed to update ApplicationDefinition status")
841+
return false, err // Return other status update errors
770842
}
771-
logger.Error(err, "Failed to update ApplicationDefinition status")
772-
return false, err // Return other status update errors
843+
logger.V(1).Info("ApplicationDefinition status updated successfully")
844+
return true, nil // Status updated successfully
773845
}
774-
logger.V(1).Info("ApplicationDefinition status updated successfully")
775-
return true, nil // Status updated successfully
846+
// If we get here, all retries failed
847+
return false, fmt.Errorf("failed to update status after 3 attempts")
776848
}
777849

778850
// --- Status Comparison Helpers ---

pkg/webrecorder/recorder.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"context"
66
"encoding/json"
77
"fmt"
8+
"io"
89
"math"
910
"net/http"
1011
"strings"
@@ -44,7 +45,7 @@ const (
4445
WebhookRetryMaxAttempts = 3
4546
// WebhookRetryInitialInterval is the base duration to wait before the first retry.
4647
// Subsequent retries will use exponential backoff.
47-
WebhookRetryInitialInterval = 2 * time.Second
48+
WebhookRetryInitialInterval = 10 * time.Second
4849
)
4950

5051
// ResourceChange tracks resource changes (CPU, memory, disk, replicas)
@@ -122,7 +123,7 @@ func NewWebhookEventRecorder(webhookURL, eventID, clusterID string) record.Event
122123
clusterID: clusterID,
123124
logger: log.Log.WithName("Web Event Recorder"),
124125
httpClient: &http.Client{
125-
Timeout: 5 * time.Second, // 降低超时时间,避免长时间阻塞
126+
Timeout: 15 * time.Second,
126127
},
127128
}
128129
}
@@ -207,6 +208,7 @@ func (r *WebhookEventRecorder) sendEvent(data *WebhookEvent) {
207208
// Calculate exponential backoff: 2s, 4s, 8s
208209
backoffDuration := WebhookRetryInitialInterval * time.Duration(math.Pow(2, float64(attempt-1)))
209210
r.logger.Info("Webhook send failed. Retrying...",
211+
"url", r.webhookURL,
210212
"attempt", fmt.Sprintf("%d/%d", attempt+1, WebhookRetryMaxAttempts),
211213
"retry_after", backoffDuration.String())
212214
time.Sleep(backoffDuration)
@@ -226,7 +228,7 @@ func (r *WebhookEventRecorder) sendEvent(data *WebhookEvent) {
226228

227229
resp, err := r.httpClient.Do(req)
228230
if err != nil {
229-
r.logger.V(1).Info("Failed to send event to webhook", "url", r.webhookURL, "error", err.Error())
231+
r.logger.Error(err, "Failed to send webhook event", "url", r.webhookURL)
230232
// 快速失败:如果是连接拒绝或EOF,不再重试
231233
if isNetworkError(err) {
232234
r.logger.Info("Network error detected, skipping remaining retries", "error", err.Error())
@@ -236,18 +238,27 @@ func (r *WebhookEventRecorder) sendEvent(data *WebhookEvent) {
236238
}
237239
defer resp.Body.Close()
238240

241+
// Read the response body
242+
bodyBytes, err := io.ReadAll(resp.Body)
243+
respBody := string(bodyBytes)
244+
239245
// On successful send, check the status code.
240246
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
241247
r.logger.V(1).Info("Successfully sent event to webhook", "url", r.webhookURL, "status", resp.Status)
242248
return // Success, exit the function.
243249
}
244250

245251
// If the status code indicates an error, treat it as a failure.
246-
r.logger.V(1).Info("Webhook endpoint returned an error", "status", resp.Status)
252+
r.logger.Error(nil, "Webhook endpoint returned error status",
253+
"url", r.webhookURL,
254+
"status_code", resp.StatusCode,
255+
"status", resp.Status,
256+
"response_body", respBody)
247257
}
248258

249259
// If the loop completes, all retries have failed.
250-
r.logger.V(1).Info("Failed to send event to webhook after all retries, dropping the event.",
260+
r.logger.Error(nil, "Failed to send webhook event after all retries, dropping the event.",
261+
"url", r.webhookURL,
251262
"max_retries", WebhookRetryMaxAttempts)
252263
}
253264

0 commit comments

Comments
 (0)