Skip to content

Commit dcca4b7

Browse files
author
michael stack
committed
Fix wrong_shard_server handling to avoid retry loops
Instead of adding recursive retry actors that can multiply and cause hangs, let wrong_shard_server errors propagate up to be handled by the higher-level error handlers. This prevents concurrent actors from all incrementing retryCount simultaneously and creating retry storms.
1 parent 761b593 commit dcca4b7

File tree

1 file changed

+3
-8
lines changed

1 file changed

+3
-8
lines changed

fdbserver/DataDistribution.actor.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4738,14 +4738,9 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
47384738
} else if (e.code() == error_code_audit_storage_error) {
47394739
audit->foundError = true;
47404740
} else if (e.code() == error_code_wrong_shard_server) {
4741-
// wrong_shard_server means stale shard location data
4742-
// Retry a few times to see if data distribution stabilizes
4743-
if (audit->retryCount >= 3) {
4744-
// After retries, fail the audit so it can be retried from scratch
4745-
throw audit_storage_failed();
4746-
}
4747-
audit->retryCount++;
4748-
audit->actors.add(scheduleAuditOnRange(self, audit, req.range));
4741+
// wrong_shard_server means stale shard location data - treat as transient error
4742+
// Let the higher-level retry logic handle it
4743+
throw e;
47494744
} else if (audit->retryCount >= SERVER_KNOBS->AUDIT_RETRY_COUNT_MAX) {
47504745
throw audit_storage_failed();
47514746
} else {

0 commit comments

Comments
 (0)