Fix wrong_shard_server handling to avoid retry loops

michael stack · michael stack · commit dcca4b75008a · 2025-11-21T16:29:52.000-08:00
Instead of adding recursive retry actors that can multiply and cause
hangs, let wrong_shard_server errors propagate up to be handled by
the higher-level error handlers. This prevents concurrent actors from
all incrementing retryCount simultaneously and creating retry storms.
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
@@ -4738,14 +4738,9 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
 		} else if (e.code() == error_code_audit_storage_error) {
 			audit->foundError = true;
 		} else if (e.code() == error_code_wrong_shard_server) {
-			// wrong_shard_server means stale shard location data
-			// Retry a few times to see if data distribution stabilizes
-			if (audit->retryCount >= 3) {
-				// After retries, fail the audit so it can be retried from scratch
-				throw audit_storage_failed();
-			}
-			audit->retryCount++;
-			audit->actors.add(scheduleAuditOnRange(self, audit, req.range));
+			// wrong_shard_server means stale shard location data - treat as transient error
+			// Let the higher-level retry logic handle it
+			throw e;
 		} else if (audit->retryCount >= SERVER_KNOBS->AUDIT_RETRY_COUNT_MAX) {
 			throw audit_storage_failed();
 		} else {