From ffa4fd433ff0c4e25711128c64ad8077082abad6 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 07:43:31 -0800
Subject: [PATCH 01/15] Add restore validation feature with simplified backup
 gap fix

Implements restore validation using audit_storage to verify backup/restore
correctness. Includes a minimal fix for the backup gap bug.

Key components:
- ValidateRestore audit type: compares source keys against restored keys
  at \xff\x02/rlog/ prefix in storage server
- DD audit fixes: propagate validation errors, handle DD failover correctly
- RestoreValidation and BackupAndRestoreValidation workloads for testing
- Simplified backup gap fix: prevent snapshot from finishing in the same
  iteration it dispatches the last tasks (single flag + one check)

Backup gap bug fix (FileBackupAgent.actor.cpp):
The original dispatcher marks ranges as DONE when selecting them for dispatch,
then immediately checks if all ranges are done. This causes snapshots to finish
before the dispatched tasks complete, creating gaps in backup coverage.

The fix adds a dispatchedInThisIteration flag. If tasks were dispatched in
this iteration, the completion check is skipped, ensuring at least one full
loop between dispatch and completion. This minimal change prevents premature
snapshot completion without complex state tracking.
---
 ...idating_restored_data_using_one_cluster.md |  87 ++++
 .../sphinx/source/administration.rst          |   2 +
 documentation/sphinx/source/auditstorage.rst  |   5 +
 .../source/restore-validation-testing.rst     | 486 ++++++++++++++++++
 fdbcli/AuditStorageCommand.actor.cpp          |  40 +-
 fdbcli/GetAuditStatusCommand.actor.cpp        |  29 +-
 fdbclient/AuditUtils.actor.cpp                |  12 +-
 fdbclient/FileBackupAgent.actor.cpp           |  78 +--
 fdbclient/SystemData.cpp                      |   4 +
 fdbclient/include/fdbclient/Audit.h           |   3 +-
 fdbclient/include/fdbclient/SystemData.h      |   2 +
 fdbserver/DataDistribution.actor.cpp          | 100 +++-
 fdbserver/storageserver.actor.cpp             | 422 +++++++++++++++
 fdbserver/worker.actor.cpp                    |  14 +-
 .../BackupAndRestoreValidation.actor.cpp      | 242 +++++++++
 .../workloads/RestoreValidation.actor.cpp     | 346 +++++++++++++
 tests/CMakeLists.txt                          |   1 +
 tests/fast/RestoreValidation_Simple.toml      |  42 ++
 18 files changed, 1842 insertions(+), 73 deletions(-)
 create mode 100644 design/validating_restored_data_using_one_cluster.md
 create mode 100644 documentation/sphinx/source/restore-validation-testing.rst
 create mode 100644 fdbserver/workloads/BackupAndRestoreValidation.actor.cpp
 create mode 100644 fdbserver/workloads/RestoreValidation.actor.cpp
 create mode 100644 tests/fast/RestoreValidation_Simple.toml

diff --git a/design/validating_restored_data_using_one_cluster.md b/design/validating_restored_data_using_one_cluster.md
new file mode 100644
index 00000000000..34cac712288
--- /dev/null
+++ b/design/validating_restored_data_using_one_cluster.md
@@ -0,0 +1,87 @@
+# Validating restored data with source in the same cluster
+
+Author: Neethu Haneeshi Bingi
+
+## Goal:
+
+The goal is to verify the restored data from a backup with the original source data to ensure the end-to-end correctness of backup and restore flow. While this validation is currently performed in simulation, it is limited in scale and differs from the production backup environment.
+This new validation flow should:
+
+* Run **quickly as possible and automatically**, enabling execution at regular intervals like bi-weekly or with every patch release.
+* Increase confidence in **backup/restore reliability** by testing this in production-like clusters especially as upcoming backup and restore using bulk-loading projects evolve over the next year.
+* Should be able to validate data for smaller key ranges for faster turn around time and be able to test the entire database data.
+
+
+
+## Usage:
+
+* Workflow running for smaller keyRanges once per week. Workflow can take multiple keyRange boundaries from getLocation fdbcli api output and will run the rest of the workflow steps. Should be able to complete within 3-4 hours.
+* Workflow running for entire DB data every patch release/once bi-monthly. This has a constraint for the cluster to be only 35% filled, so target to be able to complete in 5 days. 
+
+By doing this, we can validate data reliability in this path and also the restore speed for any regressions.
+
+
+## **High-Level Idea**
+
+Store both **source** and **restored** data within the same cluster to eliminate the need for a separate validation cluster.
+Every key-value pair will be compared directly, and exact key/value **corruptions or mismatches** will be reported.
+
+
+
+## **Solution:**
+
+More detailed steps followed by the diagram
+[Image: Screenshot 2025-11-12 at 7.53.04 PM.png]
+The validation consists of **three main steps** — **Backup**, **Restore**, and **Compare** — followed by a **Cleanup** phase.
+
+* Backup: 
+    * Run workload → Start Backup → Stop load  → NoteDown ReadVersion → Lock cluster for writes → Wait until (ReadVersion saved > MaxRestorableVersion) → Stop Backup 
+    * Constraint: If you running this validate entire DB data, ensure cluster has >65% available space in this phase. If space drops below this, stop/clear the load.
+        Note: Locking the cluster still allows backup and restore operations
+* Restore:
+    * Setup: Use the `addPrefix` parameter to restore into a validation keyspace. When restoring with a prefix, the restore destination empty check is automatically bypassed.
+    * GetMaxRestoreVersion from backup → Start restore with add-prefix option → Wait for restore completion
+    * Note: 
+        * Restore writes into a predefined restore_data_prefix (**/xff/x02/rlog**). Restore does lock-aware transactions to bypass the lock. Restore already supports option to add prefix to data.
+        * **Important**: Restore will clear and overwrite any existing data at the destination range. This is standard restore behavior.
+        * The restore should work for smaller key ranges or for entire db data.
+* Compare:
+    * Run fdbcli ‘audit_storage’ command with AuditType=validate_restore
+        * `audit_storage validate_restore [BeginKey] [EndKey]`  (beginKey,endKey should be within userKeyRange)
+            * BeginKey and Endkey should be same range given to restore command.
+        * DD dispatches validation tasks to SS and monitors the progress
+        * Each SS compares data and updates the results in the DB.
+    * Monitor the audit storage status, fdbcli command
+        *  `get_audit_status validate_restore progress [AuditID]`
+    * Once complete/error, check for corruptions
+* Clean up:
+    * Clear up the restored data in restored_data_prefix
+    * Clear off the setup in restore phase
+    * Start backup
+    * Unlock cluster for writes.
+
+**Note:** 
+
+* Backup–Restore–Compare steps can be automated as a single script/workflow, while Cleanup can be managed via a separate script/workflow.
+* The validate_restore process compares user keys against the restored data, but not the other way around. As a result, it can confirm that all user keys were successfully restored, but cannot detect any extra keys that may exist in the restored data
+
+## 
+Alternative Design Considerations
+
+* Validate two databases content are same:  Might be slow. Needs an external process like Consistency Checker to compare two databases. With our availability infra, difficult to have two databases
+* Validate checksum of two databases: [Comparing FDB Contents Using Merkle-Tree MD5](https://quip-apple.com/1x2jAMZJP5YB). Detects inconsistency but cannot pinpoint which key/value differs.
+
+
+
+## Implementation Details
+
+* **Locking/Unlocking database:** Use the similar api's restore uses locking and unlocking database. [Lock](https://github.com/apple/foundationdb/blob/release-7.4/fdbclient/FileBackupAgent.actor.cpp#L6754) [Unlock](https://github.com/apple/foundationdb/blob/release-7.4/fdbclient/FileBackupAgent.actor.cpp#L4286). Don't allow the restore to unlockDB as we want lock the database until the comparison is done. Restore does lock-aware transactions to bypass this lock.
+* **Wait until (ReadVersion saved > MaxRestorableVersion) step** in Backup phase: There might be small gap before we save readVersion and lock the DB. Ensure to wait for at least backup_lag_seconds not to miss any mutations in the backup
+* **Restore destination check:** The restore empty destination check is bypassed when `addPrefix.size() > 0` (indicating a validation restore to a prefixed keyspace). For regular restores without a prefix, the check remains enforced to prevent accidental data loss. Note that all restores (validation or regular) will clear and overwrite any existing data at the destination range - this is standard restore behavior.
+* **Audit Storage:**
+    * Default value of BeginKey and EndKey is normalKeys.begin and normalKeys.end. Validate both keys are in normalKeys/userKeys range, systemKeys should be not included as they are in the backup.
+    * Add new AuditType **ValidateRestore.**
+    * New audit actor auditRestoreQ() in StorageServer similar to auditStorageShardReplicaQ()
+        * Change the code to read the range appended with the prefix restored_data_prefix. Can compare, update metadata and error mechanism in the same way. [Range](https://github.com/apple/foundationdb/blob/release-7.4/fdbserver/storageserver.actor.cpp#L5671)
+
+
diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index 5298516e1c3..9ede5df838e 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -15,6 +15,8 @@ Administration
    authorization
    bulkload-user
    bulkdump
+   auditstorage
+   restore-validation-testing
    
 
 This document covers the administration of an existing FoundationDB cluster. We recommend you read this document before setting up a cluster for performance testing or production use.
diff --git a/documentation/sphinx/source/auditstorage.rst b/documentation/sphinx/source/auditstorage.rst
index 268a443b5f6..cc3a1f41a71 100644
--- a/documentation/sphinx/source/auditstorage.rst
+++ b/documentation/sphinx/source/auditstorage.rst
@@ -97,3 +97,8 @@ The AuditStorage tool checks the consistency between ServerKey and SS local shar
 In this job, the tool needs to check each storage server to see each SS has the shard mapping consistent with ServerKey.
 For each SS, DD partitions the job range in the unit of shards. Given a shard, DD requests the SS to check the consistency between the shard mapping and the ServerKey.
 The SS reads the shard mapping and compares it with the ServerKey.
+
+Restore Validation Testing
+===========================
+
+For detailed instructions on testing the restore validation feature, see :ref:`restore-validation-testing`.
diff --git a/documentation/sphinx/source/restore-validation-testing.rst b/documentation/sphinx/source/restore-validation-testing.rst
new file mode 100644
index 00000000000..f9cc5e8f3f5
--- /dev/null
+++ b/documentation/sphinx/source/restore-validation-testing.rst
@@ -0,0 +1,486 @@
+.. _restore-validation-testing:
+
+##################################
+Restore Validation Testing Guide
+##################################
+
+This guide provides step-by-step instructions for testing the restore validation feature in FoundationDB.
+
+Quick Setup
+===========
+
+1. Build Required Binaries
+---------------------------
+
+::
+
+    cd ~/build_output
+    cmake --build . --target fdbserver fdbcli fdbbackup backup_agent -j4
+
+2. Start Test Cluster
+---------------------
+
+::
+
+    # Create directories and cluster file
+    mkdir -p ~/fdb_test_data ~/fdb_backup
+    echo "test:test@127.0.0.1:4500" > ~/fdb_test.cluster
+
+    # Start fdbserver
+    ~/build_output/bin/fdbserver -p 127.0.0.1:4500 -d ~/fdb_test_data \
+      -C ~/fdb_test.cluster &
+
+    # Configure database
+    sleep 3
+    ~/build_output/bin/fdbcli -C ~/fdb_test.cluster --exec "configure new single memory"
+
+    # Start backup agent (required for backups)
+    sleep 2
+    ~/build_output/bin/backup_agent -C ~/fdb_test.cluster &
+
+    sleep 2
+    echo "Cluster ready"
+
+Testing Workflow
+================
+
+Phase 1: Set Up Test Data
+--------------------------
+
+Write Some Test Data
+^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    ~/build_output/bin/fdbcli -C ~/fdb_test.cluster
+
+    # In fdbcli:
+    fdb> writemode on
+    fdb> set testkey1 testvalue1
+    fdb> set testkey2 testvalue2
+    fdb> set testkey3 testvalue3
+    fdb> set mykey myvalue
+
+    # Verify data
+    fdb> getrange "" "\xff"
+
+Phase 2: Backup
+---------------
+
+Start Backup
+^^^^^^^^^^^^
+
+::
+
+    # Start backup (directory already created in setup)
+    ~/build_output/bin/fdbbackup start -C ~/fdb_test.cluster \
+      -d file:///Users/stack/fdb_backup -z
+
+    # Check status - wait until backup is restorable
+    ~/build_output/bin/fdbbackup status -C ~/fdb_test.cluster
+
+Wait approximately 15-30 seconds until status shows backup is "restorable". The output should include:
+
+- ``BackupURL: file:///Users/stack/fdb_backup/backup-<timestamp>``
+- Status showing "completed" or "running differential"
+
+Verify Backup is Ready
+^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    # Get the specific backup URL from the status output
+    ~/build_output/bin/fdbbackup status -C ~/fdb_test.cluster
+
+    # Verify it's restorable (use the actual backup URL from status)
+    ~/build_output/bin/fdbbackup describe \
+      -d file:///Users/stack/fdb_backup/backup-<timestamp>
+
+Look for ``Restorable: true`` and ``MaxRestorableVersion`` in the output.
+
+Phase 3: Restore to Validation Prefix
+--------------------------------------
+
+Restore with Prefix
+^^^^^^^^^^^^^^^^^^^
+
+**Important**: Use the SPECIFIC backup URL from the previous step (not the parent directory):
+
+::
+
+    # Use fdbrestore with the actual backup URL
+    ~/build_output/bin/fdbrestore start \
+      -r file:///Users/stack/fdb_backup/backup-<timestamp> \
+      --dest-cluster-file ~/fdb_test.cluster \
+      --add-prefix "\xff\x02/rlog/" \
+      -w
+
+.. warning::
+   Using ``file:///Users/stack/fdb_backup`` (parent dir) instead of the full backup path will fail with "not restorable to any version".
+
+The ``\xff\x02/rlog/`` prefix is the ``restoreLogKeys`` range where validation looks for restored data.
+
+.. important::
+   **Restore will clear and overwrite any existing data at the destination range.** This is standard restore behavior. The restore process explicitly clears the destination range before writing restored data, so any pre-existing keys in ``\xff\x02/rlog/`` will be replaced.
+
+Verify Restored Data Exists
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    ~/build_output/bin/fdbcli -C ~/fdb_test.cluster
+
+    # In fdbcli, enable system keys to see restored data
+    fdb> option on ACCESS_SYSTEM_KEYS
+    fdb> getrange "\xff\x02/rlog/" "\xff\x02/rlog0"
+
+    # You should see your keys with the prefix
+    # e.g., "\xff\x02/rlog/testkey1" -> "testvalue1"
+
+Phase 4: Run Validation
+------------------------
+
+Start Validation Audit
+^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    ~/build_output/bin/fdbcli -C ~/fdb_test.cluster
+
+    # Start validation for entire key range
+    fdb> audit_storage validate_restore "" "\xff"
+
+This returns an Audit ID. **Save this ID!**
+
+Example output::
+
+    Audit ID: 12345678-1234-5678-1234-567812345678
+
+Monitor Progress
+^^^^^^^^^^^^^^^^
+
+::
+
+    # Check overall status
+    fdb> get_audit_status validate_restore id <AuditID>
+
+    # Check detailed progress (shows which ranges are complete)
+    fdb> get_audit_status validate_restore progress <AuditID>
+
+    # Check for any errors
+    fdb> get_audit_status validate_restore phase error
+
+Wait for Completion
+^^^^^^^^^^^^^^^^^^^
+
+Keep checking status until the audit completes. For a small dataset, this should take seconds to minutes.
+
+Phase 5: Verify Results
+------------------------
+
+Check Audit Status
+^^^^^^^^^^^^^^^^^^
+
+::
+
+    fdb> get_audit_status validate_restore id <AuditID>
+
+**Expected Output (Success)**::
+
+    Audit result is:
+    AuditStorageState: [ID]: <AuditID>, [Range]: ["","\\xff"), [Type]: 5, [Phase]: 2
+
+Where:
+
+- Type: 5 = ValidateRestore
+- Phase: 2 = Complete (no errors)
+
+**If Phase: 3 = Error**, there were validation failures!
+
+Check Trace Logs
+^^^^^^^^^^^^^^^^
+
+Look for validation events in the server logs::
+
+    grep "AuditRestore" ~/fdb_test_data/*.log | tail -20
+
+Look for:
+
+- ``SSAuditRestoreBegin`` - Validation started
+- ``SSAuditRestoreComplete`` - Validation finished successfully
+- ``SSAuditRestoreError`` - Validation found errors (check details!)
+
+Phase 6: Testing a Failed Audit
+--------------------------------
+
+To verify that the audit correctly detects mismatches, you can intentionally modify the source data and rerun the audit.
+
+Modify Source Data
+^^^^^^^^^^^^^^^^^^
+
+::
+
+    ~/build_output/bin/fdbcli -C ~/fdb_test.cluster
+
+    fdb> writemode on
+
+    # Modify one of the original source values to create a mismatch
+    fdb> set testkey1 "modified_value"
+
+    # Verify the change
+    fdb> get testkey1
+
+This creates a mismatch because:
+
+- Current source data: ``testkey1`` = ``modified_value`` (modified after backup)
+- Restored data: ``\xff\x02/rlog/testkey1`` = ``testvalue1`` (from backup)
+
+Run Audit Again
+^^^^^^^^^^^^^^^
+
+::
+
+    # Start a new validation audit
+    fdb> audit_storage validate_restore "" "\xff"
+
+Save the new Audit ID returned.
+
+Check for Expected Failure
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    # Monitor the audit status
+    fdb> get_audit_status validate_restore id <NewAuditID>
+
+**Expected Output (Failure)**::
+
+    Audit result is:
+    AuditStorageState: [ID]: <AuditID>, [Range]: ["","\\xff"), [Type]: 5, [Phase]: 3
+
+Where:
+
+- Phase: 3 = Error (validation found mismatches)
+
+Check Error Details
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+    # Check detailed error information
+    fdb> get_audit_status validate_restore phase error
+
+    # Check trace logs for specific error details
+    grep "SSAuditRestoreError" ~/fdb_test_data/*.log | tail -20
+
+The logs should show which key had a mismatch and what the differing values were.
+
+Restore Correct Source Data for Next Tests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+    # Restore the original source value to match the backup
+    fdb> writemode on
+    fdb> set testkey1 testvalue1
+
+    # Verify the restoration
+    fdb> get testkey1
+
+Phase 7: Understanding Audit Design and Limitations
+----------------------------------------------------
+
+.. important::
+   The restore validation audit is designed to run immediately after a restore operation to verify the restore process didn't corrupt data. It compares the current database state with the restored data at the time of the audit.
+
+What the Audit Validates
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The audit ensures:
+
+✅ The restore process worked without corruption during the restore operation
+
+✅ Current source data matches the restored data at the time of audit
+
+✅ No data corruption occurred between source and restored locations
+
+What the Audit Does NOT Validate
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The audit has these design limitations:
+
+❌ It does NOT detect if source data changed after the backup was created
+
+❌ It compares current source data to restored data, not backup data to restored data
+
+**Why This Matters**:
+
+If you:
+
+1. Create a backup with ``testkey1=value1``
+2. Modify source data to ``testkey1=value2``
+3. Restore the backup (restores ``testkey1=value1`` to ``\xff\x02/rlog/``)
+4. Run the audit
+
+The audit will report an ERROR because current source (``value2``) doesn't match restored (``value1``). This is expected behavior - the audit validates restore integrity by comparing current state to restored state.
+
+When to Use Restore Validation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use restore validation:
+
+✅ Immediately after completing a restore operation
+
+✅ To verify the restore process worked correctly
+
+✅ To ensure no corruption during data transfer
+
+Do NOT use restore validation:
+
+❌ To verify backup data integrity (use backup verification tools instead)
+
+❌ To check if source data matches the backup (they're expected to diverge)
+
+❌ As a long-term consistency check between source and restored data
+
+Phase 8: Cleanup
+-----------------
+
+Clear Restored Data
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+    ~/build_output/bin/fdbcli -C ~/fdb_test.cluster
+
+    fdb> option on ACCESS_SYSTEM_KEYS
+    fdb> writemode on
+    fdb> clearrange "\xff\x02/rlog/" "\xff\x02/rlog0"
+
+Verify Cleanup
+^^^^^^^^^^^^^^
+
+::
+
+    fdb> getrange "\xff\x02/rlog/" "\xff\x02/rlog0"
+    # Should return empty
+
+Troubleshooting
+===============
+
+"restore_destination_not_empty" Error
+--------------------------------------
+
+**Symptom**: Restore fails saying destination is not empty
+
+**Cause**: The restore is not using a prefix (addPrefix parameter)
+
+**Fix**: When restoring for validation, you must use the ``addPrefix`` parameter to restore 
+to a different keyspace (e.g., ``\xff\x02/rlog/``). This bypasses the empty destination check 
+that's enforced for regular restores (which protect against accidental data loss).
+
+**Note**: All restores (with or without prefix) will clear and overwrite the destination range. 
+The difference is that regular restores (no prefix) check for existing data and fail to prevent 
+accidents, while validation restores (with prefix) proceed because they're intentionally writing 
+to a dedicated validation keyspace.
+
+"No backup agents are responding"
+----------------------------------
+
+**Symptom**: After running ``fdbbackup start``, you see this message
+
+**Cause**: No backup agent process running to execute the backup
+
+**Fix**::
+
+    # Start backup agent daemon
+    ~/build_output/bin/backup_agent -C ~/fdb_test.cluster &
+
+    # Wait a moment, then check backup status
+    sleep 5
+    ~/build_output/bin/fdbbackup status -C ~/fdb_test.cluster
+
+"The specified backup is not restorable to any version"
+--------------------------------------------------------
+
+**Symptom**: Restore fails immediately with this error
+
+**Causes**:
+
+1. **Wrong backup URL**: Using parent directory instead of specific backup path
+2. **Backup not complete**: Backup hasn't finished creating a restorable snapshot
+
+**Fix**::
+
+    # 1. Get the correct backup URL from status
+    ~/build_output/bin/fdbbackup status -C ~/fdb_test.cluster
+    # Look for: BackupURL: file:///.../backup-<timestamp>
+
+    # 2. Use that EXACT URL in restore command
+    ~/build_output/bin/fdbrestore start \
+      -r file:///Users/stack/fdb_backup/backup-2025-11-18-09-36-09.156836 \
+      --dest-cluster-file ~/fdb_test.cluster \
+      --add-prefix "\xff\x02/rlog/" -w
+
+Backup Not Restorable
+----------------------
+
+**Symptom**: Restore fails with "not restorable to any version"
+
+**Cause**: Backup hasn't completed or saved a snapshot yet
+
+**Fix**:
+
+- Wait longer (10-30 seconds minimum)
+- Check ``fdbbackup status`` for restorable version
+- Ensure backup agent is running
+
+No Progress Updates
+-------------------
+
+**Symptom**: Audit status stays in "Running" phase forever
+
+**Cause**: Storage servers may not have the shard containing your key range
+
+**Fix**::
+
+    # Check if data distribution is working
+    fdbcli> status details
+    # Look for storage servers and their shard assignments
+
+Cannot See Restored Data
+-------------------------
+
+**Symptom**: ``getrange "\xff\x02/rlog/"`` returns empty
+
+**Cause**: Need to enable system keys access
+
+**Fix**::
+
+    fdb> option on ACCESS_SYSTEM_KEYS
+
+Validation Completes but No Logs
+---------------------------------
+
+**Symptom**: Can't find trace events
+
+**Cause**: Logs may be in different location
+
+**Fix**::
+
+    # Find log location
+    ps aux | grep fdbserver
+    # Look for -L or --logdir parameter
+
+    # Or check default locations
+    ls -ltr /var/log/foundationdb/
+    ls -ltr ~/fdb_test_data/*.log
+
+Expected Performance
+====================
+
+- **Small dataset (100s of keys)**: Seconds
+- **Medium dataset (10K keys)**: 1-5 minutes
+- **Large dataset (1M+ keys)**: Hours (rate limited)
+
+Rate limiting is controlled by ``AUDIT_STORAGE_RATE_PER_SERVER_MAX`` (default: 50MB/s per server).
diff --git a/fdbcli/AuditStorageCommand.actor.cpp b/fdbcli/AuditStorageCommand.actor.cpp
index 8a526c01b68..75fb6269169 100644
--- a/fdbcli/AuditStorageCommand.actor.cpp
+++ b/fdbcli/AuditStorageCommand.actor.cpp
@@ -18,6 +18,40 @@
  * limitations under the License.
  */
 
+/*
+ * ============================================================================
+ * AUDIT STORAGE COMMANDS
+ * ============================================================================
+ *
+ * This file implements CLI commands for various storage audit operations:
+ *   - audit_storage ha                 : Validate high availability
+ *   - audit_storage replica            : Validate replica consistency
+ *   - audit_storage locationmetadata   : Validate location metadata
+ *   - audit_storage ssshard            : Validate storage server shards
+ *   - audit_storage validate_restore   : Validate restored backup data
+ *
+ * ============================================================================
+ * RESTORE VALIDATION (validate_restore) - Quick Reference
+ * ============================================================================
+ *
+ * USAGE: audit_storage validate_restore <begin_key> <end_key>
+ *
+ * Validates that restored backup data matches original source data.
+ *
+ * EXAMPLE WORKFLOW:
+ *   1. Backup:     fdbbackup start -d file:///backup -z
+ *   2. Restore:    fdbbackup restore -r file:///backup --add-prefix "\xff\x02/rlog/"
+ *   3. Validate:   fdb> audit_storage validate_restore "" "\xff"
+ *   4. Check:      fdb> get_audit_status validate_restore id <AuditID>
+ *   5. Cleanup:    fdb> clearrange "\xff\x02/rlog/" "\xff\x02/rlog0"
+ *
+ * NOTE: The --add-prefix parameter in step 2 allows the restore to run on a non-empty
+ * database, enabling validation by comparing restored data against the source.
+ *
+ * See fdbserver/storageserver.actor.cpp for detailed implementation docs.
+ * ============================================================================
+ */
+
 #include "fdbcli/fdbcli.actor.h"
 
 #include "fdbclient/IClientApi.h"
@@ -54,6 +88,8 @@ ACTOR Future<UID> auditStorageCommandActor(Reference<IClusterConnectionRecord> c
 			type = AuditType::ValidateLocationMetadata;
 		} else if (tokencmp(tokens[2], "ssshard")) {
 			type = AuditType::ValidateStorageServerShard;
+		} else if (tokencmp(tokens[2], "validate_restore")) {
+			type = AuditType::ValidateRestore;
 		} else {
 			printUsage(tokens[0]);
 			return UID();
@@ -72,6 +108,8 @@ ACTOR Future<UID> auditStorageCommandActor(Reference<IClusterConnectionRecord> c
 			type = AuditType::ValidateLocationMetadata;
 		} else if (tokencmp(tokens[1], "ssshard")) {
 			type = AuditType::ValidateStorageServerShard;
+		} else if (tokencmp(tokens[1], "validate_restore")) {
+			type = AuditType::ValidateRestore;
 		} else {
 			printUsage(tokens[0]);
 			return UID();
@@ -119,7 +157,7 @@ CommandFactory auditStorageFactory(
     CommandHelp("audit_storage <Type> [BeginKey EndKey] <EngineType>",
                 "Start an audit storage",
                 "Specify audit `Type' (only `ha' and `replica' and `locationmetadata' and "
-                "`ssshard' `Type' are supported currently), and\n"
+                "`ssshard' and `validate_restore' `Type' are supported currently), and\n"
                 "optionally a sub-range with `BeginKey' and `EndKey'.\n"
                 "Specify audit `EngineType' when auditType is `ha' or `replica'\n"
                 "(only `ssd-rocksdb-v1' and `ssd-sharded-rocksdb' and `ssd-2' are supported).\n"
diff --git a/fdbcli/GetAuditStatusCommand.actor.cpp b/fdbcli/GetAuditStatusCommand.actor.cpp
index d99ac0d1a92..bc8b40620cd 100644
--- a/fdbcli/GetAuditStatusCommand.actor.cpp
+++ b/fdbcli/GetAuditStatusCommand.actor.cpp
@@ -134,7 +134,7 @@ ACTOR Future<AuditPhase> getAuditProgressByServer(Database cx,
 
 ACTOR Future<Void> getAuditProgress(Database cx, AuditType auditType, UID auditId, KeyRange auditRange) {
 	if (auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
-	    auditType == AuditType::ValidateLocationMetadata) {
+	    auditType == AuditType::ValidateLocationMetadata || auditType == AuditType::ValidateRestore) {
 		wait(getAuditProgressByRange(cx, auditType, auditId, auditRange));
 	} else if (auditType == AuditType::ValidateStorageServerShard) {
 		state std::vector<Future<Void>> fs;
@@ -186,6 +186,8 @@ ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef
 		type = AuditType::ValidateLocationMetadata;
 	} else if (tokencmp(tokens[1], "ssshard")) {
 		type = AuditType::ValidateStorageServerShard;
+	} else if (tokencmp(tokens[1], "validate_restore")) {
+		type = AuditType::ValidateRestore;
 	} else {
 		printUsage(tokens[0]);
 		return false;
@@ -244,16 +246,17 @@ ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef
 
 CommandFactory getAuditStatusFactory(
     "get_audit_status",
-    CommandHelp("get_audit_status [ha|replica|locationmetadata|ssshard] [id|recent|phase|progress] [ARGs]",
-                "Retrieve audit storage status",
-                "To fetch audit status via ID: `get_audit_status [Type] id [ID]'\n"
-                "To fetch status of most recent audit: `get_audit_status [Type] recent [Count]'\n"
-                "To fetch status of audits in a specific phase: `get_audit_status [Type] phase "
-                "[running|complete|failed|error] count'\n"
-                "To fetch audit progress via ID: `get_audit_status [Type] progress [ID]'\n"
-                "Supported types include: 'ha', `replica`, `locationmetadata`, `ssshard`. \n"
-                "If specified, `Count' is how many rows to audit.\n"
-                "If not specified, check all rows in audit.\n"
-                "Phase can be `Invalid=0', `Running=1', `Complete=2', `Error=3', or `Failed=4'.\n"
-                "See also `audit_storage' command."));
+    CommandHelp(
+        "get_audit_status [ha|replica|locationmetadata|ssshard|validate_restore] [id|recent|phase|progress] [ARGs]",
+        "Retrieve audit storage status",
+        "To fetch audit status via ID: `get_audit_status [Type] id [ID]'\n"
+        "To fetch status of most recent audit: `get_audit_status [Type] recent [Count]'\n"
+        "To fetch status of audits in a specific phase: `get_audit_status [Type] phase "
+        "[running|complete|failed|error] count'\n"
+        "To fetch audit progress via ID: `get_audit_status [Type] progress [ID]'\n"
+        "Supported types include: 'ha', `replica`, `locationmetadata`, `ssshard`, `validate_restore`. \n"
+        "If specified, `Count' is how many rows to audit.\n"
+        "If not specified, check all rows in audit.\n"
+        "Phase can be `Invalid=0', `Running=1', `Complete=2', `Error=3', or `Failed=4'.\n"
+        "See also `audit_storage' command."));
 } // namespace fdb_cli
diff --git a/fdbclient/AuditUtils.actor.cpp b/fdbclient/AuditUtils.actor.cpp
index 061e0a006b4..d9842395f7b 100644
--- a/fdbclient/AuditUtils.actor.cpp
+++ b/fdbclient/AuditUtils.actor.cpp
@@ -45,6 +45,8 @@ void clearAuditProgressMetadata(Transaction* tr, AuditType auditType, UID auditI
 		tr->clear(auditRangeBasedProgressRangeFor(auditType, auditId));
 	} else if (auditType == AuditType::ValidateLocationMetadata) {
 		tr->clear(auditRangeBasedProgressRangeFor(auditType, auditId));
+	} else if (auditType == AuditType::ValidateRestore) {
+		tr->clear(auditRangeBasedProgressRangeFor(auditType, auditId));
 	} else {
 		UNREACHABLE();
 	}
@@ -490,7 +492,9 @@ ACTOR Future<Void> persistAuditStateByRange(Database cx, AuditStorageState audit
 			AuditStorageState ddAuditState = decodeAuditStorageState(ddAuditState_.get());
 			ASSERT(ddAuditState.ddId.isValid());
 			if (ddAuditState.ddId != auditState.ddId) {
-				throw audit_storage_task_outdated(); // a new dd starts and this audit task is outdated
+				// DD failover occurred - update to new DD ID and continue
+				// The validation work completed successfully, just persist it with the current DD ID
+				auditState.ddId = ddAuditState.ddId;
 			}
 			// It is possible ddAuditState is complete while some progress is about to persist
 			// Since doAuditOnStorageServer may repeatedly issue multiple requests (see getReplyUnlessFailedFor)
@@ -581,7 +585,9 @@ ACTOR Future<Void> persistAuditStateByServer(Database cx, AuditStorageState audi
 			AuditStorageState ddAuditState = decodeAuditStorageState(ddAuditState_.get());
 			ASSERT(ddAuditState.ddId.isValid());
 			if (ddAuditState.ddId != auditState.ddId) {
-				throw audit_storage_task_outdated(); // a new dd starts and this audit task is outdated
+				// DD failover occurred - update to new DD ID and continue
+				// The validation work completed successfully, just persist it with the current DD ID
+				auditState.ddId = ddAuditState.ddId;
 			}
 			// It is possible ddAuditState is complete while some progress is about to persist
 			// Since doAuditOnStorageServer may repeatedly issue multiple requests (see getReplyUnlessFailedFor)
@@ -669,7 +675,7 @@ ACTOR Future<bool> checkAuditProgressCompleteByRange(Database cx,
                                                      UID auditId,
                                                      KeyRange auditRange) {
 	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
-	       auditType == AuditType::ValidateLocationMetadata);
+	       auditType == AuditType::ValidateLocationMetadata || auditType == AuditType::ValidateRestore);
 	state KeyRange rangeToRead = auditRange;
 	state Key rangeToReadBegin = auditRange.begin;
 	state int retryCount = 0;
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 5cfb2309ee0..e7bcdfab14b 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2992,24 +2992,19 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
 
 		// Coalesce the shard map to make random selection below more efficient.
 		shardMap.coalesce(allKeys);
-		wait(yield());
+	wait(yield());
 
-		// In this context "all" refers to all of the shards relevant for this particular backup
-		state int countAllShards = countShardsDone + countShardsNotDone;
+	// In this context "all" refers to all of the shards relevant for this particular backup
+	state int countAllShards = countShardsDone + countShardsNotDone;
 
-		if (countShardsNotDone == 0) {
-			TraceEvent("FileBackupSnapshotDispatchFinished")
-			    .detail("BackupUID", config.getUid())
-			    .detail("AllShards", countAllShards)
-			    .detail("ShardsDone", countShardsDone)
-			    .detail("ShardsNotDone", countShardsNotDone)
-			    .detail("SnapshotBeginVersion", snapshotBeginVersion)
-			    .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion)
-			    .detail("CurrentVersion", recentReadVersion)
-			    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);
-			Params.snapshotFinished().set(task, true);
-			return Void();
-		}
+	// NOTE: Don't finish here even if countShardsNotDone == 0. We need to dispatch tasks first.
+	// The completion check after dispatch (with dispatchedInThisIteration guard) prevents
+	// finishing in the same iteration we dispatch the last tasks.
+	if (countShardsNotDone == 0) {
+		TraceEvent("FileBackupSnapshotDispatchAllDoneBeforeDispatch")
+		    .detail("BackupUID", config.getUid())
+		    .detail("Note", "Will check again after dispatch loop");
+	}
 
 		// Decide when the next snapshot dispatch should run.
 		state Version nextDispatchVersion;
@@ -3080,11 +3075,14 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
 		    .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion)
 		    .detail("NextDispatchVersion", nextDispatchVersion)
 		    .detail("CurrentVersion", recentReadVersion)
-		    .detail("TimeElapsed", timeElapsed)
-		    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);
+	    .detail("TimeElapsed", timeElapsed)
+	    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);
+
+	// Track whether we dispatched any tasks in this iteration
+	state bool dispatchedInThisIteration = false;
 
-		// Dispatch random shards to catch up to the expected progress
-		while (countShardsToDispatch > 0) {
+	// Dispatch random shards to catch up to the expected progress
+	while (countShardsToDispatch > 0) {
 			// First select ranges to add
 			state std::vector<KeyRange> rangesToAdd;
 
@@ -3213,16 +3211,20 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
 						}
 					}
 
-					wait(waitForAll(addTaskFutures));
-					wait(tr->commit());
-					break;
-				} catch (Error& e) {
-					wait(tr->onError(e));
-				}
+				wait(waitForAll(addTaskFutures));
+				wait(tr->commit());
+				dispatchedInThisIteration = true;
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
 			}
 		}
+	}
 
-		if (countShardsNotDone == 0) {
+	// Only finish if all shards are done AND we didn't dispatch any tasks this iteration.
+	// This prevents the bug where we mark snapshot finished immediately after dispatching
+	// the last batch of tasks, before they actually complete.
+	if (countShardsNotDone == 0 && !dispatchedInThisIteration) {
 			TraceEvent("FileBackupSnapshotDispatchFinished")
 			    .detail("BackupUID", config.getUid())
 			    .detail("AllShards", countAllShards)
@@ -6977,18 +6979,20 @@ class FileBackupAgentImpl {
 			oldRestore.clear(tr);
 		}
 
-		if (!onlyApplyMutationLogs) {
-			state int index;
-			for (index = 0; index < restoreRanges.size(); index++) {
-				KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end)
-				                                .removePrefix(removePrefix)
-				                                .withPrefix(addPrefix);
-				RangeResult existingRows = wait(tr->getRange(restoreIntoRange, 1));
-				if (existingRows.size() > 0) {
-					throw restore_destination_not_empty();
-				}
+	if (!onlyApplyMutationLogs) {
+		state int index;
+		for (index = 0; index < restoreRanges.size(); index++) {
+			KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end)
+			                                .removePrefix(removePrefix)
+			                                .withPrefix(addPrefix);
+			RangeResult existingRows = wait(tr->getRange(restoreIntoRange, 1));
+			// Allow restoring over existing data when restoring with a prefix (for validation)
+			// addPrefix.size() > 0 indicates this is a validation restore
+			if (existingRows.size() > 0 && addPrefix.size() == 0) {
+				throw restore_destination_not_empty();
 			}
 		}
+	}
 		// Make new restore config
 		state RestoreConfig restore(uid);
 
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 52a14347af9..e70ee5474e8 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -1294,6 +1294,10 @@ const KeyRef clusterIdKey = "\xff/clusterIdKey"_sr;
 const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr;
 const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr);
 const KeyRangeRef applyLogKeys("\xff\x02/alog/"_sr, "\xff\x02/alog0"_sr);
+// Restore validation prefix (system key space)
+// Usage: fdbbackup restore --add-prefix '\xff\x02/rlog/'
+// TOML: addPrefix = '\xff\x02/rlog/' (unprintable() converts escape sequences to bytes)
+const KeyRangeRef restoreLogKeys("\xff\x02/rlog/"_sr, "\xff\x02/rlog0"_sr);
 bool isBackupLogMutation(const MutationRef& m) {
 	return isSingleKeyMutation((MutationRef::Type)m.type) &&
 	       (backupLogKeys.contains(m.param1) || applyLogKeys.contains(m.param1));
diff --git a/fdbclient/include/fdbclient/Audit.h b/fdbclient/include/fdbclient/Audit.h
index 2f4a7bf198a..96fe9b9eb2d 100644
--- a/fdbclient/include/fdbclient/Audit.h
+++ b/fdbclient/include/fdbclient/Audit.h
@@ -39,6 +39,7 @@ enum class AuditType : uint8_t {
 	ValidateReplica = 2,
 	ValidateLocationMetadata = 3,
 	ValidateStorageServerShard = 4,
+	ValidateRestore = 5,
 };
 
 struct AuditStorageState {
@@ -68,7 +69,7 @@ struct AuditStorageState {
 		                  ", [Range]: " + Traceable<KeyRangeRef>::toString(range) +
 		                  ", [Type]: " + std::to_string(type) + ", [Phase]: " + std::to_string(phase);
 		if (!error.empty()) {
-			res += "[Error]: " + error;
+			res += ", [Error]: " + error;
 		}
 
 		return res;
diff --git a/fdbclient/include/fdbclient/SystemData.h b/fdbclient/include/fdbclient/SystemData.h
index 4f64c38efd3..c6cfb44ce25 100644
--- a/fdbclient/include/fdbclient/SystemData.h
+++ b/fdbclient/include/fdbclient/SystemData.h
@@ -629,6 +629,8 @@ extern const KeyRef backupLatestVersionsPrefix;
 // Key range reserved by backup agent to storing mutations
 extern const KeyRangeRef backupLogKeys;
 extern const KeyRangeRef applyLogKeys;
+// Key range reserved for restore validation data storage (system key space)
+extern const KeyRangeRef restoreLogKeys;
 // Returns true if m is a blog (backup log) or alog (apply log) mutation
 bool isBackupLogMutation(const MutationRef& m);
 
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index bd4ed9832e0..8855e903529 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -465,6 +465,7 @@ struct DataDistributor : NonCopyable, ReferenceCounted<DataDistributor> {
 	FlowLock auditStorageReplicaLaunchingLock;
 	FlowLock auditStorageLocationMetadataLaunchingLock;
 	FlowLock auditStorageSsShardLaunchingLock;
+	FlowLock auditStorageRestoreLaunchingLock;
 	Promise<Void> auditStorageInitialized;
 	bool auditStorageInitStarted;
 
@@ -3494,7 +3495,8 @@ ACTOR Future<Void> auditStorageCore(Reference<DataDistributor> self,
 		} else {
 			// Check audit persist progress to double check if any range omitted to be check
 			if (audit->coreState.getType() == AuditType::ValidateHA ||
-			    audit->coreState.getType() == AuditType::ValidateReplica) {
+			    audit->coreState.getType() == AuditType::ValidateReplica ||
+			    audit->coreState.getType() == AuditType::ValidateRestore) {
 				bool allFinish = wait(checkAuditProgressCompleteByRange(self->txnProcessor->context(),
 				                                                        audit->coreState.getType(),
 				                                                        audit->coreState.id,
@@ -3529,7 +3531,11 @@ ACTOR Future<Void> auditStorageCore(Reference<DataDistributor> self,
 					throw retry();
 				}
 			}
-			audit->coreState.setPhase(AuditPhase::Complete);
+			if (!audit->foundError) {
+				audit->coreState.setPhase(AuditPhase::Complete);
+			} else {
+				audit->coreState.setPhase(AuditPhase::Error);
+			}
 		}
 		TraceEvent(SevVerbose, "DDAuditStorageCoreCompleteAudit", self->ddId)
 		    .detail("Context", audit->getDDAuditContext())
@@ -3575,6 +3581,9 @@ ACTOR Future<Void> auditStorageCore(Reference<DataDistributor> self,
 			                        audit->coreState.id); // remove audit
 			// Silently exit
 		} else if (e.code() == error_code_audit_storage_task_outdated) {
+			// DD failover occurred - storage server completed audit with old DD ID
+			// Remove from map so it can be properly resumed/retried
+			removeAuditFromAuditMap(self, audit->coreState.getType(), audit->coreState.id);
 			// Silently exit
 		} else if (e.code() == error_code_audit_storage_cancelled) {
 			// If this audit is cancelled, the place where cancelling
@@ -3665,7 +3674,8 @@ void runAuditStorage(Reference<DataDistributor> self,
 	// Validate input auditState
 	if (auditState.getType() != AuditType::ValidateHA && auditState.getType() != AuditType::ValidateReplica &&
 	    auditState.getType() != AuditType::ValidateLocationMetadata &&
-	    auditState.getType() != AuditType::ValidateStorageServerShard) {
+	    auditState.getType() != AuditType::ValidateStorageServerShard &&
+	    auditState.getType() != AuditType::ValidateRestore) {
 		throw not_implemented();
 	}
 	TraceEvent(SevDebug, "DDRunAuditStorage", self->ddId)
@@ -3761,12 +3771,9 @@ ACTOR Future<UID> launchAudit(Reference<DataDistributor> self,
 			// hence a new audit resumption loads audits from disk and launch the audits
 			// Since the resumed audit has already taken over the launchAudit job,
 			// we simply retry this launchAudit, then return the audit id to client
-			if (g_network->isSimulated() && deterministicRandom()->coinflip()) {
-				TraceEvent(SevDebug, "DDAuditStorageLaunchInjectActorCancelWhenPersist", self->ddId)
-				    .detail("AuditID", auditID_)
-				    .detail("AuditType", auditType)
-				    .detail("KeyValueStoreType", auditStorageEngineType)
-				    .detail("Range", auditRange);
+			// Skip this injection for ValidateRestore as the simple test needs a clean run
+			if (g_network->isSimulated() && auditType != AuditType::ValidateRestore &&
+			    deterministicRandom()->coinflip()) {
 				throw operation_failed(); // Trigger DD restart and check if resume audit is correct
 			}
 			TraceEvent(SevInfo, "DDAuditStorageLaunchPersistNewAuditID", self->ddId)
@@ -3813,6 +3820,9 @@ ACTOR Future<Void> cancelAuditStorage(Reference<DataDistributor> self, TriggerAu
 	} else if (req.getType() == AuditType::ValidateStorageServerShard) {
 		wait(self->auditStorageSsShardLaunchingLock.take(TaskPriority::DefaultYield));
 		holder = FlowLock::Releaser(self->auditStorageSsShardLaunchingLock);
+	} else if (req.getType() == AuditType::ValidateRestore) {
+		wait(self->auditStorageRestoreLaunchingLock.take(TaskPriority::DefaultYield));
+		holder = FlowLock::Releaser(self->auditStorageRestoreLaunchingLock);
 	} else {
 		req.reply.sendError(not_implemented());
 		return Void();
@@ -3870,6 +3880,9 @@ ACTOR Future<Void> auditStorage(Reference<DataDistributor> self, TriggerAuditReq
 	} else if (req.getType() == AuditType::ValidateStorageServerShard) {
 		wait(self->auditStorageSsShardLaunchingLock.take(TaskPriority::DefaultYield));
 		holder = FlowLock::Releaser(self->auditStorageSsShardLaunchingLock);
+	} else if (req.getType() == AuditType::ValidateRestore) {
+		wait(self->auditStorageRestoreLaunchingLock.take(TaskPriority::DefaultYield));
+		holder = FlowLock::Releaser(self->auditStorageRestoreLaunchingLock);
 	} else {
 		req.reply.sendError(not_implemented());
 		return Void();
@@ -3941,6 +3954,8 @@ void loadAndDispatchAudit(Reference<DataDistributor> self, std::shared_ptr<DDAud
 		audit->actors.add(dispatchAuditLocationMetadata(self, audit, allKeys));
 	} else if (audit->coreState.getType() == AuditType::ValidateStorageServerShard) {
 		audit->actors.add(dispatchAuditStorageServerShard(self, audit));
+	} else if (audit->coreState.getType() == AuditType::ValidateRestore) {
+		audit->actors.add(dispatchAuditStorage(self, audit));
 	} else {
 		UNREACHABLE();
 	}
@@ -4175,12 +4190,13 @@ ACTOR Future<Void> scheduleAuditStorageShardOnServer(Reference<DataDistributor>
 	return Void();
 }
 
-// This function is for ha/replica audits
+// This function is for ha/replica/restore audits
 // Schedule audit task on the input range
 ACTOR Future<Void> dispatchAuditStorage(Reference<DataDistributor> self, std::shared_ptr<DDAudit> audit) {
 	state const AuditType auditType = audit->coreState.getType();
 	state const KeyRange range = audit->coreState.range;
-	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica);
+	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
+	       auditType == AuditType::ValidateRestore);
 	TraceEvent(SevInfo, "DDDispatchAuditStorageBegin", self->ddId)
 	    .detail("AuditID", audit->coreState.id)
 	    .detail("Range", range)
@@ -4206,13 +4222,20 @@ ACTOR Future<Void> dispatchAuditStorage(Reference<DataDistributor> self, std::sh
 			state int i = 0;
 			for (; i < auditStates.size(); i++) {
 				state AuditPhase phase = auditStates[i].getPhase();
-				ASSERT(phase != AuditPhase::Running && phase != AuditPhase::Failed);
+				// Skip Running/Failed states during retries (they will be updated on retry)
+				if (phase == AuditPhase::Running || phase == AuditPhase::Failed) {
+					continue;
+				}
 				totalCount++;
 				if (phase == AuditPhase::Complete) {
 					completedCount++;
 				} else if (phase == AuditPhase::Error) {
 					completedCount++;
 					audit->foundError = true;
+					// Capture first error message from range states
+					if (audit->coreState.error.empty() && !auditStates[i].error.empty()) {
+						audit->coreState.error = auditStates[i].error;
+					}
 				} else {
 					ASSERT(phase == AuditPhase::Invalid);
 					ASSERT(audit->remainingBudgetForAuditTasks.get() >= 0);
@@ -4283,13 +4306,14 @@ ACTOR Future<std::unordered_map<UID, KeyValueStoreType>> getStorageType(
 }
 
 // Partition the input range into multiple subranges according to the range ownership, and
-// schedule ha/replica audit tasks of each subrange on the server which owns the subrange
+// schedule ha/replica/restore audit tasks of each subrange on the server which owns the subrange
 // Automatically retry until complete or timed out
 ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
                                         std::shared_ptr<DDAudit> audit,
                                         KeyRange rangeToSchedule) {
 	state const AuditType auditType = audit->coreState.getType();
-	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica);
+	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
+	       auditType == AuditType::ValidateRestore);
 	TraceEvent(SevInfo, "DDScheduleAuditOnRangeBegin", self->ddId)
 	    .detail("AuditID", audit->coreState.id)
 	    .detail("AuditRange", audit->coreState.range)
@@ -4356,7 +4380,10 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 					state int auditStateIndex = 0;
 					for (; auditStateIndex < auditStates.size(); ++auditStateIndex) {
 						state AuditPhase phase = auditStates[auditStateIndex].getPhase();
-						ASSERT(phase != AuditPhase::Running && phase != AuditPhase::Failed);
+						// Skip Running/Failed states during retries (they will be updated on retry)
+						if (phase == AuditPhase::Running || phase == AuditPhase::Failed) {
+							continue;
+						}
 						if (phase == AuditPhase::Complete) {
 							continue;
 						} else if (phase == AuditPhase::Error) {
@@ -4414,6 +4441,7 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								}
 								dcid++;
 							}
+							// ValidateReplica requires multiple replicas, skip if single replica
 							if (storageServersToCheck.size() <= 1) {
 								TraceEvent(SevInfo, "DDScheduleAuditOnRangeEnd", self->ddId)
 								    .detail("Reason", "Single replica, ignore")
@@ -4422,6 +4450,26 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								    .detail("AuditType", auditType);
 								return Void();
 							}
+						} else if (auditType == AuditType::ValidateRestore) {
+							// select a server from primary DC to do audit
+							// ValidateRestore compares source vs restored data, single replica is fine
+							int dcid = 0;
+							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
+								if (dcid == 0) {
+									// in primary DC randomly select a server to do the audit task
+									const int idx = deterministicRandom()->randomInt(0, dcServers.size());
+									targetServer = dcServers[idx];
+								}
+								for (int i = 0; i < dcServers.size(); i++) {
+									if (dcServers[i].id() == targetServer.id()) {
+										ASSERT_WE_THINK(dcid == 0);
+									} else {
+										req.targetServers.push_back(dcServers[i].id());
+									}
+									storageServersToCheck.push_back(dcServers[i]);
+								}
+								dcid++;
+							}
 						} else {
 							UNREACHABLE();
 						}
@@ -4537,7 +4585,8 @@ ACTOR Future<Void> skipAuditOnRange(Reference<DataDistributor> self,
                                     std::shared_ptr<DDAudit> audit,
                                     KeyRange rangeToSkip) {
 	state AuditType auditType = audit->coreState.getType();
-	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica);
+	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
+	       auditType == AuditType::ValidateRestore);
 	try {
 		audit->overallIssuedDoAuditCount++;
 		AuditStorageState res(audit->coreState.id, rangeToSkip, auditType);
@@ -4600,7 +4649,7 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
                                           AuditStorageRequest req) {
 	state AuditType auditType = req.getType();
 	ASSERT(auditType == AuditType::ValidateHA || auditType == AuditType::ValidateReplica ||
-	       auditType == AuditType::ValidateStorageServerShard);
+	       auditType == AuditType::ValidateStorageServerShard || auditType == AuditType::ValidateRestore);
 	TraceEvent(SevInfo, "DDDoAuditOnStorageServerBegin", self->ddId)
 	    .detail("AuditID", req.id)
 	    .detail("Range", req.range)
@@ -4619,6 +4668,15 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
 		if (vResult.isError()) {
 			throw vResult.getError();
 		}
+		// Check if storage server found validation errors
+		if (vResult.get().getPhase() == AuditPhase::Error) {
+			audit->foundError = true;
+			TraceEvent(SevWarn, "DDDoAuditOnStorageServerFoundError", self->ddId)
+			    .detail("AuditID", req.id)
+			    .detail("Range", req.range)
+			    .detail("AuditType", auditType)
+			    .detail("Error", vResult.get().error);
+		}
 		audit->overallCompleteDoAuditCount++;
 		TraceEvent(SevInfo, "DDDoAuditOnStorageServerResult", self->ddId)
 		    .detail("AuditID", req.id)
@@ -4667,6 +4725,14 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
 			throw e;
 		} else if (e.code() == error_code_audit_storage_error) {
 			audit->foundError = true;
+		} else if (e.code() == error_code_wrong_shard_server) {
+			// wrong_shard_server means stale shard location data
+			// Don't retry within DD - throw to let workload retry at higher level
+			if (audit->retryCount >= 3) {
+				throw audit_storage_cancelled(); // Let workload retry with fresh DD dispatch
+			}
+			audit->retryCount++;
+			audit->actors.add(scheduleAuditOnRange(self, audit, req.range));
 		} else if (audit->retryCount >= SERVER_KNOBS->AUDIT_RETRY_COUNT_MAX) {
 			throw audit_storage_failed();
 		} else {
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index d5b4e2e4c43..ef9095b94c3 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -4485,6 +4485,426 @@ ACTOR Future<Void> auditStorageServerShardQ(StorageServer* data, AuditStorageReq
 	return Void();
 }
 
+/*
+ * RESTORE VALIDATION FEATURE - How to Use
+ *
+ * This feature validates that restored backup data matches the original source data
+ * by comparing them within the same cluster.
+ *
+ * === WORKFLOW ===
+ *
+ * Step 1: Backup
+ *   $ fdbbackup start -C <cluster> -d <backup_url> -z
+ *   $ fdbbackup discontinue -C <cluster>
+ *   $ fdbbackup wait -C <cluster>
+ *
+ * Step 2: Restore to Validation Prefix
+ *   Production:
+ *   $ fdbbackup restore -C <cluster> -r <backup_url> \
+ *       --add-prefix "\xff\x02/rlog/" --wait-for-done
+ *
+ *   Simulation Tests (use in TOML configs):
+ *     addPrefix = 'restored/'
+ *
+ * Step 3: Validate
+ *   $ fdbcli -C <cluster>
+ *   fdb> audit_storage validate_restore "" "\xff"
+ *   # Returns Audit ID
+ *   fdb> get_audit_status validate_restore id <AuditID>
+ *
+ * Step 4: Cleanup
+ *   fdb> option on ACCESS_SYSTEM_KEYS
+ *   fdb> writemode on
+ *   fdb> clearrange "\xff\x02/rlog/" "\xff\x02/rlog0"
+ *   # Or for simulation: clearrange "restored/" "restored0"
+ *
+ * Note: When restoring with a prefix (addPrefix parameter), the restore automatically
+ * allows overwriting existing data, making it suitable for validation purposes.
+ *
+ */
+
+// Helper: Issue a GetKeyValues request for a given range and return the future
+static Future<ErrorOr<GetKeyValuesReply>> issueGetKeyValuesRequest(StorageServer* data,
+                                                                   KeyRange range,
+                                                                   Version version,
+                                                                   int limit,
+                                                                   int limitBytes) {
+	GetKeyValuesRequest req;
+	req.begin = firstGreaterOrEqual(range.begin);
+	req.end = firstGreaterOrEqual(range.end);
+	req.limit = limit;
+	req.limitBytes = limitBytes;
+	req.version = version;
+	req.tags = TagSet();
+	data->actors.add(getKeyValuesQ(data, req));
+	return errorOr(req.reply.getFuture());
+}
+
+// Helper: Read both source and restored data for a given range
+//
+// Restored data is stored at restoreLogKeys (\xff\x02/rlog/) in system key space.
+// NOTE: We read the ENTIRE restored keyspace (not just rangeToRead with prefix),
+// because restored keys are stored with their original names under the prefix.
+// E.g., source key "mykey" is restored as "\xff\x02/rlog/mykey"
+ACTOR static Future<std::pair<GetKeyValuesReply, GetKeyValuesReply>> fetchSourceAndRestoredData(StorageServer* data,
+                                                                                                KeyRange rangeToRead,
+                                                                                                Version version,
+                                                                                                int limit,
+                                                                                                int limitBytes) {
+	// Construct the restored range by adding the restore prefix to the source range
+	// E.g., if source range is "key1 - key2", restored range is "\xff\x02/rlog/key1 - \xff\x02/rlog/key2"
+	state Key restoredBegin = rangeToRead.begin.withPrefix(restoreLogKeys.begin);
+	state Key restoredEnd = rangeToRead.end.withPrefix(restoreLogKeys.begin);
+	state KeyRange restoredRange = KeyRangeRef(restoredBegin, restoredEnd);
+
+	TraceEvent("SSAuditRestoreFetch", data->thisServerID)
+	    .detail("RangeToRead", rangeToRead)
+	    .detail("RestoredRange", restoredRange)
+	    .detail("Version", version)
+	    .detail("Limit", limit)
+	    .detail("LimitBytes", limitBytes);
+
+	// Read source data from user key range (this SS must own it since DD sent request here)
+	state Future<ErrorOr<GetKeyValuesReply>> sourceFuture =
+	    issueGetKeyValuesRequest(data, rangeToRead, version, limit, limitBytes);
+
+	// Read restored data from system key space
+	// NOTE: Use database transaction to read system keys since this SS might not own them
+	state ErrorOr<GetKeyValuesReply> restoredResult;
+	try {
+		state Transaction tr(data->cx);
+		tr.setVersion(version);
+		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+		state RangeResult restoredData = wait(tr.getRange(restoredRange, limit, Snapshot::False, Reverse::False));
+
+		// Convert RangeResult to GetKeyValuesReply format
+		GetKeyValuesReply restoredReply;
+		restoredReply.data.append_deep(restoredReply.arena, restoredData.begin(), restoredData.size());
+		restoredReply.more = restoredData.more;
+		restoredReply.version = version;
+		restoredResult = restoredReply;
+	} catch (Error& e) {
+		restoredResult = e;
+	}
+
+	state Future<ErrorOr<GetKeyValuesReply>> restoredFuture = Future<ErrorOr<GetKeyValuesReply>>(restoredResult);
+
+	wait(success(sourceFuture) && success(restoredFuture));
+
+	// Check for errors
+	if (sourceFuture.get().isError()) {
+		throw sourceFuture.get().getError();
+	}
+	if (restoredFuture.get().isError()) {
+		throw restoredFuture.get().getError();
+	}
+	if (sourceFuture.get().get().error.present()) {
+		throw sourceFuture.get().get().error.get();
+	}
+	if (restoredFuture.get().get().error.present()) {
+		throw restoredFuture.get().get().error.get();
+	}
+
+	// Log what we fetched
+	TraceEvent("SSAuditRestoreFetchResult", data->thisServerID)
+	    .detail("SourceKeys", sourceFuture.get().get().data.size())
+	    .detail("RestoredKeys", restoredFuture.get().get().data.size())
+	    .detail("SourceBytes", sourceFuture.get().get().data.expectedSize())
+	    .detail("RestoredBytes", restoredFuture.get().get().data.expectedSize())
+	    .detail("SourceMore", sourceFuture.get().get().more)
+	    .detail("RestoredMore", restoredFuture.get().get().more);
+
+	return std::make_pair(sourceFuture.get().get(), restoredFuture.get().get());
+}
+
+// Helper: Compare source and restored data, returning validation errors
+std::vector<std::string> compareSourceAndRestoredData(UID thisServerID,
+                                                      UID auditID,
+                                                      KeyRange auditRange,
+                                                      const GetKeyValuesReply& sourceReply,
+                                                      const GetKeyValuesReply& restoredReply,
+                                                      KeyRange rangeToRead,
+                                                      Version version,
+                                                      KeyRange claimRange,
+                                                      Key& lastKey,
+                                                      int64_t& numValidatedKeys) {
+	std::vector<std::string> errors;
+	int sourceIdx = 0;
+	int restoredIdx = 0;
+
+	TraceEvent("SSAuditRestoreCompare", thisServerID)
+	    .detail("AuditID", auditID)
+	    .detail("SourceKeys", sourceReply.data.size())
+	    .detail("RestoredKeys", restoredReply.data.size())
+	    .detail("RangeToRead", rangeToRead)
+	    .detail("Version", version);
+
+	// Log first few keys from both sets for debugging
+	if (sourceReply.data.size() > 0) {
+		TraceEvent("SSAuditRestoreCompareSourceKeys", thisServerID)
+		    .detail("FirstSourceKey", sourceReply.data[0].key)
+		    .detail("LastSourceKey", sourceReply.data[sourceReply.data.size() - 1].key);
+	}
+	if (restoredReply.data.size() > 0) {
+		TraceEvent("SSAuditRestoreCompareRestoredKeys", thisServerID)
+		    .detail("FirstRestoredKey", restoredReply.data[0].key)
+		    .detail("LastRestoredKey", restoredReply.data[restoredReply.data.size() - 1].key);
+	}
+
+	TraceEvent("SSAuditRestoreCompareStart", thisServerID)
+	    .detail("SourceSize", sourceReply.data.size())
+	    .detail("RestoredSize", restoredReply.data.size())
+	    .detail("SourceMore", sourceReply.more)
+	    .detail("RestoredMore", restoredReply.more);
+
+	while (sourceIdx < sourceReply.data.size() && restoredIdx < restoredReply.data.size()) {
+		KeyValueRef sourceKV = sourceReply.data[sourceIdx];
+		KeyValueRef restoredKV = restoredReply.data[restoredIdx];
+
+		// Remove the restore prefix from restored key to compare
+		Key restoredKeyWithoutPrefix = restoredKV.key.removePrefix(restoreLogKeys.begin);
+
+		if (sourceKV.key == restoredKeyWithoutPrefix) {
+			// Keys match, compare values
+			if (sourceKV.value != restoredKV.value) {
+				std::string error = format("Value Mismatch for Key %s: source value: %s, restored value: %s",
+				                           Traceable<StringRef>::toString(sourceKV.key).c_str(),
+				                           Traceable<StringRef>::toString(sourceKV.value).c_str(),
+				                           Traceable<StringRef>::toString(restoredKV.value).c_str());
+				TraceEvent(SevError, "SSAuditRestoreError", thisServerID)
+				    .setMaxFieldLength(-1)
+				    .setMaxEventLength(-1)
+				    .detail("AuditId", auditID)
+				    .detail("AuditRange", auditRange)
+				    .detail("ErrorMessage", error)
+				    .detail("Version", version)
+				    .detail("ClaimRange", claimRange);
+				errors.push_back(error);
+				break;
+			}
+			lastKey = sourceKV.key;
+			++numValidatedKeys;
+			++sourceIdx;
+			++restoredIdx;
+		} else if (sourceKV.key < restoredKeyWithoutPrefix) {
+			// Source key missing from restored data
+			std::string error =
+			    format("Missing key in restored data: %s", Traceable<StringRef>::toString(sourceKV.key).c_str());
+			TraceEvent(SevError, "SSAuditRestoreError", thisServerID)
+			    .setMaxFieldLength(-1)
+			    .setMaxEventLength(-1)
+			    .detail("AuditId", auditID)
+			    .detail("AuditRange", auditRange)
+			    .detail("ErrorMessage", error)
+			    .detail("Version", version)
+			    .detail("ClaimRange", claimRange);
+			errors.push_back(error);
+			break;
+		} else {
+			// Extra key in restored data (skip it, as per design doc: one-directional comparison)
+			++restoredIdx;
+		}
+	}
+
+	// Check for any remaining source keys that are missing from restored data
+	if (errors.empty() && sourceIdx < sourceReply.data.size() && !sourceReply.more) {
+		std::string error = format("Missing key(s) in restored data, next source key: %s",
+		                           Traceable<StringRef>::toString(sourceReply.data[sourceIdx].key).c_str());
+		TraceEvent(SevError, "SSAuditRestoreError", thisServerID)
+		    .setMaxFieldLength(-1)
+		    .setMaxEventLength(-1)
+		    .detail("AuditId", auditID)
+		    .detail("AuditRange", auditRange)
+		    .detail("ErrorMessage", error)
+		    .detail("Version", version)
+		    .detail("ClaimRange", claimRange);
+		errors.push_back(error);
+	}
+
+	TraceEvent("SSAuditRestoreCompareEnd", thisServerID)
+	    .detail("SourceIdx", sourceIdx)
+	    .detail("RestoredIdx", restoredIdx)
+	    .detail("SourceSize", sourceReply.data.size())
+	    .detail("RestoredSize", restoredReply.data.size())
+	    .detail("LastKey", printable(lastKey))
+	    .detail("ErrorCount", errors.size());
+
+	return errors;
+}
+
+ACTOR Future<Void> auditRestoreQ(StorageServer* data, AuditStorageRequest req) {
+	ASSERT(req.getType() == AuditType::ValidateRestore);
+	wait(data->serveAuditStorageParallelismLock.take(TaskPriority::DefaultYield));
+	state FlowLock::Releaser holder(data->serveAuditStorageParallelismLock);
+
+	TraceEvent(SevInfo, "SSAuditRestoreBegin", data->thisServerID)
+	    .detail("AuditID", req.id)
+	    .detail("AuditRange", req.range)
+	    .detail("AuditType", req.type);
+
+	// Validate that req.range is within normalKeys (user keys only)
+	if (!normalKeys.contains(req.range)) {
+		TraceEvent(SevError, "SSAuditRestoreInvalidRange", data->thisServerID)
+		    .detail("AuditID", req.id)
+		    .detail("AuditRange", req.range)
+		    .detail("Error", "Range must be within normalKeys");
+		req.reply.sendError(audit_storage_failed());
+		return Void();
+	}
+
+	state AuditStorageState res(req.id, req.getType());
+	state std::vector<std::string> errors;
+	state Version version;
+	state KeyRange rangeToRead = req.range;
+	state Key rangeToReadBegin = req.range.begin;
+	state KeyRange claimRange;
+	state int limit = 1e4;
+	state int limitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT;
+	state int64_t readBytes = 0;
+	state int64_t numValidatedKeys = 0;
+	state int64_t validatedBytes = 0;
+	state bool complete = false;
+	state double startTime = now();
+	state Reference<IRateControl> rateLimiter =
+	    Reference<IRateControl>(new SpeedLimit(SERVER_KNOBS->AUDIT_STORAGE_RATE_PER_SERVER_MAX, 1));
+
+	try {
+		loop {
+			try {
+				readBytes = 0;
+				rangeToRead = KeyRangeRef(rangeToReadBegin, req.range.end);
+				ASSERT(!rangeToRead.empty());
+
+				TraceEvent(SevDebug, "SSAuditRestoreNewRoundBegin", data->thisServerID)
+				    .suppressFor(10.0)
+				    .detail("AuditID", req.id)
+				    .detail("AuditRange", req.range)
+				    .detail("ReadRangeBegin", rangeToReadBegin)
+				    .detail("ReadRangeEnd", req.range.end);
+
+				errors.clear();
+
+				// Use current durable version for reading
+				version = data->version.get();
+
+				// Fetch both source and restored data
+				state std::pair<GetKeyValuesReply, GetKeyValuesReply> replyPair =
+				    wait(fetchSourceAndRestoredData(data, rangeToRead, version, limit, limitBytes));
+				state GetKeyValuesReply sourceReply = replyPair.first;
+				state GetKeyValuesReply restoredReply = replyPair.second;
+
+				readBytes = sourceReply.data.expectedSize() + restoredReply.data.expectedSize();
+				validatedBytes += readBytes;
+
+				// Check if we've completed reading
+				if (!sourceReply.more) {
+					complete = true;
+				}
+
+				// Compare source data with restored data
+				claimRange = rangeToRead;
+				state Key lastKey = rangeToRead.begin;
+				errors = compareSourceAndRestoredData(data->thisServerID,
+				                                      req.id,
+				                                      req.range,
+				                                      sourceReply,
+				                                      restoredReply,
+				                                      rangeToRead,
+				                                      version,
+				                                      claimRange,
+				                                      lastKey,
+				                                      numValidatedKeys);
+
+				// Update progress in the database
+				KeyRange completeRange = Standalone(KeyRangeRef(rangeToRead.begin, keyAfter(lastKey)));
+				if (!completeRange.empty() && claimRange.begin == completeRange.begin) {
+					claimRange = claimRange & completeRange;
+					AuditStorageState progressState(req.id, claimRange, req.getType());
+					progressState.setPhase(AuditPhase::Running);
+					progressState.ddId = req.ddId;
+					progressState.auditServerId = data->thisServerID;
+					wait(persistAuditStateByRange(data->cx, progressState));
+				}
+
+				// Apply rate limiting
+				wait(rateLimiter->getAllowance(readBytes));
+
+				// If errors found or complete, break
+				if (!errors.empty() || complete) {
+					break;
+				}
+
+				// Move to next range
+				rangeToReadBegin = keyAfter(lastKey);
+				if (rangeToReadBegin >= req.range.end) {
+					complete = true;
+					break;
+				}
+
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled) {
+					throw e;
+				}
+				throw;
+			}
+		}
+
+		// Set final state
+		if (!errors.empty()) {
+			res.setPhase(AuditPhase::Error);
+			res.error = errors[0]; // Report first error
+			res.range = req.range;
+			TraceEvent(SevWarn, "SSAuditRestoreComplete", data->thisServerID)
+			    .detail("AuditID", req.id)
+			    .detail("AuditRange", req.range)
+			    .detail("Complete", complete)
+			    .detail("ValidationErrors", errors.size())
+			    .detail("NumValidatedKeys", numValidatedKeys)
+			    .detail("ValidatedBytes", validatedBytes)
+			    .detail("Duration", now() - startTime);
+		} else {
+			res.setPhase(AuditPhase::Complete);
+			res.range = req.range;
+			TraceEvent(SevInfo, "SSAuditRestoreComplete", data->thisServerID)
+			    .detail("AuditID", req.id)
+			    .detail("AuditRange", req.range)
+			    .detail("Complete", complete)
+			    .detail("NumValidatedKeys", numValidatedKeys)
+			    .detail("ValidatedBytes", validatedBytes)
+			    .detail("Duration", now() - startTime);
+		}
+
+		// Persist final audit state
+		res.ddId = req.ddId;
+		res.auditServerId = data->thisServerID;
+		wait(persistAuditStateByRange(data->cx, res));
+
+	} catch (Error& e) {
+		if (e.code() == error_code_actor_cancelled) {
+			throw e;
+		}
+		// Send retryable errors back to DD so it can retry with correct SS
+		if (e.code() == error_code_wrong_shard_server) {
+			req.reply.sendError(e);
+			return Void();
+		}
+		res.setPhase(AuditPhase::Error);
+		res.error = e.what();
+		res.range = req.range;
+		TraceEvent(SevWarn, "SSAuditRestoreError", data->thisServerID)
+		    .errorUnsuppressed(e)
+		    .detail("AuditID", req.id)
+		    .detail("AuditRange", req.range);
+		res.ddId = req.ddId;
+		res.auditServerId = data->thisServerID;
+		wait(persistAuditStateByRange(data->cx, res));
+	}
+
+	req.reply.send(res);
+	return Void();
+}
+
 ACTOR Future<Void> auditStorageShardReplicaQ(StorageServer* data, AuditStorageRequest req) {
 	ASSERT(req.getType() == AuditType::ValidateHA || req.getType() == AuditType::ValidateReplica);
 	wait(data->serveAuditStorageParallelismLock.take(TaskPriority::DefaultYield));
@@ -12484,6 +12904,8 @@ ACTOR Future<Void> storageServerCore(StorageServer* self, StorageServerInterface
 					self->actors.add(auditStorageShardReplicaQ(self, req));
 				} else if (req.getType() == AuditType::ValidateStorageServerShard) {
 					self->actors.add(auditStorageServerShardQ(self, req));
+				} else if (req.getType() == AuditType::ValidateRestore) {
+					self->actors.add(auditRestoreQ(self, req));
 				} else {
 					req.reply.sendError(not_implemented());
 				}
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 741049fa398..bbd7913bd7d 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -1863,9 +1863,21 @@ void endRole(const Role& role, UID id, std::string reason, bool ok, Error e) {
 	}
 
 	if (!ok) {
+		// Some errors are expected operational events, not actual failures
+		// These should not be logged as SevError
+		bool isExpectedError = (e.code() == error_code_audit_storage_task_outdated);
+
+		if (isExpectedError) {
+			TraceEvent(SevInfo, "ExpectedRoleFailureSuppressed", id)
+			    .detail("Role", role.roleName)
+			    .detail("ErrorCode", e.code())
+			    .detail("ErrorName", e.name())
+			    .detail("Reason", reason);
+		}
+
 		std::string type = role.roleName + "Failed";
 
-		TraceEvent err(SevError, type.c_str(), id);
+		TraceEvent err(isExpectedError ? SevInfo : SevError, type.c_str(), id);
 		if (e.code() != invalid_error_code) {
 			err.errorUnsuppressed(e);
 		}
diff --git a/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp b/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp
new file mode 100644
index 00000000000..e41b6b93c38
--- /dev/null
+++ b/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp
@@ -0,0 +1,242 @@
+/*
+ * BackupAndRestoreValidation.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/BackupAgent.actor.h"
+#include "fdbclient/BackupContainer.h"
+#include "fdbclient/SystemData.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "fdbserver/QuietDatabase.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+// Simplified backup and restore workload specifically for restore validation testing
+// This avoids the complexity of BackupAndRestoreCorrectness which is used by many tests
+
+// Completion marker key to signal that restore is fully done
+const KeyRef restoreValidationCompletionKey = "\xff\x02/restoreValidationComplete"_sr;
+struct BackupAndRestoreValidationWorkload : TestWorkload {
+	static constexpr auto NAME = "BackupAndRestoreValidation";
+	double backupAfter, restoreAfter;
+	Key backupTag;
+	Key addPrefix; // Prefix to add during restore (e.g., \xff\x02/rlog/)
+
+	BackupAndRestoreValidationWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		backupAfter = getOption(options, "backupAfter"_sr, 10.0);
+		restoreAfter = getOption(options, "restoreAfter"_sr, 30.0);
+		backupTag = getOption(options, "backupTag"_sr, BackupAgentBase::getDefaultTag());
+		addPrefix = unprintable(getOption(options, "addPrefix"_sr, ""_sr).toString());
+
+		TraceEvent("BARV_Init")
+		    .detail("BackupAfter", backupAfter)
+		    .detail("RestoreAfter", restoreAfter)
+		    .detail("AddPrefix", printable(addPrefix));
+	}
+
+	Future<Void> setup(Database const& cx) override { return Void(); }
+
+	Future<Void> start(Database const& cx) override {
+		if (clientId != 0)
+			return Void();
+		return _start(cx, this);
+	}
+
+	Future<bool> check(Database const& cx) override { return true; }
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+
+	ACTOR static Future<Void> doBackup(BackupAndRestoreValidationWorkload* self,
+	                                   FileBackupAgent* backupAgent,
+	                                   Database cx) {
+		state std::string backupContainer = "file://simfdb/backups/";
+		state Standalone<VectorRef<KeyRangeRef>> backupRanges;
+
+		// Only backup normal user keys (not system keys)
+		backupRanges.push_back_deep(backupRanges.arena(), normalKeys);
+
+		TraceEvent("BARV_SubmitBackup").detail("Tag", printable(self->backupTag)).detail("Container", backupContainer);
+
+		try {
+			wait(backupAgent->submitBackup(cx,
+			                               StringRef(backupContainer),
+			                               {},
+			                               deterministicRandom()->randomInt(0, 60),
+			                               deterministicRandom()->randomInt(0, 100),
+			                               self->backupTag.toString(),
+			                               backupRanges,
+			                               true,
+			                               StopWhenDone{ true }));
+		} catch (Error& e) {
+			TraceEvent("BARV_SubmitBackupException").error(e);
+			if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate)
+				throw;
+		}
+
+		// Wait for backup to complete
+		TraceEvent("BARV_WaitBackup").detail("Tag", printable(self->backupTag));
+		state EBackupState statusValue =
+		    wait(backupAgent->waitBackup(cx, self->backupTag.toString(), StopWhenDone::True));
+
+		TraceEvent("BARV_BackupComplete")
+		    .detail("Tag", printable(self->backupTag))
+		    .detail("Status", BackupAgentBase::getStateText(statusValue));
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> doRestore(BackupAndRestoreValidationWorkload* self,
+	                                    FileBackupAgent* backupAgent,
+	                                    Database cx,
+	                                    Reference<IBackupContainer> backupContainer) {
+		state Standalone<VectorRef<KeyRangeRef>> restoreRanges;
+
+		// Restore normal user keys only
+		restoreRanges.push_back_deep(restoreRanges.arena(), normalKeys);
+
+		state Standalone<StringRef> restoreTag(self->backupTag.toString() + "_restore");
+
+		TraceEvent("BARV_StartRestore")
+		    .detail("Tag", printable(restoreTag))
+		    .detail("Container", backupContainer->getURL())
+		    .detail("AddPrefix", printable(self->addPrefix));
+
+		// Don't clear keys - we want to keep original data for validation comparison
+		// The restore will put data at the addPrefix location
+
+		wait(success(backupAgent->restore(cx,
+		                                  cx,
+		                                  restoreTag,
+		                                  KeyRef(backupContainer->getURL()),
+		                                  backupContainer->getProxy(),
+		                                  restoreRanges,
+		                                  WaitForComplete::True,
+		                                  ::invalidVersion,
+		                                  Verbose::True,
+		                                  self->addPrefix,
+		                                  Key(), // removePrefix
+		                                  LockDB{ false },
+		                                  UnlockDB::True,
+		                                  OnlyApplyMutationLogs::False,
+		                                  InconsistentSnapshotOnly::False,
+		                                  ::invalidVersion,
+		                                  backupContainer->getEncryptionKeyFileName())));
+
+		TraceEvent("BARV_RestoreComplete")
+		    .detail("Tag", printable(restoreTag))
+		    .detail("AddPrefix", printable(self->addPrefix));
+
+		// Write a completion marker so RestoreValidation knows restore is fully done
+		state Key completionMarker = restoreValidationCompletionKey;
+		state Transaction markTr(cx);
+		loop {
+			try {
+				markTr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				markTr.setOption(FDBTransactionOptions::LOCK_AWARE);
+				markTr.set(completionMarker, "1"_sr);
+				wait(markTr.commit());
+				TraceEvent("BARV_RestoreCompletionMarkerSet").detail("MarkerKey", printable(completionMarker));
+				break;
+			} catch (Error& e) {
+				wait(markTr.onError(e));
+			}
+		}
+
+		// Unlock the database after restore completes
+		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			tr->clear(databaseLockedKey);
+			return Void();
+		}));
+
+		TraceEvent("BARV_DatabaseUnlocked").detail("Tag", printable(restoreTag));
+
+		return Void();
+	}
+
+	ACTOR static Future<Void> _start(Database cx, BackupAndRestoreValidationWorkload* self) {
+		// Only run on client 0 to avoid conflicts
+		if (self->clientId != 0) {
+			return Void();
+		}
+
+		state FileBackupAgent backupAgent;
+		state UID randomID = nondeterministicRandom()->randomUniqueID();
+		state int retryCount = 0;
+
+		loop {
+			try {
+				// Wait before starting backup
+				wait(delay(self->backupAfter));
+
+				// Perform backup
+				TraceEvent("BARV_StartBackup", randomID)
+				    .detail("Tag", printable(self->backupTag))
+				    .detail("RetryCount", retryCount);
+				wait(doBackup(self, &backupAgent, cx));
+
+				// Get backup container info
+				state KeyBackedTag keyBackedTag = makeBackupTag(self->backupTag.toString());
+				UidAndAbortedFlagT uidFlag = wait(keyBackedTag.getOrThrow(cx.getReference()));
+				state UID logUid = uidFlag.first;
+				state Reference<IBackupContainer> backupContainer =
+				    wait(BackupConfig(logUid).backupContainer().getD(cx.getReference()));
+
+				// Wait before starting restore
+				wait(delay(self->restoreAfter - self->backupAfter));
+
+				// Perform restore with prefix
+				TraceEvent("BARV_StartRestore", randomID)
+				    .detail("Tag", printable(self->backupTag))
+				    .detail("Container", backupContainer->getURL());
+				wait(doRestore(self, &backupAgent, cx, backupContainer));
+
+				TraceEvent("BARV_Complete", randomID).detail("Tag", printable(self->backupTag));
+				break; // Success!
+
+			} catch (Error& e) {
+				// Retry on transient errors from buggify chaos injection
+				if (e.code() == error_code_grv_proxy_memory_limit_exceeded ||
+				    e.code() == error_code_commit_proxy_memory_limit_exceeded ||
+				    e.code() == error_code_database_locked || e.code() == error_code_transaction_too_old ||
+				    e.code() == error_code_future_version) {
+					retryCount++;
+					double backoff = std::min(1.0, 0.1 * retryCount);
+					TraceEvent(SevWarn, "BARV_RetryableError", randomID)
+					    .error(e)
+					    .detail("RetryCount", retryCount)
+					    .detail("BackoffSeconds", backoff);
+					wait(delay(backoff));
+					// Reset state and retry
+					self->backupAfter = 0.0; // Don't wait again
+					self->restoreAfter = self->restoreAfter - self->backupAfter;
+					// Loop will retry
+				} else {
+					TraceEvent(SevError, "BARV_Error", randomID).error(e).detail("RetryCount", retryCount);
+					throw;
+				}
+			}
+		}
+
+		return Void();
+	}
+};
+
+WorkloadFactory<BackupAndRestoreValidationWorkload> BackupAndRestoreValidationWorkloadFactory;
diff --git a/fdbserver/workloads/RestoreValidation.actor.cpp b/fdbserver/workloads/RestoreValidation.actor.cpp
new file mode 100644
index 00000000000..14a8450371c
--- /dev/null
+++ b/fdbserver/workloads/RestoreValidation.actor.cpp
@@ -0,0 +1,346 @@
+/*
+ * RestoreValidation.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// RestoreValidationWorkload triggers and monitors a ValidateRestore audit to verify that
+// backup/restore operations correctly restored all data.
+//
+// This workload is designed to work with BackupAndRestoreValidation workload:
+// 1. BackupAndRestoreValidation performs backup and restore with a prefix (e.g., \xff\x02/rlog/)
+// 2. BackupAndRestoreValidation sets a completion marker when fully done
+// 3. RestoreValidationWorkload waits for the completion marker
+// 4. RestoreValidationWorkload triggers a ValidateRestore audit via the audit_storage API
+// 5. The audit compares source keys (normalKeys) with restored keys (prefix + normalKeys)
+// 6. RestoreValidationWorkload monitors audit progress and reports success/failure
+//
+// The workload includes:
+// - Synchronization: Waits for restore completion marker to avoid racing with restore
+// - Retry logic: Retries audit scheduling on transient failures (up to 5 times)
+// - Timeout handling: 60s timeout on audit scheduling to detect cluster recovery issues
+// - Progress monitoring: Polls audit status every checkInterval seconds
+// - Error detection: Fails the test if audit finds missing or mismatched keys
+
+#include "fdbclient/Audit.h"
+#include "fdbclient/AuditUtils.actor.h"
+#include "fdbclient/ClusterConnectionFile.h"
+#include "fdbclient/ManagementAPI.actor.h"
+#include "fdbclient/NativeAPI.actor.h"
+#include "fdbserver/workloads/workloads.actor.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+struct RestoreValidationWorkload : TestWorkload {
+	static constexpr auto NAME = "RestoreValidation";
+
+	double validateAfter;
+	KeyRange validationRange;
+	int expectedPhase; // Expected AuditPhase (2 = Complete)
+	bool expectSuccess;
+	double checkInterval;
+	double maxWaitTime;
+
+	RestoreValidationWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
+		validateAfter = getOption(options, "validateAfter"_sr, 50.0);
+		validationRange = normalKeys;
+		expectedPhase = getOption(options, "expectedPhase"_sr, (int)AuditPhase::Complete);
+		expectSuccess = getOption(options, "expectSuccess"_sr, true);
+		checkInterval = getOption(options, "checkInterval"_sr, 5.0);
+		maxWaitTime = getOption(options, "maxWaitTime"_sr, 300.0);
+
+		TraceEvent("RestoreValidationWorkloadInit")
+		    .detail("ValidateAfter", validateAfter)
+		    .detail("ExpectedPhase", expectedPhase)
+		    .detail("ExpectSuccess", expectSuccess)
+		    .detail("MaxWaitTime", maxWaitTime);
+	}
+
+	Future<Void> setup(Database const& cx) override { return Void(); }
+
+	Future<Void> start(Database const& cx) override {
+		if (clientId == 0) {
+			return _start(this, cx);
+		}
+		return Void();
+	}
+
+	Future<bool> check(Database const& cx) override { return true; }
+
+	void getMetrics(std::vector<PerfMetric>& m) override {}
+
+	ACTOR static Future<Void> _start(RestoreValidationWorkload* self, Database cx) {
+		// Only run on client 0 to avoid conflicts (backup/restore runs on client 0)
+		if (self->clientId != 0) {
+			return Void();
+		}
+
+		// Wait for the specified time before starting validation
+		TraceEvent("RestoreValidationWorkloadWaiting").detail("WaitTime", self->validateAfter);
+		wait(delay(self->validateAfter));
+
+		// Wait for restore completion marker
+		// BackupAndRestoreValidation sets this key when restore is fully complete
+		state Key completionMarker = "\xff\x02/restoreValidationComplete"_sr;
+		state bool restoreComplete = false;
+		state int checkAttempts = 0;
+
+		TraceEvent("RestoreValidationWaitingForRestoreCompletion")
+		    .detail("CompletionMarker", printable(completionMarker));
+
+		loop {
+			try {
+				state Transaction tr(cx);
+				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+
+				Optional<Value> markerValue = wait(tr.get(completionMarker));
+				if (markerValue.present()) {
+					restoreComplete = true;
+					TraceEvent("RestoreValidationRestoreComplete").detail("CheckAttempts", checkAttempts);
+					break;
+				}
+
+				checkAttempts++;
+				// No max check limit - keep waiting until test timeout or marker appears
+				// This is necessary because buggify can make operations arbitrarily slow
+				if (checkAttempts % 12 == 0) { // Log every minute
+					TraceEvent("RestoreValidationStillWaitingForRestore")
+					    .detail("CheckAttempts", checkAttempts)
+					    .detail("WaitTimeSeconds", checkAttempts * 5);
+				}
+				wait(delay(5.0));
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled) {
+					throw;
+				}
+				// Retry on transient errors from buggify chaos injection
+				if (e.code() == error_code_grv_proxy_memory_limit_exceeded ||
+				    e.code() == error_code_commit_proxy_memory_limit_exceeded ||
+				    e.code() == error_code_database_locked || e.code() == error_code_transaction_too_old ||
+				    e.code() == error_code_future_version || e.code() == error_code_audit_storage_failed ||
+				    e.code() == error_code_tag_throttled) {
+					TraceEvent(SevWarn, "RestoreValidationRetryableError")
+					    .error(e)
+					    .detail("CheckAttempts", checkAttempts);
+					wait(delay(1.0)); // Backoff before retry
+					// Loop will retry
+				} else {
+					throw;
+				}
+			}
+		}
+
+		TraceEvent("RestoreValidationWorkloadStarting").detail("Range", self->validationRange);
+
+		state int auditRetryCount = 0;
+		state int maxAuditRetries = 5;
+
+		loop {
+			try {
+				// Trigger the audit_storage validate_restore command
+				state AuditType auditType = AuditType::ValidateRestore;
+				state Reference<IClusterConnectionRecord> clusterFile = cx->getConnectionRecord();
+
+				TraceEvent("RestoreValidationTriggeringAudit")
+				    .detail("AuditType", (int)auditType)
+				    .detail("Range", self->validationRange)
+				    .detail("RetryCount", auditRetryCount);
+
+				// Trigger the audit using ManagementAPI with timeout
+				// Use shorter timeout for scheduling (60s) to detect cluster issues early
+				state UID auditId;
+				try {
+					UID scheduleResult = wait(timeoutError(
+					    auditStorage(
+					        clusterFile, self->validationRange, auditType, KeyValueStoreType::END, self->maxWaitTime),
+					    60.0));
+					auditId = scheduleResult;
+				} catch (Error& e) {
+					if (e.code() == error_code_timed_out) {
+						TraceEvent(SevWarn, "RestoreValidationAuditScheduleTimeout")
+						    .detail("RetryCount", auditRetryCount)
+						    .detail("MaxRetries", maxAuditRetries);
+						// Treat as retryable - cluster might be recovering
+						if (auditRetryCount < maxAuditRetries) {
+							throw audit_storage_failed();
+						} else {
+							throw;
+						}
+					}
+					throw;
+				}
+
+				TraceEvent("RestoreValidationAuditScheduled")
+				    .detail("AuditID", auditId)
+				    .detail("RetryCount", auditRetryCount);
+
+				// Monitor audit progress
+				state double startTime = now();
+				state double lastReportTime = startTime;
+				state bool completed = false;
+				state AuditPhase finalPhase = AuditPhase::Invalid;
+				state std::string errorMessage;
+
+				loop {
+					wait(delay(self->checkInterval));
+
+					// Get audit status (newFirst=true to get latest states first)
+					// Add timeout to handle cluster recovery/instability
+					state std::vector<AuditStorageState> auditStates;
+					try {
+						std::vector<AuditStorageState> states =
+						    wait(timeoutError(getAuditStates(cx, auditType, true), 60.0));
+						auditStates = states;
+					} catch (Error& e) {
+						if (e.code() == error_code_timed_out) {
+							// Cluster is likely recovering - check overall timeout and continue
+							if (now() - startTime > self->maxWaitTime) {
+								TraceEvent(SevError, "RestoreValidationTimeout")
+								    .detail("AuditID", auditId)
+								    .detail("ElapsedTime", now() - startTime)
+								    .detail("MaxWaitTime", self->maxWaitTime)
+								    .detail("Reason", "getAuditStates timed out");
+								throw timed_out();
+							}
+							continue; // Skip this iteration, try again
+						}
+						throw;
+					}
+
+					// Filter for our audit ID
+					state bool foundOurAudit = false;
+					state bool allComplete = true;
+					state bool anyError = false;
+
+					for (const auto& state : auditStates) {
+						if (state.id == auditId) {
+							foundOurAudit = true;
+
+							if (state.getPhase() == AuditPhase::Running) {
+								allComplete = false;
+							} else if (state.getPhase() == AuditPhase::Error ||
+							           state.getPhase() == AuditPhase::Failed) {
+								anyError = true;
+								finalPhase = state.getPhase();
+								if (!state.error.empty()) {
+									errorMessage = state.error;
+								} else {
+									errorMessage = "Unknown error";
+								}
+							} else if (state.getPhase() == AuditPhase::Complete) {
+								finalPhase = AuditPhase::Complete;
+							}
+						}
+					}
+
+					if (!foundOurAudit) {
+						TraceEvent(SevWarn, "RestoreValidationNoAuditStates")
+						    .detail("AuditID", auditId)
+						    .detail("ElapsedTime", now() - startTime);
+					} else {
+						// Report progress periodically
+						if (now() - lastReportTime >= 10.0) {
+							TraceEvent("RestoreValidationProgress")
+							    .detail("AuditID", auditId)
+							    .detail("AllComplete", allComplete)
+							    .detail("AnyError", anyError)
+							    .detail("FinalPhase", (int)finalPhase)
+							    .detail("ElapsedTime", now() - startTime);
+							lastReportTime = now();
+						}
+
+						if (allComplete || anyError) {
+							completed = true;
+							break;
+						}
+					}
+
+					// Check timeout
+					if (now() - startTime > self->maxWaitTime) {
+						TraceEvent(SevError, "RestoreValidationTimeout")
+						    .detail("AuditID", auditId)
+						    .detail("ElapsedTime", now() - startTime)
+						    .detail("MaxWaitTime", self->maxWaitTime);
+						throw timed_out();
+					}
+				}
+
+				// Verify the results
+				TraceEvent("RestoreValidationComplete")
+				    .detail("AuditID", auditId)
+				    .detail("FinalPhase", (int)finalPhase)
+				    .detail("ExpectedPhase", self->expectedPhase)
+				    .detail("ErrorMessage", errorMessage)
+				    .detail("ElapsedTime", now() - startTime);
+
+				if (self->expectSuccess) {
+					if (finalPhase != AuditPhase::Complete) {
+						// Log as warning since we may retry - only becomes error if all retries fail
+						TraceEvent(SevWarn, "RestoreValidationUnexpectedPhase")
+						    .detail("AuditID", auditId)
+						    .detail("FinalPhase", (int)finalPhase)
+						    .detail("ExpectedPhase", self->expectedPhase)
+						    .detail("ErrorMessage", errorMessage);
+						throw audit_storage_failed();
+					}
+					if (!errorMessage.empty()) {
+						TraceEvent(SevError, "RestoreValidationUnexpectedError")
+						    .detail("AuditID", auditId)
+						    .detail("ErrorMessage", errorMessage);
+						throw audit_storage_error();
+					}
+				} else {
+					if (finalPhase == AuditPhase::Complete) {
+						TraceEvent(SevError, "RestoreValidationUnexpectedSuccess")
+						    .detail("AuditID", auditId)
+						    .detail("ExpectedPhase", self->expectedPhase);
+						throw audit_storage_task_outdated();
+					}
+				}
+
+				TraceEvent("RestoreValidationSuccess").detail("AuditID", auditId);
+				break; // Success!
+
+			} catch (Error& e) {
+				if (e.code() == error_code_actor_cancelled) {
+					throw;
+				}
+				// Retry audit on failures caused by cluster instability during buggify
+				if (e.code() == error_code_audit_storage_failed && auditRetryCount < maxAuditRetries) {
+					auditRetryCount++;
+					double backoff = std::min(10.0, 2.0 * auditRetryCount);
+					TraceEvent(SevWarn, "RestoreValidationAuditRetry")
+					    .error(e)
+					    .detail("RetryCount", auditRetryCount)
+					    .detail("MaxRetries", maxAuditRetries)
+					    .detail("BackoffSeconds", backoff);
+					wait(delay(backoff));
+					// Loop will retry the entire audit
+				} else {
+					TraceEvent(SevError, "RestoreValidationError")
+					    .errorUnsuppressed(e)
+					    .detail("RetryCount", auditRetryCount);
+					throw;
+				}
+			}
+		}
+
+		return Void();
+	}
+};
+
+WorkloadFactory<RestoreValidationWorkload> RestoreValidationWorkloadFactory;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1ed5676fbc4..20c6a7c8d50 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -154,6 +154,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES fast/BackupCorrectnessClean.toml)
   add_fdb_test(TEST_FILES fast/BackupToDBCorrectness.toml)
   add_fdb_test(TEST_FILES fast/BackupToDBCorrectnessClean.toml)
+  add_fdb_test(TEST_FILES fast/RestoreValidation_Simple.toml)
 
   # IGNORE below because we do not use tenant mode and it is likely to be
   # deprecated/removed. rdar://157717454, rdar://134522214.
diff --git a/tests/fast/RestoreValidation_Simple.toml b/tests/fast/RestoreValidation_Simple.toml
new file mode 100644
index 00000000000..7009472fd82
--- /dev/null
+++ b/tests/fast/RestoreValidation_Simple.toml
@@ -0,0 +1,42 @@
+# RestoreValidation_Simple.toml
+#
+# Simple test to validate restore correctness using audit_storage
+#
+# Flow:
+# 1. Cycle generates data
+# 2. Backup and restore to \xff\x02/rlog/ prefix
+# 3. Validation audits the restored data
+
+testClass = "Backup"
+
+[configuration]
+buggify = false
+faultInjection = false
+generateFearless = false
+minimumRegions = 1
+config = "single usable_regions=1 storage_engine=ssd-2 tenant_mode=disabled"
+
+[[test]]
+testTitle = 'RestoreValidationSimple'
+clearAfterTest = false
+simBackupAgents = 'BackupToFile'
+waitForQuiescenceBegin = false
+runFailureWorkloads = false
+
+    [[test.workload]]
+    testName = 'Cycle'
+    nodeCount = 100
+    transactionsPerSecond = 10.0
+    testDuration = 10.0
+    expectedRate = 0
+
+    [[test.workload]]
+    testName = 'BackupAndRestoreValidation'
+    backupAfter = 60.0
+    restoreAfter = 110.0
+    addPrefix = '\xff\x02/rlog/'
+
+    [[test.workload]]
+    testName = 'RestoreValidation'
+    validateAfter = 130.0
+    maxWaitTime = 600.0

From b8be20eae2e6b5380186e34f7b28e329d3bfb9ba Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 09:56:42 -0800
Subject: [PATCH 02/15] Fix ValidateRestore audit hanging when no storage
 servers found for shard

---
 fdbserver/DataDistribution.actor.cpp | 50 +++++++++++++++++-----------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 8855e903529..ceed4a61d2a 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4450,29 +4450,39 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								    .detail("AuditType", auditType);
 								return Void();
 							}
-						} else if (auditType == AuditType::ValidateRestore) {
-							// select a server from primary DC to do audit
-							// ValidateRestore compares source vs restored data, single replica is fine
-							int dcid = 0;
-							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
-								if (dcid == 0) {
-									// in primary DC randomly select a server to do the audit task
-									const int idx = deterministicRandom()->randomInt(0, dcServers.size());
-									targetServer = dcServers[idx];
-								}
-								for (int i = 0; i < dcServers.size(); i++) {
-									if (dcServers[i].id() == targetServer.id()) {
-										ASSERT_WE_THINK(dcid == 0);
-									} else {
-										req.targetServers.push_back(dcServers[i].id());
-									}
-									storageServersToCheck.push_back(dcServers[i]);
+					} else if (auditType == AuditType::ValidateRestore) {
+						// select a server from primary DC to do audit
+						// ValidateRestore compares source vs restored data, single replica is fine
+						if (rangeLocations[rangeLocationIndex].servers.empty()) {
+							TraceEvent(SevInfo, "DDScheduleAuditOnRangeEnd", self->ddId)
+							    .detail("Reason", "No servers found for shard")
+							    .detail("AuditID", audit->coreState.id)
+							    .detail("AuditRange", audit->coreState.range)
+							    .detail("TaskRange", taskRange)
+							    .detail("AuditType", auditType);
+							++numSkippedShards;
+							break; // Skip this shard, move to next range
+						}
+						int dcid = 0;
+						for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
+							if (dcid == 0) {
+								// in primary DC randomly select a server to do the audit task
+								const int idx = deterministicRandom()->randomInt(0, dcServers.size());
+								targetServer = dcServers[idx];
+							}
+							for (int i = 0; i < dcServers.size(); i++) {
+								if (dcServers[i].id() == targetServer.id()) {
+									ASSERT_WE_THINK(dcid == 0);
+								} else {
+									req.targetServers.push_back(dcServers[i].id());
 								}
-								dcid++;
+								storageServersToCheck.push_back(dcServers[i]);
 							}
-						} else {
-							UNREACHABLE();
+							dcid++;
 						}
+					} else {
+						UNREACHABLE();
+					}
 						// Set doAuditOnStorageServer
 						ASSERT(audit->remainingBudgetForAuditTasks.get() >= 0);
 						while (audit->remainingBudgetForAuditTasks.get() == 0) {

From b83b5c0de54ab878728bdc3eb9a9df903114617f Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 09:58:43 -0800
Subject: [PATCH 03/15] Formatting

---
 fdbclient/FileBackupAgent.actor.cpp  | 80 ++++++++++++++--------------
 fdbserver/DataDistribution.actor.cpp | 58 ++++++++++----------
 2 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index e7bcdfab14b..8dbd7fecd19 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2992,19 +2992,19 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
 
 		// Coalesce the shard map to make random selection below more efficient.
 		shardMap.coalesce(allKeys);
-	wait(yield());
+		wait(yield());
 
-	// In this context "all" refers to all of the shards relevant for this particular backup
-	state int countAllShards = countShardsDone + countShardsNotDone;
+		// In this context "all" refers to all of the shards relevant for this particular backup
+		state int countAllShards = countShardsDone + countShardsNotDone;
 
-	// NOTE: Don't finish here even if countShardsNotDone == 0. We need to dispatch tasks first.
-	// The completion check after dispatch (with dispatchedInThisIteration guard) prevents
-	// finishing in the same iteration we dispatch the last tasks.
-	if (countShardsNotDone == 0) {
-		TraceEvent("FileBackupSnapshotDispatchAllDoneBeforeDispatch")
-		    .detail("BackupUID", config.getUid())
-		    .detail("Note", "Will check again after dispatch loop");
-	}
+		// NOTE: Don't finish here even if countShardsNotDone == 0. We need to dispatch tasks first.
+		// The completion check after dispatch (with dispatchedInThisIteration guard) prevents
+		// finishing in the same iteration we dispatch the last tasks.
+		if (countShardsNotDone == 0) {
+			TraceEvent("FileBackupSnapshotDispatchAllDoneBeforeDispatch")
+			    .detail("BackupUID", config.getUid())
+			    .detail("Note", "Will check again after dispatch loop");
+		}
 
 		// Decide when the next snapshot dispatch should run.
 		state Version nextDispatchVersion;
@@ -3075,14 +3075,14 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
 		    .detail("SnapshotTargetEndVersion", snapshotTargetEndVersion)
 		    .detail("NextDispatchVersion", nextDispatchVersion)
 		    .detail("CurrentVersion", recentReadVersion)
-	    .detail("TimeElapsed", timeElapsed)
-	    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);
+		    .detail("TimeElapsed", timeElapsed)
+		    .detail("SnapshotIntervalSeconds", snapshotIntervalSeconds);
 
-	// Track whether we dispatched any tasks in this iteration
-	state bool dispatchedInThisIteration = false;
+		// Track whether we dispatched any tasks in this iteration
+		state bool dispatchedInThisIteration = false;
 
-	// Dispatch random shards to catch up to the expected progress
-	while (countShardsToDispatch > 0) {
+		// Dispatch random shards to catch up to the expected progress
+		while (countShardsToDispatch > 0) {
 			// First select ranges to add
 			state std::vector<KeyRange> rangesToAdd;
 
@@ -3211,20 +3211,20 @@ struct BackupSnapshotDispatchTask : BackupTaskFuncBase {
 						}
 					}
 
-				wait(waitForAll(addTaskFutures));
-				wait(tr->commit());
-				dispatchedInThisIteration = true;
-				break;
-			} catch (Error& e) {
-				wait(tr->onError(e));
+					wait(waitForAll(addTaskFutures));
+					wait(tr->commit());
+					dispatchedInThisIteration = true;
+					break;
+				} catch (Error& e) {
+					wait(tr->onError(e));
+				}
 			}
 		}
-	}
 
-	// Only finish if all shards are done AND we didn't dispatch any tasks this iteration.
-	// This prevents the bug where we mark snapshot finished immediately after dispatching
-	// the last batch of tasks, before they actually complete.
-	if (countShardsNotDone == 0 && !dispatchedInThisIteration) {
+		// Only finish if all shards are done AND we didn't dispatch any tasks this iteration.
+		// This prevents the bug where we mark snapshot finished immediately after dispatching
+		// the last batch of tasks, before they actually complete.
+		if (countShardsNotDone == 0 && !dispatchedInThisIteration) {
 			TraceEvent("FileBackupSnapshotDispatchFinished")
 			    .detail("BackupUID", config.getUid())
 			    .detail("AllShards", countAllShards)
@@ -6979,20 +6979,20 @@ class FileBackupAgentImpl {
 			oldRestore.clear(tr);
 		}
 
-	if (!onlyApplyMutationLogs) {
-		state int index;
-		for (index = 0; index < restoreRanges.size(); index++) {
-			KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end)
-			                                .removePrefix(removePrefix)
-			                                .withPrefix(addPrefix);
-			RangeResult existingRows = wait(tr->getRange(restoreIntoRange, 1));
-			// Allow restoring over existing data when restoring with a prefix (for validation)
-			// addPrefix.size() > 0 indicates this is a validation restore
-			if (existingRows.size() > 0 && addPrefix.size() == 0) {
-				throw restore_destination_not_empty();
+		if (!onlyApplyMutationLogs) {
+			state int index;
+			for (index = 0; index < restoreRanges.size(); index++) {
+				KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end)
+				                                .removePrefix(removePrefix)
+				                                .withPrefix(addPrefix);
+				RangeResult existingRows = wait(tr->getRange(restoreIntoRange, 1));
+				// Allow restoring over existing data when restoring with a prefix (for validation)
+				// addPrefix.size() > 0 indicates this is a validation restore
+				if (existingRows.size() > 0 && addPrefix.size() == 0) {
+					throw restore_destination_not_empty();
+				}
 			}
 		}
-	}
 		// Make new restore config
 		state RestoreConfig restore(uid);
 
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index ceed4a61d2a..d57039a597f 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4450,39 +4450,39 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								    .detail("AuditType", auditType);
 								return Void();
 							}
-					} else if (auditType == AuditType::ValidateRestore) {
-						// select a server from primary DC to do audit
-						// ValidateRestore compares source vs restored data, single replica is fine
-						if (rangeLocations[rangeLocationIndex].servers.empty()) {
-							TraceEvent(SevInfo, "DDScheduleAuditOnRangeEnd", self->ddId)
-							    .detail("Reason", "No servers found for shard")
-							    .detail("AuditID", audit->coreState.id)
-							    .detail("AuditRange", audit->coreState.range)
-							    .detail("TaskRange", taskRange)
-							    .detail("AuditType", auditType);
-							++numSkippedShards;
-							break; // Skip this shard, move to next range
-						}
-						int dcid = 0;
-						for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
-							if (dcid == 0) {
-								// in primary DC randomly select a server to do the audit task
-								const int idx = deterministicRandom()->randomInt(0, dcServers.size());
-								targetServer = dcServers[idx];
+						} else if (auditType == AuditType::ValidateRestore) {
+							// select a server from primary DC to do audit
+							// ValidateRestore compares source vs restored data, single replica is fine
+							if (rangeLocations[rangeLocationIndex].servers.empty()) {
+								TraceEvent(SevInfo, "DDScheduleAuditOnRangeEnd", self->ddId)
+								    .detail("Reason", "No servers found for shard")
+								    .detail("AuditID", audit->coreState.id)
+								    .detail("AuditRange", audit->coreState.range)
+								    .detail("TaskRange", taskRange)
+								    .detail("AuditType", auditType);
+								++numSkippedShards;
+								break; // Skip this shard, move to next range
 							}
-							for (int i = 0; i < dcServers.size(); i++) {
-								if (dcServers[i].id() == targetServer.id()) {
-									ASSERT_WE_THINK(dcid == 0);
-								} else {
-									req.targetServers.push_back(dcServers[i].id());
+							int dcid = 0;
+							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
+								if (dcid == 0) {
+									// in primary DC randomly select a server to do the audit task
+									const int idx = deterministicRandom()->randomInt(0, dcServers.size());
+									targetServer = dcServers[idx];
 								}
-								storageServersToCheck.push_back(dcServers[i]);
+								for (int i = 0; i < dcServers.size(); i++) {
+									if (dcServers[i].id() == targetServer.id()) {
+										ASSERT_WE_THINK(dcid == 0);
+									} else {
+										req.targetServers.push_back(dcServers[i].id());
+									}
+									storageServersToCheck.push_back(dcServers[i]);
+								}
+								dcid++;
 							}
-							dcid++;
+						} else {
+							UNREACHABLE();
 						}
-					} else {
-						UNREACHABLE();
-					}
 						// Set doAuditOnStorageServer
 						ASSERT(audit->remainingBudgetForAuditTasks.get() >= 0);
 						while (audit->remainingBudgetForAuditTasks.get() == 0) {

From 8f968f5a04a7230fc1cc50ddf9276c25713f1819 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 10:07:09 -0800
Subject: [PATCH 04/15] Fix audit stuck in Running state after
 wrong_shard_server errors

When too many wrong_shard_server errors occur (stale shard location data),
throw audit_storage_failed instead of audit_storage_cancelled. This ensures
the audit is properly marked as Failed in the database rather than staying
stuck in Running state. Also add a delay before retrying to let data
distribution stabilize.
---
 fdbserver/DataDistribution.actor.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index d57039a597f..56ea27324a8 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4737,11 +4737,14 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
 			audit->foundError = true;
 		} else if (e.code() == error_code_wrong_shard_server) {
 			// wrong_shard_server means stale shard location data
-			// Don't retry within DD - throw to let workload retry at higher level
+			// Retry a few times to see if data distribution stabilizes
 			if (audit->retryCount >= 3) {
-				throw audit_storage_cancelled(); // Let workload retry with fresh DD dispatch
+				// After retries, fail the audit so it can be retried from scratch
+				throw audit_storage_failed();
 			}
 			audit->retryCount++;
+			// Wait a bit before retrying to let data distribution stabilize
+			wait(delay(1.0));
 			audit->actors.add(scheduleAuditOnRange(self, audit, req.range));
 		} else if (audit->retryCount >= SERVER_KNOBS->AUDIT_RETRY_COUNT_MAX) {
 			throw audit_storage_failed();

From be1e6502f85dc54c5bb8f775639c845dc22d3cab Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 10:36:50 -0800
Subject: [PATCH 05/15] Add delay after restore to let data stabilize before
 validation

The restore API can return success before all restored data is fully
committed and visible to readers. Add a 5-second delay after restore
completes before setting the completion marker. This prevents the
validation audit from running too early and finding false mismatches
due to in-flight commits.
---
 fdbserver/workloads/BackupAndRestoreValidation.actor.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp b/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp
index e41b6b93c38..46a46bb616d 100644
--- a/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp
+++ b/fdbserver/workloads/BackupAndRestoreValidation.actor.cpp
@@ -142,6 +142,11 @@ struct BackupAndRestoreValidationWorkload : TestWorkload {
 		    .detail("Tag", printable(restoreTag))
 		    .detail("AddPrefix", printable(self->addPrefix));
 
+		// Wait a bit to ensure all restored data is committed and visible
+		// The restore API returns success before all data is fully flushed to storage servers
+		wait(delay(5.0));
+		TraceEvent("BARV_RestoreDataStabilizationWait").detail("WaitTime", 5.0);
+
 		// Write a completion marker so RestoreValidation knows restore is fully done
 		state Key completionMarker = restoreValidationCompletionKey;
 		state Transaction markTr(cx);

From 6e67cd702f00edf2e9610c22545ae9646a149d2a Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 12:54:12 -0800
Subject: [PATCH 06/15] Remove delay from wrong_shard_server error handler

The delay in the error path could interfere with actor cleanup or cause
issues in other audit types. The retry itself should be sufficient to
allow data distribution to stabilize.
---
 fdbserver/DataDistribution.actor.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 56ea27324a8..d7408893df7 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4743,8 +4743,6 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
 				throw audit_storage_failed();
 			}
 			audit->retryCount++;
-			// Wait a bit before retrying to let data distribution stabilize
-			wait(delay(1.0));
 			audit->actors.add(scheduleAuditOnRange(self, audit, req.range));
 		} else if (audit->retryCount >= SERVER_KNOBS->AUDIT_RETRY_COUNT_MAX) {
 			throw audit_storage_failed();

From d210c37822c8c422c718dcaff5bcf46b5d013530 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 12:57:04 -0800
Subject: [PATCH 07/15] Fix crash when audit finds shard with no servers

When rangeLocations[].servers is empty, we were breaking out of the
inner loop but continuing execution, which led to using uninitialized
targetServer variable at line 4538. This caused crashes/undefined
behavior.

Fix: Set taskRangeBegin to skip the entire range and continue the loop,
avoiding the use of uninitialized targetServer.
---
 fdbserver/DataDistribution.actor.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index d7408893df7..b0205f85417 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4454,14 +4454,16 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 							// select a server from primary DC to do audit
 							// ValidateRestore compares source vs restored data, single replica is fine
 							if (rangeLocations[rangeLocationIndex].servers.empty()) {
-								TraceEvent(SevInfo, "DDScheduleAuditOnRangeEnd", self->ddId)
+								TraceEvent(SevInfo, "DDScheduleAuditOnRangeSkipped", self->ddId)
 								    .detail("Reason", "No servers found for shard")
 								    .detail("AuditID", audit->coreState.id)
 								    .detail("AuditRange", audit->coreState.range)
 								    .detail("TaskRange", taskRange)
 								    .detail("AuditType", auditType);
 								++numSkippedShards;
-								break; // Skip this shard, move to next range
+								// Skip the entire task range (all audit states for this shard)
+								taskRangeBegin = taskRange.end;
+								continue; // Continue to check if there are more states in this range
 							}
 							int dcid = 0;
 							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {

From 78f2f50a196bce5b7826b4603b904013518a87f2 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 13:45:24 -0800
Subject: [PATCH 08/15] Fix compilation error: rename loop variable from
 'state' to 'auditState'

The actor compiler was confused by using 'state' as a loop variable name
since 'state' is a keyword in actor code. Renamed to 'auditState' to avoid
the conflict.
---
 .../workloads/RestoreValidation.actor.cpp     | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/fdbserver/workloads/RestoreValidation.actor.cpp b/fdbserver/workloads/RestoreValidation.actor.cpp
index 14a8450371c..4676bcefe15 100644
--- a/fdbserver/workloads/RestoreValidation.actor.cpp
+++ b/fdbserver/workloads/RestoreValidation.actor.cpp
@@ -221,31 +221,31 @@ struct RestoreValidationWorkload : TestWorkload {
 						throw;
 					}
 
-					// Filter for our audit ID
-					state bool foundOurAudit = false;
-					state bool allComplete = true;
-					state bool anyError = false;
-
-					for (const auto& state : auditStates) {
-						if (state.id == auditId) {
-							foundOurAudit = true;
-
-							if (state.getPhase() == AuditPhase::Running) {
-								allComplete = false;
-							} else if (state.getPhase() == AuditPhase::Error ||
-							           state.getPhase() == AuditPhase::Failed) {
-								anyError = true;
-								finalPhase = state.getPhase();
-								if (!state.error.empty()) {
-									errorMessage = state.error;
-								} else {
-									errorMessage = "Unknown error";
-								}
-							} else if (state.getPhase() == AuditPhase::Complete) {
-								finalPhase = AuditPhase::Complete;
+				// Filter for our audit ID
+				state bool foundOurAudit = false;
+				state bool allComplete = true;
+				state bool anyError = false;
+
+				for (const auto& auditState : auditStates) {
+					if (auditState.id == auditId) {
+						foundOurAudit = true;
+
+						if (auditState.getPhase() == AuditPhase::Running) {
+							allComplete = false;
+						} else if (auditState.getPhase() == AuditPhase::Error ||
+						           auditState.getPhase() == AuditPhase::Failed) {
+							anyError = true;
+							finalPhase = auditState.getPhase();
+							if (!auditState.error.empty()) {
+								errorMessage = auditState.error;
+							} else {
+								errorMessage = "Unknown error";
 							}
+						} else if (auditState.getPhase() == AuditPhase::Complete) {
+							finalPhase = AuditPhase::Complete;
 						}
 					}
+				}
 
 					if (!foundOurAudit) {
 						TraceEvent(SevWarn, "RestoreValidationNoAuditStates")

From cc061f0fb19fa5ff3cb4b4fb7a9eacbd7705e7de Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 14:01:30 -0800
Subject: [PATCH 09/15] Formatting

---
 .../workloads/RestoreValidation.actor.cpp     | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/fdbserver/workloads/RestoreValidation.actor.cpp b/fdbserver/workloads/RestoreValidation.actor.cpp
index 4676bcefe15..289c4edae5f 100644
--- a/fdbserver/workloads/RestoreValidation.actor.cpp
+++ b/fdbserver/workloads/RestoreValidation.actor.cpp
@@ -221,31 +221,31 @@ struct RestoreValidationWorkload : TestWorkload {
 						throw;
 					}
 
-				// Filter for our audit ID
-				state bool foundOurAudit = false;
-				state bool allComplete = true;
-				state bool anyError = false;
-
-				for (const auto& auditState : auditStates) {
-					if (auditState.id == auditId) {
-						foundOurAudit = true;
-
-						if (auditState.getPhase() == AuditPhase::Running) {
-							allComplete = false;
-						} else if (auditState.getPhase() == AuditPhase::Error ||
-						           auditState.getPhase() == AuditPhase::Failed) {
-							anyError = true;
-							finalPhase = auditState.getPhase();
-							if (!auditState.error.empty()) {
-								errorMessage = auditState.error;
-							} else {
-								errorMessage = "Unknown error";
+					// Filter for our audit ID
+					state bool foundOurAudit = false;
+					state bool allComplete = true;
+					state bool anyError = false;
+
+					for (const auto& auditState : auditStates) {
+						if (auditState.id == auditId) {
+							foundOurAudit = true;
+
+							if (auditState.getPhase() == AuditPhase::Running) {
+								allComplete = false;
+							} else if (auditState.getPhase() == AuditPhase::Error ||
+							           auditState.getPhase() == AuditPhase::Failed) {
+								anyError = true;
+								finalPhase = auditState.getPhase();
+								if (!auditState.error.empty()) {
+									errorMessage = auditState.error;
+								} else {
+									errorMessage = "Unknown error";
+								}
+							} else if (auditState.getPhase() == AuditPhase::Complete) {
+								finalPhase = AuditPhase::Complete;
 							}
-						} else if (auditState.getPhase() == AuditPhase::Complete) {
-							finalPhase = AuditPhase::Complete;
 						}
 					}
-				}
 
 					if (!foundOurAudit) {
 						TraceEvent(SevWarn, "RestoreValidationNoAuditStates")

From 9850620c8f89e29ea875538f8e1830935ded762f Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 16:29:52 -0800
Subject: [PATCH 10/15] Fix wrong_shard_server handling to avoid retry loops

Instead of adding recursive retry actors that can multiply and cause
hangs, let wrong_shard_server errors propagate up to be handled by
the higher-level error handlers. This prevents concurrent actors from
all incrementing retryCount simultaneously and creating retry storms.
---
 fdbserver/DataDistribution.actor.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index b0205f85417..e5f9a3c334c 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4738,14 +4738,9 @@ ACTOR Future<Void> doAuditOnStorageServer(Reference<DataDistributor> self,
 		} else if (e.code() == error_code_audit_storage_error) {
 			audit->foundError = true;
 		} else if (e.code() == error_code_wrong_shard_server) {
-			// wrong_shard_server means stale shard location data
-			// Retry a few times to see if data distribution stabilizes
-			if (audit->retryCount >= 3) {
-				// After retries, fail the audit so it can be retried from scratch
-				throw audit_storage_failed();
-			}
-			audit->retryCount++;
-			audit->actors.add(scheduleAuditOnRange(self, audit, req.range));
+			// wrong_shard_server means stale shard location data - treat as transient error
+			// Let the higher-level retry logic handle it
+			throw e;
 		} else if (audit->retryCount >= SERVER_KNOBS->AUDIT_RETRY_COUNT_MAX) {
 			throw audit_storage_failed();
 		} else {

From 821dd18342756874b09a33dc2eed6482ded8163e Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 22:41:32 -0800
Subject: [PATCH 11/15] Add safety check for empty dcServers vector in
 ValidateRestore audit

Even if the servers map is non-empty, individual DC server vectors could
be empty. This would cause randomInt(0, 0) and out-of-bounds access.
Skip empty DC server vectors to prevent crashes.
---
 fdbserver/DataDistribution.actor.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index e5f9a3c334c..3ccd3bdc12d 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4467,6 +4467,11 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 							}
 							int dcid = 0;
 							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
+								if (dcServers.empty()) {
+									// Skip empty server lists for this DC
+									dcid++;
+									continue;
+								}
 								if (dcid == 0) {
 									// in primary DC randomly select a server to do the audit task
 									const int idx = deterministicRandom()->randomInt(0, dcServers.size());

From c156f8013a5588a180a8a0126e7134c08dd77330 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Fri, 21 Nov 2025 22:42:11 -0800
Subject: [PATCH 12/15] Add additional safety check for when all dcServers are
 empty

After skipping empty dcServers vectors, if storageServersToCheck is still
empty, it means all DC server lists were empty. In this case, targetServer
would never be initialized. Skip the entire shard to prevent using
uninitialized targetServer.
---
 fdbserver/DataDistribution.actor.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 3ccd3bdc12d..182299d9b79 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4487,6 +4487,18 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								}
 								dcid++;
 							}
+							// If all dcServers were empty, skip this shard
+							if (storageServersToCheck.empty()) {
+								TraceEvent(SevInfo, "DDScheduleAuditOnRangeSkipped", self->ddId)
+								    .detail("Reason", "All DC server lists empty")
+								    .detail("AuditID", audit->coreState.id)
+								    .detail("AuditRange", audit->coreState.range)
+								    .detail("TaskRange", taskRange)
+								    .detail("AuditType", auditType);
+								++numSkippedShards;
+								taskRangeBegin = taskRange.end;
+								continue;
+							}
 						} else {
 							UNREACHABLE();
 						}

From 6f8be78e5055af89000fe2dfbb40717afed884b2 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Sat, 22 Nov 2025 12:39:40 -0800
Subject: [PATCH 13/15] Fix division by zero in audit dispatch when all states
 are Running/Failed

When all audit states are Running or Failed and skipped, totalCount remains 0.
The CompleteRatio calculation then divides by zero, causing a floating point
exception (SIGFPE) and process crash with exit code -2.

This was the root cause of the -2 crashes in general test runs. The crashes
occurred when ValidateHA or ValidateReplica audits (used in general tests)
hit DD failovers and temporarily had all states in Running/Failed status.
---
 fdbserver/DataDistribution.actor.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 182299d9b79..861976dfb05 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4248,13 +4248,13 @@ ACTOR Future<Void> dispatchAuditStorage(Reference<DataDistributor> self, std::sh
 			}
 			wait(delay(0.1));
 		}
-		TraceEvent(SevInfo, "DDDispatchAuditStorageEnd", self->ddId)
-		    .detail("AuditID", audit->coreState.id)
-		    .detail("Range", range)
-		    .detail("AuditType", auditType)
-		    .detail("TotalRanges", totalCount)
-		    .detail("TotalComplete", completedCount)
-		    .detail("CompleteRatio", completedCount * 1.0 / totalCount);
+	TraceEvent(SevInfo, "DDDispatchAuditStorageEnd", self->ddId)
+	    .detail("AuditID", audit->coreState.id)
+	    .detail("Range", range)
+	    .detail("AuditType", auditType)
+	    .detail("TotalRanges", totalCount)
+	    .detail("TotalComplete", completedCount)
+	    .detail("CompleteRatio", totalCount > 0 ? completedCount * 1.0 / totalCount : 0.0);
 
 	} catch (Error& e) {
 		if (e.code() == error_code_actor_cancelled) {

From 77618b60e87688c8846c158400d5208d36ce1680 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Sat, 22 Nov 2025 17:01:07 -0800
Subject: [PATCH 14/15] Fix uninitialized targetServer when first DC has empty
 server list

The targetServer was only set when dcid == 0, but dcid gets incremented
even for empty DC server lists (via continue). So if the first DC had an
empty server list, dcid would be 1 when we encounter the first non-empty
DC, and targetServer would never be set, causing a crash when accessed.

Fixed by using a targetServerSet flag instead of checking dcid == 0.
Now targetServer is set on the FIRST non-empty DC, regardless of index.

This was the root cause of -2 crashes in general test runs.
---
 fdbserver/DataDistribution.actor.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 861976dfb05..b7e4b6f6a38 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4465,18 +4465,20 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								taskRangeBegin = taskRange.end;
 								continue; // Continue to check if there are more states in this range
 							}
-							int dcid = 0;
-							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
-								if (dcServers.empty()) {
-									// Skip empty server lists for this DC
-									dcid++;
-									continue;
-								}
-								if (dcid == 0) {
-									// in primary DC randomly select a server to do the audit task
-									const int idx = deterministicRandom()->randomInt(0, dcServers.size());
-									targetServer = dcServers[idx];
-								}
+						int dcid = 0;
+						bool targetServerSet = false;
+						for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
+							if (dcServers.empty()) {
+								// Skip empty server lists for this DC
+								dcid++;
+								continue;
+							}
+							if (!targetServerSet) {
+								// On first non-empty DC, randomly select a server to do the audit task
+								const int idx = deterministicRandom()->randomInt(0, dcServers.size());
+								targetServer = dcServers[idx];
+								targetServerSet = true;
+							}
 								for (int i = 0; i < dcServers.size(); i++) {
 									if (dcServers[i].id() == targetServer.id()) {
 										ASSERT_WE_THINK(dcid == 0);

From c7a1faf34a4fb0a02578f0d0ab4d59096d3d0c33 Mon Sep 17 00:00:00 2001
From: michael stack <stack@duboce.com>
Date: Sun, 23 Nov 2025 09:38:52 -0800
Subject: [PATCH 15/15] Formatting

---
 fdbserver/DataDistribution.actor.cpp | 42 ++++++++++++++--------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index b7e4b6f6a38..d5c268d7d2d 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -4248,13 +4248,13 @@ ACTOR Future<Void> dispatchAuditStorage(Reference<DataDistributor> self, std::sh
 			}
 			wait(delay(0.1));
 		}
-	TraceEvent(SevInfo, "DDDispatchAuditStorageEnd", self->ddId)
-	    .detail("AuditID", audit->coreState.id)
-	    .detail("Range", range)
-	    .detail("AuditType", auditType)
-	    .detail("TotalRanges", totalCount)
-	    .detail("TotalComplete", completedCount)
-	    .detail("CompleteRatio", totalCount > 0 ? completedCount * 1.0 / totalCount : 0.0);
+		TraceEvent(SevInfo, "DDDispatchAuditStorageEnd", self->ddId)
+		    .detail("AuditID", audit->coreState.id)
+		    .detail("Range", range)
+		    .detail("AuditType", auditType)
+		    .detail("TotalRanges", totalCount)
+		    .detail("TotalComplete", completedCount)
+		    .detail("CompleteRatio", totalCount > 0 ? completedCount * 1.0 / totalCount : 0.0);
 
 	} catch (Error& e) {
 		if (e.code() == error_code_actor_cancelled) {
@@ -4465,20 +4465,20 @@ ACTOR Future<Void> scheduleAuditOnRange(Reference<DataDistributor> self,
 								taskRangeBegin = taskRange.end;
 								continue; // Continue to check if there are more states in this range
 							}
-						int dcid = 0;
-						bool targetServerSet = false;
-						for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
-							if (dcServers.empty()) {
-								// Skip empty server lists for this DC
-								dcid++;
-								continue;
-							}
-							if (!targetServerSet) {
-								// On first non-empty DC, randomly select a server to do the audit task
-								const int idx = deterministicRandom()->randomInt(0, dcServers.size());
-								targetServer = dcServers[idx];
-								targetServerSet = true;
-							}
+							int dcid = 0;
+							bool targetServerSet = false;
+							for (const auto& [_, dcServers] : rangeLocations[rangeLocationIndex].servers) {
+								if (dcServers.empty()) {
+									// Skip empty server lists for this DC
+									dcid++;
+									continue;
+								}
+								if (!targetServerSet) {
+									// On first non-empty DC, randomly select a server to do the audit task
+									const int idx = deterministicRandom()->randomInt(0, dcServers.size());
+									targetServer = dcServers[idx];
+									targetServerSet = true;
+								}
 								for (int i = 0; i < dcServers.size(); i++) {
 									if (dcServers[i].id() == targetServer.id()) {
 										ASSERT_WE_THINK(dcid == 0);