From cf3480c49509347e733464d770992766ebd03416 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 24 Oct 2025 12:04:03 -0700 Subject: [PATCH 1/5] Add TTL values for client caches of key locations So the client may automatically remove stale entries, e.g., storage server has since changed IP addresses. --- fdbclient/ClientKnobs.cpp | 4 ++++ fdbclient/DatabaseContext.actor.cpp | 11 ----------- fdbclient/NativeAPI.actor.cpp | 14 ++++++++++++-- fdbclient/include/fdbclient/ClientKnobs.h | 6 ++++++ fdbclient/include/fdbclient/DatabaseContext.h | 13 ++++++++++--- 5 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 6957c0eb446..56d57969328 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -97,6 +97,10 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); + // TTL disabled by default to preserve existing behavior; set > 0 to enable + init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); + // When cache entry is used, extend its expiration by this amount (sliding window) + init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index 8fb45d23c67..e68f2bcc9c7 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -972,17 +972,6 @@ void updateLocationCacheWithCaches(DatabaseContext* self, } } -Reference addCaches(const Reference& loc, - const std::vector>>& other) { - std::vector>> interfaces; - interfaces.reserve(loc->size() + other.size()); - for (int i = 0; i < loc->size(); ++i) { - interfaces.emplace_back((*loc)[i]); - } - interfaces.insert(interfaces.end(), other.begin(), other.end()); - return makeReference(interfaces, true); -} - ACTOR static Future handleTssMismatches(DatabaseContext* cx) { state Reference tr; state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index edf38c1ee8f..e0ec33f0a66 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -168,8 +168,13 @@ Optional DatabaseContext::getCachedLocation(const TenantIn auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); - if (range->value()) { - return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), range->value()); + auto& loc = range->value(); + if (loc) { + // Cache hit: extend expiration time if refresh knob is set + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && loc->expireTime > 0.0) { + loc->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; + } + return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), loc); } return Optional(); @@ -200,6 +205,10 @@ bool DatabaseContext::getCachedLocations(const TenantInfo& tenant, result.clear(); return false; } + // Cache hit: extend expiration time if refresh knob is set + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && r->value()->expireTime > 0.0) { + r->value()->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; + } result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value()); if (result.size() == limit || begin == end) { break; @@ -224,6 +233,7 @@ Reference DatabaseContext::setCachedLocation(const KeyRangeRef& ab int maxEvictionAttempts = 100, attempts = 0; auto loc = makeReference(serverRefs); + // TODO: ideally remove based on TTL expiration times, instead of random while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) { CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted"); attempts++; diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index e174d28484f..b02c33e407b 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -94,6 +94,12 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImpl 0, each key-location cache entry expires this many seconds after insertion. + // Default 0 disables TTL expiration and keeps current behavior. + double LOCATION_CACHE_ENTRY_TTL; + // If > 0, extend the expireTime by this many seconds when a cached entry is used (cache hit). + // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. + double LOCATION_CACHE_ENTRY_REFRESH_TIME; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index dd76f359651..6bf6ee0342a 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -64,15 +64,22 @@ class StorageServerInfo : public ReferencedInterface { struct LocationInfo : MultiInterface>, FastAllocated { using Locations = MultiInterface>; explicit LocationInfo(const std::vector>>& v) - : Locations(v) {} + : Locations(v), + expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL + : 0.0) {} LocationInfo(const std::vector>>& v, bool hasCaches) - : Locations(v), hasCaches(hasCaches) {} + : Locations(v), hasCaches(hasCaches), + expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL + : 0.0) {} LocationInfo(const LocationInfo&) = delete; LocationInfo(LocationInfo&&) = delete; LocationInfo& operator=(const LocationInfo&) = delete; LocationInfo& operator=(LocationInfo&&) = delete; - bool hasCaches = false; Reference locations() { return Reference::addRef(this); } + + bool hasCaches = false; + // Absolute expiration time for this cache entry. 0 means no expiration (TTL disabled). + double expireTime = 0.0; }; using CommitProxyInfo = ModelInterface; From bc933ead9fc1b1b7d614b71451da60b1160a2ec2 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 24 Oct 2025 12:24:13 -0700 Subject: [PATCH 2/5] Add the actor to clean up expired location cache entries --- fdbclient/ClientKnobs.cpp | 2 + fdbclient/DatabaseContext.actor.cpp | 42 +++++++++++++++++++ fdbclient/include/fdbclient/ClientKnobs.h | 3 ++ fdbclient/include/fdbclient/DatabaseContext.h | 1 + 4 files changed, 48 insertions(+) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 56d57969328..50460f295f4 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -101,6 +101,8 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); // When cache entry is used, extend its expiration by this amount (sliding window) init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); + // Run location cache cleanup every 60 seconds when TTL is enabled + init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index e68f2bcc9c7..4f6ba47d9d0 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -972,6 +972,47 @@ void updateLocationCacheWithCaches(DatabaseContext* self, } } +ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { + // Only run cleanup if TTL is enabled + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { + return Void(); + } + + loop { + wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); + + double currentTime = now(); + std::vector toRemove; + int totalCount = 0; + + // Scan locationCache for expired entries + auto iter = cx->locationCache.randomRange(); + for (; iter != cx->locationCache.end(); ++iter) { + if (iter->value() && iter->value()->hasCaches) { + // Check the expireTime of the first cache entry as a representative + // All entries in a range typically have similar expiration times + if (iter->value()->locations()->expireTime > 0.0 && + iter->value()->locations()->expireTime <= currentTime) { + toRemove.push_back(iter->range()); + } + } + totalCount++; + if (totalCount > 1000 || toRemove.size() > 100) { + break; // Avoid long blocking scans + } + } + + // Remove expired entries + for (const auto& range : toRemove) { + cx->locationCache.insert(range, Reference()); + } + + if (!toRemove.empty()) { + TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size()); + } + } +} + ACTOR static Future handleTssMismatches(DatabaseContext* cx) { state Reference tr; state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); @@ -1255,6 +1296,7 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index b02c33e407b..8d8cebded22 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -100,6 +100,9 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImpl 0, extend the expireTime by this many seconds when a cached entry is used (cache hit). // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. double LOCATION_CACHE_ENTRY_REFRESH_TIME; + // How often to run the background actor that removes expired location cache entries. + // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. Default 60 seconds. + double LOCATION_CACHE_EVICTION_INTERVAL; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index 6bf6ee0342a..daf03c43520 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -383,6 +383,7 @@ class DatabaseContext : public ReferenceCounted, public FastAll Future tssMismatchHandler; PromiseStream>> tssMismatchStream; Future grvUpdateHandler; + Future locationCacheCleanup; Reference commitProxies; Reference grvProxies; bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time. From 2967d312e93dcefe32533a6a237730e7903188b7 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 24 Oct 2025 14:06:34 -0700 Subject: [PATCH 3/5] Add code probe for location cache cleanups --- fdbclient/DatabaseContext.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index 4f6ba47d9d0..72445bda29e 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -987,12 +987,11 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { // Scan locationCache for expired entries auto iter = cx->locationCache.randomRange(); - for (; iter != cx->locationCache.end(); ++iter) { + for (; iter != cx->locationCache.lastItem(); ++iter) { if (iter->value() && iter->value()->hasCaches) { // Check the expireTime of the first cache entry as a representative // All entries in a range typically have similar expiration times - if (iter->value()->locations()->expireTime > 0.0 && - iter->value()->locations()->expireTime <= currentTime) { + if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { toRemove.push_back(iter->range()); } } @@ -1008,6 +1007,7 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { } if (!toRemove.empty()) { + CODE_PROBE(true, "LocationCacheCleanup removed some entries"); TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size()); } } From 18b1b4a52bdde62afee3b3da38521703cd8e9bbb Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Sat, 25 Oct 2025 11:59:04 -0700 Subject: [PATCH 4/5] Tweak knobs Doesn't seem to be effective in simulation runs. will go back to this. --- fdbclient/ClientKnobs.cpp | 6 +++--- fdbclient/NativeAPI.actor.cpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 50460f295f4..214ae0d4e47 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -98,11 +98,11 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); // TTL disabled by default to preserve existing behavior; set > 0 to enable - init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(10, 60); + init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(0, 20); // When cache entry is used, extend its expiration by this amount (sliding window) - init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(10, 60); + init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(5, 10); // Run location cache cleanup every 60 seconds when TTL is enabled - init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); + init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_INTERVAL = 5.0; init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index e0ec33f0a66..f554d6d538e 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -172,6 +172,7 @@ Optional DatabaseContext::getCachedLocation(const TenantIn if (loc) { // Cache hit: extend expiration time if refresh knob is set if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && loc->expireTime > 0.0) { + CODE_PROBE(true, "Location cache hit - refresh expire time"); loc->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; } return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), loc); @@ -207,6 +208,7 @@ bool DatabaseContext::getCachedLocations(const TenantInfo& tenant, } // Cache hit: extend expiration time if refresh knob is set if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && r->value()->expireTime > 0.0) { + CODE_PROBE(true, "Location cache hit2 - refresh expire time"); r->value()->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; } result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value()); From 78ca1a2027dfc37f8c2d4645294ce60b9b0bb156 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 20 Nov 2025 11:17:23 -0800 Subject: [PATCH 5/5] Remove hasCaches from LocationInfo This seems to be a field added for StorageCache feature, which was removed. --- fdbclient/DatabaseContext.actor.cpp | 27 ++----------------- fdbclient/NativeAPI.actor.cpp | 4 --- fdbclient/include/fdbclient/DatabaseContext.h | 6 +---- 3 files changed, 3 insertions(+), 34 deletions(-) diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index 72445bda29e..84ced1f01b1 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -948,30 +948,6 @@ ACTOR static Future monitorClientDBInfoChange(DatabaseContext* cx, } } -void updateLocationCacheWithCaches(DatabaseContext* self, - const std::map& removed, - const std::map& added) { - // TODO: this needs to be more clever in the future - auto ranges = self->locationCache.ranges(); - for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { - if (iter->value() && iter->value()->hasCaches) { - auto& val = iter->value(); - std::vector>> interfaces; - interfaces.reserve(val->size() - removed.size() + added.size()); - for (int i = 0; i < val->size(); ++i) { - const auto& interf = (*val)[i]; - if (removed.count(interf->interf.id()) == 0) { - interfaces.emplace_back(interf); - } - } - for (const auto& p : added) { - interfaces.push_back(makeReference>(p.second)); - } - iter->value() = makeReference(interfaces, true); - } - } -} - ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { // Only run cleanup if TTL is enabled if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { @@ -988,7 +964,7 @@ ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { // Scan locationCache for expired entries auto iter = cx->locationCache.randomRange(); for (; iter != cx->locationCache.lastItem(); ++iter) { - if (iter->value() && iter->value()->hasCaches) { + if (iter->value()) { // Check the expireTime of the first cache entry as a representative // All entries in a range typically have similar expiration times if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { @@ -1605,6 +1581,7 @@ DatabaseContext::~DatabaseContext() { clientDBInfoMonitor.cancel(); monitorTssInfoChange.cancel(); tssMismatchHandler.cancel(); + locationCacheCleanup.cancel(); storage = nullptr; if (grvUpdateHandler.isValid()) { diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index f554d6d538e..dee747d1a60 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1776,10 +1776,6 @@ Future loadBalance( QueueModel* model = nullptr, bool compareReplicas = false, int requiredReplicas = 0) { - if (alternatives->hasCaches) { - return loadBalance( - alternatives->locations(), channel, request, taskID, atMostOnce, model, compareReplicas, requiredReplicas); - } return fmap( [ctx](auto const& res) { if (res.cached) { diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index daf03c43520..8092262e70c 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -67,17 +67,13 @@ struct LocationInfo : MultiInterface : Locations(v), expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL : 0.0) {} - LocationInfo(const std::vector>>& v, bool hasCaches) - : Locations(v), hasCaches(hasCaches), - expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL - : 0.0) {} + LocationInfo(const LocationInfo&) = delete; LocationInfo(LocationInfo&&) = delete; LocationInfo& operator=(const LocationInfo&) = delete; LocationInfo& operator=(LocationInfo&&) = delete; Reference locations() { return Reference::addRef(this); } - bool hasCaches = false; // Absolute expiration time for this cache entry. 0 means no expiration (TTL disabled). double expireTime = 0.0; };