diff --git a/fdbclient/ClientKnobs.cpp b/fdbclient/ClientKnobs.cpp index 6957c0eb446..214ae0d4e47 100644 --- a/fdbclient/ClientKnobs.cpp +++ b/fdbclient/ClientKnobs.cpp @@ -97,6 +97,12 @@ void ClientKnobs::initialize(Randomize randomize) { init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 ); init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 ); + // TTL disabled by default to preserve existing behavior; set > 0 to enable + init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(0, 20); + // When cache entry is used, extend its expiration by this amount (sliding window) + init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(5, 10); + // Run location cache cleanup every 60 seconds when TTL is enabled + init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_INTERVAL = 5.0; init( GET_RANGE_SHARD_LIMIT, 2 ); init( WARM_RANGE_SHARD_LIMIT, 100 ); diff --git a/fdbclient/DatabaseContext.actor.cpp b/fdbclient/DatabaseContext.actor.cpp index 8fb45d23c67..84ced1f01b1 100644 --- a/fdbclient/DatabaseContext.actor.cpp +++ b/fdbclient/DatabaseContext.actor.cpp @@ -948,39 +948,45 @@ ACTOR static Future monitorClientDBInfoChange(DatabaseContext* cx, } } -void updateLocationCacheWithCaches(DatabaseContext* self, - const std::map& removed, - const std::map& added) { - // TODO: this needs to be more clever in the future - auto ranges = self->locationCache.ranges(); - for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { - if (iter->value() && iter->value()->hasCaches) { - auto& val = iter->value(); - std::vector>> interfaces; - interfaces.reserve(val->size() - removed.size() + added.size()); - for (int i = 0; i < val->size(); ++i) { - const auto& interf = (*val)[i]; - if (removed.count(interf->interf.id()) == 0) { - interfaces.emplace_back(interf); +ACTOR static Future cleanupLocationCache(DatabaseContext* cx) { + // Only run cleanup if TTL is enabled + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) { + return Void(); + } + + loop { + wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL)); + + double currentTime = now(); + std::vector toRemove; + int totalCount = 0; + + // Scan locationCache for expired entries + auto iter = cx->locationCache.randomRange(); + for (; iter != cx->locationCache.lastItem(); ++iter) { + if (iter->value()) { + // Check the expireTime of the first cache entry as a representative + // All entries in a range typically have similar expiration times + if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) { + toRemove.push_back(iter->range()); } } - for (const auto& p : added) { - interfaces.push_back(makeReference>(p.second)); + totalCount++; + if (totalCount > 1000 || toRemove.size() > 100) { + break; // Avoid long blocking scans } - iter->value() = makeReference(interfaces, true); } - } -} -Reference addCaches(const Reference& loc, - const std::vector>>& other) { - std::vector>> interfaces; - interfaces.reserve(loc->size() + other.size()); - for (int i = 0; i < loc->size(); ++i) { - interfaces.emplace_back((*loc)[i]); + // Remove expired entries + for (const auto& range : toRemove) { + cx->locationCache.insert(range, Reference()); + } + + if (!toRemove.empty()) { + CODE_PROBE(true, "LocationCacheCleanup removed some entries"); + TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size()); + } } - interfaces.insert(interfaces.end(), other.begin(), other.end()); - return makeReference(interfaces, true); } ACTOR static Future handleTssMismatches(DatabaseContext* cx) { @@ -1266,6 +1272,7 @@ DatabaseContext::DatabaseContext(ReferenceINIT_MID_SHARD_BYTES); @@ -1574,6 +1581,7 @@ DatabaseContext::~DatabaseContext() { clientDBInfoMonitor.cancel(); monitorTssInfoChange.cancel(); tssMismatchHandler.cancel(); + locationCacheCleanup.cancel(); storage = nullptr; if (grvUpdateHandler.isValid()) { diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index edf38c1ee8f..dee747d1a60 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -168,8 +168,14 @@ Optional DatabaseContext::getCachedLocation(const TenantIn auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); - if (range->value()) { - return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), range->value()); + auto& loc = range->value(); + if (loc) { + // Cache hit: extend expiration time if refresh knob is set + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && loc->expireTime > 0.0) { + CODE_PROBE(true, "Location cache hit - refresh expire time"); + loc->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; + } + return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), loc); } return Optional(); @@ -200,6 +206,11 @@ bool DatabaseContext::getCachedLocations(const TenantInfo& tenant, result.clear(); return false; } + // Cache hit: extend expiration time if refresh knob is set + if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && r->value()->expireTime > 0.0) { + CODE_PROBE(true, "Location cache hit2 - refresh expire time"); + r->value()->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME; + } result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value()); if (result.size() == limit || begin == end) { break; @@ -224,6 +235,7 @@ Reference DatabaseContext::setCachedLocation(const KeyRangeRef& ab int maxEvictionAttempts = 100, attempts = 0; auto loc = makeReference(serverRefs); + // TODO: ideally remove based on TTL expiration times, instead of random while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) { CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted"); attempts++; @@ -1764,10 +1776,6 @@ Future loadBalance( QueueModel* model = nullptr, bool compareReplicas = false, int requiredReplicas = 0) { - if (alternatives->hasCaches) { - return loadBalance( - alternatives->locations(), channel, request, taskID, atMostOnce, model, compareReplicas, requiredReplicas); - } return fmap( [ctx](auto const& res) { if (res.cached) { diff --git a/fdbclient/include/fdbclient/ClientKnobs.h b/fdbclient/include/fdbclient/ClientKnobs.h index e174d28484f..8d8cebded22 100644 --- a/fdbclient/include/fdbclient/ClientKnobs.h +++ b/fdbclient/include/fdbclient/ClientKnobs.h @@ -94,6 +94,15 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImpl 0, each key-location cache entry expires this many seconds after insertion. + // Default 0 disables TTL expiration and keeps current behavior. + double LOCATION_CACHE_ENTRY_TTL; + // If > 0, extend the expireTime by this many seconds when a cached entry is used (cache hit). + // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. + double LOCATION_CACHE_ENTRY_REFRESH_TIME; + // How often to run the background actor that removes expired location cache entries. + // Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. Default 60 seconds. + double LOCATION_CACHE_EVICTION_INTERVAL; int GET_RANGE_SHARD_LIMIT; int WARM_RANGE_SHARD_LIMIT; diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index dd76f359651..8092262e70c 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -64,15 +64,18 @@ class StorageServerInfo : public ReferencedInterface { struct LocationInfo : MultiInterface>, FastAllocated { using Locations = MultiInterface>; explicit LocationInfo(const std::vector>>& v) - : Locations(v) {} - LocationInfo(const std::vector>>& v, bool hasCaches) - : Locations(v), hasCaches(hasCaches) {} + : Locations(v), + expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL + : 0.0) {} + LocationInfo(const LocationInfo&) = delete; LocationInfo(LocationInfo&&) = delete; LocationInfo& operator=(const LocationInfo&) = delete; LocationInfo& operator=(LocationInfo&&) = delete; - bool hasCaches = false; Reference locations() { return Reference::addRef(this); } + + // Absolute expiration time for this cache entry. 0 means no expiration (TTL disabled). + double expireTime = 0.0; }; using CommitProxyInfo = ModelInterface; @@ -376,6 +379,7 @@ class DatabaseContext : public ReferenceCounted, public FastAll Future tssMismatchHandler; PromiseStream>> tssMismatchStream; Future grvUpdateHandler; + Future locationCacheCleanup; Reference commitProxies; Reference grvProxies; bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time.