Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions fdbclient/ClientKnobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ void ClientKnobs::initialize(Randomize randomize) {
init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3;
init( LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD, 60 );
init( LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL, 60 );
// TTL disabled by default to preserve existing behavior; set > 0 to enable
init( LOCATION_CACHE_ENTRY_TTL, 0.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_TTL = deterministicRandom()->randomInt(0, 20);
// When cache entry is used, extend its expiration by this amount (sliding window)
init( LOCATION_CACHE_ENTRY_REFRESH_TIME, 300.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_ENTRY_REFRESH_TIME = deterministicRandom()->randomInt(5, 10);
// Run location cache cleanup every 60 seconds when TTL is enabled
init( LOCATION_CACHE_EVICTION_INTERVAL, 60.0 ); if ( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_INTERVAL = 5.0;

init( GET_RANGE_SHARD_LIMIT, 2 );
init( WARM_RANGE_SHARD_LIMIT, 100 );
Expand Down
62 changes: 35 additions & 27 deletions fdbclient/DatabaseContext.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -948,39 +948,45 @@ ACTOR static Future<Void> monitorClientDBInfoChange(DatabaseContext* cx,
}
}

void updateLocationCacheWithCaches(DatabaseContext* self,
const std::map<UID, StorageServerInterface>& removed,
const std::map<UID, StorageServerInterface>& added) {
// TODO: this needs to be more clever in the future
auto ranges = self->locationCache.ranges();
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (iter->value() && iter->value()->hasCaches) {
auto& val = iter->value();
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> interfaces;
interfaces.reserve(val->size() - removed.size() + added.size());
for (int i = 0; i < val->size(); ++i) {
const auto& interf = (*val)[i];
if (removed.count(interf->interf.id()) == 0) {
interfaces.emplace_back(interf);
ACTOR static Future<Void> cleanupLocationCache(DatabaseContext* cx) {
// Only run cleanup if TTL is enabled
if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL == 0.0) {
return Void();
}

loop {
wait(delay(CLIENT_KNOBS->LOCATION_CACHE_EVICTION_INTERVAL));

double currentTime = now();
std::vector<KeyRangeRef> toRemove;
int totalCount = 0;

// Scan locationCache for expired entries
auto iter = cx->locationCache.randomRange();
for (; iter != cx->locationCache.lastItem(); ++iter) {
if (iter->value()) {
// Check the expireTime of the first cache entry as a representative
// All entries in a range typically have similar expiration times
if (iter->value()->expireTime > 0.0 && iter->value()->expireTime <= currentTime) {
toRemove.push_back(iter->range());
}
}
for (const auto& p : added) {
interfaces.push_back(makeReference<ReferencedInterface<StorageServerInterface>>(p.second));
totalCount++;
if (totalCount > 1000 || toRemove.size() > 100) {
break; // Avoid long blocking scans
}
iter->value() = makeReference<LocationInfo>(interfaces, true);
}
}
}

Reference<LocationInfo> addCaches(const Reference<LocationInfo>& loc,
const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& other) {
std::vector<Reference<ReferencedInterface<StorageServerInterface>>> interfaces;
interfaces.reserve(loc->size() + other.size());
for (int i = 0; i < loc->size(); ++i) {
interfaces.emplace_back((*loc)[i]);
// Remove expired entries
for (const auto& range : toRemove) {
cx->locationCache.insert(range, Reference<LocationInfo>());
}

if (!toRemove.empty()) {
CODE_PROBE(true, "LocationCacheCleanup removed some entries");
TraceEvent("LocationCacheCleanup").detail("RemovedRanges", toRemove.size());
}
}
interfaces.insert(interfaces.end(), other.begin(), other.end());
return makeReference<LocationInfo>(interfaces, true);
}

ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
Expand Down Expand Up @@ -1266,6 +1272,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<IClusterConnection

clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger);
tssMismatchHandler = handleTssMismatches(this);
locationCacheCleanup = cleanupLocationCache(this);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a cancel() call for this actor in the destructor (e.g. https://github.com/apple/foundationdb/blob/main/fdbclient/DatabaseContext.actor.cpp#L1573)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to explicitly call cancel(), because locationCacheCleanup destructor will automatically cancel the actor. Usually explictly calling cancel is to proactively clean the state to avoid destruction order problems. To be on the safe side, I'll add the cancel call in the destructor.

clientStatusUpdater.actor = clientStatusUpdateActor(this);

smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES);
Expand Down Expand Up @@ -1574,6 +1581,7 @@ DatabaseContext::~DatabaseContext() {
clientDBInfoMonitor.cancel();
monitorTssInfoChange.cancel();
tssMismatchHandler.cancel();
locationCacheCleanup.cancel();
storage = nullptr;

if (grvUpdateHandler.isValid()) {
Expand Down
20 changes: 14 additions & 6 deletions fdbclient/NativeAPI.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,14 @@ Optional<KeyRangeLocationInfo> DatabaseContext::getCachedLocation(const TenantIn

auto range =
isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey);
if (range->value()) {
return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), range->value());
auto& loc = range->value();
if (loc) {
// Cache hit: extend expiration time if refresh knob is set
if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && loc->expireTime > 0.0) {
CODE_PROBE(true, "Location cache hit - refresh expire time");
loc->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME;
}
return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), loc);
}

return Optional<KeyRangeLocationInfo>();
Expand Down Expand Up @@ -200,6 +206,11 @@ bool DatabaseContext::getCachedLocations(const TenantInfo& tenant,
result.clear();
return false;
}
// Cache hit: extend expiration time if refresh knob is set
if (CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME > 0.0 && r->value()->expireTime > 0.0) {
CODE_PROBE(true, "Location cache hit2 - refresh expire time");
r->value()->expireTime = now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_REFRESH_TIME;
}
result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value());
if (result.size() == limit || begin == end) {
break;
Expand All @@ -224,6 +235,7 @@ Reference<LocationInfo> DatabaseContext::setCachedLocation(const KeyRangeRef& ab

int maxEvictionAttempts = 100, attempts = 0;
auto loc = makeReference<LocationInfo>(serverRefs);
// TODO: ideally remove based on TTL expiration times, instead of random
while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) {
CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted");
attempts++;
Expand Down Expand Up @@ -1764,10 +1776,6 @@ Future<REPLY_TYPE(Request)> loadBalance(
QueueModel* model = nullptr,
bool compareReplicas = false,
int requiredReplicas = 0) {
if (alternatives->hasCaches) {
return loadBalance(
alternatives->locations(), channel, request, taskID, atMostOnce, model, compareReplicas, requiredReplicas);
}
return fmap(
[ctx](auto const& res) {
if (res.cached) {
Expand Down
9 changes: 9 additions & 0 deletions fdbclient/include/fdbclient/ClientKnobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ class SWIFT_CXX_IMMORTAL_SINGLETON_TYPE ClientKnobs : public KnobsImpl<ClientKno
int LOCATION_CACHE_EVICTION_SIZE_SIM;
double LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD;
double LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL;
// If > 0, each key-location cache entry expires this many seconds after insertion.
// Default 0 disables TTL expiration and keeps current behavior.
double LOCATION_CACHE_ENTRY_TTL;
// If > 0, extend the expireTime by this many seconds when a cached entry is used (cache hit).
// Only has effect when LOCATION_CACHE_ENTRY_TTL > 0.
double LOCATION_CACHE_ENTRY_REFRESH_TIME;
// How often to run the background actor that removes expired location cache entries.
// Only has effect when LOCATION_CACHE_ENTRY_TTL > 0. Default 60 seconds.
double LOCATION_CACHE_EVICTION_INTERVAL;

int GET_RANGE_SHARD_LIMIT;
int WARM_RANGE_SHARD_LIMIT;
Expand Down
12 changes: 8 additions & 4 deletions fdbclient/include/fdbclient/DatabaseContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,18 @@ class StorageServerInfo : public ReferencedInterface<StorageServerInterface> {
struct LocationInfo : MultiInterface<ReferencedInterface<StorageServerInterface>>, FastAllocated<LocationInfo> {
using Locations = MultiInterface<ReferencedInterface<StorageServerInterface>>;
explicit LocationInfo(const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& v)
: Locations(v) {}
LocationInfo(const std::vector<Reference<ReferencedInterface<StorageServerInterface>>>& v, bool hasCaches)
: Locations(v), hasCaches(hasCaches) {}
: Locations(v),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please correct me if I'm wrong, but do I understand the code correctly that the LocationInfo holds information of one or more StorageServerInterface and the expire time is set for the LocationInfo and not the information of the individual StorageServerInterface in this LocationInfo?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's correct. Each shard is stored on multiple storage servers, so LocationInfo points to all replicas.

expireTime(CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL > 0.0 ? now() + CLIENT_KNOBS->LOCATION_CACHE_ENTRY_TTL
: 0.0) {}

LocationInfo(const LocationInfo&) = delete;
LocationInfo(LocationInfo&&) = delete;
LocationInfo& operator=(const LocationInfo&) = delete;
LocationInfo& operator=(LocationInfo&&) = delete;
bool hasCaches = false;
Reference<Locations> locations() { return Reference<Locations>::addRef(this); }

// Absolute expiration time for this cache entry. 0 means no expiration (TTL disabled).
double expireTime = 0.0;
};

using CommitProxyInfo = ModelInterface<CommitProxyInterface>;
Expand Down Expand Up @@ -376,6 +379,7 @@ class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAll
Future<Void> tssMismatchHandler;
PromiseStream<std::pair<UID, std::vector<DetailedTSSMismatch>>> tssMismatchStream;
Future<Void> grvUpdateHandler;
Future<Void> locationCacheCleanup;
Reference<CommitProxyInfo> commitProxies;
Reference<GrvProxyInfo> grvProxies;
bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time.
Expand Down