Skip to content

Commit a16075f

Browse files
authored
[Release-7.3] Validate ServerTeam count per server in simulation (#11678)
* validate server team count in simulation * change naming (not relevant to the PR title) * address comments and add a new trace event BuildTeamsLastBuildTeamsFailed triggered when buildTeam failed
1 parent c2a7a5a commit a16075f

File tree

4 files changed

+30
-10
lines changed

4 files changed

+30
-10
lines changed

fdbclient/ServerKnobs.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
364364
init( DD_MAXIMUM_LARGE_TEAM_CLEANUP, 10000 ); if( randomize && BUGGIFY ) DD_MAXIMUM_LARGE_TEAM_CLEANUP = 10;
365365
init( DD_LARGE_TEAM_DELAY, 60.0 );
366366
init( DD_FIX_WRONG_REPLICAS_DELAY, 60.0 );
367+
init (DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM, false ); if (isSimulated) DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM = true;
367368

368369
// TeamRemover
369370
init( TR_LOW_SPACE_PIVOT_DELAY_SEC, 0 ); if (isSimulated) TR_LOW_SPACE_PIVOT_DELAY_SEC = deterministicRandom()->randomInt(0, 3);

fdbclient/include/fdbclient/ServerKnobs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,8 @@ class ServerKnobs : public KnobsImpl<ServerKnobs> {
344344
bool DD_ENABLE_REBALANCE_STORAGE_QUEUE_WITH_LIGHT_WRITE_SHARD; // Enable to allow storage queue rebalancer to move
345345
// light-traffic shards out of the overloading server
346346
double DD_WAIT_TSS_DATA_MOVE_DELAY;
347+
bool DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM; // Enable to validate server team count per server after build
348+
// team
347349

348350
// TeamRemover to remove redundant teams
349351
double TR_LOW_SPACE_PIVOT_DELAY_SEC; // teamRedundant data moves can make the min SS available % smaller in

fdbserver/DDShardTracker.actor.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -217,14 +217,14 @@ ACTOR Future<Void> shardUsableRegions(DataDistributionTracker::SafeAccessor self
217217
double expectedCompletionSeconds = self()->shards->size() * 1.0 / SERVER_KNOBS->DD_SHARD_USABLE_REGION_CHECK_RATE;
218218
double delayTime = deterministicRandom()->random01() * expectedCompletionSeconds;
219219
wait(delayJittered(delayTime));
220-
auto [destTeams, srcTeams] = self()->shardsAffectedByTeamFailure->getTeamsForFirstShard(keys);
221-
if (destTeams.size() < self()->usableRegions) {
220+
auto [newTeam, previousTeam] = self()->shardsAffectedByTeamFailure->getTeamsForFirstShard(keys);
221+
if (newTeam.size() < self()->usableRegions) {
222222
TraceEvent(SevWarn, "ShardUsableRegionMismatch", self()->distributorId)
223223
.suppressFor(5.0)
224-
.detail("DestTeamSize", destTeams.size())
225-
.detail("SrcTeamSize", srcTeams.size())
226-
.detail("DestServers", describe(destTeams))
227-
.detail("SrcServers", describe(srcTeams))
224+
.detail("NewTeamSize", newTeam.size())
225+
.detail("PreviousTeamSize", previousTeam.size())
226+
.detail("NewServers", describe(newTeam))
227+
.detail("PreviousServers", describe(previousTeam))
228228
.detail("UsableRegion", self()->usableRegions)
229229
.detail("Shard", keys);
230230
RelocateShard rs(keys, DataMovementReason::POPULATE_REGION, RelocateReason::OTHER);

fdbserver/DDTeamCollection.actor.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,8 @@ class DDTeamCollectionImpl {
839839
// If there are too few machines to even build teams or there are too few represented datacenters, can't
840840
// build any team.
841841
self->lastBuildTeamsFailed = true;
842-
TraceEvent(SevWarnAlways, "BuildTeamsNotEnoughUniqueMachines", self->distributorId)
842+
TraceEvent(SevWarnAlways, "BuildTeamsLastBuildTeamsFailed", self->distributorId)
843+
.detail("Reason", "Do not have enough unique machines")
843844
.detail("Primary", self->primary)
844845
.detail("UniqueMachines", uniqueMachines)
845846
.detail("Replication", self->configuration.storageTeamSize);
@@ -4424,7 +4425,8 @@ bool DDTeamCollection::isValidLocality(Reference<IReplicationPolicy> storagePoli
44244425
void DDTeamCollection::evaluateTeamQuality() const {
44254426
int teamCount = teams.size(), serverCount = allServers.size();
44264427
double teamsPerServer = (double)teamCount * configuration.storageTeamSize / serverCount;
4427-
4428+
const int targetTeamNumPerServer =
4429+
(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2;
44284430
ASSERT_EQ(serverCount, server_info.size());
44294431

44304432
int minTeams = std::numeric_limits<int>::max();
@@ -4440,6 +4442,16 @@ void DDTeamCollection::evaluateTeamQuality() const {
44404442
varTeams += (stc - teamsPerServer) * (stc - teamsPerServer);
44414443
// Use zoneId as server's machine id
44424444
machineTeams[info->getLastKnownInterface().locality.zoneId()] += stc;
4445+
// Check invariant: if latest buildTeam succeeds, then each server must have at least
4446+
// targetTeamNumPerServer serverTeams
4447+
// lastBuildTeamsFailed is set only when (1) machine count is less than configured team size;
4448+
// (2) Not find any server team candidates when creating server team; (3) failed to add machine team
4449+
if (SERVER_KNOBS->DD_VALIDATE_SERVER_TEAM_COUNT_AFTER_BUILD_TEAM && !lastBuildTeamsFailed &&
4450+
stc < targetTeamNumPerServer) {
4451+
TraceEvent(SevError, "NewAddServerNotMatchTargetSTCount", distributorId)
4452+
.detail("CurrentServerTeams", stc)
4453+
.detail("TargetServerTeams", targetTeamNumPerServer);
4454+
}
44434455
}
44444456
}
44454457
varTeams /= teamsPerServer * teamsPerServer;
@@ -5063,9 +5075,9 @@ int DDTeamCollection::addBestMachineTeams(int machineTeamsToBuild) {
50635075
// When too many teams exist in simulation, traceAllInfo will buffer too many trace logs before
50645076
// trace has a chance to flush its buffer, which causes assertion failure.
50655077
traceAllInfo(!g_network->isSimulated());
5066-
TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId)
5078+
TraceEvent(SevWarn, "BuildTeamsLastBuildTeamsFailed", distributorId)
50675079
.detail("Primary", primary)
5068-
.detail("Reason", "Unable to make desired machine Teams")
5080+
.detail("Reason", "Unable to make desired machineTeams")
50695081
.detail("Hint", "Check TraceAllInfo event");
50705082
lastBuildTeamsFailed = true;
50715083
break;
@@ -5475,6 +5487,11 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max
54755487
if (bestServerTeam.size() != configuration.storageTeamSize) {
54765488
// Not find any team and will unlikely find a team
54775489
lastBuildTeamsFailed = true;
5490+
TraceEvent(SevWarn, "BuildTeamsLastBuildTeamsFailed", distributorId)
5491+
.detail("Reason", "Unable to find any valid serverTeam")
5492+
.detail("Primary", primary)
5493+
.detail("BestServerTeam", describe(bestServerTeam))
5494+
.detail("ConfigStorageTeamSize", configuration.storageTeamSize);
54785495
break;
54795496
}
54805497

0 commit comments

Comments
 (0)