Skip to content

Commit b15632b

Browse files
hsun324meta-codesync[bot]
authored andcommitted
Update server queue length without requiring staleness change
Summary: Introduce a separate info method for updating the server queue length and call it unconditionally whenever we are reevaluating the liveness and staleness of the RAFT server to improve the accuracy of the value. This fixes an issue where the queue length value can get stuck if the RAFT server stops receiving RPCs (for example if a custom distribution implementation withholds RPCs when the queue is too large). Also fixes an issue where the queue length was always being stored with the default name of the RAFT partition. Reviewed By: jaher, SarahDesouky Differential Revision: D86316661 Privacy Context Container: L1384697 fbshipit-source-id: c1c91dea7dd5a1914565803bbfb01a168a104469
1 parent 297688d commit b15632b

File tree

2 files changed

+28
-8
lines changed

2 files changed

+28
-8
lines changed

src/wa_raft_info.erl

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
set_membership/3,
3131
set_live/3,
3232
set_stale/3,
33-
set_state/3
33+
set_state/3,
34+
set_message_queue_length/1,
35+
set_message_queue_length/2
3436
]).
3537

3638
%% Local RAFT server's current FSM state
@@ -132,10 +134,17 @@ set_live(Table, Partition, Live) ->
132134

133135
-spec set_stale(wa_raft:table(), wa_raft:partition(), boolean()) -> true.
134136
set_stale(Table, Partition, Stale) ->
135-
set(?RAFT_STALE_KEY(Table, Partition), Stale),
136-
{message_queue_len, MsgQLen} = process_info(self(), message_queue_len),
137-
set(?RAFT_MSG_QUEUE_LENGTH_KEY(wa_raft_server:default_name(Table, Partition)), MsgQLen).
137+
set(?RAFT_STALE_KEY(Table, Partition), Stale).
138138

139139
-spec set_membership(wa_raft:table(), wa_raft:partition(), wa_raft_server:membership()) -> true.
140140
set_membership(Table, Partition, Membership) ->
141141
set(?RAFT_MEMBERSHIP_KEY(Table, Partition), Membership).
142+
143+
-spec set_message_queue_length(Name :: atom()) -> true.
144+
set_message_queue_length(Name) ->
145+
{message_queue_len, Length} = process_info(self(), message_queue_len),
146+
set_message_queue_length(Name, Length).
147+
148+
-spec set_message_queue_length(Name :: atom(), Length :: non_neg_integer()) -> true.
149+
set_message_queue_length(Name, Length) ->
150+
set(?RAFT_MSG_QUEUE_LENGTH_KEY(Name), Length).

src/wa_raft_server.erl

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -688,12 +688,13 @@ callback_mode() ->
688688
[state_functions, state_enter].
689689

690690
-spec terminate(Reason :: term(), State :: state(), Data :: #raft_state{}) -> ok.
691-
terminate(Reason, State, #raft_state{table = Table, partition = Partition} = Data) ->
691+
terminate(Reason, State, #raft_state{name = Name, table = Table, partition = Partition} = Data) ->
692692
?SERVER_LOG_NOTICE(State, Data, "terminating due to ~0P", [Reason, 20]),
693693
wa_raft_durable_state:sync(Data),
694694
wa_raft_info:delete_state(Table, Partition),
695695
wa_raft_info:set_live(Table, Partition, false),
696696
wa_raft_info:set_stale(Table, Partition, true),
697+
wa_raft_info:set_message_queue_length(Name, 0),
697698
ok.
698699

699700
%%------------------------------------------------------------------------------
@@ -2657,14 +2658,16 @@ check_stale_upon_entry(witness, #raft_state{table = Table, partition = Partition
26572658
wa_raft_info:set_stale(Table, Partition, true),
26582659
ok;
26592660
%% Leaders are always live and never stale upon entry.
2660-
check_stale_upon_entry(leader, #raft_state{table = Table, partition = Partition}) ->
2661+
check_stale_upon_entry(leader, #raft_state{name = Name, table = Table, partition = Partition}) ->
26612662
wa_raft_info:set_live(Table, Partition, true),
26622663
wa_raft_info:set_stale(Table, Partition, false),
2664+
wa_raft_info:set_message_queue_length(Name),
26632665
ok;
26642666
%% Stalled and disabled servers are never live and always stale.
2665-
check_stale_upon_entry(_, #raft_state{table = Table, partition = Partition}) ->
2667+
check_stale_upon_entry(_, #raft_state{name = Name, table = Table, partition = Partition}) ->
26662668
wa_raft_info:set_live(Table, Partition, false),
26672669
wa_raft_info:set_stale(Table, Partition, true),
2670+
wa_raft_info:set_message_queue_length(Name),
26682671
ok.
26692672

26702673
%% Set a new current term and voted-for peer and clear any state that is associated with the previous term.
@@ -3405,6 +3408,7 @@ check_follower_liveness(
34053408
State,
34063409
#raft_state{
34073410
application = App,
3411+
name = Name,
34083412
table = Table,
34093413
partition = Partition,
34103414
leader_heartbeat_ts = LeaderHeartbeatTs
@@ -3428,6 +3432,7 @@ check_follower_liveness(
34283432
wa_raft_info:set_stale(Table, Partition, true)
34293433
end
34303434
end,
3435+
wa_raft_info:set_message_queue_length(Name),
34313436
ok.
34323437

34333438
%% Check the state of a follower or witness's last applied log entry versus the leader's
@@ -3439,6 +3444,7 @@ check_follower_lagging(
34393444
LeaderCommit,
34403445
#raft_state{
34413446
application = App,
3447+
name = Name,
34423448
table = Table,
34433449
partition = Partition,
34443450
last_applied = LastApplied
@@ -3462,6 +3468,7 @@ check_follower_lagging(
34623468
wa_raft_info:set_stale(Table, Partition, false)
34633469
end
34643470
end,
3471+
wa_raft_info:set_message_queue_length(Name),
34653472

34663473
ok.
34673474

@@ -3472,6 +3479,7 @@ check_follower_lagging(
34723479
check_leader_liveness(
34733480
#raft_state{
34743481
application = App,
3482+
name = Name,
34753483
table = Table,
34763484
partition = Partition,
34773485
heartbeat_response_ts = HeartbeatResponse
@@ -3502,7 +3510,10 @@ check_leader_liveness(
35023510
false ->
35033511
?SERVER_LOG_NOTICE(leader, State, "is no longer stale after heartbeat quorum age drops to ~0p ms < ~0p ms max", [QuorumAge, MaxAge]),
35043512
wa_raft_info:set_stale(Table, Partition, ShouldBeStale)
3505-
end.
3513+
end,
3514+
3515+
% Update message queue length
3516+
wa_raft_info:set_message_queue_length(Name).
35063517

35073518
%% Based on information that the leader has available as a result of heartbeat replies, attempt
35083519
%% to discern what the best subsequent replication mode would be for this follower.

0 commit comments

Comments
 (0)