From 2b4e2b42508a5fe860160942eb32f3c6638366ae Mon Sep 17 00:00:00 2001 From: Karl Nilsson Date: Fri, 8 May 2026 10:49:21 +0100 Subject: [PATCH 1/3] Fix snapshot recovery with empty indexes file Updates `ra_log:pre_init` to accept and pass the machine configuration to `ra_snapshot:init`, ensuring proper recovery even when the snapshot `indexes` file is empty. Additionally, fixes `ra_log_pre_init` to properly handle both `{module, ...}` and `{simple, ...}` machine configuration formats, and includes a regression test. --- src/ra_log.erl | 8 ++--- src/ra_log_pre_init.erl | 16 +++++++-- test/ra_log_2_SUITE.erl | 79 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/src/ra_log.erl b/src/ra_log.erl index cc86796a..c0760488 100644 --- a/src/ra_log.erl +++ b/src/ra_log.erl @@ -10,7 +10,7 @@ -include_lib("stdlib/include/assert.hrl"). -compile([inline_list_funcs]). --export([pre_init/1, +-export([pre_init/2, init/1, close/1, begin_tx/1, @@ -181,8 +181,8 @@ -define(CHECKPOINTS_DIR, <<"checkpoints">>). -define(RECOVERY_CHECKPOINT_DIR, <<"recovery_checkpoint">>). -pre_init(#{uid := UId, - system_config := #{data_dir := DataDir}} = Conf) -> +pre_init(Machine, #{uid := UId, + system_config := #{data_dir := DataDir}} = Conf) -> Dir = server_data_dir(DataDir, UId), SnapModule = maps:get(snapshot_module, Conf, ?DEFAULT_SNAPSHOT_MODULE), MaxCheckpoints = maps:get(max_checkpoints, Conf, ?DEFAULT_MAX_CHECKPOINTS), @@ -191,7 +191,7 @@ pre_init(#{uid := UId, RecoveryCheckpointDir = filename:join(Dir, ?RECOVERY_CHECKPOINT_DIR), _ = ra_snapshot:init(UId, SnapModule, SnapshotsDir, CheckpointsDir, RecoveryCheckpointDir, - undefined, undefined, MaxCheckpoints), + Machine, undefined, MaxCheckpoints), ok. -spec init(ra_log_init_args()) -> state(). diff --git a/src/ra_log_pre_init.erl b/src/ra_log_pre_init.erl index 40c3ad2b..2ec275e6 100644 --- a/src/ra_log_pre_init.erl +++ b/src/ra_log_pre_init.erl @@ -89,9 +89,19 @@ pre_init(System, UId) -> case ra_lib:is_dir(Dir) of true -> case ra_log:read_config(Dir) of - {ok, #{log_init_args := Log}} -> - ok = ra_log:pre_init( - Log#{system_config => SysCfg}), + {ok, #{log_init_args := Log, + machine := MachineConf}} -> + Machine = case MachineConf of + {simple, Fun, S} -> + {machine, ra_machine_simple, + #{simple_fun => Fun, + initial_state => S}}; + {module, Mod, Args} -> + {machine, Mod, Args} + end, + ok = ra_log:pre_init(Machine, + Log#{system_config => + SysCfg}), ok; {error, Err} when Err == parsing orelse diff --git a/test/ra_log_2_SUITE.erl b/test/ra_log_2_SUITE.erl index a4914579..718671fe 100644 --- a/test/ra_log_2_SUITE.erl +++ b/test/ra_log_2_SUITE.erl @@ -92,8 +92,9 @@ all_tests() -> concurrent_snapshot_install_and_compaction, snapshot_installation_with_live_indexes, init_with_dangling_symlink, - init_after_missing_segments_event - ]. + init_after_missing_segments_event, + snapshot_install_with_empty_indexes_file + ]. groups() -> [ @@ -684,6 +685,80 @@ recover_after_snapshot(Config) -> last_written_index_term := {2, 1}}, Overview), ok. +snapshot_install_with_empty_indexes_file(Config) -> + UId = ?config(uid, Config), + MachineConf = {module, ?MODULE, #{}}, + LogConf = #{uid => UId, + initial_access_pattern => ?config(access_pattern, Config)}, + ServerConf = #{cluster_name => ?MODULE, + id => {?MODULE, node()}, + uid => UId, + log_init_args => LogConf, + initial_members => [], + machine => MachineConf}, + Log0 = ra_log_init(Config, LogConf), + %% write config to check recovery + ok = ra_log:write_config(ServerConf, Log0), + + {0, 0} = ra_log:last_index_term(Log0), + Log1 = assert_log_events(write_n(1, 6, 2, Log0), + fun (L) -> + LW = ra_log:last_written(L), + {5, 2} == LW + end), + + %% snapshot at 10 + SnapIdx = 10, + SnapTerm = 2, + Meta = meta(SnapIdx, SnapTerm, [?N1]), + Chunk = create_snapshot_chunk(Config, Meta, #{}), + SnapState0 = ra_log:snapshot_state(Log1), + {ok, SnapState1} = ra_snapshot:begin_accept(Meta, SnapState0), + Machine = {machine, ?MODULE, #{}}, + {SnapState, _, LiveIndexes, AEffs} = ra_snapshot:complete_accept(Chunk, 1, + Machine, + SnapState1), + run_effs(AEffs), + {ok, Log2, Effs4} = ra_log:install_snapshot({SnapIdx, SnapTerm}, ?MODULE, + LiveIndexes, + ra_log:set_snapshot_state(SnapState, Log1)), + run_effs(Effs4), + + {SnapIdx, _} = ra_log:last_index_term(Log2), + {SnapIdx, _} = ra_log:last_written(Log2), + + %% append after snapshot + Log3 = append_n(11, 16, SnapTerm, Log2), + Log4 = assert_log_events(Log3, fun (L) -> + {15, SnapTerm} == ra_log:last_written(L) + end), + + SnapStateForDir = ra_log:snapshot_state(Log4), + SnapDir = ra_snapshot:current_snapshot_dir(SnapStateForDir), + + ra_log:close(Log4), + + IndexesFile = filename:join(SnapDir, <<"indexes">>), + ok = file:write_file(IndexesFile, <<>>), + + application:stop(ra), + start_ra(Config), + timer:sleep(100), + ct:pal("snapshot state ~p", + [ra_log_snapshot_state:read(ra_log_snapshot_state, UId)]), + + + Log5 = ra_log_init(Config, LogConf), + + %% Fetch items 1..10 (should be dropped because of snapshot) + {[], _} = ra_log_take(1, 10, Log5), + + %% Fetch items 11..15 + {[_, _, _, _, _], _} = ra_log_take(11, 16, Log5), + + ra_log:close(Log5), + ok. + writes_lower_than_snapshot_index_are_dropped(Config) -> logger:set_primary_config(level, debug), Log0 = ra_log_init(Config, #{min_snapshot_interval => 1}), From f4bc68de760c4df1fc8458cf70ab8a3469249b1b Mon Sep 17 00:00:00 2001 From: Karl Nilsson Date: Fri, 8 May 2026 11:00:06 +0100 Subject: [PATCH 2/3] reduce log output on snapshot send errors --- src/ra_server_proc.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ra_server_proc.erl b/src/ra_server_proc.erl index f7f78e8b..391fb03a 100644 --- a/src/ra_server_proc.erl +++ b/src/ra_server_proc.erl @@ -2183,8 +2183,8 @@ send_snapshots(Id, Term, {_, ToNode} = To, ChunkSize, ok catch Class:Err:Stack -> ?INFO("~ts: send_pre_snapshot_entries " - "encountered an error: ~w", - [LogId, Err]), + "encountered an error: ~W", + [LogId, Err, 10]), erlang:raise(Class, Err, safe_stacktrace(Stack)) end, From 759e80ab1cadcede8d679ccac8103576b3055478 Mon Sep 17 00:00:00 2001 From: Karl Nilsson Date: Fri, 8 May 2026 13:34:29 +0100 Subject: [PATCH 3/3] improve logging --- src/ra_snapshot.erl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ra_snapshot.erl b/src/ra_snapshot.erl index 1f562fac..e0a982dd 100644 --- a/src/ra_snapshot.erl +++ b/src/ra_snapshot.erl @@ -323,6 +323,9 @@ recover_indexes(UId, Module, Machine, SnapDir, Err) -> Idxs = ra_machine:live_indexes( MacMod, MacState), ok = write_indexes(SnapDir, Idxs), + ?INFO("ra_snapshot: ~ts: indexes file recovered " + "~b live indexes recovered from snapshot", + [UId, ra_seq:length(Idxs)]), Idxs; {error, RecoverErr} -> ?WARN("ra_snapshot: ~ts: failed to "