From 13b55d1010e23e6fa2f11ac5ba273b7bc0be2aef Mon Sep 17 00:00:00 2001 From: Serge Tupchii Date: Tue, 18 Jun 2024 19:19:46 +0300 Subject: [PATCH] fix: apply autoheal once majority of (core) nodes are alive (reachable) --- src/mria_autoheal.erl | 7 ++++++- test/mria_autoheal_SUITE.erl | 26 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/mria_autoheal.erl b/src/mria_autoheal.erl index 0a9b214..247c71c 100644 --- a/src/mria_autoheal.erl +++ b/src/mria_autoheal.erl @@ -77,7 +77,7 @@ handle_msg({report_partition, Node}, Autoheal = #autoheal{delay = Delay, timer = handle_msg(Msg = {create_splitview, Node}, Autoheal = #autoheal{delay = Delay, timer = TRef}) when Node =:= node() -> ensure_cancel_timer(TRef), - case mria_membership:is_all_alive() of + case is_majority_alive() of true -> Nodes = mria_mnesia:db_nodes(), RPCResult = erpc:multicall(Nodes, mria_mnesia, running_nodes, []), @@ -160,3 +160,8 @@ ensure_cancel_timer(undefined) -> ok; ensure_cancel_timer(TRef) -> catch erlang:cancel_timer(TRef). + +is_majority_alive() -> + All = mria_mnesia:cluster_nodes(all), + NotAliveLen = length(All -- [node() | nodes()]), + NotAliveLen < (length(All) div 2). diff --git a/test/mria_autoheal_SUITE.erl b/test/mria_autoheal_SUITE.erl index 7b2d373..7c77e63 100644 --- a/test/mria_autoheal_SUITE.erl +++ b/test/mria_autoheal_SUITE.erl @@ -87,6 +87,32 @@ t_autoheal_with_replicants(Config) when is_list(Config) -> end, [fun ?MODULE:prop_callbacks/1]). +t_autoheal_majority_reachable(Config) when is_list(Config) -> + Cluster = mria_ct:cluster([core, core, core, core, core], [{mria, cluster_autoheal, 200}]), + ?check_trace( + #{timetrap => 25000}, + try + Nodes = [N1, N2, N3, N4, N5] = mria_ct:start_cluster(mria, Cluster), + %% Simulate netsplit + true = rpc:cast(N4, erlang, disconnect_node, [N1]), + true = rpc:cast(N5, erlang, disconnect_node, [N1]), + ok = mria_ct:stop_slave(N5), + ok = timer:sleep(1000), + AliveMajorityNodes = [N1, N2, N3, N4], + %% Wait for autoheal, it should happen automatically: + ?retry(1000, 20, + begin + ?assertMatch({AliveMajorityNodes, [N5]}, view(N1)), + ?assertMatch({AliveMajorityNodes, [N5]}, view(N2)), + ?assertMatch({AliveMajorityNodes, [N5]}, view(N3)), + ?assertMatch({AliveMajorityNodes, [N5]}, view(N4)) + end), + Nodes + after + ok = mria_ct:teardown_cluster(lists:sublist(Cluster, 4)) + end, + [fun ?MODULE:prop_callbacks/1]). + todo_t_reboot_rejoin(Config) when is_list(Config) -> %% FIXME: Flaky and somewhat broken, disable for now CommonEnv = [ {mria, cluster_autoheal, 200} , {mria, db_backend, rlog}