From 5b2846d8a10dff2f2f79a3dc812399f4be8c368c Mon Sep 17 00:00:00 2001 From: Andrew Mayorov Date: Wed, 18 Dec 2024 12:29:41 +0100 Subject: [PATCH] fix(autoheal): simplify split view computation Specifically, stop using locality as a sorting criterion and avoid deduplicating entries for better diagnostics. --- src/ekka_autoheal.erl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/ekka_autoheal.erl b/src/ekka_autoheal.erl index d63c7ad..c621a64 100644 --- a/src/ekka_autoheal.erl +++ b/src/ekka_autoheal.erl @@ -82,7 +82,7 @@ handle_msg(Msg = {create_splitview, Node}, Autoheal = #autoheal{delay = Delay, t case HealPlan of {Candidates = [_ | _], Minority} -> %% Non-empty list of candidates, choose a coordinator. - CoordNode = ekka_membership:coordinator(Candidates), + CoordNode = pick_coordinator(Candidates), ekka_node_monitor:cast(CoordNode, {heal_cluster, Minority, SplitView}); {[], Cluster} -> %% It's very unlikely but possible to have empty list of candidates. @@ -146,7 +146,7 @@ find_split_view(Nodes, Views) -> Nodes, Views ), - MajorityView = lists:usort(fun compare_node_views/2, ClusterView), + MajorityView = lists:sort(fun compare_node_views/2, ClusterView), find_split_view(MajorityView). compare_node_views({_N1, Running1, _}, {_N2, Running2, _}) -> @@ -155,10 +155,8 @@ compare_node_views({_N1, Running1, _}, {_N2, Running2, _}) -> case Len1 of %% Prefer partitions with higher number of surviving nodes. L when L > Len2 -> true; - %% If number of nodes is the same, prefer those where current node is a survivor. - %% Otherwise, sort by list of running nodes. If lists happen to be the same, this - %% view will be excluded by usort. - Len2 -> lists:member(node(), Running1) orelse Running1 < Running2; + %% If number of nodes is the same, sort by list of running nodes. + Len2 -> Running1 < Running2; L when L < Len2 -> false end. @@ -184,6 +182,12 @@ find_heal_plan([{_Node, R0, P0} | Rest]) -> find_heal_plan([]) -> {}. +pick_coordinator(Candidates) -> + case lists:member(node(), Candidates) of + true -> node(); + false -> ekka_membership:coordinator(Candidates) + end. + heal_partition([{Nodes, []} | _] = SplitView) -> %% Symmetric partition. ?LOG(info, "Healing partition: ~p", [SplitView]),