From 768a6f9359b34ed59a1e92ac4633923e735ceb43 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 13 Feb 2024 14:38:21 -0500 Subject: [PATCH] ra_SUITE: Fix flake in consistent_query_leader_change The line: {ok, 9, B} = ra:consistent_query(B, fun(S) -> S end), would flake because `B` might fail to become the leader. `ra:transfer_leadership/2` is asynchronous and fallible and in cases where the test flaked, `B` was recovering and catching up to the latest changes, so it didn't assume leadership. We can wait for `B` to catch up enough that it should win pre-votes and then wait for it to assume leadership before asserting that `B` is the leader to fix the flake. --- test/ra_SUITE.erl | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/test/ra_SUITE.erl b/test/ra_SUITE.erl index bf6eb4ca..b3a71584 100644 --- a/test/ra_SUITE.erl +++ b/test/ra_SUITE.erl @@ -629,7 +629,8 @@ consistent_query_minority(Config) -> consistent_query_leader_change(Config) -> %% this test reproduces a scenario that could cause a stale %% read to be returned from `ra:consistent_query/2` - [A, B, C, D, E] = Cluster = start_local_cluster(5, ?config(test_name, Config), + ClusterName = ?config(test_name, Config), + [A, B, C, D, E] = Cluster = start_local_cluster(5, ClusterName, add_machine()), ok = ra:transfer_leadership(A, A), {ok, _, A} = ra:process_command(A, 9, ?PROCESS_COMMAND_TIMEOUT), @@ -640,9 +641,22 @@ consistent_query_leader_change(Config) -> %% restart B ok = ra:stop_server(B), ok = ra:restart_server(B), + %% Wait for B to recover and catch up. + {ok, #{log := #{last_written_index_term := CurrentIdxTerm}}, _} = + ra:member_overview(A), + await_condition( + fun() -> + {ok, #{log := #{last_written_index_term := IdxTermB}}, _} = + ra:member_overview(B), + IdxTermB =:= CurrentIdxTerm + end, 20), %% B's query_index is now 0 %% Make B leader ok = ra:transfer_leadership(A, B), + await_condition( + fun() -> + ra_leaderboard:lookup_leader(ClusterName) =:= B + end, 20), %% restart E ok = ra:restart_server(E), {ok, 9, B} = ra:consistent_query(B, fun(S) -> S end), @@ -1310,3 +1324,13 @@ search_paths() -> Ld = code:lib_dir(), lists:filter(fun (P) -> string:prefix(P, Ld) =:= nomatch end, code:get_path()). + +await_condition(_Fun, 0) -> + exit(condition_did_not_materialise); +await_condition(Fun, Attempts) -> + case catch Fun() of + true -> ok; + _ -> + timer:sleep(100), + await_condition(Fun, Attempts - 1) + end.