Skip to content

Commit

Permalink
Merge pull request #183 from keynslug/fix/race-system-limit
Browse files Browse the repository at this point in the history
fix(bootstrap): wait core tables are ready before copying
  • Loading branch information
keynslug authored Oct 21, 2024
2 parents ad1cbd7 + 6084346 commit 4cab8da
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 6 deletions.
6 changes: 3 additions & 3 deletions src/mria_rlog.hrl
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
-define(unexpected_event_kind, "Mria worker received unexpected event").
-define(unexpected_event_tp(Params),
?tp(warning, ?unexpected_event_kind,
Params#{ process => ?MODULE
, callback => ?FUNCTION_NAME
})).
(begin Params end)#{ process => ?MODULE
, callback => ?FUNCTION_NAME
})).

-define(terminate_tp,
?tp(debug, mria_worker_terminate, #{process => ?MODULE, callback => terminate})).
Expand Down
11 changes: 8 additions & 3 deletions src/mria_schema.erl
Original file line number Diff line number Diff line change
Expand Up @@ -317,19 +317,20 @@ converge_schema(Entries, InitialState) ->
%% is needed so we can replicate schema updates just like regular
%% transactions.
bootstrap() ->
Storage = ram_copies,
Opts = [{type, ordered_set},
{record_name, ?schema},
{attributes, record_info(fields, ?schema)}
],
MetaSpec = #?schema{ mnesia_table = ?schema
, shard = ?mria_meta_shard
, storage = Storage
, storage = ram_copies
, config = Opts
},
%% Create (or copy) the mnesia table and wait for it:
ok = create_table(MetaSpec),
ok = mria_mnesia:copy_table(?schema, Storage),
%% Ensure replicas are available before starting copy:
ok = mria_mnesia:wait_for_tables([?schema]),
ok = mria_mnesia:copy_table(?schema, ram_copies),
RlogSyncOpts = [{record_name, ?rlog_sync},
{attributes, record_info(fields, ?rlog_sync)}
],
Expand All @@ -339,6 +340,10 @@ bootstrap() ->
, config = RlogSyncOpts
},
ok = create_table(RlogSyncSpec),
%% Ensure replicas are available before starting copy:
%% If we've managed to sync only mnesia schema up to this point, `copy_table/2` may
%% fail if other nodes suddenly become unavailable.
ok = mria_mnesia:wait_for_tables([?rlog_sync]),
ok = mria_mnesia:copy_table(?rlog_sync, null_copies),
mria_mnesia:wait_for_tables([?schema, ?rlog_sync]),
%% Seed the table with the metadata:
Expand Down
47 changes: 47 additions & 0 deletions test/mria_mnesia_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,53 @@ t_join_after_node_down(_) ->
end,
[]).

%% Start a cluster of two nodes, then join the third, and simulate two nodes went down
%% right after the third one joined. Restore them and verify the third one is healthy.
t_cluster_down_after_join(_) ->
ClusterEnv = mria_mnesia_test_util:common_env(),
Cluster = [C1, C2, C3] = mria_ct:cluster([core, core, core], ClusterEnv),
?check_trace(
#{timetrap => 10_000},
try
%% Prepare cluster with 3 nodes:
Ns = [N1, N2, N3] = mria_ct:start_cluster(node, Cluster),
?assertEqual([{ok, ok} || _ <- Ns], erpc:multicall(Ns, mria, start, [])),
%% Join together first 2 nodes:
?assertEqual(ok, erpc:call(N1, mria, join, [N2])),
?assertEqual([N1, N2], lists:sort(erpc:call(N1, mria_mnesia, running_nodes, []))),
?assertEqual(ok, erpc:call(N1, mria_transaction_gen, init, [])),
%% Tell N3 to join but simulate it goes down after joining but before bootstrap:
?assertEqual(ok, erpc:call(N3, meck, new, [mria_app, [no_link, passthrough]])),
?assertEqual(ok, erpc:call(N3, meck, expect, [mria_app, start, fun ?MODULE:suicide/2])),
%% Node N3 expectedly dies:
?assertError({erpc, _}, erpc:call(N3, mria, join, [N1])),
?assertError({erpc, _}, erpc:call(N3, mria_mnesia, running_nodes, [])),
%% Tell N1 and N2 to stop:
?assertEqual(ok, erpc:call(N1, mria, stop, [])),
?assertEqual(ok, erpc:call(N2, mria, stop, [])),
?assertEqual([ok, ok], [slave:stop(N) || N <- [N1, N2]]),
%% Restart N3 and tell mria to start:
N3 = mria_ct:start_slave(node, C3),
%% This will hang waiting for N1 or N2 to go online, thus `cast/4`:
?assertEqual(ok, erpc:cast(N3, mria, start, [])),
%% Tell N1 and N2 to get back up:
[N1, N2] = [mria_ct:start_slave(node, C) || C <- [C1, C2]],
%% Again, use `cast/4` to avoid hanging waiting for another node:
?assertEqual(ok, erpc:cast(N1, mria, start, [])),
?assertEqual(ok, erpc:cast(N2, mria, start, [])),
?assertEqual(ok, erpc:call(N3, mria, start, [])),
%% Verify that bootstrap process has finished and the node is alive:
_ = erpc:call(N3, sys, get_state, [mria_schema]),
?assertEqual([N1, N2, N3], lists:sort(erpc:call(N3, mria_mnesia, running_nodes, []))),
ok
after
ok = mria_ct:teardown_cluster(Cluster)
end,
[]).

suicide(_Type, _Args) ->
erlang:halt().

t_diagnosis_tab(_)->
TestTab = test_tab_1,
Cluster = [NS1, NS2] = mria_ct:cluster([core, core], []),
Expand Down

0 comments on commit 4cab8da

Please sign in to comment.