From d97cce45c76e1fff6327bcf1969d89233a3faf8f Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 18 Oct 2024 21:36:22 +0200
Subject: [PATCH 1/3] fix(bootstrap): wait core tables are ready before copying

In specific circumstances `mria_mnesia:copy_table/2` may fail with
`{system_limit, '$mria_rlog_sync', {Node, none_active}}` error, which
crashes the node.

Consider the following scenario:
1. Node `N1` starts up and bootstraps Mria.
2. Node `N2` starts up and bootstraps Mria.
3. Node `N2` joins cluster consisting of node `N1`.
4. Node `N2` runs `mria_mnesia:join_cluster/1` and starts Mria again.
5. At the exact same time node `N1` decides to restart for some reason.
6. During bootstrap, node `N2` tries to copy `$mria_rlog_sync` table.
7. Mnesia sees there's nowhere to copy from and aborts the operation.
8. Mria fails to start.

While unlikely, in practice this might be achieved when the operator
performs unusual maintenance operations, e.g. simultaneously requests
version upgrade and scales the cluster up.
---
 src/mria_schema.erl | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/mria_schema.erl b/src/mria_schema.erl
index 46e9261..5ccb12e 100644
--- a/src/mria_schema.erl
+++ b/src/mria_schema.erl
@@ -317,19 +317,20 @@ converge_schema(Entries, InitialState) ->
 %% is needed so we can replicate schema updates just like regular
 %% transactions.
 bootstrap() ->
-    Storage = ram_copies,
     Opts = [{type, ordered_set},
             {record_name, ?schema},
             {attributes, record_info(fields, ?schema)}
            ],
     MetaSpec = #?schema{ mnesia_table = ?schema
                        , shard = ?mria_meta_shard
-                       , storage = Storage
+                       , storage = ram_copies
                        , config = Opts
                        },
     %% Create (or copy) the mnesia table and wait for it:
     ok = create_table(MetaSpec),
-    ok = mria_mnesia:copy_table(?schema, Storage),
+    %% Ensure replicas are available before starting copy:
+    ok = mria_mnesia:wait_for_tables([?schema]),
+    ok = mria_mnesia:copy_table(?schema, ram_copies),
     RlogSyncOpts = [{record_name, ?rlog_sync},
                     {attributes, record_info(fields, ?rlog_sync)}
                    ],
@@ -339,6 +340,10 @@ bootstrap() ->
                            , config = RlogSyncOpts
                            },
     ok = create_table(RlogSyncSpec),
+    %% Ensure replicas are available before starting copy:
+    %% If we've managed to sync only mnesia schema up to this point, `copy_table/2` may
+    %% fail if other nodes suddenly become unavailable.
+    ok = mria_mnesia:wait_for_tables([?rlog_sync]),
     ok = mria_mnesia:copy_table(?rlog_sync, null_copies),
     mria_mnesia:wait_for_tables([?schema, ?rlog_sync]),
     %% Seed the table with the metadata:

From 25c7fb1ca2061ccd5a97142fb4aaa4eb9261cdf2 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Fri, 18 Oct 2024 21:54:12 +0200
Subject: [PATCH 2/3] chore: ensure Erlang/OTP 27 compat

Silence "expression updates a literal" compiler lint recently introduced
in erlang/otp#8069.
---
 src/mria_rlog.hrl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mria_rlog.hrl b/src/mria_rlog.hrl
index f1df041..a7ee43a 100644
--- a/src/mria_rlog.hrl
+++ b/src/mria_rlog.hrl
@@ -25,9 +25,9 @@
 -define(unexpected_event_kind, "Mria worker received unexpected event").
 -define(unexpected_event_tp(Params),
         ?tp(warning, ?unexpected_event_kind,
-            Params#{ process => ?MODULE
-                   , callback => ?FUNCTION_NAME
-                   })).
+            (begin Params end)#{ process => ?MODULE
+                               , callback => ?FUNCTION_NAME
+                               })).
 
 -define(terminate_tp,
         ?tp(debug, mria_worker_terminate, #{process => ?MODULE, callback => terminate})).

From 60843469ccc381fe399df2ed68694b9e63777b69 Mon Sep 17 00:00:00 2001
From: Andrew Mayorov <encube.ul@gmail.com>
Date: Mon, 21 Oct 2024 11:13:43 +0200
Subject: [PATCH 3/3] test(bootstrap): test mria starts if nodes unavailable in
 bootrstrap

---
 test/mria_mnesia_SUITE.erl | 47 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/test/mria_mnesia_SUITE.erl b/test/mria_mnesia_SUITE.erl
index 73d0f4c..36dfaa4 100644
--- a/test/mria_mnesia_SUITE.erl
+++ b/test/mria_mnesia_SUITE.erl
@@ -99,6 +99,53 @@ t_join_after_node_down(_) ->
        end,
        []).
 
+%% Start a cluster of two nodes, then join the third, and simulate two nodes went down
+%% right after the third one joined. Restore them and verify the third one is healthy.
+t_cluster_down_after_join(_) ->
+    ClusterEnv = mria_mnesia_test_util:common_env(),
+    Cluster = [C1, C2, C3] = mria_ct:cluster([core, core, core], ClusterEnv),
+    ?check_trace(
+       #{timetrap => 10_000},
+       try
+           %% Prepare cluster with 3 nodes:
+           Ns = [N1, N2, N3] = mria_ct:start_cluster(node, Cluster),
+           ?assertEqual([{ok, ok} || _ <- Ns], erpc:multicall(Ns, mria, start, [])),
+           %% Join together first 2 nodes:
+           ?assertEqual(ok, erpc:call(N1, mria, join, [N2])),
+           ?assertEqual([N1, N2], lists:sort(erpc:call(N1, mria_mnesia, running_nodes, []))),
+           ?assertEqual(ok, erpc:call(N1, mria_transaction_gen, init, [])),
+           %% Tell N3 to join but simulate it goes down after joining but before bootstrap:
+           ?assertEqual(ok, erpc:call(N3, meck, new, [mria_app, [no_link, passthrough]])),
+           ?assertEqual(ok, erpc:call(N3, meck, expect, [mria_app, start, fun ?MODULE:suicide/2])),
+           %% Node N3 expectedly dies:
+           ?assertError({erpc, _}, erpc:call(N3, mria, join, [N1])),
+           ?assertError({erpc, _}, erpc:call(N3, mria_mnesia, running_nodes, [])),
+           %% Tell N1 and N2 to stop:
+           ?assertEqual(ok, erpc:call(N1, mria, stop, [])),
+           ?assertEqual(ok, erpc:call(N2, mria, stop, [])),
+           ?assertEqual([ok, ok], [slave:stop(N) || N <- [N1, N2]]),
+           %% Restart N3 and tell mria to start:
+           N3 = mria_ct:start_slave(node, C3),
+           %% This will hang waiting for N1 or N2 to go online, thus `cast/4`:
+           ?assertEqual(ok, erpc:cast(N3, mria, start, [])),
+           %% Tell N1 and N2 to get back up:
+           [N1, N2] = [mria_ct:start_slave(node, C) || C <- [C1, C2]],
+           %% Again, use `cast/4` to avoid hanging waiting for another node:
+           ?assertEqual(ok, erpc:cast(N1, mria, start, [])),
+           ?assertEqual(ok, erpc:cast(N2, mria, start, [])),
+           ?assertEqual(ok, erpc:call(N3, mria, start, [])),
+           %% Verify that bootstrap process has finished and the node is alive:
+           _ = erpc:call(N3, sys, get_state, [mria_schema]),
+           ?assertEqual([N1, N2, N3], lists:sort(erpc:call(N3, mria_mnesia, running_nodes, []))),
+           ok
+       after
+           ok = mria_ct:teardown_cluster(Cluster)
+       end,
+       []).
+
+suicide(_Type, _Args) ->
+    erlang:halt().
+
 t_diagnosis_tab(_)->
     TestTab = test_tab_1,
     Cluster = [NS1, NS2] = mria_ct:cluster([core, core], []),