Parcourir la source

feat: add graceful shutdown

prior to this cahnge emqx node shutdown is done by init:stop
which might have undesired stop order of the applications

in this change, emqx_machine_terminator is added to stop apps
in defined order and then terminate the node in infinite loop
Zaiming Shi il y a 4 ans
Parent
commit
bc23ff5e47

+ 1 - 1
.ci/build_packages/tests.sh

@@ -138,7 +138,7 @@ EOF
         exit 1
     fi
     IDLE_TIME=0
-   while ! curl http://localhost:8081/api/v5/status >/dev/null 2>&1; do
+    while ! curl http://localhost:8081/api/v5/status >/dev/null 2>&1; do
         if [ $IDLE_TIME -gt 10 ]
         then
             echo "emqx running error"

+ 27 - 0
apps/emqx_machine/src/emqx_machine.erl

@@ -0,0 +1,27 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2021 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_machine).
+
+-export([start/0,
+         graceful_shutdown/0
+        ]).
+
+start() ->
+    ok = emqx_machine_terminator:start().
+
+graceful_shutdown() ->
+    emqx_machine_terminator:graceful().

+ 5 - 9
apps/emqx_machine/src/emqx_machine_app.erl

@@ -18,11 +18,9 @@
 
 -export([ start/2
         , stop/1
-        , prep_stop/1
         ]).
 
-%% Shutdown and reboot
--export([ shutdown/1
+-export([ stop_apps/1
         , ensure_apps_started/0
         ]).
 
@@ -50,11 +48,9 @@ start(_Type, _Args) ->
     ok = print_vsn(),
 
     ok = start_autocluster(),
+    ok = emqx_machine:start(),
     {ok, RootSupPid}.
 
-prep_stop(_State) ->
-    application:stop(emqx).
-
 stop(_State) ->
     ok.
 
@@ -96,13 +92,13 @@ load_config_files() ->
     ok = emqx_app:set_init_config_load_done().
 
 start_autocluster() ->
-    ekka:callback(prepare, fun ?MODULE:shutdown/1),
+    ekka:callback(prepare, fun ?MODULE:stop_apps/1),
     ekka:callback(reboot,  fun ?MODULE:ensure_apps_started/0),
     _ = ekka:autocluster(emqx), %% returns 'ok' or a pid or 'any()' as in spec
     ok.
 
-shutdown(Reason) ->
-    ?SLOG(critical, #{msg => "stopping_apps", reason => Reason}),
+stop_apps(Reason) ->
+    ?SLOG(info, #{msg => "stopping_apps", reason => Reason}),
     _ = emqx_alarm_handler:unload(),
     lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())).
 

+ 67 - 0
apps/emqx_machine/src/emqx_machine_terminator.erl

@@ -0,0 +1,67 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2021 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_machine_terminator).
+
+-export([ start/0
+        , graceful/0
+        , terminator_loop/0
+        ]).
+
+-define(TERMINATOR, ?MODULE).
+
+%% @doc This API is called to shutdown the Erlang VM by RPC call from remote shell node.
+%% The shutown of apps is delegated to a to a process instead of doing it in the RPC spawned
+%% process which has a remote group leader.
+start() ->
+    _ = spawn_link(
+          fun() ->
+                  register(?TERMINATOR, self()),
+                  terminator_loop()
+          end),
+    ok.
+
+%% internal use
+terminator_loop() ->
+    receive
+        graceful_shutdown ->
+            ok = emqx_machine_app:stop_apps(normal),
+            exit_loop()
+    after
+        1000 ->
+            %% keep looping for beam reload
+            ?MODULE:terminator_loop()
+    end.
+
+%% @doc Shutdown the Erlang VM.
+graceful() ->
+    case whereis(?TERMINATOR) of
+        undefined ->
+            exit(emqx_machine_not_started);
+        Pid ->
+            Pid ! graceful_shutdown,
+            Ref = monitor(process, Pid),
+            %% NOTE: not exactly sure, but maybe there is a chance that
+            %% Erlang VM goes down before this receive.
+            %% In which case, the remote caller will get {badrpc, nodedown}
+            receive {'DOWN', Ref, process, Pid, _} -> ok end
+    end.
+
+%% Loop until Erlang VM exits
+exit_loop() ->
+    init:stop(),
+    timer:sleep(100),
+    exit_loop().

+ 2 - 2
apps/emqx_machine/test/emqx_machine_app_SUITE.erl

@@ -33,9 +33,9 @@ end_per_suite(_Config) ->
     emqx_ct_helpers:stop_apps([]).
 
 t_shutdown_reboot(_Config) ->
-    emqx_machine_app:shutdown(normal),
+    emqx_machine_app:stop_apps(normal),
     false = emqx:is_running(node()),
     emqx_machine_app:ensure_apps_started(),
     true = emqx:is_running(node()),
-    ok = emqx_machine_app:shutdown(for_test),
+    ok = emqx_machine_app:stop_apps(for_test),
     false = emqx:is_running(node()).

+ 1 - 1
bin/emqx

@@ -99,7 +99,7 @@ relx_usage() {
             echo "                      don't make it permanent"
             ;;
         *)
-            echo "Usage: $REL_NAME {start|start_boot <file>|ertspath|foreground|stop|restart|reboot|pid|ping|console|console_clean|console_boot <file>|attach|remote_console|upgrade|downgrade|install|uninstall|versions|escript|ctl|rpc|rpcterms|eval|root_dir}"
+            echo "Usage: $REL_NAME {start|start_boot <file>|ertspath|foreground|stop|pid|ping|console|console_clean|console_boot <file>|attach|remote_console|upgrade|downgrade|install|uninstall|versions|escript|ctl|rpc|rpcterms|eval|root_dir}"
             ;;
     esac
 }

+ 10 - 4
bin/nodetool

@@ -72,9 +72,15 @@ do(Args) ->
             %% a "pong"
             io:format("pong\n");
         ["stop"] ->
-            io:format("~p\n", [rpc:call(TargetNode, init, stop, [], 60000)]);
-        ["restart", "-config", ConfigFile | _RestArgs1] ->
-            io:format("~p\n", [rpc:call(TargetNode, emqx, restart, [ConfigFile], 60000)]);
+            case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of
+                ok ->
+                    ok;
+                {badrpc, nodedown} ->
+                    %% nodetool commands are always executed after a ping
+                    %% which if the code gets here, it's because the target node
+                    %% has shutdown before RPC returns.
+                    ok
+            end;
         ["rpc", Module, Function | RpcArgs] ->
             case rpc:call(TargetNode, list_to_atom(Module), list_to_atom(Function),
                           [RpcArgs], 60000) of
@@ -141,7 +147,7 @@ do(Args) ->
             end;
         Other ->
             io:format("Other: ~p\n", [Other]),
-            io:format("Usage: nodetool {genconfig, chkconfig|getpid|ping|stop|restart|reboot|rpc|rpc_infinity|rpcterms|eval [Terms]} [RPC]\n")
+            io:format("Usage: nodetool {genconfig, chkconfig|getpid|ping|stop|rpc|rpc_infinity|rpcterms|eval [Terms]} [RPC]\n")
     end,
     net_kernel:stop().