Przeglądaj źródła

Merge pull request #11567 from SergeTupchiy/EMQX-10835-increase-graceful-stop-timeout

fix(nodetool): increase graceful stop timeout
SergeTupchiy 2 lat temu
rodzic
commit
cf334d5542
3 zmienionych plików z 29 dodań i 3 usunięć
  1. 2 2
      bin/emqx
  2. 23 1
      bin/nodetool
  3. 4 0
      changes/ce/fix-11567.en.md

+ 2 - 2
bin/emqx

@@ -812,7 +812,7 @@ is_down() {
     if ps -p "$PID" >/dev/null; then
         # still around
         # shellcheck disable=SC2009 # this grep pattern is not a part of the program names
-        if ps -efp "$PID" | $GREP -q 'defunct'; then
+        if ps -fp "$PID" | $GREP -q 'defunct'; then
             # zombie state, print parent pid
             parent="$(ps -o ppid= -p "$PID" | tr -d ' ')"
             logwarn "$PID is marked <defunct>, parent: $(ps -p "$parent")"
@@ -831,7 +831,7 @@ wait_for() {
     shift
     CMD="$*"
     while true; do
-        if $CMD >/dev/null 2>&1; then
+        if $CMD; then
             return 0
         fi
         if [ "$WAIT_TIME" -le 0 ]; then

+ 23 - 1
bin/nodetool

@@ -8,6 +8,8 @@
 %% -------------------------------------------------------------------
 -mode(compile).
 
+-define(SHUTDOWN_TIMEOUT_MS, 120_000).
+
 main(Args) ->
     case os:type() of
         {win32, nt} -> ok;
@@ -85,9 +87,17 @@ do(Args) ->
             %% a "pong"
             io:format("pong\n");
         ["stop"] ->
-            case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of
+            Pid = start_shutdown_status(),
+            Res = rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS),
+            true = stop_shutdown_status(Pid),
+            case Res of
                 ok ->
                     ok;
+                {badrpc, timeout} ->
+                    io:format("EMQX is still shutting down, it failed to stop gracefully "
+                              "within the configured timeout of: ~ps\n",
+                              [erlang:convert_time_unit(?SHUTDOWN_TIMEOUT_MS, millisecond, second)]),
+                    halt(1);
                 {badrpc, nodedown} ->
                     %% nodetool commands are always executed after a ping
                     %% which if the code gets here, it's because the target node
@@ -145,6 +155,18 @@ do(Args) ->
     end,
     net_kernel:stop().
 
+start_shutdown_status() ->
+    spawn_link(fun shutdown_status_loop/0).
+
+stop_shutdown_status(Pid) ->
+    true = unlink(Pid),
+    true = exit(Pid, stop).
+
+shutdown_status_loop() ->
+    timer:sleep(10_000),
+    io:format("EMQX is shutting down, please wait...\n", []),
+    shutdown_status_loop().
+
 parse_eval_args(Args) ->
     % shells may process args into more than one, and end up stripping
     % spaces, so this converts all of that to a single string to parse

+ 4 - 0
changes/ce/fix-11567.en.md

@@ -0,0 +1,4 @@
+Improve EMQX graceful shutdown (`emqx stop` command):
+- increase timeout from 1 to 2 minutes
+- print an error message if EMQX can't stop gracefully within the configured timeout
+- print periodic status messages while EMQX is shutting down