Explorar o código

Merge pull request #10052 from zmstone/0301-optimize-boot-error-log

0301 optimize boot failure logs
Zaiming (Stone) Shi %!s(int64=3) %!d(string=hai) anos
pai
achega
a4aece396a

+ 5 - 0
apps/emqx_machine/src/emqx_machine.erl

@@ -19,6 +19,7 @@
 -export([
 -export([
     start/0,
     start/0,
     graceful_shutdown/0,
     graceful_shutdown/0,
+    brutal_shutdown/0,
     is_ready/0,
     is_ready/0,
 
 
     node_status/0,
     node_status/0,
@@ -47,6 +48,10 @@ start() ->
 graceful_shutdown() ->
 graceful_shutdown() ->
     emqx_machine_terminator:graceful_wait().
     emqx_machine_terminator:graceful_wait().
 
 
+%% only used when failed to boot
+brutal_shutdown() ->
+    init:stop().
+
 set_backtrace_depth() ->
 set_backtrace_depth() ->
     {ok, Depth} = application:get_env(emqx_machine, backtrace_depth),
     {ok, Depth} = application:get_env(emqx_machine, backtrace_depth),
     _ = erlang:system_flag(backtrace_depth, Depth),
     _ = erlang:system_flag(backtrace_depth, Depth),

+ 73 - 36
bin/emqx

@@ -451,6 +451,20 @@ call_hocon() {
         || die "call_hocon_failed: $*" $?
         || die "call_hocon_failed: $*" $?
 }
 }
 
 
+find_emqx_process() {
+    ## Find the running node from 'ps -ef'
+    ##  * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
+    ##  * The running 'remsh' and 'nodetool' processes must be excluded
+    if [ -n "${EMQX_NODE__NAME:-}" ]; then
+        # if node name is provided, filter by node name
+        # shellcheck disable=SC2009
+        ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
+    else
+        # shellcheck disable=SC2009
+        ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
+    fi
+}
+
 ## Resolve boot configs in a batch
 ## Resolve boot configs in a batch
 ## This is because starting the Erlang beam with all modules loaded
 ## This is because starting the Erlang beam with all modules loaded
 ## and parsing HOCON config + environment variables is a non-trivial task
 ## and parsing HOCON config + environment variables is a non-trivial task
@@ -468,17 +482,7 @@ fi
 # Turn off debug as the ps output can be quite noisy
 # Turn off debug as the ps output can be quite noisy
 set +x
 set +x
 
 
-## Find the running node from 'ps -ef'
-##  * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
-##  * The running 'remsh' and 'nodetool' processes must be excluded
-if [ -n "${EMQX_NODE__NAME:-}" ]; then
-    # if node name is provided, filter by node name
-    # shellcheck disable=SC2009
-    PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
-else
-    # shellcheck disable=SC2009
-    PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
-fi
+PS_LINE="$(find_emqx_process)"
 logdebug "PS_LINE=$PS_LINE"
 logdebug "PS_LINE=$PS_LINE"
 RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)"
 RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)"
 [ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT"
 [ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT"
@@ -927,6 +931,7 @@ case "$NAME" in
         NAME_TYPE='-sname'
         NAME_TYPE='-sname'
 esac
 esac
 SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')"
 SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')"
+HOST_NAME="$(echo "$NAME" | awk -F'@' '{print $2}')"
 if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then
 if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then
     logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
     logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
     exit 1
     exit 1
@@ -969,6 +974,59 @@ if [[ "$IS_BOOT_COMMAND" == 'yes' && "$(get_boot_config 'node.db_backend')" == "
     fi
     fi
 fi
 fi
 
 
+diagnose_boot_failure_and_die() {
+    local ps_line
+    ps_line="$(find_emqx_process)"
+    if [ -z "$ps_line" ]; then
+        echo "Find more information in the latest log file: ${RUNNER_LOG_DIR}/erlang.log.*"
+        exit 1
+    fi
+    if ! relx_nodetool "ping" > /dev/null; then
+        logerr "$NAME seems to be running, but not responding to pings."
+        echo "Make sure '$HOST_NAME' is a resolvable and reachable hostname."
+        pipe_shutdown
+        exit 2
+    fi
+    if ! relx_nodetool 'eval' 'true = emqx:is_running()' > /dev/null; then
+        logerr "$NAME node is started, but failed to complete the boot sequence in time."
+        echo "Please collect the logs in ${RUNNER_LOG_DIR} and report a bug to EMQX team at https://github.com/emqx/emqx/issues/new/choose"
+        pipe_shutdown
+        exit 3
+    fi
+}
+
+## Only works when started in daemon mode
+pipe_shutdown() {
+    if [ -d "$PIPE_DIR" ]; then
+        echo "Shutting down $NAME from to_erl pipe."
+        ## can not evaluate init:stop() or erlang:halt() because the shell is restricted
+        echo 'emqx_machine:brutal_shutdown().' | "$BINDIR/to_erl" "$PIPE_DIR"
+    fi
+}
+
+## Call nodetool to stop EMQX
+nodetool_shutdown() {
+    # Wait for the node to completely stop...
+    PID="$(relx_get_pid)"
+    if ! relx_nodetool "stop"; then
+        die "Graceful shutdown failed PID=[$PID]"
+    fi
+    WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
+    if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
+        msg="dangling after ${WAIT_TIME} seconds"
+        # also log to syslog
+        logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
+        # log to user console
+        set +x
+        logerr "Stop failed, $msg"
+        echo "ERROR: $PID is still around"
+        ps -p "$PID"
+        exit 1
+    fi
+    echo "ok"
+    logger -t "${REL_NAME}[${PID}]" "STOP: OK"
+}
+
 cd "$RUNNER_ROOT_DIR"
 cd "$RUNNER_ROOT_DIR"
 
 
 case "${COMMAND}" in
 case "${COMMAND}" in
@@ -1014,33 +1072,15 @@ case "${COMMAND}" in
             echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
             echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
             exit 0
             exit 0
         else
         else
-            echo "$EMQX_DESCRIPTION $REL_VSN failed to start in ${WAIT_TIME} seconds."
-            echo "Please find more information in erlang.log.N"
-            echo "Or run 'env DEBUG=1 $0 console' to have logs printed to console."
-            exit 1
+            logerr "${EMQX_DESCRIPTION} ${REL_VSN} using node name '${NAME}' failed ${WAIT_TIME} probes."
+            diagnose_boot_failure_and_die
         fi
         fi
         ;;
         ;;
 
 
     stop)
     stop)
-        # Wait for the node to completely stop...
-        PID="$(relx_get_pid)"
-        if ! relx_nodetool "stop"; then
-            die "Graceful shutdown failed PID=[$PID]"
-        fi
-        WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
-        if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
-            msg="dangling after ${WAIT_TIME} seconds"
-            # also log to syslog
-            logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
-            # log to user console
-            set +x
-            logerr "Stop failed, $msg"
-            echo "ERROR: $PID is still around"
-            ps -p "$PID"
-            exit 1
+        if ! nodetool_shutdown; then
+            pipe_shutdown
         fi
         fi
-        echo "ok"
-        logger -t "${REL_NAME}[${PID}]" "STOP: OK"
         ;;
         ;;
 
 
     pid)
     pid)
@@ -1063,9 +1103,6 @@ case "${COMMAND}" in
         ;;
         ;;
 
 
     attach)
     attach)
-        assert_node_alive
-
-        shift
         exec "$BINDIR/to_erl" "$PIPE_DIR"
         exec "$BINDIR/to_erl" "$PIPE_DIR"
         ;;
         ;;
 
 

+ 12 - 0
changes/ce/fix-10052.en.md

@@ -0,0 +1,12 @@
+Improve daemon mode startup failure logs.
+
+Before this change, it was difficult for users to understand the reason for EMQX 'start' command failed to boot the node.
+The only information they received was that the node did not start within the expected time frame,
+and they were instructed to boot the node with 'console' command in the hope of obtaining some logs.
+However, the node might actually be running, which could cause 'console' mode to fail for a different reason.
+
+With this new change, when daemon mode fails to boot, a diagnosis is issued. Here are the possible scenarios:
+
+* If the node cannot be found from `ps -ef`, the user is instructed to find information in log files `erlang.log.*`.
+* If the node is found to be running but not responding to pings, the user is advised to check if the host name is resolvable and reachable.
+* If the node is responding to pings, but the EMQX app is not running, it is likely a bug. In this case, the user is advised to report a Github issue.

+ 11 - 0
changes/ce/fix-10052.zh.md

@@ -0,0 +1,11 @@
+优化 EMQX daemon 模式启动启动失败的日志。
+
+在进行此更改之前,当 EMQX 用 `start` 命令启动失败时,用户很难理解出错的原因。
+所知道的仅仅是节点未能在预期时间内启动,然后被指示以 `console` 式引导节点以获取一些日志。
+然而,节点实际上可能正在运行,这可能会导致 `console` 模式因不同的原因而失败。
+
+此次修复后,启动脚本会发出诊断:
+
+* 如果无法从 `ps -ef` 中找到节点,则指示用户在 `erlang.log.*` 中查找信息。
+* 如果发现节点正在运行但不响应 ping,则建议用户检查节点主机名是否有效并可达。
+* 如果节点响应 ping 但 EMQX 应用程序未运行,则很可能是一个错误。在这种情况下,建议用户报告一个Github issue。

+ 1 - 0
deploy/docker/README.md

@@ -59,6 +59,7 @@ EMQX_DASHBOARD__DEFAULT_PASSWORD       <--> dashboard.default_password
 EMQX_NODE__COOKIE                      <--> node.cookie
 EMQX_NODE__COOKIE                      <--> node.cookie
 EMQX_LISTENERS__SSL__default__ENABLE   <--> listeners.ssl.default.enable
 EMQX_LISTENERS__SSL__default__ENABLE   <--> listeners.ssl.default.enable
 ```
 ```
+Note: The lowercase use of 'default' is not a typo. It is used to demonstrate that lowercase environment variables are equivalent.
 
 
 + Prefix `EMQX_` is removed
 + Prefix `EMQX_` is removed
 + All upper case letters is replaced with lower case letters
 + All upper case letters is replaced with lower case letters