Просмотр исходного кода

Merge pull request #5430 from zmstone/refactor-simplify-ci-scripts

refactor: simplify ci scripts
Zaiming (Stone) Shi 4 лет назад
Родитель
Сommit
672f94e118
3 измененных файлов с 96 добавлено и 102 удалено
  1. 11 5
      .ci/build_packages/tests.sh
  2. 7 15
      apps/emqx_machine/src/emqx_machine_terminator.erl
  3. 78 82
      bin/emqx

+ 11 - 5
.ci/build_packages/tests.sh

@@ -73,7 +73,7 @@ emqx_test(){
                 fi
 
                 echo "running ${packagename} start"
-                running_test
+                run_test
                 echo "running ${packagename} stop"
 
                 dpkg -r "${EMQX_NAME}"
@@ -99,7 +99,7 @@ emqx_test(){
                 fi
 
                 echo "running ${packagename} start"
-                running_test
+                run_test
                 echo "running ${packagename} stop"
 
                 rpm -e "${EMQX_NAME}"
@@ -113,7 +113,7 @@ emqx_test(){
     done
 }
 
-running_test(){
+run_test(){
     # sed -i '/emqx_telemetry/d' /var/lib/emqx/loaded_plugins
     emqx_env_vars=$(dirname "$(readlink "$(command -v emqx)")")/../releases/emqx_vars
 
@@ -132,7 +132,7 @@ EOF
         exit 1
     fi
 
-    if ! su - emqx -c "emqx start"; then
+    if ! emqx 'start'; then
         cat /var/log/emqx/erlang.log.1 || true
         cat /var/log/emqx/emqx.log.1 || true
         exit 1
@@ -149,7 +149,13 @@ EOF
     done
     pytest -v /paho-mqtt-testing/interoperability/test_client/V5/test_connect.py::test_basic
     # shellcheck disable=SC2009 # pgrep does not support Extended Regular Expressions
-    emqx stop || kill "$(ps -ef | grep -E '\-progname\s.+emqx\s' |awk '{print $2}')"
+    ps -ef | grep -E '\-progname\s.+emqx\s'
+    if ! emqx 'stop'; then
+        echo "ERROR: failed_to_stop_emqx_with_the_stop_command"
+        cat /var/log/emqx/erlang.log.1 || true
+        cat /var/log/emqx/emqx.log.1 || true
+        exit 1
+    fi
 
     if [ "$(sed -n '/^ID=/p' /etc/os-release | sed -r 's/ID=(.*)/\1/g' | sed 's/"//g')" = ubuntu ] \
     || [ "$(sed -n '/^ID=/p' /etc/os-release | sed -r 's/ID=(.*)/\1/g' | sed 's/"//g')" = debian ] ;then

+ 7 - 15
apps/emqx_machine/src/emqx_machine_terminator.erl

@@ -41,7 +41,8 @@ start_link() ->
 
 is_running() -> is_pid(whereis(?TERMINATOR)).
 
-%% @doc Send a signal to activate the terminator.
+%% @doc Call `emqx_machine_terminator' to stop applications
+%% then call init:stop() stop beam.
 graceful() ->
     try
         _ = gen_server:call(?TERMINATOR, ?DO_IT, infinity)
@@ -52,28 +53,19 @@ graceful() ->
             %% should issue a shutdown to be sure
             %% NOTE: not exit_loop here because we do not want to
             %% block erl_signal_server
+            ?ELOG("Shutdown before node is ready?~n", []),
             init:stop()
     end,
     ok.
 
-%% @doc Shutdown the Erlang VM and wait until the terminator dies or the VM dies.
+%% @doc Shutdown the Erlang VM and wait indefinitely.
 graceful_wait() ->
-    case whereis(?TERMINATOR) of
-        undefined ->
-            ?SLOG(warning, #{msg => "shutdown_before_boot_is_complete"}),
-            exit_loop();
-        Pid ->
-            ok = graceful(),
-            Ref = monitor(process, Pid),
-            %% NOTE: not exactly sure, but maybe there is a chance that
-            %% Erlang VM goes down before this receive.
-            %% In which case, the remote caller will get {badrpc, nodedown}
-            receive {'DOWN', Ref, process, Pid, _} -> ok end
-    end.
+    ok = graceful(),
+    exit_loop().
 
 exit_loop() ->
-    init:stop(),
     timer:sleep(100),
+    init:stop(),
     exit_loop().
 
 init(_) ->

+ 78 - 82
bin/emqx

@@ -5,7 +5,8 @@
 set -e
 set -o pipefail
 
-if [ -n "$DEBUG" ]; then
+DEBUG="${DEBUG:-0}"
+if [ "$DEBUG" -eq 1 ]; then
     set -x
 fi
 
@@ -42,12 +43,21 @@ export LD_LIBRARY_PATH="$ERTS_DIR/lib:$LD_LIBRARY_PATH"
 export ERTS_LIB_DIR="$ERTS_DIR/../lib"
 MNESIA_DATA_DIR="$RUNNER_DATA_DIR/mnesia/$NAME"
 
+# Echo to stderr on errors
+echoerr() { echo "ERROR: $*" 1>&2; }
+
 die() {
-    echo >&2 "$1"
+    echoerr "ERROR: $1"
     errno=${2:-1}
     exit "$errno"
 }
 
+assert_node_alive() {
+    if ! relx_nodetool "ping" > /dev/null; then
+        die "node_is_not_running!" 1
+    fi
+}
+
 relx_usage() {
     command="$1"
 
@@ -113,7 +123,7 @@ check_user() {
             echo "You need to be root or use sudo to run this command"
             exit 1
         fi
-        CMD="\"$RUNNER_SCRIPT\" "
+        CMD="DEBUG=$DEBUG \"$RUNNER_SCRIPT\" "
         for ARG in "$@"; do
             CMD="${CMD} \"$ARG\""
         done
@@ -145,9 +155,6 @@ if [ "$ULIMIT_F" -lt 1024 ]; then
     echo "!!!!"
 fi
 
-# Echo to stderr on errors
-echoerr() { echo "$@" 1>&2; }
-
 SED_REPLACE="sed -i "
 case $(sed --help 2>&1) in
     *GNU*) SED_REPLACE="sed -i ";;
@@ -204,7 +211,7 @@ call_hocon() {
     export RUNNER_ETC_DIR
     export REL_VSN
     "$ERTS_DIR/bin/escript" "$ROOTDIR/bin/nodetool" hocon "$@" \
-        || die "ERROR: call_hocon failed: $*" $?
+        || die "call_hocon_failed: $*" $?
 }
 
 # Run an escript in the node's environment
@@ -281,7 +288,7 @@ generate_config() {
 
     # shellcheck disable=SC2086
     if ! relx_nodetool chkconfig $CONFIG_ARGS; then
-        die "Error reading $CONFIG_ARGS"
+        die "failed_to_check_config $CONFIG_ARGS"
     fi
 }
 
@@ -292,6 +299,33 @@ bootstrapd() {
     fi
 }
 
+# check if a PID is down
+is_down() {
+    PID="$1"
+    if kill -s 0 "$PID" 2>/dev/null; then
+        return 1
+    fi
+    return 0
+}
+
+wait_for() {
+    local WAIT_TIME
+    local CMD
+    WAIT_TIME="$1"
+    shift
+    CMD="$*"
+    while true; do
+        if $CMD >/dev/null 2>&1; then
+            return 0
+        fi
+        if [ "$WAIT_TIME" -le 0 ]; then
+            return 1
+        fi
+        WAIT_TIME=$((WAIT_TIME - 1))
+        sleep 1
+    done
+}
+
 # Use $CWD/etc/sys.config if exists
 if [ -z "$RELX_CONFIG_PATH" ]; then
     if [ -f "$RUNNER_ETC_DIR/sys.config" ]; then
@@ -326,7 +360,7 @@ if [ -z "$NAME" ]; then
         # shellcheck disable=SC2012,SC2086
         LATEST_VM_ARGS="$(ls -t $CONFIGS_DIR/vm.*.args | head -1)"
         if [ -z "$LATEST_VM_ARGS" ]; then
-            echoerr "For command $1, there is no vm.*.args file found in $CONFIGS_DIR/"
+            echoerr "no_vm_arg_file_found_for $1 in $CONFIGS_DIR/"
             exit 1
         fi
         NAME="$(grep -E '^-s?name' "$LATEST_VM_ARGS" | awk '{print $2}')"
@@ -359,7 +393,7 @@ if [ -z "$COOKIE" ]; then
         # shellcheck disable=SC2012,SC2086
         LATEST_VM_ARGS="$(ls -t $CONFIGS_DIR/vm.*.args | head -1)"
         if [ -z "$LATEST_VM_ARGS" ]; then
-            echo "For command $1, there is no vm.*.args config file found in $CONFIGS_DIR/"
+            echoerr "no_vm_arg_file_found_for $1 in $CONFIGS_DIR/"
             exit 1
         fi
         COOKIE="$(grep -E '^-setcookie' "$LATEST_VM_ARGS" | awk '{print $2}')"
@@ -384,8 +418,7 @@ case "$1" in
     start|start_boot)
         # Make sure a node IS not running
         if relx_nodetool "ping" >/dev/null 2>&1; then
-            echo "Node is already running!"
-            exit 1
+            die "node_is_already_running!"
         fi
         # Bootstrap daemon command (check perms & drop to $RUNNER_USER)
         bootstrapd
@@ -431,33 +464,35 @@ case "$1" in
                           "$(relx_start_command)"
 
         WAIT_TIME=${WAIT_FOR_ERLANG:-15}
-        while [ "$WAIT_TIME" -gt 0 ]; do
-            if ! relx_nodetool "ping" >/dev/null 2>&1; then
-                WAIT_TIME=$((WAIT_TIME - 1))
-                sleep 1
-                continue
-            fi
-            sleep 1
-            if relx_nodetool "ping" >/dev/null 2>&1; then
-                echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
-                exit 0
-            fi
-        done && echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_FOR_ERLANG:-15} seconds,"
-        echo "see the output of '$0 console' for more information."
-        echo "If you want to wait longer, set the environment variable"
-        echo "WAIT_FOR_ERLANG to the number of seconds to wait."
-        exit 1
+        if wait_for "$WAIT_TIME" 'relx_nodetool' 'ping'; then
+            echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
+            exit 0
+        else
+            echo "$EMQX_DESCRIPTION $REL_VSN failed to start within ${WAIT_TIME} seconds,"
+            echo "see the output of '$0 console' for more information."
+            echo "If you want to wait longer, set the environment variable"
+            echo "WAIT_FOR_ERLANG to the number of seconds to wait."
+            exit 1
+        fi
         ;;
 
     stop)
         # Wait for the node to completely stop...
         PID="$(relx_get_pid)"
         if ! relx_nodetool "stop"; then
+            echoerr "Graceful shutdown failed PID=[$PID]"
             exit 1
         fi
-        while kill -s 0 "$PID" 2>/dev/null; do
-            sleep 1
-        done
+        WAIT_TIME="${WAIT_FOR_ERLANG_STOP:-60}"
+        if ! wait_for "$WAIT_TIME" is_down "$PID"; then
+            msg="dangling after ${WAIT_TIME} seconds"
+            # also log to syslog
+            logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
+            # log to user console
+            echoerr "STOP: $msg"
+            exit 1
+        fi
+        logger -t "${REL_NAME}[${PID}]" "STOP: OK"
         ;;
 
     restart|reboot)
@@ -473,10 +508,7 @@ case "$1" in
         ;;
 
     ping)
-        ## See if the VM is alive
-        if ! relx_nodetool "ping"; then
-            exit 1
-        fi
+        assert_node_alive
         ;;
 
     escript)
@@ -487,11 +519,7 @@ case "$1" in
         ;;
 
     attach)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         # Bootstrap daemon command (check perms & drop to $RUNNER_USER)
         bootstrapd
@@ -501,11 +529,7 @@ case "$1" in
         ;;
 
     remote_console)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         # Bootstrap daemon command (check perms & drop to $RUNNER_USER)
         bootstrapd
@@ -523,11 +547,7 @@ case "$1" in
 
         COMMAND="$1"; shift
 
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         ERL_FLAGS="$ERL_FLAGS $EPMD_ARG" \
         exec "$BINDIR/escript" "$ROOTDIR/bin/install_upgrade.escript" \
@@ -535,11 +555,7 @@ case "$1" in
         ;;
 
     versions)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         COMMAND="$1"; shift
 
@@ -601,7 +617,7 @@ case "$1" in
             $RELX_CONFIG_PATH $CONFIG_ARGS $EPMD_ARG
 
         # Log the startup
-        logger -t "${REL_NAME}[$$]" "$* -- ${1+$ARGS}"
+        logger -t "${REL_NAME}[$$]" "EXEC: $* -- ${1+$ARGS}"
 
         # Start the VM
         exec "$@" -- ${1+$ARGS}
@@ -642,7 +658,7 @@ case "$1" in
             $RELX_CONFIG_PATH $CONFIG_ARGS $EPMD_ARG
 
         # Log the startup
-        logger -t "${REL_NAME}[$$]" "$* -- ${1+$ARGS}"
+        logger -t "${REL_NAME}[$$]" "EXEC: $* -- ${1+$ARGS}"
 
         # Start the VM
         exec "$@" -- ${1+$ARGS}
@@ -651,54 +667,34 @@ case "$1" in
         echo "$ERTS_PATH"
         ;;
     ctl)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         shift
 
         relx_nodetool rpc emqx_ctl run_command "$@"
         ;;
     rpc)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         shift
 
         relx_nodetool rpc "$@"
         ;;
     rpcterms)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         shift
 
         relx_nodetool rpcterms "$@"
         ;;
     root_dir)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         shift
         relx_nodetool "eval" 'code:root_dir()'
         ;;
     eval)
-        # Make sure a node IS running
-        if ! relx_nodetool "ping" > /dev/null; then
-            echo "Node is not running!"
-            exit 1
-        fi
+        assert_node_alive
 
         shift
         relx_nodetool "eval" "$@"