Browse Source

Merge pull request #10286 from zmstone/0330-exit-with-non-zero-code-if-conf-init-failed

0330 exit with non zero code if conf init failed
Zaiming (Stone) Shi 2 years atrás
parent
commit
36043dd651

+ 4 - 0
apps/emqx/src/emqx_app.erl

@@ -72,9 +72,13 @@ set_init_config_load_done() ->
 get_init_config_load_done() ->
     application:get_env(emqx, init_config_load_done, false).
 
+%% @doc Set the transaction id from which this node should start applying after boot.
+%% The transaction ID is received from the core node which we just copied the latest
+%% config from.
 set_init_tnx_id(TnxId) ->
     application:set_env(emqx, cluster_rpc_init_tnx_id, TnxId).
 
+%% @doc Get the transaction id from which this node should start applying after boot.
 get_init_tnx_id() ->
     application:get_env(emqx, cluster_rpc_init_tnx_id, -1).
 

+ 6 - 0
apps/emqx_conf/src/emqx_cluster_rpc.erl

@@ -275,8 +275,13 @@ init([Node, RetryMs]) ->
     _ = mria:wait_for_tables([?CLUSTER_MFA, ?CLUSTER_COMMIT]),
     {ok, _} = mnesia:subscribe({table, ?CLUSTER_MFA, simple}),
     State = #{node => Node, retry_interval => RetryMs},
+    %% The init transaction ID is set in emqx_conf_app after
+    %% it has fetched the latest config from one of the core nodes
     TnxId = emqx_app:get_init_tnx_id(),
     ok = maybe_init_tnx_id(Node, TnxId),
+    %% Now continue with the normal catch-up process
+    %% That is: apply the missing transactions after the config
+    %% was copied until now.
     {ok, State, {continue, ?CATCH_UP}}.
 
 %% @private
@@ -396,6 +401,7 @@ get_cluster_tnx_id() ->
         Id -> Id
     end.
 
+%% The entry point of a config change transaction.
 init_mfa(Node, MFA) ->
     mnesia:write_lock_table(?CLUSTER_MFA),
     LatestId = get_cluster_tnx_id(),

+ 1 - 1
apps/emqx_conf/src/emqx_conf_app.erl

@@ -38,7 +38,7 @@ start(_StartType, _StartArgs) ->
                 reason => E,
                 stacktrace => St
             }),
-            init:stop()
+            init:stop(1)
     end,
     ok = emqx_config_logger:refresh_config(),
     emqx_conf_sup:start_link().

+ 1 - 1
bin/emqx

@@ -760,7 +760,7 @@ generate_config() {
     local node_name="$2"
     ## Delete the *.siz files first or it can't start after
     ## changing the config 'log.rotation.size'
-    rm -rf "${RUNNER_LOG_DIR}"/*.siz
+    rm -f "${RUNNER_LOG_DIR}"/*.siz
 
     ## timestamp for each generation
     local NOW_TIME

+ 2 - 0
changes/ce/fix-10286.en.md

@@ -0,0 +1,2 @@
+Enhance logging behaviour during boot failure.
+When EMQX fails to start due to corrupted configuration files, excessive logging is eliminated and no crash dump file is generated.

+ 2 - 0
changes/ce/fix-10286.zh.md

@@ -0,0 +1,2 @@
+优化启动失败的错误日志。
+如果 EMQX 因为损坏的配置文件无法启动时,不会再打印过多的错误日志,也不再生成 crash.dump 文件。

+ 21 - 0
scripts/test/emqx-boot.bats

@@ -0,0 +1,21 @@
+#!/usr/bin/env bats
+
+# https://github.com/bats-core/bats-core
+# env PROFILE=emqx bats -t -p --verbose-run scripts/test/emqx-boot.bats
+
+@test "PROFILE must be set" {
+    [[ -n "$PROFILE" ]]
+}
+
+@test "emqx boot with invalid node name" {
+    output="$(env EMQX_NODE_NAME="invliadename#" ./_build/$PROFILE/rel/emqx/bin/emqx console 2>&1|| true)"
+    [[ "$output" =~ "ERROR: Invalid node name,".+ ]]
+}
+
+@test "corrupted cluster config file" {
+    conffile="./_build/$PROFILE/rel/emqx/data/configs/cluster-override.conf"
+    echo "{" > $conffile
+    run ./_build/$PROFILE/rel/emqx/bin/emqx console
+    [[ $status -ne 0 ]]
+    rm -f $conffile
+}