소스 검색

fix: improve logs for emqx_cm:request_stepdown/3

Shawn 1 년 전
부모
커밋
64d2262080
2개의 변경된 파일59개의 추가작업 그리고 45개의 파일을 삭제
  1. 56 45
      apps/emqx/src/emqx_cm.erl
  2. 3 0
      changes/ce/fix-14061.en.md

+ 56 - 45
apps/emqx/src/emqx_cm.erl

@@ -145,6 +145,12 @@
     (is_binary(CLIENTID) orelse (is_atom(CLIENTID) andalso CLIENTID =/= undefined))
     (is_binary(CLIENTID) orelse (is_atom(CLIENTID) andalso CLIENTID =/= undefined))
 ).
 ).
 
 
+-define(REQ_DOWN_ERR(MOD, PID, ACTION), (#{conn_mod => MOD, stale_pid => PID, action => ACTION})).
+
+-define(REQ_DOWN_ERR_CHANNEL_INFO(MOD, PID, ACTION),
+    (?REQ_DOWN_ERR(MOD, PID, ACTION)#{stale_channel => stale_channel_info(PID)})
+).
+
 %% linting overrides
 %% linting overrides
 -elvis([
 -elvis([
     {elvis_style, invalid_dynamic_call, #{ignore => [emqx_cm]}},
     {elvis_style, invalid_dynamic_call, #{ignore => [emqx_cm]}},
@@ -402,12 +408,8 @@ takeover_session(ClientId, Pid) ->
     try
     try
         do_takeover_begin(ClientId, Pid)
         do_takeover_begin(ClientId, Pid)
     catch
     catch
-        _:R when
-            R == noproc;
-            R == timeout;
-            %% request_stepdown/3
-            R == unexpected_exception
-        ->
+        %% request_stepdown/3
+        _:R when R == noproc; R == timeout; R == unexpected_exception ->
             none;
             none;
         % rpc_call/3
         % rpc_call/3
         _:{'EXIT', {noproc, _}} ->
         _:{'EXIT', {noproc, _}} ->
@@ -461,55 +463,64 @@ request_stepdown(Action, ConnMod, Pid) ->
             _ -> ?T_TAKEOVER
             _ -> ?T_TAKEOVER
         end,
         end,
     Return =
     Return =
-        %% this is essentially a gen_server:call implemented in emqx_connection
-        %% and emqx_ws_connection.
-        %% the handle_call is implemented in emqx_channel
         try apply(ConnMod, call, [Pid, Action, Timeout]) of
         try apply(ConnMod, call, [Pid, Action, Timeout]) of
             ok -> ok;
             ok -> ok;
             Reply -> {ok, Reply}
             Reply -> {ok, Reply}
         catch
         catch
-            % emqx_ws_connection: call
-            _:noproc ->
-                ok = ?tp(debug, "session_already_gone", #{stale_pid => Pid, action => Action}),
-                {error, noproc};
-            % emqx_connection: gen_server:call
-            _:{noproc, _} ->
-                ok = ?tp(debug, "session_already_gone", #{stale_pid => Pid, action => Action}),
-                {error, noproc};
-            _:{shutdown, _} ->
-                ok = ?tp(debug, "session_already_shutdown", #{stale_pid => Pid, action => Action}),
-                {error, noproc};
-            _:{{shutdown, _}, _} ->
-                ok = ?tp(debug, "session_already_shutdown", #{stale_pid => Pid, action => Action}),
-                {error, noproc};
-            _:{timeout, {gen_server, call, _}} ->
-                ?tp(
-                    warning,
-                    "session_stepdown_request_timeout",
-                    #{stale_pid => Pid, action => Action, stale_channel => stale_channel_info(Pid)}
-                ),
-                ok = force_kill(Pid),
-                {error, timeout};
-            _:Error:St ->
-                ?tp(
-                    error,
-                    "session_stepdown_request_exception",
-                    #{
-                        stale_pid => Pid,
-                        action => Action,
-                        reason => Error,
-                        stacktrace => St,
-                        stale_channel => stale_channel_info(Pid)
-                    }
-                ),
-                ok = force_kill(Pid),
-                {error, unexpected_exception}
+            Err:Reason:St ->
+                handle_stepdown_exception(Err, Reason, St, ConnMod, Pid, Action)
         end,
         end,
     case Action == kick orelse Action == discard of
     case Action == kick orelse Action == discard of
         true -> ok;
         true -> ok;
         _ -> Return
         _ -> Return
     end.
     end.
 
 
+%% The emqx_connection returns `{Reason, {gen_server, call, _}}` on failure, but
+%% emqx_ws_connection returns `Reason`.
+handle_stepdown_exception(_Err, noproc, _St, ConnMod, Pid, Action) ->
+    ok = ?tp(debug, "ws_session_already_gone", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, {noproc, _}, _St, ConnMod, Pid, Action) ->
+    ok = ?tp(debug, "session_already_gone", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, {shutdown, _}, _St, ConnMod, Pid, Action) ->
+    ok = ?tp(debug, "ws_session_already_shutdown", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, {{shutdown, _}, _}, _St, ConnMod, Pid, Action) ->
+    ok = ?tp(debug, "session_already_shutdown", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, killed, _St, ConnMod, Pid, Action) ->
+    ?tp(debug, "ws_session_already_killed", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, {killed, {gen_server, call, _}}, _St, ConnMod, Pid, Action) ->
+    ?tp(debug, "session_already_killed", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, normal, _St, ConnMod, Pid, Action) ->
+    ?tp(debug, "ws_session_already_stopped_normally", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, {normal, {gen_server, call, _}}, _St, ConnMod, Pid, Action) ->
+    ?tp(debug, "session_already_stopped_normally", ?REQ_DOWN_ERR(ConnMod, Pid, Action)),
+    {error, noproc};
+handle_stepdown_exception(_Err, timeout, _St, ConnMod, Pid, Action) ->
+    ErrInfo = ?REQ_DOWN_ERR_CHANNEL_INFO(ConnMod, Pid, Action),
+    ?tp(warning, "ws_session_stepdown_request_timeout", ErrInfo),
+    ok = force_kill(Pid),
+    {error, timeout};
+handle_stepdown_exception(_Err, {timeout, {gen_server, call, _}}, _St, ConnMod, Pid, Action) ->
+    ErrInfo = ?REQ_DOWN_ERR_CHANNEL_INFO(ConnMod, Pid, Action),
+    ?tp(warning, "session_stepdown_request_timeout", ErrInfo),
+    ok = force_kill(Pid),
+    {error, timeout};
+handle_stepdown_exception(Err, Reason, St, ConnMod, Pid, Action) ->
+    ErroInfo = ?REQ_DOWN_ERR_CHANNEL_INFO(ConnMod, Pid, Action)#{
+        error => Err,
+        reason => Reason,
+        stacktrace => St
+    },
+    ?tp(error, "session_stepdown_request_exception", ErroInfo),
+    ok = force_kill(Pid),
+    {error, unexpected_exception}.
+
 force_kill(Pid) ->
 force_kill(Pid) ->
     exit(Pid, kill),
     exit(Pid, kill),
     ok.
     ok.

+ 3 - 0
changes/ce/fix-14061.en.md

@@ -0,0 +1,3 @@
+Improved log information when `emqx_cm:request_stepdown/3` fails.
+
+When a client channel needs to kill another channel with the same ClientID, it may encounter a race condition that the target channel has already been closed or killed. In this case, error logs and stack information is useless and will no longer be printed.