Просмотр исходного кода

Merge pull request #14353 from savonarola/1205-fix-session-evacution

Add a workaround for eviction of sessions lost from `emqx_cm_registry`
Ilia Averianov 1 год назад
Родитель
Сommit
0742137d70

+ 46 - 18
apps/emqx/src/emqx_cm.erl

@@ -25,7 +25,6 @@
 -include("emqx_mqtt.hrl").
 -include("emqx_external_trace.hrl").
 -include_lib("snabbkaffe/include/snabbkaffe.hrl").
--include_lib("stdlib/include/qlc.hrl").
 -include_lib("stdlib/include/ms_transform.hrl").
 
 -export([start_link/0]).
@@ -75,8 +74,8 @@
 
 %% Client management
 -export([
-    all_channels_table/1,
-    live_connection_table/1
+    all_channels_stream/1,
+    live_connection_stream/1
 ]).
 
 %% gen_server callbacks
@@ -140,6 +139,8 @@
 %% Batch drain
 -define(BATCH_SIZE, 100000).
 
+-define(CHAN_INFO_SELECT_LIMIT, 100).
+
 %% Server name
 -define(CM, ?MODULE).
 
@@ -650,30 +651,57 @@ all_channels() ->
     ets:select(?CHAN_TAB, Pat).
 
 %% @doc Get clientinfo for all clients
-all_channels_table(ConnModuleList) ->
+-spec all_channels_stream([module()]) ->
+    emqx_utils_stream:stream({
+        emqx_types:clientid(),
+        _ConnState :: atom(),
+        emqx_types:conninfo(),
+        emqx_types:clientinfo()
+    }).
+all_channels_stream(ConnModuleList) ->
     Ms = ets:fun2ms(
         fun({{ClientId, _ChanPid}, Info, _Stats}) ->
             {ClientId, Info}
         end
     ),
-    Table = ets:table(?CHAN_INFO_TAB, [{traverse, {select, Ms}}]),
     ConnModules = sets:from_list(ConnModuleList, [{version, 2}]),
-    qlc:q([
-        {ClientId, ConnState, ConnInfo, ClientInfo}
-     || {ClientId, #{
-            conn_state := ConnState,
-            clientinfo := ClientInfo,
-            conninfo := #{conn_mod := ConnModule} = ConnInfo
-        }} <-
-            Table,
-        sets:is_element(ConnModule, ConnModules)
-    ]).
+    AllChanInfoStream = emqx_utils_stream:ets(fun
+        (undefined) -> ets:select(?CHAN_INFO_TAB, Ms, ?CHAN_INFO_SELECT_LIMIT);
+        (Cont) -> ets:select(Cont)
+    end),
+    WithModulesFilteredStream = emqx_utils_stream:filter(
+        fun({_, #{conninfo := #{conn_mod := ConnModule}}}) ->
+            sets:is_element(ConnModule, ConnModules)
+        end,
+        AllChanInfoStream
+    ),
+    %% Map to the plain tuples
+    emqx_utils_stream:map(
+        fun(
+            {ClientId, #{
+                conn_state := ConnState,
+                clientinfo := ClientInfo,
+                conninfo := ConnInfo
+            }}
+        ) ->
+            {ClientId, ConnState, ConnInfo, ClientInfo}
+        end,
+        WithModulesFilteredStream
+    ).
 
 %% @doc Get all local connection query handle
-live_connection_table(ConnModules) ->
+-spec live_connection_stream([module()]) ->
+    emqx_utils_stream:stream({emqx_types:clientid(), pid()}).
+live_connection_stream(ConnModules) ->
     Ms = lists:map(fun live_connection_ms/1, ConnModules),
-    Table = ets:table(?CHAN_CONN_TAB, [{traverse, {select, Ms}}]),
-    qlc:q([{ClientId, ChanPid} || {ClientId, ChanPid} <- Table, is_channel_connected(ChanPid)]).
+    AllConnStream = emqx_utils_stream:ets(fun
+        (undefined) -> ets:select(?CHAN_CONN_TAB, Ms, ?CHAN_INFO_SELECT_LIMIT);
+        (Cont) -> ets:select(Cont)
+    end),
+    emqx_utils_stream:filter(
+        fun({_ClientId, ChanPid}) -> is_channel_connected(ChanPid) end,
+        AllConnStream
+    ).
 
 live_connection_ms(ConnModule) ->
     {{{'$1', '$2'}, ConnModule}, [], [{{'$1', '$2'}}]}.

+ 4 - 0
apps/emqx/src/emqx_cm_registry.erl

@@ -19,6 +19,8 @@
 
 -behaviour(gen_server).
 
+-include_lib("snabbkaffe/include/snabbkaffe.hrl").
+
 -export([start_link/0]).
 
 -export([is_enabled/0, is_hist_enabled/0]).
@@ -185,9 +187,11 @@ handle_cast(Msg, State) ->
     {noreply, State}.
 
 handle_info({membership, {mnesia, down, Node}}, State) ->
+    ?tp(warning, cm_registry_mnesia_down, #{node => Node}),
     cleanup_channels(Node),
     {noreply, State};
 handle_info({membership, {node, down, Node}}, State) ->
+    ?tp(warning, cm_registry_node_down, #{node => Node}),
     cleanup_channels(Node),
     {noreply, State};
 handle_info({membership, _Event}, State) ->

+ 2 - 2
apps/emqx/test/emqx_connection_conninfo_SUITE.erl

@@ -59,8 +59,8 @@ t_inconsistent_chan_info(_Config) ->
 
     ClientIds = [
         ClientId
-     || {ClientId, _ConnState, _ConnInfo, _ClientInfo} <- qlc:eval(
-            emqx_cm:all_channels_table([emqx_connection])
+     || {ClientId, _ConnState, _ConnInfo, _ClientInfo} <- emqx_utils_stream:consume(
+            emqx_cm:all_channels_stream([emqx_connection])
         )
     ],
 

+ 1 - 1
apps/emqx_eviction_agent/src/emqx_eviction_agent.app.src

@@ -1,6 +1,6 @@
 {application, emqx_eviction_agent, [
     {description, "EMQX Eviction Agent"},
-    {vsn, "5.1.8"},
+    {vsn, "5.1.9"},
     {registered, [
         emqx_eviction_agent_sup,
         emqx_eviction_agent,

+ 80 - 65
apps/emqx_eviction_agent/src/emqx_eviction_agent.erl

@@ -9,7 +9,6 @@
 -include_lib("emqx/include/types.hrl").
 -include_lib("emqx/include/emqx_hooks.hrl").
 
--include_lib("stdlib/include/qlc.hrl").
 -include_lib("snabbkaffe/include/snabbkaffe.hrl").
 
 -export([
@@ -122,7 +121,8 @@ enable_status() ->
 evict_connections(N) ->
     case enable_status() of
         {enabled, _Kind, ServerReference, _Options} ->
-            ok = do_evict_connections(N, ServerReference);
+            Stream = emqx_utils_stream:limit_length(N, connection_pid_stream()),
+            ok = do_evict_connections(Stream, ServerReference);
         disabled ->
             {error, disabled}
     end.
@@ -141,16 +141,18 @@ evict_sessions(N, Nodes, ConnState) when
 ->
     case enable_status() of
         {enabled, _Kind, _ServerReference, _Options} ->
-            ok = do_evict_sessions(N, Nodes, ConnState);
+            Stream = emqx_utils_stream:limit_length(N, channel_stream(ConnState)),
+            ok = do_evict_sessions(Nodes, Stream);
         disabled ->
             {error, disabled}
     end.
 
 -spec purge_sessions(non_neg_integer()) -> ok_or_error(disabled).
-purge_sessions(N) ->
+purge_sessions(N) when N > 0 ->
     case enable_status() of
         {enabled, _Kind, _ServerReference, _Options} ->
-            ok = do_purge_sessions(N);
+            Stream = emqx_utils_stream:limit_length(N, channel_stream(any)),
+            ok = do_purge_sessions(Stream);
         disabled ->
             {error, disabled}
     end.
@@ -267,25 +269,31 @@ stats() ->
         sessions => session_count()
     }.
 
-connection_table() ->
-    emqx_cm:live_connection_table(?CONN_MODULES).
+connection_stream() ->
+    emqx_cm:live_connection_stream(?CONN_MODULES).
 
 connection_count() ->
-    table_count(connection_table()).
-
-channel_table(any) ->
-    qlc:q([
-        {ClientId, ConnInfo, ClientInfo}
-     || {ClientId, _, ConnInfo, ClientInfo} <-
-            emqx_cm:all_channels_table(?CONN_MODULES)
-    ]);
-channel_table(RequiredConnState) ->
-    qlc:q([
-        {ClientId, ConnInfo, ClientInfo}
-     || {ClientId, ConnState, ConnInfo, ClientInfo} <-
-            emqx_cm:all_channels_table(?CONN_MODULES),
-        RequiredConnState =:= ConnState
-    ]).
+    stream_count(connection_stream()).
+
+channel_stream(any) ->
+    emqx_utils_stream:map(
+        fun({ClientId, _, ConnInfo, ClientInfo}) ->
+            {ClientId, ConnInfo, ClientInfo}
+        end,
+        emqx_cm:all_channels_stream(?CONN_MODULES)
+    );
+channel_stream(RequiredConnState) ->
+    WithRequiredConnStateStream =
+        emqx_utils_stream:filter(
+            fun({_, ConnState, _, _}) -> RequiredConnState =:= ConnState end,
+            emqx_cm:all_channels_stream(?CONN_MODULES)
+        ),
+    emqx_utils_stream:map(
+        fun({ClientId, _, ConnInfo, ClientInfo}) ->
+            {ClientId, ConnInfo, ClientInfo}
+        end,
+        WithRequiredConnStateStream
+    ).
 
 -spec all_channels_count() -> non_neg_integer().
 all_channels_count() ->
@@ -312,7 +320,7 @@ all_channels_count() ->
 
 -spec all_local_channels_count() -> non_neg_integer().
 all_local_channels_count() ->
-    table_count(channel_table(any)).
+    stream_count(channel_stream(any)).
 
 session_count() ->
     session_count(any) + durable_session_count().
@@ -321,51 +329,59 @@ durable_session_count() ->
     emqx_persistent_session_bookkeeper:get_disconnected_session_count().
 
 session_count(ConnState) ->
-    table_count(channel_table(ConnState)).
-
-table_count(QH) ->
-    qlc:fold(fun(_, Acc) -> Acc + 1 end, 0, QH).
-
-take_connections(N) ->
-    ChanQH = qlc:q([ChanPid || {_ClientId, ChanPid} <- connection_table()]),
-    ChanPidCursor = qlc:cursor(ChanQH),
-    ChanPids = qlc:next_answers(ChanPidCursor, N),
-    ok = qlc:delete_cursor(ChanPidCursor),
-    ChanPids.
-
-take_channels(N) ->
-    QH = qlc:q([
-        {ClientId, ConnInfo, ClientInfo}
-     || {ClientId, _, ConnInfo, ClientInfo} <-
-            emqx_cm:all_channels_table(?CONN_MODULES)
-    ]),
-    ChanPidCursor = qlc:cursor(QH),
-    Channels = qlc:next_answers(ChanPidCursor, N),
-    ok = qlc:delete_cursor(ChanPidCursor),
-    Channels.
-
-take_channels(N, ConnState) ->
-    ChanPidCursor = qlc:cursor(channel_table(ConnState)),
-    Channels = qlc:next_answers(ChanPidCursor, N),
-    ok = qlc:delete_cursor(ChanPidCursor),
-    Channels.
-
-do_evict_connections(N, ServerReference) when N > 0 ->
-    ChanPids = take_connections(N),
-    ok = lists:foreach(
+    stream_count(channel_stream(ConnState)).
+
+stream_count(Stream) ->
+    emqx_utils_stream:fold(fun(_, Acc) -> Acc + 1 end, 0, Stream).
+
+connection_pid_stream() ->
+    emqx_utils_stream:map(
+        fun({_ClientId, ChanPid}) -> ChanPid end,
+        connection_stream()
+    ).
+
+do_evict_connections(ChanPidStream, ServerReference) ->
+    ok = emqx_utils_stream:foreach(
         fun(ChanPid) ->
             disconnect_channel(ChanPid, ServerReference)
         end,
-        ChanPids
+        ChanPidStream
     ).
 
-do_evict_sessions(N, Nodes, ConnState) when N > 0 ->
-    Channels = take_channels(N, ConnState),
-    ok = lists:foreach(
+do_evict_sessions(Nodes, ChannelStream) ->
+    ok = emqx_utils_stream:foreach(
         fun({ClientId, ConnInfo, ClientInfo}) ->
-            evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo)
+            case is_session_evictable(ClientId) of
+                true ->
+                    evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo);
+                false ->
+                    %% This should not happen normally.
+                    %% But session may slip from the `emqx_cm_registry` due to cluster errors.
+                    %% Such sessions cannot be evicted to another node.
+                    %% If we do nothing here, we may enter a dead loop of evicting the same session.
+                    %%
+                    %% But
+                    %% * In case of node evacuation, the session is doomed anyway.
+                    %% * In case of node rebalance, we evict disconnected sessions only,
+                    %% and a disconnected session slipped from `emqx_cm_registry` cannot be
+                    %% taken over by a reconnecting client, so it is already lost.
+                    %%
+                    %% Therefore, it is safe to just discard the session here.
+                    discard_session_channel(ClientId)
+            end
+        end,
+        ChannelStream
+    ).
+
+is_session_evictable(ClientId) ->
+    emqx_cm_registry:lookup_channels(ClientId) =/= [].
+
+discard_session_channel(ClientId) ->
+    lists:foreach(
+        fun(ChanPid) ->
+            emqx_cm:discard_session(ClientId, ChanPid)
         end,
-        Channels
+        emqx_cm:lookup_channels(local, ClientId)
     ).
 
 evict_session_channel(Nodes, ClientId, ConnInfo, ClientInfo) ->
@@ -463,13 +479,12 @@ disconnect_channel(ChanPid, ServerReference) ->
             'Server-Reference' => ServerReference
         }}.
 
-do_purge_sessions(N) when N > 0 ->
-    Channels = take_channels(N),
-    ok = lists:foreach(
+do_purge_sessions(Stream) ->
+    ok = emqx_utils_stream:foreach(
         fun({ClientId, _ConnInfo, _ClientInfo}) ->
             emqx_cm:discard_session(ClientId)
         end,
-        Channels
+        Stream
     ).
 
 do_purge_durable_sessions(N) when N > 0 ->

+ 22 - 0
apps/emqx_eviction_agent/test/emqx_eviction_agent_SUITE.erl

@@ -306,6 +306,28 @@ t_explicit_session_takeover(Config) ->
     ),
     ok = emqtt:disconnect(C3).
 
+t_evict_lost_session(_Config) ->
+    _ = erlang:process_flag(trap_exit, true),
+    ok = restart_emqx(),
+
+    %% Make a session
+    {ok, C0} = emqtt_connect([
+        {clientid, <<"client_with_session">>},
+        {clean_start, false}
+    ]),
+    {ok, _, _} = emqtt:subscribe(C0, <<"t1">>),
+    ok = emqtt:disconnect(C0),
+    ok = emqx_eviction_agent:enable(test_eviction, undefined),
+    ?assertEqual(1, emqx_eviction_agent:session_count()),
+
+    %% Emulate lost session
+    [ChanPid] = emqx_cm:lookup_channels(<<"client_with_session">>),
+    emqx_cm_registry:unregister_channel({<<"client_with_session">>, ChanPid}),
+    %% unregister is async, wait for it
+    ct:sleep(100),
+    ok = emqx_eviction_agent:evict_sessions(1, node()),
+    ?retry(_Sleep = 10, _Retries = 20, ?assertEqual(0, emqx_eviction_agent:session_count())).
+
 t_disable_on_restart(_Config) ->
     ok = emqx_eviction_agent:enable(test_eviction, undefined),
 

+ 1 - 0
changes/ce/feat-14353.en.md

@@ -0,0 +1 @@
+Make session rebalance and evacuation more robust. Previously, session evacuation could enter a dead loop after some clustering errors.