瀏覽代碼

Merge pull request #14356 from zmstone/241206-sync-relese-584-to-release-58

241206 sync relese 584 to release 58
zmstone 1 年之前
父節點
當前提交
16dfeb3010
共有 79 個文件被更改,包括 1770 次插入454 次删除
  1. 5 0
      apps/emqx/include/emqx.hrl
  2. 1 0
      apps/emqx/priv/bpapi.versions
  3. 1 1
      apps/emqx/src/emqx.app.src
  4. 39 4
      apps/emqx/src/emqx_channel.erl
  5. 37 25
      apps/emqx/src/emqx_config.erl
  6. 6 2
      apps/emqx/src/emqx_connection.erl
  7. 6 2
      apps/emqx/src/emqx_logger.erl
  8. 3 1
      apps/emqx/src/emqx_logger_textfmt.erl
  9. 3 3
      apps/emqx/src/emqx_schema.erl
  10. 2 2
      apps/emqx/src/emqx_schema_secret.erl
  11. 7 4
      apps/emqx/src/emqx_secret_loader.erl
  12. 7 2
      apps/emqx/src/emqx_trace/emqx_trace_formatter.erl
  13. 2 0
      apps/emqx/src/emqx_types.erl
  14. 6 2
      apps/emqx/src/emqx_ws_connection.erl
  15. 110 0
      apps/emqx/test/emqx_channel_tests.erl
  16. 1 1
      apps/emqx/test/emqx_common_test_helpers.erl
  17. 27 1
      apps/emqx/test/emqx_config_SUITE.erl
  18. 3 3
      apps/emqx/test/emqx_release_tests.erl
  19. 29 6
      apps/emqx/test/emqx_secret_tests.erl
  20. 52 0
      apps/emqx/test/emqx_trace_formatter_tests.erl
  21. 1 1
      apps/emqx_auto_subscribe/src/emqx_auto_subscribe.app.src
  22. 19 18
      apps/emqx_auto_subscribe/src/emqx_auto_subscribe.erl
  23. 20 0
      apps/emqx_auto_subscribe/test/emqx_auto_subscribe_SUITE.erl
  24. 1 1
      apps/emqx_bridge/src/emqx_bridge.app.src
  25. 4 0
      apps/emqx_bridge/src/emqx_bridge_v2.erl
  26. 1 1
      apps/emqx_bridge_http/src/emqx_bridge_http.app.src
  27. 2 1
      apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl
  28. 4 1
      apps/emqx_bridge_kafka/mix.exs
  29. 3 2
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src
  30. 25 0
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka_app.erl
  31. 0 21
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka_impl_consumer.erl
  32. 46 0
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka_sup.erl
  33. 2 1
      apps/emqx_bridge_kafka/test/emqx_bridge_v2_kafka_consumer_SUITE.erl
  34. 10 7
      apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl
  35. 21 1
      apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl
  36. 1 1
      apps/emqx_bridge_pulsar/mix.exs
  37. 1 1
      apps/emqx_bridge_pulsar/rebar.config
  38. 1 1
      apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar.app.src
  39. 216 82
      apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_connector.erl
  40. 1 1
      apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl
  41. 328 42
      apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_v2_SUITE.erl
  42. 24 0
      apps/emqx_conf/etc/base.hocon
  43. 5 27
      apps/emqx_conf/etc/emqx_conf.conf
  44. 48 0
      apps/emqx_connector/test/emqx_connector_SUITE.erl
  45. 1 1
      apps/emqx_dashboard/src/emqx_dashboard.app.src
  46. 7 4
      apps/emqx_dashboard/src/emqx_dashboard_monitor.erl
  47. 20 1
      apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl
  48. 5 1
      apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v1.erl
  49. 44 0
      apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v2.erl
  50. 17 7
      apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl
  51. 1 1
      apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl
  52. 1 1
      apps/emqx_management/src/emqx_management.app.src
  53. 98 16
      apps/emqx_management/src/emqx_mgmt_api.erl
  54. 1 1
      apps/emqx_resource/src/emqx_resource.app.src
  55. 8 1
      apps/emqx_resource/src/emqx_resource_buffer_worker.erl
  56. 18 5
      apps/emqx_resource/src/emqx_resource_cache_cleaner.erl
  57. 236 116
      apps/emqx_resource/src/emqx_resource_manager.erl
  58. 6 1
      apps/emqx_resource/src/emqx_resource_manager_sup.erl
  59. 22 19
      apps/emqx_resource/test/emqx_resource_SUITE.erl
  60. 1 1
      apps/emqx_utils/src/emqx_utils.app.src
  61. 27 4
      apps/emqx_utils/src/emqx_utils_redact.erl
  62. 41 0
      apps/emqx_utils/test/emqx_utils_redact_tests.erl
  63. 6 3
      bin/emqx
  64. 3 0
      changes/ce/feat-13739.en.md
  65. 1 0
      changes/ce/feat-14247.en.md
  66. 20 0
      changes/ce/feat-14269.en.md
  67. 1 0
      changes/ce/fix-14267.en.md
  68. 1 0
      changes/ce/fix-14272.en.md
  69. 1 0
      changes/ce/fix-14317.en.md
  70. 7 0
      changes/ce/fix-14318.en.md
  71. 5 0
      changes/ce/fix-14319.en.md
  72. 1 0
      changes/ee/feat-14110.en.md
  73. 7 0
      changes/ee/fix-14291.en.md
  74. 7 0
      changes/ee/fix-14345.en.md
  75. 3 0
      dev
  76. 6 0
      mix.exs
  77. 3 2
      rebar.config.erl
  78. 5 0
      rel/i18n/emqx_dashboard_monitor_api.hocon
  79. 8 0
      scripts/test/emqx-boot.bats

+ 5 - 0
apps/emqx/include/emqx.hrl

@@ -112,4 +112,9 @@
 -define(KIND_REPLICATE, replicate).
 -define(KIND_INITIATE, initiate).
 
+%%--------------------------------------------------------------------
+%% Client Attributes
+%%--------------------------------------------------------------------
+-define(CLIENT_ATTR_NAME_TNS, <<"tns">>).
+
 -endif.

+ 1 - 0
apps/emqx/priv/bpapi.versions

@@ -20,6 +20,7 @@
 {emqx_conf,4}.
 {emqx_connector,1}.
 {emqx_dashboard,1}.
+{emqx_dashboard,2}.
 {emqx_delayed,1}.
 {emqx_delayed,2}.
 {emqx_delayed,3}.

+ 1 - 1
apps/emqx/src/emqx.app.src

@@ -2,7 +2,7 @@
 {application, emqx, [
     {id, "emqx"},
     {description, "EMQX Core"},
-    {vsn, "5.4.3"},
+    {vsn, "5.5.0"},
     {modules, []},
     {registered, []},
     {applications, [

+ 39 - 4
apps/emqx/src/emqx_channel.erl

@@ -72,8 +72,14 @@
     prepare_will_message_for_publishing/2
 ]).
 
-%% Exports for CT
--export([set_field/3]).
+%% Exports for tests
+-ifdef(TEST).
+-export([
+    dummy/0,
+    set_field/3,
+    set_log_meta/2
+]).
+-endif.
 
 -if(?EMQX_RELEASE_EDITION == ee).
 -export([basic_trace_attrs/1]).
@@ -2011,8 +2017,33 @@ fix_mountpoint(ClientInfo = #{mountpoint := MountPoint}) ->
 
 set_log_meta(_ConnPkt, #channel{clientinfo = #{clientid := ClientId} = ClientInfo}) ->
     Username = maps:get(username, ClientInfo, undefined),
-    emqx_logger:set_metadata_clientid(ClientId),
-    emqx_logger:set_metadata_username(Username).
+    Attrs = maps:get(client_attrs, ClientInfo, #{}),
+    Tns0 = maps:get(?CLIENT_ATTR_NAME_TNS, Attrs, undefined),
+    %% No need to add Tns to log metadata if it's aready a prefix is client ID
+    %% Or if it's the username.
+    Tns =
+        case is_clientid_namespaced(ClientId, Tns0) orelse Username =:= Tns0 of
+            true ->
+                undefined;
+            false ->
+                Tns0
+        end,
+    Meta0 = [{clientid, ClientId}, {username, Username}, {tns, Tns}],
+    %% Drop undefined or <<>>
+    Meta = lists:filter(fun({_, V}) -> V =/= undefined andalso V =/= <<>> end, Meta0),
+    emqx_logger:set_proc_metadata(maps:from_list(Meta)).
+
+%% clientid_override is an expression which is free to set tns as a prefix, suffix or whatsoever,
+%% but as a best-effort log metadata optimization, we only check for prefix
+is_clientid_namespaced(ClientId, Tns) when is_binary(Tns) andalso Tns =/= <<>> ->
+    case ClientId of
+        <<Tns:(size(Tns))/binary, _/binary>> ->
+            true;
+        _ ->
+            false
+    end;
+is_clientid_namespaced(_ClientId, _Tns) ->
+    false.
 
 %%--------------------------------------------------------------------
 %% Check banned
@@ -3231,6 +3262,10 @@ subscribe_authz_result_attrs(CheckResult) ->
 %% For CT tests
 %%--------------------------------------------------------------------
 
+-ifdef(TEST).
+dummy() -> #channel{}.
+
 set_field(Name, Value, Channel) ->
     Pos = emqx_utils:index_of(Name, record_info(fields, channel)),
     setelement(Pos + 1, Channel, Value).
+-endif.

+ 37 - 25
apps/emqx/src/emqx_config.erl

@@ -97,7 +97,7 @@
 -export([upgrade_raw_conf/2]).
 
 -ifdef(TEST).
--export([erase_all/0, backup_and_write/2]).
+-export([erase_all/0, backup_and_write/2, cluster_hocon_file/0, base_hocon_file/0]).
 -endif.
 
 -include("logger.hrl").
@@ -440,12 +440,13 @@ do_parse_hocon(true, Conf, IncDirs) ->
 do_parse_hocon(false, Conf, IncDirs) ->
     Opts = #{format => map, include_dirs => IncDirs},
     case is_binary(Conf) of
-        %% only use in test
         true ->
+            %% only used in test
             hocon:binary(Conf, Opts);
         false ->
+            BaseHocon = base_hocon_file(),
             ClusterFile = cluster_hocon_file(),
-            hocon:files([ClusterFile | Conf], Opts)
+            hocon:files([BaseHocon, ClusterFile | Conf], Opts)
     end.
 
 include_dirs() ->
@@ -541,12 +542,12 @@ ensure_file_deleted(F) ->
 
 -spec read_override_conf(map()) -> raw_config().
 read_override_conf(#{} = Opts) ->
-    File =
+    Files =
         case has_deprecated_file() of
-            true -> deprecated_conf_file(Opts);
-            false -> cluster_hocon_file()
+            true -> [deprecated_conf_file(Opts)];
+            false -> [base_hocon_file(), cluster_hocon_file()]
         end,
-    load_hocon_file(File, map).
+    load_hocon_files(Files, map).
 
 %% @doc Return `true' if this node is upgraded from older version which used cluster-override.conf for
 %% cluster-wide config persistence.
@@ -564,6 +565,9 @@ deprecated_conf_file(Opts) when is_map(Opts) ->
 deprecated_conf_file(Which) when is_atom(Which) ->
     application:get_env(emqx, Which, undefined).
 
+base_hocon_file() ->
+    emqx:etc_file("base.hocon").
+
 %% The newer version cluster-wide config persistence file.
 cluster_hocon_file() ->
     application:get_env(emqx, cluster_hocon_file, undefined).
@@ -633,16 +637,29 @@ save_to_override_conf(true = _HasDeprecatedFile, RawConf, Opts) ->
         undefined ->
             ok;
         FileName ->
-            backup_and_write(FileName, hocon_pp:do(RawConf, Opts))
+            backup_and_write(FileName, generate_hocon_content(RawConf, Opts))
     end;
 save_to_override_conf(false = _HasDeprecatedFile, RawConf, Opts) ->
     case cluster_hocon_file() of
         undefined ->
             ok;
         FileName ->
-            backup_and_write(FileName, hocon_pp:do(RawConf, Opts))
+            backup_and_write(FileName, generate_hocon_content(RawConf, Opts))
     end.
 
+generate_hocon_content(RawConf, Opts) ->
+    [
+        cluster_dot_hocon_header(),
+        hocon_pp:do(RawConf, Opts)
+    ].
+
+cluster_dot_hocon_header() ->
+    [
+        "# This file is generated. Do not edit.\n",
+        "# The configs are results of online config changes from UI/API/CLI.\n",
+        "# To persist configs in this file, copy the content to etc/base.hocon.\n"
+    ].
+
 %% @private This is the same human-readable timestamp format as
 %% hocon-cli generated app.<time>.config file name.
 now_time() ->
@@ -730,22 +747,17 @@ remove_handlers() ->
     emqx_sys_mon:remove_handler(),
     ok.
 
-load_hocon_file(FileName, LoadType) ->
-    case filelib:is_regular(FileName) of
-        true ->
-            Opts = #{include_dirs => include_dirs(), format => LoadType},
-            case hocon:load(FileName, Opts) of
-                {ok, Raw0} ->
-                    Raw0;
-                {error, Reason} ->
-                    throw(#{
-                        msg => failed_to_load_conf,
-                        reason => Reason,
-                        file => FileName
-                    })
-            end;
-        false ->
-            #{}
+load_hocon_files(FileNames, LoadType) ->
+    Opts = #{include_dirs => include_dirs(), format => LoadType},
+    case hocon:files(FileNames, Opts) of
+        {ok, Raw0} ->
+            Raw0;
+        {error, Reason} ->
+            throw(#{
+                msg => failed_to_load_conf,
+                reason => Reason,
+                files => FileNames
+            })
     end.
 
 do_get_raw(Path) ->

+ 6 - 2
apps/emqx/src/emqx_connection.erl

@@ -126,7 +126,10 @@
     limiter_timer :: undefined | reference(),
 
     %% QUIC conn shared state
-    quic_conn_ss :: option(map())
+    quic_conn_ss :: option(map()),
+
+    %% Extra field for future hot-upgrade support
+    extra = []
 }).
 
 -record(retry, {
@@ -366,7 +369,8 @@ init_state(
         limiter_buffer = queue:new(),
         limiter_timer = undefined,
         %% for quic streams to inherit
-        quic_conn_ss = maps:get(conn_shared_state, Opts, undefined)
+        quic_conn_ss = maps:get(conn_shared_state, Opts, undefined),
+        extra = []
     }.
 
 run_loop(

+ 6 - 2
apps/emqx/src/emqx_logger.erl

@@ -48,6 +48,7 @@
     set_primary_log_level/1,
     set_log_handler_level/2,
     set_log_level/1,
+    set_level/1,
     set_all_log_handlers_level/1
 ]).
 
@@ -244,13 +245,16 @@ set_log_handler_level(HandlerId, Level) ->
     end.
 
 %% @doc Set both the primary and all handlers level in one command
--spec set_log_level(logger:level()) -> ok | {error, term()}.
-set_log_level(Level) ->
+-spec set_level(logger:level()) -> ok | {error, term()}.
+set_level(Level) ->
     case set_primary_log_level(Level) of
         ok -> set_all_log_handlers_level(Level);
         {error, Error} -> {error, {primary_logger_level, Error}}
     end.
 
+set_log_level(Level) ->
+    set_level(Level).
+
 %%--------------------------------------------------------------------
 %% Internal Functions
 %%--------------------------------------------------------------------

+ 3 - 1
apps/emqx/src/emqx_logger_textfmt.erl

@@ -122,6 +122,7 @@ enrich_report(ReportRaw0, Meta, Config) ->
             undefined -> maps:get(username, ReportRaw, undefined);
             Username0 -> Username0
         end,
+    Tns = maps:get(tns, Meta, undefined),
     ClientId = maps:get(clientid, Meta, undefined),
     Peer = maps:get(peername, Meta, undefined),
     Msg = maps:get(msg, ReportRaw, undefined),
@@ -135,7 +136,7 @@ enrich_report(ReportRaw0, Meta, Config) ->
             ({_, undefined}, Acc) -> Acc;
             (Item, Acc) -> [Item | Acc]
         end,
-        maps:to_list(maps:without([topic, msg, clientid, username, tag], ReportRaw)),
+        maps:to_list(maps:without([topic, msg, tns, clientid, username, tag], ReportRaw)),
         [
             {topic, try_format_unicode(Topic)},
             {username, try_format_unicode(Username)},
@@ -143,6 +144,7 @@ enrich_report(ReportRaw0, Meta, Config) ->
             {mfa, try_format_unicode(MFA)},
             {msg, Msg},
             {clientid, try_format_unicode(ClientId)},
+            {tns, try_format_unicode(Tns)},
             {tag, Tag}
         ]
     ).

+ 3 - 3
apps/emqx/src/emqx_schema.erl

@@ -3422,7 +3422,8 @@ naive_env_interpolation(Other) ->
     Other.
 
 split_path(Path) ->
-    split_path(Path, []).
+    {Name0, Tail} = split_path(Path, []),
+    {string:trim(Name0, both, "{}"), Tail}.
 
 split_path([], Acc) ->
     {lists:reverse(Acc), []};
@@ -3431,8 +3432,7 @@ split_path([Char | Rest], Acc) when Char =:= $/ orelse Char =:= $\\ ->
 split_path([Char | Rest], Acc) ->
     split_path(Rest, [Char | Acc]).
 
-resolve_env(Name0) ->
-    Name = string:trim(Name0, both, "{}"),
+resolve_env(Name) ->
     Value = os:getenv(Name),
     case Value =/= false andalso Value =/= "" of
         true ->

+ 2 - 2
apps/emqx/src/emqx_schema_secret.erl

@@ -71,8 +71,8 @@ convert_secret(Secret, #{}) ->
     end.
 
 -spec wrap(source()) -> emqx_secret:t(t()).
-wrap(<<"file://", Filename/binary>>) ->
-    emqx_secret:wrap_load({file, Filename});
+wrap(<<"file://", _Filename/binary>> = Secret) ->
+    emqx_secret:wrap_load({file, Secret});
 wrap(Secret) ->
     emqx_secret:wrap(Secret).
 

+ 7 - 4
apps/emqx/src/emqx_secret_loader.erl

@@ -22,14 +22,17 @@
 
 -export_type([source/0]).
 
--type source() :: {file, file:filename_all()}.
+-type source() :: {file, string() | binary()}.
 
 -spec load(source()) -> binary() | no_return().
-load({file, Filename}) ->
-    file(Filename).
+load({file, <<"file://", Path/binary>>}) ->
+    file(Path);
+load({file, "file://" ++ Path}) ->
+    file(Path).
 
 -spec file(file:filename_all()) -> binary() | no_return().
-file(Filename) ->
+file(Filename0) ->
+    Filename = emqx_schema:naive_env_interpolation(Filename0),
     case file:read_file(Filename) of
         {ok, Secret} ->
             string:trim(Secret, trailing);

+ 7 - 2
apps/emqx/src/emqx_trace/emqx_trace_formatter.erl

@@ -33,12 +33,17 @@ format(
     #{level := debug, meta := Meta = #{trace_tag := Tag}, msg := Msg} =
         emqx_logger_textfmt:evaluate_lazy_values(Entry),
     Time = emqx_utils_calendar:now_to_rfc3339(microsecond),
+    Tns =
+        case to_iolist(maps:get(tns, Meta, "")) of
+            "" -> "";
+            X -> [" tns: ", X]
+        end,
     ClientId = to_iolist(maps:get(clientid, Meta, "")),
     Peername = maps:get(peername, Meta, ""),
     MetaBin = format_meta(Meta, PEncode),
     Msg1 = to_iolist(Msg),
     Tag1 = to_iolist(Tag),
-    [Time, " [", Tag1, "] ", ClientId, "@", Peername, " msg: ", Msg1, ", ", MetaBin, "\n"];
+    [Time, " [", Tag1, "] ", ClientId, "@", Peername, Tns, " msg: ", Msg1, ", ", MetaBin, "\n"];
 format(Event, Config) ->
     emqx_logger_textfmt:format(Event, Config).
 
@@ -79,7 +84,7 @@ format_meta_data(Meta, _Encode) ->
     Meta.
 
 format_meta(Meta0, Encode) ->
-    Meta1 = maps:without([msg, clientid, peername, trace_tag], Meta0),
+    Meta1 = maps:without([msg, tns, clientid, peername, trace_tag], Meta0),
     Meta2 = format_meta_data(Meta1, Encode),
     kvs_to_iolist(lists:sort(fun compare_meta_kvs/2, maps:to_list(Meta2))).
 

+ 2 - 0
apps/emqx/src/emqx_types.erl

@@ -49,6 +49,7 @@
     sockstate/0,
     conninfo/0,
     clientinfo/0,
+    tns/0,
     clientid/0,
     username/0,
     password/0,
@@ -195,6 +196,7 @@
     atom() => term()
 }.
 -type client_attrs() :: #{binary() => binary()}.
+-type tns() :: binary().
 -type clientid() :: binary() | atom().
 -type username() :: option(binary()).
 -type password() :: option(binary()).

+ 6 - 2
apps/emqx/src/emqx_ws_connection.erl

@@ -97,7 +97,10 @@
     limiter_buffer :: queue:queue(cache()),
 
     %% limiter timers
-    limiter_timer :: undefined | reference()
+    limiter_timer :: undefined | reference(),
+
+    %% Extra field for future hot-upgrade support
+    extra = []
 }).
 
 -record(retry, {
@@ -330,7 +333,8 @@ websocket_init([Req, Opts]) ->
                     zone = Zone,
                     listener = {Type, Listener},
                     limiter_timer = undefined,
-                    limiter_buffer = queue:new()
+                    limiter_buffer = queue:new(),
+                    extra = []
                 },
                 hibernate};
         {denny, Reason} ->

+ 110 - 0
apps/emqx/test/emqx_channel_tests.erl

@@ -0,0 +1,110 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_channel_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+
+set_tns_in_log_meta_test_() ->
+    PdKey = '$logger_metadata$',
+    Original = get(PdKey),
+    Set = fun(Cinfo) ->
+        Ch = emqx_channel:dummy(),
+        Ch1 = emqx_channel:set_field(clientinfo, Cinfo, Ch),
+        emqx_channel:set_log_meta(dummy, Ch1)
+    end,
+    Restore = fun() -> put(PdKey, Original) end,
+    NoTns = #{
+        clientid => <<"id1">>,
+        client_attrs => #{<<"not_tns">> => <<"tns1">>},
+        username => <<"user1">>
+    },
+    NoTnsFn = fun(M) ->
+        ?assertMatch(
+            #{
+                clientid := <<"id1">>,
+                username := <<"user1">>
+            },
+            M
+        ),
+        ?assertNot(maps:is_key(tns, M))
+    end,
+    Prefixed = #{
+        clientid => <<"tns1-id1">>,
+        client_attrs => #{<<"tns">> => <<"tns1">>},
+        username => <<"user2">>
+    },
+    PrefixedFn = fun(M) ->
+        ?assertMatch(
+            #{
+                clientid := <<"tns1-id1">>,
+                username := <<"user2">>
+            },
+            M
+        ),
+        ?assertNot(maps:is_key(tns, M))
+    end,
+
+    Username = #{
+        clientid => <<"id1">>,
+        client_attrs => #{<<"tns">> => <<"user3">>},
+        username => <<"user3">>
+    },
+    UsernameFn =
+        fun(M) ->
+            ?assertMatch(
+                #{
+                    clientid := <<"id1">>,
+                    username := <<"user3">>
+                },
+                M
+            ),
+            ?assertNot(maps:is_key(tns, M))
+        end,
+    TnsAdded = #{
+        clientid => <<"id4">>,
+        client_attrs => #{<<"tns">> => <<"tns1">>},
+        username => <<"user4">>
+    },
+    TnsAddedFn = fun(M) ->
+        ?assertMatch(
+            #{
+                clientid := <<"id4">>,
+                username := <<"user4">>,
+                tns := <<"tns1">>
+            },
+            M
+        )
+    end,
+    Run = fun(Cinfo, CheckFn) ->
+        Set(Cinfo),
+        try
+            CheckFn(get(PdKey))
+        after
+            Restore()
+        end
+    end,
+    MakeTestFn = fun(Cinfo, CheckFn) ->
+        fun() ->
+            Run(Cinfo, CheckFn)
+        end
+    end,
+    [
+        {"tns-added", MakeTestFn(TnsAdded, TnsAddedFn)},
+        {"username as tns", MakeTestFn(Username, UsernameFn)},
+        {"tns prefixed clientid", MakeTestFn(Prefixed, PrefixedFn)},
+        {"no tns", MakeTestFn(NoTns, NoTnsFn)}
+    ].

+ 1 - 1
apps/emqx/test/emqx_common_test_helpers.erl

@@ -547,7 +547,7 @@ force_set_config_file_paths(emqx, Paths) ->
     %% we need init cluster conf, so we can save the cluster conf to the file
     application:set_env(emqx, local_override_conf_file, "local_override.conf"),
     application:set_env(emqx, cluster_override_conf_file, "cluster_override.conf"),
-    application:set_env(emqx, cluster_conf_file, "cluster.hocon"),
+    application:set_env(emqx, cluster_hocon_file, "cluster.hocon"),
     application:set_env(emqx, config_files, Paths);
 force_set_config_file_paths(_, _) ->
     ok.

+ 27 - 1
apps/emqx/test/emqx_config_SUITE.erl

@@ -92,7 +92,7 @@ t_init_load(C) when is_list(C) ->
     emqx_config:erase_all(),
     {ok, DeprecatedFile} = application:get_env(emqx, cluster_override_conf_file),
     ?assertEqual(false, filelib:is_regular(DeprecatedFile), DeprecatedFile),
-    %% Don't has deprecated file
+    %% Don't have deprecated file
     ok = emqx_config:init_load(emqx_schema, [ConfFile]),
     ?assertEqual(ExpectRootNames, lists:sort(emqx_config:get_root_names())),
     ?assertMatch({ok, #{raw_config := 256}}, emqx:update_config([mqtt, max_topic_levels], 256)),
@@ -104,6 +104,32 @@ t_init_load(C) when is_list(C) ->
     ?assertMatch({ok, #{raw_config := 128}}, emqx:update_config([mqtt, max_topic_levels], 128)),
     ok = file:delete(DeprecatedFile).
 
+t_init_load_with_base_hocon(C) when is_list(C) ->
+    BaseHocon = emqx_config:base_hocon_file(),
+    ClusterHocon = emqx_config:cluster_hocon_file(),
+    ConfFile = "./test_emqx_2.conf",
+    ok = filelib:ensure_dir(BaseHocon),
+    ok = file:write_file(
+        BaseHocon,
+        "mqtt.max_topic_levels = 123\n"
+        "mqtt.max_clientid_len=12\n"
+        "mqtt.max_inflight=12\n"
+    ),
+    ok = file:write_file(
+        ClusterHocon,
+        "mqtt.max_clientid_len = 123\n"
+        "mqtt.max_inflight=22\n"
+    ),
+    ok = file:write_file(ConfFile, "mqtt.max_inflight = 123\n"),
+    ok = emqx_config:init_load(emqx_schema, [ConfFile]),
+    ?assertEqual(123, emqx:get_config([mqtt, max_topic_levels])),
+    ?assertEqual(123, emqx:get_config([mqtt, max_clientid_len])),
+    ?assertEqual(123, emqx:get_config([mqtt, max_inflight])),
+    emqx_config:erase_all(),
+    ok = file:delete(BaseHocon),
+    ok = file:delete(ClusterHocon),
+    ok.
+
 t_unknown_root_keys(C) when is_list(C) ->
     ?check_trace(
         #{timetrap => 1000},

+ 3 - 3
apps/emqx/test/emqx_release_tests.erl

@@ -59,15 +59,15 @@ vsn_compre_test_() ->
         end}
     ].
 
-emqx_flavor_test(_Config) ->
+emqx_flavor_test() ->
     case emqx_release:edition() of
         ce ->
             ok;
         ee ->
             ?assertEqual(official, emqx_release:get_flavor()),
-            ?assertEqual("EMQX Enterprise", emqx:get_description()),
+            ?assertEqual("EMQX Enterprise", emqx_app:get_description()),
             emqx_release:set_flavor(marketplace),
             ?assertEqual(marketplace, emqx_release:get_flavor()),
-            ?assertEqual("EMQX Enterprise(marketplace)", emqx:get_description()),
+            ?assertEqual("EMQX Enterprise(marketplace)", emqx_app:get_description()),
             emqx_release:set_flavor(official)
     end.

+ 29 - 6
apps/emqx/test/emqx_secret_tests.erl

@@ -39,21 +39,41 @@ wrap_unwrap_load_test_() ->
         fun(Filename) ->
             ?_assertEqual(
                 Secret,
-                emqx_secret:unwrap(emqx_secret:wrap_load({file, Filename}))
+                emqx_secret:unwrap(emqx_secret:wrap_load(file_ref(Filename)))
             )
         end
     }.
 
+wrap_unwrap_load_path_env_interpolate_test_() ->
+    Secret = <<"111">>,
+    {
+        setup,
+        fun() -> write_temp_file(Secret) end,
+        fun(Filename) -> file:delete(Filename) end,
+        fun(Filename) ->
+            fun() ->
+                os:putenv("SECRETFILEPATH", Filename),
+                File = "file://${SECRETFILEPATH}",
+                try
+                    ?assertEqual(
+                        Secret,
+                        emqx_secret:unwrap(emqx_secret:wrap_load({file, File}))
+                    )
+                after
+                    os:unsetenv("SECRETFILEPATH")
+                end
+            end
+        end
+    }.
+
 wrap_load_term_test() ->
-    ?assertEqual(
-        {file, "no/such/file/i/swear"},
-        emqx_secret:term(emqx_secret:wrap_load({file, "no/such/file/i/swear"}))
-    ).
+    Ref = file_ref("no/such/file/i/swear"),
+    ?assertEqual(Ref, emqx_secret:term(emqx_secret:wrap_load(Ref))).
 
 wrap_unwrap_missing_file_test() ->
     ?assertThrow(
         #{msg := failed_to_read_secret_file, reason := "No such file or directory"},
-        emqx_secret:unwrap(emqx_secret:wrap_load({file, "no/such/file/i/swear"}))
+        emqx_secret:unwrap(emqx_secret:wrap_load(file_ref("no/such/file/i/swear")))
     ).
 
 wrap_term_test() ->
@@ -74,3 +94,6 @@ write_temp_file(Bytes) ->
     Filename = filename:join("/tmp", ?MODULE_STRING ++ integer_to_list(-Ts)),
     ok = file:write_file(Filename, Bytes),
     Filename.
+
+file_ref(Path) ->
+    {file, "file://" ++ Path}.

+ 52 - 0
apps/emqx/test/emqx_trace_formatter_tests.erl

@@ -0,0 +1,52 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_trace_formatter_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+
+format_no_tns_in_meta_test() ->
+    Meta = #{
+        clientid => <<"c">>,
+        trace_tag => tag
+    },
+    Event = #{
+        level => debug,
+        meta => Meta,
+        msg => <<"test_msg">>
+    },
+    Config = #{payload_encode => hidden},
+    Formatted = format(Event, Config),
+    ?assertMatch(nomatch, re:run(Formatted, "tns:")),
+    ok.
+
+format_tns_in_meta_test() ->
+    Meta = #{
+        tns => <<"a">>,
+        clientid => <<"c">>,
+        trace_tag => tag
+    },
+    Event = #{
+        level => debug,
+        meta => Meta,
+        msg => <<"test_msg">>
+    },
+    Config = #{payload_encode => hidden},
+    Formatted = format(Event, Config),
+    ?assertMatch({match, _}, re:run(Formatted, "\stns:\sa\s")),
+    ok.
+
+format(Event, Config) ->
+    unicode:characters_to_binary(emqx_trace_formatter:format(Event, Config)).

+ 1 - 1
apps/emqx_auto_subscribe/src/emqx_auto_subscribe.app.src

@@ -1,7 +1,7 @@
 %% -*- mode: erlang -*-
 {application, emqx_auto_subscribe, [
     {description, "Auto subscribe Application"},
-    {vsn, "0.1.6"},
+    {vsn, "0.1.7"},
     {registered, []},
     {mod, {emqx_auto_subscribe_app, []}},
     {applications, [

+ 19 - 18
apps/emqx_auto_subscribe/src/emqx_auto_subscribe.erl

@@ -25,6 +25,7 @@
 -define(HOOK_POINT, 'client.connected').
 
 -define(MAX_AUTO_SUBSCRIBE, 20).
+-define(ROOT_KEY, auto_subscribe).
 
 -export([load/0, unload/0]).
 
@@ -47,27 +48,27 @@
 ]).
 
 load() ->
-    ok = emqx_conf:add_handler([auto_subscribe, topics], ?MODULE),
+    ok = emqx_conf:add_handler([?ROOT_KEY], ?MODULE),
     update_hook().
 
 unload() ->
-    emqx_conf:remove_handler([auto_subscribe, topics]).
+    emqx_conf:remove_handler([?ROOT_KEY]).
 
 max_limit() ->
     ?MAX_AUTO_SUBSCRIBE.
 
 list() ->
-    format(emqx_conf:get([auto_subscribe, topics], [])).
+    format(emqx_conf:get([?ROOT_KEY, topics], [])).
 
 update(Topics) when length(Topics) =< ?MAX_AUTO_SUBSCRIBE ->
     case
         emqx_conf:update(
-            [auto_subscribe, topics],
-            Topics,
+            [?ROOT_KEY],
+            #{<<"topics">> => Topics},
             #{rawconf_with_defaults => true, override_to => cluster}
         )
     of
-        {ok, #{raw_config := NewTopics}} ->
+        {ok, #{raw_config := #{<<"topics">> := NewTopics}}} ->
             {ok, NewTopics};
         {error, Reason} ->
             {error, Reason}
@@ -75,9 +76,8 @@ update(Topics) when length(Topics) =< ?MAX_AUTO_SUBSCRIBE ->
 update(_Topics) ->
     {error, quota_exceeded}.
 
-post_config_update(_KeyPath, _Req, NewTopics, _OldConf, _AppEnvs) ->
-    Config = emqx_conf:get([auto_subscribe], #{}),
-    update_hook(Config#{topics => NewTopics}).
+post_config_update([?ROOT_KEY], _Req, NewConf, _OldConf, _AppEnvs) ->
+    update_hook(NewConf).
 
 %%------------------------------------------------------------------------------
 %% hook
@@ -100,25 +100,26 @@ on_client_connected(_, _, _) ->
 
 -spec get_basic_usage_info() -> #{auto_subscribe_count => non_neg_integer()}.
 get_basic_usage_info() ->
-    AutoSubscribe = emqx_conf:get([auto_subscribe, topics], []),
+    AutoSubscribe = emqx_conf:get([?ROOT_KEY, topics], []),
     #{auto_subscribe_count => length(AutoSubscribe)}.
 
 %%------------------------------------------------------------------------------
 %% Data backup
 %%------------------------------------------------------------------------------
 
-import_config(#{<<"auto_subscribe">> := #{<<"topics">> := Topics}}) ->
-    ConfPath = [auto_subscribe, topics],
-    OldTopics = emqx:get_raw_config(ConfPath, []),
+import_config(#{<<"auto_subscribe">> := #{<<"topics">> := Topics} = AutoSubscribe}) ->
+    ConfPath = [?ROOT_KEY],
+    OldTopics = emqx:get_raw_config(ConfPath ++ [topics], []),
     KeyFun = fun(#{<<"topic">> := T}) -> T end,
     MergedTopics = emqx_utils:merge_lists(OldTopics, Topics, KeyFun),
-    case emqx_conf:update(ConfPath, MergedTopics, #{override_to => cluster}) of
-        {ok, #{raw_config := NewTopics}} ->
+    Conf = AutoSubscribe#{<<"topics">> => MergedTopics},
+    case emqx_conf:update(ConfPath, Conf, #{override_to => cluster}) of
+        {ok, #{raw_config := #{<<"topics">> := NewTopics}}} ->
             Changed = maps:get(changed, emqx_utils:diff_lists(NewTopics, OldTopics, KeyFun)),
             Changed1 = [ConfPath ++ [T] || {#{<<"topic">> := T}, _} <- Changed],
-            {ok, #{root_key => auto_subscribe, changed => Changed1}};
+            {ok, #{root_key => ?ROOT_KEY, changed => Changed1}};
         Error ->
-            {error, #{root_key => auto_subscribe, reason => Error}}
+            {error, #{root_key => ?ROOT_KEY, reason => Error}}
     end;
 import_config(_RawConf) ->
     {ok, #{root_key => auto_subscribe, changed => []}}.
@@ -139,7 +140,7 @@ format(Rule = #{topic := Topic}) when is_map(Rule) ->
     }.
 
 update_hook() ->
-    update_hook(emqx_conf:get([auto_subscribe], #{})).
+    update_hook(emqx_conf:get([?ROOT_KEY], #{topics => []})).
 
 update_hook(Config) ->
     {TopicHandler, Options} = emqx_auto_subscribe_handler:init(Config),

+ 20 - 0
apps/emqx_auto_subscribe/test/emqx_auto_subscribe_SUITE.erl

@@ -20,6 +20,7 @@
 
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("common_test/include/ct.hrl").
+-import(emqx_config_SUITE, [prepare_conf_file/3]).
 
 -define(TOPIC_C, <<"/c/${clientid}">>).
 -define(TOPIC_U, <<"/u/${username}">>).
@@ -100,12 +101,18 @@ init_per_suite(Config) ->
 init_per_testcase(t_get_basic_usage_info, Config) ->
     {ok, _} = emqx_auto_subscribe:update([]),
     Config;
+init_per_testcase(t_auto_subscribe_reload_from_file, Config) ->
+    {ok, _} = emqx_auto_subscribe:update([]),
+    Config;
 init_per_testcase(_TestCase, Config) ->
     Config.
 
 end_per_testcase(t_get_basic_usage_info, _Config) ->
     {ok, _} = emqx_auto_subscribe:update([]),
     ok;
+end_per_testcase(t_auto_subscribe_reload_from_file, _Config) ->
+    {ok, _} = emqx_auto_subscribe:update([]),
+    ok;
 end_per_testcase(_TestCase, _Config) ->
     ok.
 
@@ -131,6 +138,19 @@ t_auto_subscribe(_) ->
     ?assertEqual(check_subs(length(?TOPICS)), ok),
     emqtt:disconnect(Client),
     ok.
+t_auto_subscribe_reload_from_file(Config) ->
+    ConfBin = hocon_pp:do(
+        #{<<"auto_subscribe">> => #{<<"topics">> => [#{<<"topic">> => Topic} || Topic <- ?TOPICS]}},
+        #{}
+    ),
+    ConfFile = prepare_conf_file(?FUNCTION_NAME, ConfBin, Config),
+    ok = emqx_conf_cli:conf(["load", "--replace", ConfFile]),
+    {ok, Client} = emqtt:start_link(#{username => ?CLIENT_USERNAME, clientid => ?CLIENT_ID}),
+    {ok, _} = emqtt:connect(Client),
+    timer:sleep(200),
+    ?assertEqual(check_subs(length(?TOPICS)), ok),
+    emqtt:disconnect(Client),
+    ok.
 
 t_update(_) ->
     Path = emqx_mgmt_api_test_util:api_path(["mqtt", "auto_subscribe"]),

+ 1 - 1
apps/emqx_bridge/src/emqx_bridge.app.src

@@ -1,7 +1,7 @@
 %% -*- mode: erlang -*-
 {application, emqx_bridge, [
     {description, "EMQX bridges"},
-    {vsn, "0.2.7"},
+    {vsn, "0.2.8"},
     {registered, [emqx_bridge_sup]},
     {mod, {emqx_bridge_app, []}},
     {applications, [

+ 4 - 0
apps/emqx_bridge/src/emqx_bridge_v2.erl

@@ -280,10 +280,14 @@ lookup(ConfRootName, Type, Name) ->
             ChannelStatus = maps:get(BridgeV2Id, Channels, undefined),
             {DisplayBridgeV2Status, ErrorMsg} =
                 case {ChannelStatus, ConnectorStatus} of
+                    {_, ?status_disconnected} ->
+                        {?status_disconnected, <<"Resource not operational">>};
                     {#{status := ?status_connected}, _} ->
                         {?status_connected, <<"">>};
                     {#{error := resource_not_operational}, ?status_connecting} ->
                         {?status_connecting, <<"Not installed">>};
+                    {#{error := not_added_yet}, _} ->
+                        {?status_connecting, <<"Not installed">>};
                     {#{status := Status, error := undefined}, _} ->
                         {Status, <<"Unknown reason">>};
                     {#{status := Status, error := Error}, _} ->

+ 1 - 1
apps/emqx_bridge_http/src/emqx_bridge_http.app.src

@@ -1,6 +1,6 @@
 {application, emqx_bridge_http, [
     {description, "EMQX HTTP Bridge and Connector Application"},
-    {vsn, "0.3.5"},
+    {vsn, "0.3.6"},
     {registered, []},
     {applications, [kernel, stdlib, emqx_resource, ehttpc]},
     {env, [

+ 2 - 1
apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl

@@ -236,7 +236,8 @@ on_start(
         port => Port,
         connect_timeout => ConnectTimeout,
         scheme => Scheme,
-        request => preprocess_request(maps:get(request, Config, undefined))
+        request => preprocess_request(maps:get(request, Config, undefined)),
+        installed_actions => #{}
     },
     case start_pool(InstId, PoolOpts) of
         ok ->

+ 4 - 1
apps/emqx_bridge_kafka/mix.exs

@@ -18,7 +18,10 @@ defmodule EMQXBridgeKafka.MixProject do
   end
 
   def application do
-    [extra_applications: UMP.extra_applications()]
+    [
+      extra_applications: UMP.extra_applications(),
+      mod: {:emqx_bridge_kafka_app, []}
+    ]
   end
 
   def deps() do

+ 3 - 2
apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src

@@ -1,8 +1,8 @@
 %% -*- mode: erlang -*-
 {application, emqx_bridge_kafka, [
     {description, "EMQX Enterprise Kafka Bridge"},
-    {vsn, "0.5.1"},
-    {registered, [emqx_bridge_kafka_consumer_sup]},
+    {vsn, "0.5.2"},
+    {registered, [emqx_bridge_kafka_sup, emqx_bridge_kafka_consumer_sup]},
     {applications, [
         kernel,
         stdlib,
@@ -12,6 +12,7 @@
         brod,
         brod_gssapi
     ]},
+    {mod, {emqx_bridge_kafka_app, []}},
     {env, [
         {emqx_action_info_modules, [
             emqx_bridge_kafka_producer_action_info,

+ 25 - 0
apps/emqx_bridge_kafka/src/emqx_bridge_kafka_app.erl

@@ -0,0 +1,25 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%--------------------------------------------------------------------
+-module(emqx_bridge_kafka_app).
+
+-behaviour(application).
+
+%% `application' API
+-export([start/2, stop/1]).
+
+%%------------------------------------------------------------------------------
+%% Type declarations
+%%------------------------------------------------------------------------------
+
+%%------------------------------------------------------------------------------
+%% `application' API
+%%------------------------------------------------------------------------------
+
+-spec start(application:start_type(), term()) -> {ok, pid()}.
+start(_Type, _Args) ->
+    emqx_bridge_kafka_sup:start_link().
+
+-spec stop(term()) -> ok.
+stop(_State) ->
+    ok.

+ 0 - 21
apps/emqx_bridge_kafka/src/emqx_bridge_kafka_impl_consumer.erl

@@ -381,26 +381,6 @@ make_subscriber_id(BridgeName) ->
     BridgeNameBin = to_bin(BridgeName),
     <<"kafka_subscriber:", BridgeNameBin/binary>>.
 
-ensure_consumer_supervisor_started() ->
-    Mod = emqx_bridge_kafka_consumer_sup,
-    ChildSpec =
-        #{
-            id => Mod,
-            start => {Mod, start_link, []},
-            restart => permanent,
-            shutdown => infinity,
-            type => supervisor,
-            modules => [Mod]
-        },
-    case supervisor:start_child(emqx_bridge_sup, ChildSpec) of
-        {ok, _Pid} ->
-            ok;
-        {error, already_present} ->
-            ok;
-        {error, {already_started, _Pid}} ->
-            ok
-    end.
-
 -spec start_consumer(
     source_config(),
     connector_resource_id(),
@@ -424,7 +404,6 @@ start_consumer(Config, ConnectorResId, SourceResId, ClientID, ConnState) ->
             value_encoding_mode := ValueEncodingMode
         } = Params0
     } = Config,
-    ok = ensure_consumer_supervisor_started(),
     ?tp(kafka_consumer_sup_started, #{}),
     TopicMapping = ensure_topic_mapping(Params0),
     InitialState = #{

+ 46 - 0
apps/emqx_bridge_kafka/src/emqx_bridge_kafka_sup.erl

@@ -0,0 +1,46 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%--------------------------------------------------------------------
+-module(emqx_bridge_kafka_sup).
+
+-behaviour(supervisor).
+
+%% API
+-export([start_link/0]).
+
+%% `supervisor' API
+-export([init/1]).
+
+%%------------------------------------------------------------------------------
+%% API
+%%------------------------------------------------------------------------------
+
+start_link() ->
+    supervisor:start_link({local, ?MODULE}, ?MODULE, []).
+
+%%------------------------------------------------------------------------------
+%% `supervisor' API
+%%------------------------------------------------------------------------------
+
+init([]) ->
+    SupFlags = #{
+        strategy => one_for_one,
+        intensity => 10,
+        period => 10
+    },
+    ConsumerSup = sup_spec(emqx_bridge_kafka_consumer_sup),
+    ChildSpecs = [ConsumerSup],
+    {ok, {SupFlags, ChildSpecs}}.
+
+%%------------------------------------------------------------------------------
+%% Internal fns
+%%------------------------------------------------------------------------------
+
+sup_spec(Mod) ->
+    #{
+        id => Mod,
+        start => {Mod, start_link, []},
+        restart => permanent,
+        shutdown => infinity,
+        type => supervisor
+    }.

+ 2 - 1
apps/emqx_bridge_kafka/test/emqx_bridge_v2_kafka_consumer_SUITE.erl

@@ -459,7 +459,8 @@ t_repeated_topics(Config) ->
                 emqx_bridge_v2_testlib:create_source_api([{source_name, Name2} | Config]),
             ?assertEqual(
                 match,
-                re:run(Error, <<"Topics .* already exist in other sources">>, [{capture, none}])
+                re:run(Error, <<"Topics .* already exist in other sources">>, [{capture, none}]),
+                #{error => Error}
             ),
             ok
         end,

+ 10 - 7
apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl

@@ -127,6 +127,8 @@ common_init_per_testcase(TestCase, Config0) ->
     ),
     ok = snabbkaffe:start_trace(),
     [
+        {bridge_type, ?BRIDGE_TYPE_BIN},
+        {bridge_name, Name},
         {oracle_name, Name},
         {oracle_config_string, ConfigString},
         {oracle_config, OracleConfig}
@@ -730,18 +732,20 @@ t_no_sid_nor_service_name(Config0) ->
     ok.
 
 t_missing_table(Config) ->
-    ResourceId = resource_id(Config),
+    Name = ?config(bridge_name, Config),
     ?check_trace(
         begin
             drop_table_if_exists(Config),
             ?assertMatch({ok, _}, create_bridge_api(Config)),
-            ActionId = emqx_bridge_v2:id(?BRIDGE_TYPE_BIN, ?config(oracle_name, Config)),
             ?retry(
                 _Sleep = 1_000,
                 _Attempts = 20,
                 ?assertMatch(
-                    {ok, Status} when Status =:= disconnected orelse Status =:= connecting,
-                    emqx_resource_manager:health_check(ResourceId)
+                    {ok, #{
+                        <<"status">> := <<"disconnected">>,
+                        <<"status_reason">> := <<"{unhealthy_target,", _/binary>>
+                    }},
+                    emqx_bridge_testlib:get_bridge_api(Config)
                 )
             ),
             ?block_until(#{?snk_kind := oracle_undefined_table}),
@@ -752,10 +756,9 @@ t_missing_table(Config) ->
                 payload => ?config(oracle_name, Config),
                 retain => true
             },
-            Message = {ActionId, Params},
             ?assertMatch(
-                {error, {resource_error, #{reason := not_connected}}},
-                emqx_resource:simple_sync_query(ResourceId, Message)
+                {error, {resource_error, #{reason := unhealthy_target}}},
+                emqx_bridge_v2:send_message(?BRIDGE_TYPE_BIN, Name, Params, _QueryOpts = #{})
             ),
             ok
         end,

+ 21 - 1
apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl

@@ -803,17 +803,37 @@ t_table_removed(Config) ->
     BridgeType = ?config(pgsql_bridge_type, Config),
     ?check_trace(
         begin
+            ct:pal("creating table"),
             connect_and_create_table(Config),
-            ?assertMatch({ok, _}, create_bridge(Config)),
+            ct:pal("creating bridge"),
+            ?assertMatch(
+                {ok, _},
+                create_bridge(Config, #{
+                    <<"resource_opts">> => #{
+                        <<"health_check_interval">> => <<"1s">>
+                    }
+                })
+            ),
+            ct:pal("checking bridge health"),
             ?retry(
                 _Sleep = 100,
                 _Attempts = 200,
                 ?assertMatch(#{status := connected}, emqx_bridge_v2:health_check(BridgeType, Name))
             ),
+            ct:pal("dropping table"),
             connect_and_drop_table(Config),
             Val = integer_to_binary(erlang:unique_integer()),
             SentData = #{payload => Val, timestamp => 1668602148000},
             ActionId = emqx_bridge_v2:id(BridgeType, Name),
+            ?retry(
+                _Sleep = 100,
+                _Attempts = 200,
+                ?assertMatch(
+                    #{error := {unhealthy_target, _}, status := disconnected},
+                    emqx_bridge_v2:health_check(BridgeType, Name)
+                )
+            ),
+            ct:pal("sending query"),
             case query_resource_sync(Config, {ActionId, SentData}) of
                 {error, {unrecoverable_error, _}} ->
                     ok;

+ 1 - 1
apps/emqx_bridge_pulsar/mix.exs

@@ -25,7 +25,7 @@ defmodule EMQXBridgePulsar.MixProject do
     [
       UMP.common_dep(:crc32cer),
       UMP.common_dep(:snappyer),
-      {:pulsar, github: "emqx/pulsar-client-erl", tag: "0.8.6"},
+      {:pulsar, github: "emqx/pulsar-client-erl", tag: "2.0.0"},
       {:emqx_connector, in_umbrella: true, runtime: false},
       {:emqx_resource, in_umbrella: true},
       {:emqx_bridge, in_umbrella: true, runtime: false}

+ 1 - 1
apps/emqx_bridge_pulsar/rebar.config

@@ -2,7 +2,7 @@
 
 {erl_opts, [debug_info]}.
 {deps, [
-    {pulsar, {git, "https://github.com/emqx/pulsar-client-erl.git", {tag, "0.8.6"}}},
+    {pulsar, {git, "https://github.com/emqx/pulsar-client-erl.git", {tag, "2.0.0"}}},
     {emqx_connector, {path, "../../apps/emqx_connector"}},
     {emqx_resource, {path, "../../apps/emqx_resource"}},
     {emqx_bridge, {path, "../../apps/emqx_bridge"}}

+ 1 - 1
apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar.app.src

@@ -1,6 +1,6 @@
 {application, emqx_bridge_pulsar, [
     {description, "EMQX Pulsar Bridge"},
-    {vsn, "0.2.6"},
+    {vsn, "0.2.7"},
     {registered, []},
     {applications, [
         kernel,

+ 216 - 82
apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_connector.erl

@@ -26,6 +26,9 @@
     on_format_query_result/1
 ]).
 
+-export([on_pulsar_ack/2]).
+-export([handle_telemetry_event/4]).
+
 -type pulsar_client_id() :: atom().
 -type state() :: #{
     client_id := pulsar_client_id(),
@@ -51,6 +54,7 @@
 %% Allocatable resources
 -define(pulsar_client_id, pulsar_client_id).
 -define(pulsar_producers, pulsar_producers).
+-define(telemetry_handler_id, telemetry_handler_id).
 
 -define(HEALTH_CHECK_RETRY_TIMEOUT, 4_000).
 
@@ -71,12 +75,12 @@ query_opts(#{resource_opts := #{query_mode := sync}, parameters := #{sync_timeou
 query_opts(_) ->
     #{}.
 
--spec on_start(resource_id(), config()) -> {ok, state()}.
-on_start(InstanceId, Config) ->
+-spec on_start(connector_resource_id(), config()) -> {ok, state()}.
+on_start(ConnResId, Config) ->
     #{servers := Servers0, ssl := SSL} = Config,
     Servers = format_servers(Servers0),
-    ClientId = make_client_id(InstanceId),
-    ok = emqx_resource:allocate_resource(InstanceId, ?pulsar_client_id, ClientId),
+    ClientId = make_client_id(ConnResId),
+    ok = emqx_resource:allocate_resource(ConnResId, ?pulsar_client_id, ClientId),
     SSLOpts = emqx_tls_lib:to_client_opts(SSL),
     ConnectTimeout = maps:get(connect_timeout, Config, timer:seconds(10)),
     ClientOpts = #{
@@ -85,12 +89,12 @@ on_start(InstanceId, Config) ->
         conn_opts => conn_opts(Config)
     },
     case pulsar:ensure_supervised_client(ClientId, Servers, ClientOpts) of
-        {ok, _Pid} ->
+        {ok, _} ->
             ?tp(
                 info,
                 "pulsar_client_started",
                 #{
-                    instance_id => InstanceId,
+                    instance_id => ConnResId,
                     pulsar_hosts => Servers
                 }
             );
@@ -98,7 +102,7 @@ on_start(InstanceId, Config) ->
             RedactedReason = emqx_utils:redact(Reason, fun is_sensitive_key/1),
             ?SLOG(error, #{
                 msg => "failed_to_start_pulsar_client",
-                instance_id => InstanceId,
+                instance_id => ConnResId,
                 pulsar_hosts => Servers,
                 reason => RedactedReason
             }),
@@ -112,84 +116,95 @@ on_start(InstanceId, Config) ->
     {ok, #{channels => #{}, client_id => ClientId, client_opts => ClientOpts}}.
 
 on_add_channel(
-    InstanceId,
+    ConnResId,
     #{channels := Channels, client_id := ClientId, client_opts := ClientOpts} = State,
-    ChannelId,
+    ActionResId,
     #{parameters := #{message := Message, sync_timeout := SyncTimeout} = Params}
 ) ->
-    case maps:is_key(ChannelId, Channels) of
+    case maps:is_key(ActionResId, Channels) of
         true ->
             {error, channel_already_exists};
         false ->
-            {ok, Producers} = start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params),
+            {ok, Producers} = start_producer(ConnResId, ActionResId, ClientId, ClientOpts, Params),
             Parameters = #{
                 message => compile_message_template(Message),
                 sync_timeout => SyncTimeout,
                 producers => Producers
             },
-            NewChannels = maps:put(ChannelId, Parameters, Channels),
+            NewChannels = maps:put(ActionResId, Parameters, Channels),
             {ok, State#{channels => NewChannels}}
     end.
 
-on_remove_channel(InstanceId, State, ChannelId) ->
-    #{channels := Channels, client_id := ClientId} = State,
-    case maps:find(ChannelId, Channels) of
+on_remove_channel(ConnResId, State, ActionResId) ->
+    #{channels := Channels} = State,
+    case maps:find(ActionResId, Channels) of
         {ok, #{producers := Producers}} ->
-            stop_producers(ClientId, Producers),
-            emqx_resource:deallocate_resource(InstanceId, {?pulsar_producers, ChannelId}),
-            {ok, State#{channels => maps:remove(ChannelId, Channels)}};
+            stop_producers(ActionResId, Producers),
+            emqx_resource:deallocate_resource(ConnResId, {?pulsar_producers, ActionResId}),
+            deallocate_telemetry_handlers(ConnResId, ActionResId),
+            {ok, State#{channels => maps:remove(ActionResId, Channels)}};
         error ->
             {ok, State}
     end.
 
-on_get_channels(InstanceId) ->
-    emqx_bridge_v2:get_channels_for_connector(InstanceId).
+on_get_channels(ConnResId) ->
+    emqx_bridge_v2:get_channels_for_connector(ConnResId).
 
 -spec on_stop(resource_id(), state()) -> ok.
-on_stop(InstanceId, _State) ->
-    Resources0 = emqx_resource:get_allocated_resources(InstanceId),
-    case maps:take(?pulsar_client_id, Resources0) of
-        {ClientId, Resources} ->
-            maps:foreach(
-                fun({?pulsar_producers, _BridgeV2Id}, Producers) ->
-                    stop_producers(ClientId, Producers)
-                end,
-                Resources
-            ),
-            stop_client(ClientId),
-            ?tp(pulsar_bridge_stopped, #{instance_id => InstanceId}),
-            ok;
-        error ->
-            ok
-    end.
+on_stop(ConnResId, _State) ->
+    Resources = emqx_resource:get_allocated_resources(ConnResId),
+    maps:foreach(
+        fun
+            ({?pulsar_producers, ActionResId}, Producers) ->
+                stop_producers(ActionResId, Producers);
+            (_, _) ->
+                ok
+        end,
+        Resources
+    ),
+    maps:foreach(
+        fun
+            ({?telemetry_handler_id, _ActionResId}, TelemetryId) ->
+                deallocate_telemetry_handlers(ConnResId, TelemetryId);
+            (_, _) ->
+                ok
+        end,
+        Resources
+    ),
+    maps:foreach(
+        fun
+            (?pulsar_client_id, ClientId) ->
+                stop_client(ClientId);
+            (_, _) ->
+                ok
+        end,
+        Resources
+    ),
+    ?tp(pulsar_bridge_stopped, #{instance_id => ConnResId}),
+    ok.
 
 %% Note: since Pulsar client has its own replayq that is not managed by
 %% `emqx_resource_buffer_worker', we must avoid returning `disconnected' here.  Otherwise,
 %% `emqx_resource_manager' will kill the Pulsar producers and messages might be lost.
 -spec on_get_status(resource_id(), state()) -> connected | connecting.
-on_get_status(_InstanceId, State = #{}) ->
+on_get_status(_ConnResId, State = #{}) ->
     #{client_id := ClientId} = State,
-    case pulsar_client_sup:find_client(ClientId) of
-        {ok, Pid} ->
-            try pulsar_client:get_status(Pid) of
-                true -> ?status_connected;
-                false -> ?status_connecting
-            catch
-                exit:{timeout, _} ->
-                    ?status_connecting;
-                exit:{noproc, _} ->
-                    ?status_connecting
-            end;
-        {error, _} ->
+    try pulsar_client_manager:get_status(ClientId, 5_000) of
+        true -> ?status_connected;
+        false -> ?status_connecting
+    catch
+        exit:{timeout, _} ->
+            ?status_connecting;
+        exit:{noproc, _} ->
             ?status_connecting
     end;
-on_get_status(_InstanceId, _State) ->
+on_get_status(_ConnResId, _State) ->
     %% If a health check happens just after a concurrent request to
     %% create the bridge is not quite finished, `State = undefined'.
     ?status_connecting.
 
-on_get_channel_status(_InstanceId, ChannelId, #{channels := Channels}) ->
-    case maps:find(ChannelId, Channels) of
+on_get_channel_status(_ConnResId, ActionResId, #{channels := Channels}) ->
+    case maps:find(ActionResId, Channels) of
         {ok, #{producers := Producers}} ->
             get_producer_status(Producers);
         error ->
@@ -200,21 +215,21 @@ on_get_channel_status(_InstanceId, ChannelId, #{channels := Channels}) ->
     {ok, term()}
     | {error, timeout}
     | {error, term()}.
-on_query(_InstanceId, {ChannelId, Message}, State) ->
+on_query(_ConnResId, {ActionResId, Message}, State) ->
     #{channels := Channels} = State,
-    case maps:find(ChannelId, Channels) of
+    case maps:find(ActionResId, Channels) of
         error ->
             {error, channel_not_found};
         {ok, #{message := MessageTmpl, sync_timeout := SyncTimeout, producers := Producers}} ->
             PulsarMessage = render_message(Message, MessageTmpl),
-            emqx_trace:rendered_action_template(ChannelId, #{
+            emqx_trace:rendered_action_template(ActionResId, #{
                 message => PulsarMessage,
                 sync_timeout => SyncTimeout,
                 is_async => false
             }),
             ?tp_span(
                 "pulsar_producer_query_enter",
-                #{instance_id => _InstanceId, message => Message, mode => sync},
+                #{instance_id => _ConnResId, message => Message, mode => sync},
                 try
                     ?tp("pulsar_producer_send", #{msg => PulsarMessage, mode => sync}),
                     pulsar:send_sync(Producers, [PulsarMessage], SyncTimeout)
@@ -229,33 +244,97 @@ on_query(_InstanceId, {ChannelId, Message}, State) ->
     resource_id(), tuple(), {ReplyFun :: function(), Args :: list()}, state()
 ) ->
     {ok, pid()}.
-on_query_async(_InstanceId, {ChannelId, Message}, AsyncReplyFn, State) ->
+on_query_async(_ConnResId, {ActionResId, Message}, AsyncReplyFn, State) ->
     #{channels := Channels} = State,
-    case maps:find(ChannelId, Channels) of
+    case maps:find(ActionResId, Channels) of
         error ->
             {error, {unrecoverable_error, channel_not_found}};
         {ok, #{message := MessageTmpl, producers := Producers}} ->
             ?tp_span(
                 "pulsar_producer_query_enter",
-                #{instance_id => _InstanceId, message => Message, mode => async},
-                on_query_async2(ChannelId, Producers, Message, MessageTmpl, AsyncReplyFn)
+                #{instance_id => _ConnResId, message => Message, mode => async},
+                on_query_async2(ActionResId, Producers, Message, MessageTmpl, AsyncReplyFn)
             )
     end.
 
-on_query_async2(ChannelId, Producers, Message, MessageTmpl, AsyncReplyFn) ->
+on_query_async2(ActionResId, Producers, Message, MessageTmpl, AsyncReplyFn) ->
     PulsarMessage = render_message(Message, MessageTmpl),
-    emqx_trace:rendered_action_template(ChannelId, #{
+    emqx_trace:rendered_action_template(ActionResId, #{
         message => PulsarMessage,
         is_async => true
     }),
+    CallbackFn = {fun ?MODULE:on_pulsar_ack/2, [AsyncReplyFn]},
     ?tp("pulsar_producer_send", #{msg => PulsarMessage, mode => async}),
-    pulsar:send(Producers, [PulsarMessage], #{callback_fn => AsyncReplyFn}).
+    pulsar:send(Producers, [PulsarMessage], #{callback_fn => CallbackFn}).
 
 on_format_query_result({ok, Info}) ->
     #{result => ok, info => Info};
 on_format_query_result(Result) ->
     Result.
 
+on_pulsar_ack(_ReplyFnAndArgs, {error, Reason}) when
+    Reason =:= expired;
+    Reason =:= overflow
+->
+    %% We already bumped the dropped counter in `handle_telemetry_event/4', so no need to
+    %% call the wrapping callback here (it would bump the failure counter).
+    ok;
+on_pulsar_ack(ReplyFnAndArgs, Result) ->
+    emqx_resource:apply_reply_fun(ReplyFnAndArgs, Result).
+
+%%-------------------------------------------------------------------------------------
+%% `telemetry' API
+%%-------------------------------------------------------------------------------------
+
+%% we *must* match the bridge id in the event metadata with that in
+%% the handler config; otherwise, multiple pulsar producer bridges will
+%% install multiple handlers to the same pulsar events, multiplying the
+handle_telemetry_event(
+    [pulsar, dropped],
+    #{counter_inc := Val, reason := queue_full},
+    #{action_id := ID},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:dropped_queue_full_inc(ID, Val);
+handle_telemetry_event(
+    [pulsar, dropped],
+    #{counter_inc := Val, reason := expired},
+    #{action_id := ID},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:dropped_expired_inc(ID, Val);
+handle_telemetry_event(
+    [pulsar, queuing],
+    #{gauge_set := Val},
+    #{action_id := ID, partition_topic := PartitionTopic},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:queuing_set(ID, PartitionTopic, Val);
+handle_telemetry_event(
+    [pulsar, queuing_bytes],
+    #{gauge_set := Val},
+    #{action_id := ID, partition_topic := PartitionTopic},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:queuing_bytes_set(ID, PartitionTopic, Val);
+handle_telemetry_event(
+    [pulsar, retried],
+    #{counter_inc := Val},
+    #{action_id := ID},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:retried_inc(ID, Val);
+handle_telemetry_event(
+    [pulsar, inflight],
+    #{gauge_set := Val},
+    #{action_id := ID, partition_topic := PartitionTopic},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:inflight_set(ID, PartitionTopic, Val);
+handle_telemetry_event(_EventId, _Metrics, _Metadata, _HandlerConfig) ->
+    %% Event that we do not handle
+    ok.
+
 %%-------------------------------------------------------------------------------------
 %% Internal fns
 %%-------------------------------------------------------------------------------------
@@ -270,12 +349,12 @@ format_servers(Servers0) ->
     ).
 
 -spec make_client_id(resource_id()) -> pulsar_client_id().
-make_client_id(InstanceId) ->
-    case emqx_resource:is_dry_run(InstanceId) of
+make_client_id(ConnResId) ->
+    case emqx_resource:is_dry_run(ConnResId) of
         true ->
             pulsar_producer_probe;
         false ->
-            {pulsar, Name} = emqx_connector_resource:parse_connector_id(InstanceId),
+            {pulsar, Name} = emqx_connector_resource:parse_connector_id(ConnResId),
             ClientIdBin = iolist_to_binary([
                 <<"pulsar:">>,
                 emqx_utils_conv:bin(Name),
@@ -304,22 +383,22 @@ conn_opts(#{authentication := #{jwt := JWT}}) ->
 replayq_dir(ClientId) ->
     filename:join([emqx:data_dir(), "pulsar", emqx_utils_conv:bin(ClientId)]).
 
-producer_name(InstanceId, ChannelId) ->
-    case emqx_resource:is_dry_run(InstanceId) of
+producer_name(ConnResId, ActionResId) ->
+    case emqx_resource:is_dry_run(ConnResId) of
         %% do not create more atom
         true ->
             pulsar_producer_probe_worker;
         false ->
-            ChannelIdBin = emqx_utils_conv:bin(ChannelId),
+            ActionResIdBin = emqx_utils_conv:bin(ActionResId),
             binary_to_atom(
                 iolist_to_binary([
                     <<"producer-">>,
-                    ChannelIdBin
+                    ActionResIdBin
                 ])
             )
     end.
 
-start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
+start_producer(ConnResId, ActionResId, ClientId, ClientOpts, Params) ->
     #{
         conn_opts := ConnOpts,
         ssl_opts := SSLOpts
@@ -342,8 +421,8 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
     {OffloadMode, ReplayQDir} =
         case BufferMode of
             memory -> {false, false};
-            disk -> {false, replayq_dir(ChannelId)};
-            hybrid -> {true, replayq_dir(ChannelId)}
+            disk -> {false, replayq_dir(ActionResId)};
+            hybrid -> {true, replayq_dir(ActionResId)}
         end,
     MemOLP =
         case os:type() of
@@ -357,7 +436,7 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
         replayq_seg_bytes => SegmentBytes,
         drop_if_highmem => MemOLP
     },
-    ProducerName = producer_name(InstanceId, ChannelId),
+    ProducerName = producer_name(ConnResId, ActionResId),
     ?tp(pulsar_producer_capture_name, #{producer_name => ProducerName}),
     ProducerOpts0 =
         #{
@@ -369,15 +448,22 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
             retention_period => RetentionPeriod,
             ssl_opts => SSLOpts,
             strategy => partition_strategy(Strategy),
-            tcp_opts => [{sndbuf, SendBuffer}]
+            tcp_opts => [{sndbuf, SendBuffer}],
+            telemetry_metadata => #{action_id => ActionResId}
         },
     ProducerOpts = maps:merge(ReplayQOpts, ProducerOpts0),
     ?tp(pulsar_producer_about_to_start_producers, #{producer_name => ProducerName}),
+    ok = emqx_resource:allocate_resource(
+        ConnResId,
+        {?telemetry_handler_id, ActionResId},
+        ActionResId
+    ),
+    _ = maybe_install_telemetry_handlers(ActionResId),
     try pulsar:ensure_supervised_producers(ClientId, PulsarTopic, ProducerOpts) of
         {ok, Producers} ->
             ok = emqx_resource:allocate_resource(
-                InstanceId,
-                {?pulsar_producers, ChannelId},
+                ConnResId,
+                {?pulsar_producers, ActionResId},
                 Producers
             ),
             ?tp(pulsar_producer_producers_allocated, #{}),
@@ -389,7 +475,7 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
                 error,
                 "failed_to_start_pulsar_producer",
                 #{
-                    instance_id => InstanceId,
+                    instance_id => ConnResId,
                     kind => Kind,
                     reason => emqx_utils:redact(Error, fun is_sensitive_key/1),
                     stacktrace => Stacktrace
@@ -399,6 +485,7 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
                 pulsar_client_id => ClientId,
                 producers => undefined
             }),
+            _ = uninstall_telemetry_handlers(ActionResId),
             throw(failed_to_start_pulsar_producer)
     end.
 
@@ -417,20 +504,20 @@ stop_client(ClientId) ->
     ),
     ok.
 
--spec stop_producers(pulsar_client_id(), pulsar_producers:producers()) -> ok.
-stop_producers(ClientId, Producers) ->
+-spec stop_producers(action_resource_id(), pulsar_producers:producers()) -> ok.
+stop_producers(ActionResId, Producers) ->
     _ = log_when_error(
         fun() ->
             ok = pulsar:stop_and_delete_supervised_producers(Producers),
             ?tp(pulsar_bridge_producer_stopped, #{
-                pulsar_client_id => ClientId,
+                action_id => ActionResId,
                 producers => Producers
             }),
             ok
         end,
         #{
             msg => "failed_to_delete_pulsar_producer",
-            pulsar_client_id => ClientId
+            action_id => ActionResId
         }
     ),
     ok.
@@ -516,3 +603,50 @@ do_get_error_message(Iterator) ->
         none ->
             error
     end.
+
+maybe_install_telemetry_handlers(ActionResId) ->
+    %% Attach event handlers for telemetry events. If a handler with the
+    %% handler id already exists, the attach_many function does nothing
+    telemetry:attach_many(
+        %% unique handler id
+        ActionResId,
+        [
+            [pulsar, dropped],
+            [pulsar, queuing],
+            [pulsar, queuing_bytes],
+            [pulsar, retried],
+            [pulsar, inflight]
+        ],
+        fun ?MODULE:handle_telemetry_event/4,
+        %% we *must* keep track of the same id that is handed down to
+        %% wolff producers; otherwise, multiple kafka producer bridges
+        %% will install multiple handlers to the same wolff events,
+        %% multiplying the metric counts...
+        #{action_id => ActionResId}
+    ).
+
+with_log_at_error(Fun, Log) ->
+    try
+        Fun()
+    catch
+        C:E ->
+            ?SLOG(error, Log#{
+                exception => C,
+                reason => E
+            })
+    end.
+
+uninstall_telemetry_handlers(TelemetryId) ->
+    telemetry:detach(TelemetryId).
+
+deallocate_telemetry_handlers(ConnResId, ActionResId) ->
+    _ = with_log_at_error(
+        fun() ->
+            _ = uninstall_telemetry_handlers(ActionResId),
+            emqx_resource:deallocate_resource(ConnResId, {?telemetry_handler_id, ActionResId})
+        end,
+        #{
+            msg => "failed_to_uninstall_telemetry_handlers",
+            action_id => ActionResId
+        }
+    ).

+ 1 - 1
apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl

@@ -400,7 +400,7 @@ start_consumer(TestCase, Config) ->
         cacertfile => filename:join([CertsPath, "cacert.pem"])
     },
     Opts = #{enable_ssl => UseTLS, ssl_opts => emqx_tls_lib:to_client_opts(SSLOpts)},
-    {ok, _ClientPid} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
+    {ok, _} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
     ConsumerOpts = Opts#{
         cb_init_args => #{send_to => self()},
         cb_module => pulsar_echo_consumer,

+ 328 - 42
apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_v2_SUITE.erl

@@ -52,21 +52,18 @@ init_per_group(plain = Type, Config) ->
     PulsarHost = os:getenv("PULSAR_PLAIN_HOST", "toxiproxy"),
     PulsarPort = list_to_integer(os:getenv("PULSAR_PLAIN_PORT", "6652")),
     ProxyName = "pulsar_plain",
+    reset_proxy(),
     case emqx_common_test_helpers:is_tcp_server_available(PulsarHost, PulsarPort) of
         true ->
             Config1 = common_init_per_group(),
-            ConnectorName = ?MODULE,
-            NewConfig =
-                [
-                    {proxy_name, ProxyName},
-                    {pulsar_host, PulsarHost},
-                    {pulsar_port, PulsarPort},
-                    {pulsar_type, Type},
-                    {use_tls, false}
-                    | Config1 ++ Config
-                ],
-            create_connector(ConnectorName, NewConfig),
-            NewConfig;
+            [
+                {proxy_name, ProxyName},
+                {pulsar_host, PulsarHost},
+                {pulsar_port, PulsarPort},
+                {pulsar_type, Type},
+                {use_tls, false}
+                | Config1 ++ Config
+            ];
         false ->
             maybe_skip_without_ci()
     end;
@@ -74,21 +71,18 @@ init_per_group(tls = Type, Config) ->
     PulsarHost = os:getenv("PULSAR_TLS_HOST", "toxiproxy"),
     PulsarPort = list_to_integer(os:getenv("PULSAR_TLS_PORT", "6653")),
     ProxyName = "pulsar_tls",
+    reset_proxy(),
     case emqx_common_test_helpers:is_tcp_server_available(PulsarHost, PulsarPort) of
         true ->
             Config1 = common_init_per_group(),
-            ConnectorName = ?MODULE,
-            NewConfig =
-                [
-                    {proxy_name, ProxyName},
-                    {pulsar_host, PulsarHost},
-                    {pulsar_port, PulsarPort},
-                    {pulsar_type, Type},
-                    {use_tls, true}
-                    | Config1 ++ Config
-                ],
-            create_connector(ConnectorName, NewConfig),
-            NewConfig;
+            [
+                {proxy_name, ProxyName},
+                {pulsar_host, PulsarHost},
+                {pulsar_port, PulsarPort},
+                {pulsar_type, Type},
+                {use_tls, true}
+                | Config1 ++ Config
+            ];
         false ->
             maybe_skip_without_ci()
     end;
@@ -105,9 +99,9 @@ end_per_group(_Group, _Config) ->
     ok.
 
 common_init_per_group() ->
+    reset_proxy(),
     ProxyHost = os:getenv("PROXY_HOST", "toxiproxy"),
     ProxyPort = list_to_integer(os:getenv("PROXY_PORT", "8474")),
-    emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
     UniqueNum = integer_to_binary(erlang:unique_integer()),
     MQTTTopic = <<"mqtt/topic/", UniqueNum/binary>>,
     [
@@ -116,6 +110,12 @@ common_init_per_group() ->
         {mqtt_topic, MQTTTopic}
     ].
 
+reset_proxy() ->
+    ProxyHost = os:getenv("PROXY_HOST", "toxiproxy"),
+    ProxyPort = list_to_integer(os:getenv("PROXY_PORT", "8474")),
+    emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
+    ok.
+
 common_end_per_group(Config) ->
     ProxyHost = ?config(proxy_host, Config),
     ProxyPort = ?config(proxy_port, Config),
@@ -131,7 +131,7 @@ end_per_testcase(_Testcase, Config) ->
     ProxyHost = ?config(proxy_host, Config),
     ProxyPort = ?config(proxy_port, Config),
     emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
-    emqx_bridge_v2_testlib:delete_all_bridges(),
+    emqx_bridge_v2_testlib:delete_all_bridges_and_connectors(),
     stop_consumer(Config),
     %% in CI, apparently this needs more time since the
     %% machines struggle with all the containers running...
@@ -159,21 +159,25 @@ common_init_per_testcase(TestCase, Config0) ->
 %% Helper fns
 %%------------------------------------------------------------------------------
 
-create_connector(Name, Config) ->
-    Connector = pulsar_connector(Config),
-    {ok, _} = emqx_connector:create(?TYPE, Name, Connector).
+create_connector(Config) ->
+    {201, _} = create_connector_api([
+        {connector_type, ?TYPE},
+        {connector_name, ?MODULE},
+        {connector_config, connector_config(Config)}
+    ]),
+    ok.
 
 delete_connector(Name) ->
     ok = emqx_connector:remove(?TYPE, Name).
 
 create_action(Name, Config) ->
-    Action = pulsar_action(Config),
+    Action = action_config(Config),
     {ok, _} = emqx_bridge_v2:create(actions, ?TYPE, Name, Action).
 
 delete_action(Name) ->
     ok = emqx_bridge_v2:remove(actions, ?TYPE, Name).
 
-pulsar_connector(Config) ->
+connector_config(Config) ->
     PulsarHost = ?config(pulsar_host, Config),
     PulsarPort = ?config(pulsar_port, Config),
     UseTLS = proplists:get_value(use_tls, Config, false),
@@ -201,11 +205,13 @@ pulsar_connector(Config) ->
     },
     emqx_bridge_v2_testlib:parse_and_check_connector(?TYPE, Name, InnerConfigMap).
 
-pulsar_action(Config) ->
+action_config(Config) ->
+    action_config(atom_to_binary(?MODULE), Config).
+
+action_config(ConnectorName, Config) ->
     QueryMode = proplists:get_value(query_mode, Config, <<"sync">>),
-    Name = atom_to_binary(?MODULE),
     InnerConfigMap = #{
-        <<"connector">> => Name,
+        <<"connector">> => ConnectorName,
         <<"enable">> => true,
         <<"parameters">> => #{
             <<"retention_period">> => <<"infinity">>,
@@ -231,7 +237,7 @@ pulsar_action(Config) ->
             <<"metrics_flush_interval">> => <<"300ms">>
         }
     },
-    emqx_bridge_v2_testlib:parse_and_check(action, ?TYPE, Name, InnerConfigMap).
+    emqx_bridge_v2_testlib:parse_and_check(action, ?TYPE, <<"some_action">>, InnerConfigMap).
 
 instance_id(Type, Name) ->
     ConnectorId = emqx_bridge_resource:resource_id(Type, ?TYPE, Name),
@@ -269,7 +275,7 @@ start_consumer(TestCase, Config) ->
         cacertfile => filename:join([CertsPath, "cacert.pem"])
     },
     Opts = #{enable_ssl => UseTLS, ssl_opts => emqx_tls_lib:to_client_opts(SSLOpts)},
-    {ok, _ClientPid} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
+    {ok, _} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
     ConsumerOpts = Opts#{
         cb_init_args => #{send_to => self()},
         cb_module => pulsar_echo_consumer,
@@ -398,6 +404,39 @@ group_path(Config) ->
             Path
     end.
 
+create_connector_api(Config) ->
+    emqx_bridge_v2_testlib:simplify_result(
+        emqx_bridge_v2_testlib:create_connector_api(Config)
+    ).
+
+create_action_api(Config) ->
+    create_action_api(Config, _Overrides = #{}).
+
+create_action_api(Config, Overrides) ->
+    emqx_bridge_v2_testlib:simplify_result(
+        emqx_bridge_v2_testlib:create_kind_api(Config, Overrides)
+    ).
+
+update_action_api(Config, Overrides) ->
+    emqx_bridge_v2_testlib:simplify_result(
+        emqx_bridge_v2_testlib:update_bridge_api(Config, Overrides)
+    ).
+
+get_combined_metrics(ActionResId, RuleId) ->
+    Metrics = emqx_resource:get_metrics(ActionResId),
+    RuleMetrics = emqx_metrics_worker:get_counters(rule_metrics, RuleId),
+    Metrics#{rule => RuleMetrics}.
+
+reset_combined_metrics(ActionResId, RuleId) ->
+    #{
+        kind := action,
+        type := Type,
+        name := Name
+    } = emqx_bridge_v2:parse_id(ActionResId),
+    ok = emqx_bridge_v2:reset_metrics(actions, Type, Name),
+    ok = emqx_rule_engine:reset_metrics_for_rule(RuleId),
+    ok.
+
 %%------------------------------------------------------------------------------
 %% Testcases
 %%------------------------------------------------------------------------------
@@ -406,7 +445,8 @@ t_action_probe(matrix) ->
     [[plain], [tls]];
 t_action_probe(Config) when is_list(Config) ->
     Name = atom_to_binary(?FUNCTION_NAME),
-    Action = pulsar_action(Config),
+    create_connector(Config),
+    Action = action_config(Config),
     {ok, Res0} = emqx_bridge_v2_testlib:probe_bridge_api(action, ?TYPE, Name, Action),
     ?assertMatch({{_, 204, _}, _, _}, Res0),
     ok.
@@ -424,6 +464,7 @@ t_action(Config) when is_list(Config) ->
             _ -> <<"async">>
         end,
     Name = atom_to_binary(?FUNCTION_NAME),
+    create_connector(Config),
     create_action(Name, [{query_mode, QueryMode} | Config]),
     Actions = emqx_bridge_v2:list(actions),
     Any = fun(#{name := BName}) -> BName =:= Name end,
@@ -477,8 +518,8 @@ t_multiple_actions_sharing_topic(matrix) ->
 t_multiple_actions_sharing_topic(Config) when is_list(Config) ->
     Type = ?TYPE,
     ConnectorName = <<"c">>,
-    ConnectorConfig = pulsar_connector(Config),
-    ActionConfig = pulsar_action(Config),
+    ConnectorConfig = connector_config(Config),
+    ActionConfig = action_config(ConnectorName, Config),
     ?check_trace(
         begin
             ConnectorParams = [
@@ -572,14 +613,259 @@ t_sync_query_down(Config0) when is_list(Config0) ->
         success_tp_filter =>
             ?match_event(#{?snk_kind := pulsar_echo_consumer_message})
     },
+    ConnectorName = atom_to_binary(?FUNCTION_NAME),
     Config = [
         {connector_type, ?TYPE},
-        {connector_name, ?FUNCTION_NAME},
-        {connector_config, pulsar_connector(Config0)},
+        {connector_name, ConnectorName},
+        {connector_config, connector_config(Config0)},
         {action_type, ?TYPE},
         {action_name, ?FUNCTION_NAME},
-        {action_config, pulsar_action(Config0)}
+        {action_config, action_config(ConnectorName, Config0)}
         | proplists_with([proxy_name, proxy_host, proxy_port], Config0)
     ],
     emqx_bridge_v2_testlib:t_sync_query_down(Config, Opts),
     ok.
+
+%% Checks that we correctly handle telemetry events emitted by pulsar.
+t_telemetry_metrics(matrix) ->
+    [[plain]];
+t_telemetry_metrics(Config) when is_list(Config) ->
+    ProxyName = ?config(proxy_name, Config),
+    ProxyHost = ?config(proxy_host, Config),
+    ProxyPort = ?config(proxy_port, Config),
+    Type = ?TYPE,
+    ConnectorName = <<"c">>,
+    ConnectorConfig = connector_config(Config),
+    ActionConfig = action_config(ConnectorName, Config),
+    ConnectorParams = [
+        {connector_config, ConnectorConfig},
+        {connector_name, ConnectorName},
+        {connector_type, Type}
+    ],
+    ActionName1 = <<"a1">>,
+    ActionParams1 = [
+        {action_config, ActionConfig},
+        {action_name, ActionName1},
+        {action_type, Type}
+    ],
+    ActionName2 = <<"a2">>,
+    ActionParams2 = [
+        {action_config, ActionConfig},
+        {action_name, ActionName2},
+        {action_type, Type}
+    ],
+    ?check_trace(
+        begin
+            {201, _} =
+                create_connector_api(ConnectorParams),
+            {201, _} =
+                create_action_api(
+                    ActionParams1,
+                    %% Initially, this will overflow on small messages
+                    #{
+                        <<"parameters">> => #{
+                            <<"buffer">> => #{
+                                <<"mode">> => <<"disk">>,
+                                <<"per_partition_limit">> => <<"2B">>,
+                                <<"segment_bytes">> => <<"1B">>
+                            }
+                        }
+                    }
+                ),
+            {201, _} =
+                create_action_api(ActionParams2),
+            RuleTopic = <<"t/a2">>,
+            {ok, #{<<"id">> := RuleId}} =
+                emqx_bridge_v2_testlib:create_rule_and_action_http(Type, RuleTopic, [
+                    {bridge_name, ActionName1}
+                ]),
+            {ok, C} = emqtt:start_link([]),
+            {ok, _} = emqtt:connect(C),
+            SendMessage = fun() ->
+                ReqPayload = payload(),
+                ReqPayloadBin = emqx_utils_json:encode(ReqPayload),
+                {ok, _} = emqtt:publish(C, RuleTopic, #{}, ReqPayloadBin, [
+                    {qos, 1}, {retain, false}
+                ]),
+                ok
+            end,
+            SendMessage(),
+            ActionResId1 = emqx_bridge_v2_testlib:bridge_id(ActionParams1),
+            ActionResId2 = emqx_bridge_v2_testlib:bridge_id(ActionParams2),
+            ?retry(
+                100,
+                10,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 1,
+                            'dropped.expired' := 0,
+                            success := 0,
+                            matched := 1,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        },
+                        rule := #{
+                            matched := 1,
+                            %% todo: bump action failure count when dropped to mimic common
+                            %% buffer worker behavior.
+                            'actions.failed' := 0,
+                            'actions.failed.unknown' := 0,
+                            'actions.success' := 0
+                        }
+                    },
+                    get_combined_metrics(ActionResId1, RuleId)
+                )
+            ),
+            reset_combined_metrics(ActionResId1, RuleId),
+            %% Now to make it drop expired messages
+            {200, _} =
+                update_action_api(ActionParams1, #{
+                    <<"parameters">> => #{
+                        <<"retention_period">> => <<"10ms">>
+                    }
+                }),
+            emqx_common_test_helpers:with_failure(down, ProxyName, ProxyHost, ProxyPort, fun() ->
+                SendMessage(),
+                ?retry(
+                    100,
+                    10,
+                    ?assertMatch(
+                        #{
+                            counters := #{
+                                'dropped.queue_full' := 0,
+                                'dropped.expired' := 0,
+                                success := 0,
+                                matched := 1,
+                                failed := 0,
+                                received := 0
+                            },
+                            gauges := #{
+                                inflight := 0,
+                                queuing := 1,
+                                queuing_bytes := QueuingBytes1
+                            }
+                        } when QueuingBytes1 > 0,
+                        get_combined_metrics(ActionResId1, RuleId)
+                    )
+                ),
+                %% Other action is not affected by telemetry events for first action.
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 0,
+                            success := 0,
+                            matched := 0,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        }
+                    },
+                    emqx_resource:get_metrics(ActionResId2)
+                ),
+                ct:sleep(20),
+                ok
+            end),
+            %% After connection is restored, the request is already expired
+            ?retry(
+                500,
+                20,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 1,
+                            success := 0,
+                            matched := 1,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        },
+                        rule := #{
+                            matched := 1,
+                            %% todo: bump action failure count when dropped to mimic common
+                            %% buffer worker behavior.
+                            'actions.failed' := 0,
+                            'actions.failed.unknown' := 0,
+                            'actions.success' := 0
+                        }
+                    },
+                    get_combined_metrics(ActionResId1, RuleId)
+                )
+            ),
+            reset_combined_metrics(ActionResId1, RuleId),
+
+            %% Now, a success.
+            SendMessage(),
+            ?retry(
+                500,
+                20,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 0,
+                            success := 1,
+                            matched := 1,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        },
+                        rule := #{
+                            matched := 1,
+                            'actions.failed' := 0,
+                            'actions.failed.unknown' := 0,
+                            'actions.success' := 1
+                        }
+                    },
+                    get_combined_metrics(ActionResId1, RuleId)
+                )
+            ),
+
+            %% Other action is not affected by telemetry events for first action.
+            ?retry(
+                100,
+                10,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 0,
+                            success := 0,
+                            matched := 0,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        }
+                    },
+                    emqx_resource:get_metrics(ActionResId2)
+                )
+            ),
+
+            ok
+        end,
+        []
+    ),
+    ok.

+ 24 - 0
apps/emqx_conf/etc/base.hocon

@@ -0,0 +1,24 @@
+## Define configurations that can later be overridden through UI/API/CLI.
+##
+## Config precedence order:
+##   etc/base.hocon < cluster.hocon < emqx.conf < environment variables
+
+## Logging configs
+## EMQX provides support for two primary log handlers: `file` and `console`,
+## with an additional `audit` handler specifically designed to always direct logs to files.
+## The system's default log handling behavior can be configured via the environment
+## variable `EMQX_DEFAULT_LOG_HANDLER`, which accepts the following settings:
+##  - `file`: Directs log output exclusively to files.
+##  - `console`: Channels log output solely to the console.
+## It's noteworthy that `EMQX_DEFAULT_LOG_HANDLER` is set to `file`
+## when EMQX is initiated via systemd `emqx.service` file.
+## In scenarios outside systemd initiation, `console` serves as the default log handler.
+## Read more about configs here: {{ emqx_configuration_doc_log }}
+log {
+    file {
+        level = warning
+    }
+    console {
+        level = warning
+    }
+}

+ 5 - 27
apps/emqx_conf/etc/emqx_conf.conf

@@ -1,12 +1,10 @@
-## NOTE:
-## This config file overrides data/configs/cluster.hocon,
-## and is merged with environment variables which start with 'EMQX_' prefix.
+## Place read-only configurations in this file.
+## To define configurations that can later be overridden through UI/API/CLI, add them to `etc/base.hocon`.
 ##
-## Config changes made from EMQX dashboard UI, management HTTP API, or CLI
-## are stored in data/configs/cluster.hocon.
-## To avoid confusion, please do not store the same configs in both files.
+## Config precedence order:
+##   etc/base.hocon < cluster.hocon < emqx.conf < environment variables
 ##
-## See {{ emqx_configuration_doc }} for more details.
+## See {{ emqx_configuration_doc }} for more information.
 ## Configuration full example can be found in etc/examples
 
 node {
@@ -19,23 +17,3 @@ cluster {
   name = emqxcl
   discovery_strategy = manual
 }
-
-## EMQX provides support for two primary log handlers: `file` and `console`, with an additional `audit` handler specifically designed to always direct logs to files.
-## The system's default log handling behavior can be configured via the environment variable `EMQX_DEFAULT_LOG_HANDLER`, which accepts the following settings:
-##
-##   - `file`: Directs log output exclusively to files.
-##   - `console`: Channels log output solely to the console.
-##
-## It's noteworthy that `EMQX_DEFAULT_LOG_HANDLER` is set to `file` when EMQX is initiated via systemd `emqx.service` file.
-## In scenarios outside systemd initiation, `console` serves as the default log handler.
-
-## Read more about configs here: {{ emqx_configuration_doc_log }}
-
-# log {
-#     file {
-#         level = warning
-#     }
-#     console {
-#         level = warning
-#     }
-# }

+ 48 - 0
apps/emqx_connector/test/emqx_connector_SUITE.erl

@@ -20,6 +20,7 @@
 
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("common_test/include/ct.hrl").
+-include_lib("snabbkaffe/include/snabbkaffe.hrl").
 
 -define(CONNECTOR, emqx_connector_dummy_impl).
 
@@ -327,6 +328,53 @@ t_no_buffer_workers(Config) ->
     ?assertEqual([], supervisor:which_children(emqx_resource_buffer_worker_sup)),
     ok.
 
+%% Checks that the maximum timeout (currently) set by `resource_opts.health_check_timeout'
+%% is respected when doing a dry run, even if the removal gets stuck because the resource
+%% process is unresponsive.
+t_dryrun_timeout({'init', Config}) ->
+    meck:new(emqx_connector_resource, [passthrough]),
+    meck:expect(emqx_connector_resource, connector_to_resource_type, 1, ?CONNECTOR),
+    meck:new(?CONNECTOR, [non_strict]),
+    meck:expect(?CONNECTOR, resource_type, 0, dummy),
+    meck:expect(?CONNECTOR, callback_mode, 0, async_if_possible),
+    %% hang forever
+    meck:expect(?CONNECTOR, on_start, fun(_ConnResId, _Opts) ->
+        receive
+            go -> ok
+        end
+    end),
+    meck:expect(?CONNECTOR, on_get_channels, 1, []),
+    meck:expect(?CONNECTOR, on_add_channel, 4, {ok, connector_state}),
+    meck:expect(?CONNECTOR, on_stop, 2, ok),
+    meck:expect(?CONNECTOR, on_get_status, 2, connected),
+    meck:expect(?CONNECTOR, query_mode, 1, sync),
+    Config;
+t_dryrun_timeout({'end', _Config}) ->
+    meck:unload(),
+    ok;
+t_dryrun_timeout(Config) when is_list(Config) ->
+    Type = kafka_producer,
+    Conf0 = connector_config(),
+    Timeout = 100,
+    Conf = Conf0#{<<"resource_opts">> => #{<<"health_check_interval">> => Timeout}},
+    %% Minimum timeout is capped at 5 s in `emqx_resource_manager'...  Plus, we need to
+    %% wait for removal of stuck process, which itself has another 5 s timeout.
+    ct:timetrap(15_000),
+    %% Cache cleaner is triggered when the process initiating the dry run dies.
+    Pid = spawn_link(fun() ->
+        Res = emqx_connector_resource:create_dry_run(Type, Conf),
+        ?assertEqual({error, timeout}, Res),
+        ok
+    end),
+    MRef = monitor(process, Pid),
+    receive
+        {'DOWN', MRef, _, _, _} ->
+            ok
+    end,
+    %% Should be removed asynchronously by cache cleaner.
+    ?retry(1_000, 7, ?assertEqual([], emqx_resource:list_instances())),
+    ok.
+
 %% helpers
 
 connector_config() ->

+ 1 - 1
apps/emqx_dashboard/src/emqx_dashboard.app.src

@@ -2,7 +2,7 @@
 {application, emqx_dashboard, [
     {description, "EMQX Web Dashboard"},
     % strict semver, bump manually!
-    {vsn, "5.1.7"},
+    {vsn, "5.2.0"},
     {modules, []},
     {registered, [emqx_dashboard_sup]},
     {applications, [

+ 7 - 4
apps/emqx_dashboard/src/emqx_dashboard_monitor.erl

@@ -24,7 +24,7 @@
 
 -behaviour(gen_server).
 
--export([create_tables/0]).
+-export([create_tables/0, clear_table/0]).
 -export([start_link/0]).
 
 -export([
@@ -98,6 +98,9 @@ create_tables() ->
     ]),
     [?TAB].
 
+clear_table() ->
+    mria:clear_table(?TAB).
+
 %% -------------------------------------------------------------------------------------------------
 %% API
 
@@ -133,7 +136,7 @@ current_rate(Node) when Node == node() ->
             {ok, maps:merge(maps:from_list(Rate0), non_rate_value())}
     end;
 current_rate(Node) ->
-    case emqx_dashboard_proto_v1:current_rate(Node) of
+    case emqx_dashboard_proto_v2:current_rate(Node) of
         {badrpc, Reason} ->
             {badrpc, #{node => Node, reason => Reason}};
         {ok, Rate} ->
@@ -316,7 +319,7 @@ do_sample(all, Time) when is_integer(Time) ->
 do_sample(Node, Time) when Node == node() andalso is_integer(Time) ->
     do_sample_local(Time);
 do_sample(Node, Time) when is_integer(Time) ->
-    case emqx_dashboard_proto_v1:do_sample(Node, Time) of
+    case emqx_dashboard_proto_v2:do_sample(Node, Time) of
         {badrpc, Reason} ->
             {badrpc, #{node => Node, reason => Reason}};
         Res ->
@@ -348,7 +351,7 @@ sample_nodes(Nodes, Time) ->
     lists:foldl(fun(I, B) -> merge_samplers(Time, I, B) end, #{}, Success).
 
 concurrently_sample_nodes(Nodes, Time) ->
-    %% emqx_dashboard_proto_v1:do_sample has a timeout (5s),
+    %% emqx_dashboard_proto_v2:do_sample has a timeout (5s),
     %% call emqx_utils:pmap here instead of a rpc multicall
     %% to avoid having to introduce a new bpapi proto version
     emqx_utils:pmap(fun(Node) -> do_sample(Node, Time) end, Nodes, infinity).

+ 20 - 1
apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl

@@ -20,6 +20,7 @@
 -include_lib("typerefl/include/types.hrl").
 -include_lib("hocon/include/hocon_types.hrl").
 -include_lib("emqx_utils/include/emqx_utils_api.hrl").
+-include_lib("emqx/include/logger.hrl").
 
 -behaviour(minirest_api).
 
@@ -61,6 +62,13 @@ schema("/monitor") ->
                 200 => hoconsc:mk(hoconsc:array(hoconsc:ref(sampler)), #{}),
                 400 => emqx_dashboard_swagger:error_codes(['BAD_RPC'], <<"Bad RPC">>)
             }
+        },
+        delete => #{
+            tags => [<<"Metrics">>],
+            description => ?DESC(clear_monitor),
+            responses => #{
+                204 => <<"Metrics deleted">>
+            }
         }
     };
 schema("/monitor/nodes/:node") ->
@@ -148,7 +156,18 @@ fields_current(Names) ->
 monitor(get, #{query_string := QS, bindings := Bindings}) ->
     Latest = maps:get(<<"latest">>, QS, infinity),
     RawNode = maps:get(node, Bindings, <<"all">>),
-    emqx_utils_api:with_node_or_cluster(RawNode, dashboard_samplers_fun(Latest)).
+    emqx_utils_api:with_node_or_cluster(RawNode, dashboard_samplers_fun(Latest));
+monitor(delete, _) ->
+    Nodes = emqx:running_nodes(),
+    Results = emqx_dashboard_proto_v2:clear_table(Nodes),
+    NodeResults = lists:zip(Nodes, Results),
+    NodeErrors = [Result || Result = {_Node, NOk} <- NodeResults, NOk =/= {atomic, ok}],
+    NodeErrors == [] orelse
+        ?SLOG(warning, #{
+            msg => "clear_monitor_metrics_rpc_errors",
+            errors => NodeErrors
+        }),
+    ?NO_CONTENT.
 
 dashboard_samplers_fun(Latest) ->
     fun(NodeOrCluster) ->

+ 5 - 1
apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v1.erl

@@ -21,7 +21,8 @@
 -export([
     introduced_in/0,
     do_sample/2,
-    current_rate/1
+    current_rate/1,
+    deprecated_since/0
 ]).
 
 -include("emqx_dashboard.hrl").
@@ -30,6 +31,9 @@
 introduced_in() ->
     "5.0.0".
 
+deprecated_since() ->
+    "5.8.4".
+
 -spec do_sample(node(), Latest :: pos_integer() | infinity) -> list(map()) | emqx_rpc:badrpc().
 do_sample(Node, Latest) ->
     rpc:call(Node, emqx_dashboard_monitor, do_sample, [Node, Latest], ?RPC_TIMEOUT).

+ 44 - 0
apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v2.erl

@@ -0,0 +1,44 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_dashboard_proto_v2).
+
+-behaviour(emqx_bpapi).
+
+-export([
+    introduced_in/0,
+    do_sample/2,
+    clear_table/1,
+    current_rate/1
+]).
+
+-include("emqx_dashboard.hrl").
+-include_lib("emqx/include/bpapi.hrl").
+
+introduced_in() ->
+    "5.8.4".
+
+-spec do_sample(node(), Latest :: pos_integer() | infinity) -> list(map()) | emqx_rpc:badrpc().
+do_sample(Node, Latest) ->
+    erpc:call(Node, emqx_dashboard_monitor, do_sample, [Node, Latest], ?RPC_TIMEOUT).
+
+-spec clear_table(Nodes :: [node()]) -> emqx_rpc:erpc_multicall(ok).
+clear_table(Nodes) ->
+    erpc:multicall(Nodes, emqx_dashboard_monitor, clear_table, [], ?RPC_TIMEOUT).
+
+-spec current_rate(node()) -> {ok, map()} | emqx_rpc:badrpc().
+current_rate(Node) ->
+    erpc:call(Node, emqx_dashboard_monitor, current_rate, [Node], ?RPC_TIMEOUT).

+ 17 - 7
apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl

@@ -405,8 +405,8 @@ t_handle_old_monitor_data(_Config) ->
 
     ok = meck:new(emqx, [passthrough, no_history]),
     ok = meck:expect(emqx, running_nodes, fun() -> [node(), 'other@node'] end),
-    ok = meck:new(emqx_dashboard_proto_v1, [passthrough, no_history]),
-    ok = meck:expect(emqx_dashboard_proto_v1, do_sample, fun('other@node', _Time) ->
+    ok = meck:new(emqx_dashboard_proto_v2, [passthrough, no_history]),
+    ok = meck:expect(emqx_dashboard_proto_v2, do_sample, fun('other@node', _Time) ->
         Self ! sample_called,
         FakeOldData
     end),
@@ -421,7 +421,7 @@ t_handle_old_monitor_data(_Config) ->
         hd(emqx_dashboard_monitor:samplers())
     ),
     ?assertReceive(sample_called, 1_000),
-    ok = meck:unload([emqx, emqx_dashboard_proto_v1]),
+    ok = meck:unload([emqx, emqx_dashboard_proto_v2]),
     ok.
 
 t_monitor_api(_) ->
@@ -583,6 +583,8 @@ t_monitor_reset(_) ->
         ),
     {ok, Samplers} = request(["monitor"], "latest=1"),
     ?assertEqual(1, erlang:length(Samplers)),
+    ok = delete(["monitor"]),
+    ?assertMatch({ok, []}, request(["monitor"], "latest=1")),
     ok.
 
 t_monitor_api_error(_) ->
@@ -666,7 +668,7 @@ t_persistent_session_stats(Config) ->
                 <<"connections">> := 3,
                 <<"disconnected_durable_sessions">> := 1,
                 %% N.B.: we currently don't perform any deduplication between persistent
-                %% and non-persistent routes, so we count `commont/topic' twice and get 8
+                %% and non-persistent routes, so we count `common/topic' twice and get 8
                 %% instead of 6 here.
                 <<"topics">> := 8,
                 <<"subscriptions">> := 8,
@@ -702,7 +704,7 @@ t_persistent_session_stats(Config) ->
                 <<"connections">> := 3,
                 <<"disconnected_durable_sessions">> := 2,
                 %% N.B.: we currently don't perform any deduplication between persistent
-                %% and non-persistent routes, so we count `commont/topic' twice and get 8
+                %% and non-persistent routes, so we count `common/topic' twice and get 8
                 %% instead of 6 here.
                 <<"topics">> := 8,
                 <<"subscriptions">> := 8,
@@ -712,7 +714,9 @@ t_persistent_session_stats(Config) ->
             ?ON(N1, request(["monitor_current"]))
         )
     end),
-
+    ?assertNotMatch({ok, []}, ?ON(N1, request(["monitor"]))),
+    ?assertMatch(ok, ?ON(N1, delete(["monitor"]))),
+    ?assertMatch({ok, []}, ?ON(N1, request(["monitor"]))),
     ok.
 
 %% Checks that we get consistent data when changing the requested time window for
@@ -842,6 +846,10 @@ get_req_cluster(Config, Path, QS) ->
 host(Port) ->
     "http://127.0.0.1:" ++ integer_to_list(Port).
 
+delete(Path) ->
+    Url = url(Path, ""),
+    do_request_api(delete, {Url, [auth_header_()]}).
+
 url(Parts, QS) ->
     url(?SERVER, Parts, QS).
 
@@ -858,6 +866,8 @@ do_request_api(Method, Request) ->
     case httpc:request(Method, Request, [], []) of
         {error, socket_closed_remotely} ->
             {error, socket_closed_remotely};
+        {ok, {{"HTTP/1.1", 204, _}, _, _}} ->
+            ok;
         {ok, {{"HTTP/1.1", Code, _}, _, Return}} when
             Code >= 200 andalso Code =< 299
         ->
@@ -960,4 +970,4 @@ cluster_node_appspec(Enable, Port0) ->
     ].
 
 clean_data() ->
-    ok = emqx_dashboard_monitor:clean(-1).
+    ok = emqx_dashboard_monitor:clean(-100000).

+ 1 - 1
apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl

@@ -239,7 +239,7 @@ transitions(Node, DB) ->
 %% Try to eliminate any ambiguity in the message representation.
 message_canonical_form(Msg0 = #message{}) ->
     message_canonical_form(emqx_message:to_map(Msg0));
-message_canonical_form(#{flags := Flags0, headers := Headers0, payload := Payload0} = Msg) ->
+message_canonical_form(#{flags := Flags0, headers := _Headers0, payload := Payload0} = Msg) ->
     %% Remove flags that are false:
     Flags = maps:filter(
         fun(_Key, Val) -> Val end,

+ 1 - 1
apps/emqx_management/src/emqx_management.app.src

@@ -2,7 +2,7 @@
 {application, emqx_management, [
     {description, "EMQX Management API and CLI"},
     % strict semver, bump manually!
-    {vsn, "5.3.3"},
+    {vsn, "5.3.4"},
     {modules, []},
     {registered, [emqx_management_sup]},
     {applications, [

+ 98 - 16
apps/emqx_management/src/emqx_mgmt_api.erl

@@ -54,7 +54,11 @@
 ]).
 
 -ifdef(TEST).
+-include_lib("proper/include/proper.hrl").
+-include_lib("eunit/include/eunit.hrl").
+
 -export([paginate_test_format/1]).
+
 -endif.
 
 -export_type([
@@ -557,18 +561,23 @@ accumulate_query_rows(
     Len = length(Rows),
     case Cursor + Len of
         NCursor when NCursor < PageStart ->
+            %% Haven't reached the required page.
             {more, ResultAcc#{cursor => NCursor}};
         NCursor when NCursor < PageEnd ->
+            %% Rows overlap with the page start
+            %% Throw away rows in the beginning belonging to the previous page(s).
             SubRows = lists:nthtail(max(0, PageStart - Cursor - 1), Rows),
             {more, ResultAcc#{
                 cursor => NCursor,
                 count => Count + length(SubRows),
                 rows => [{Node, SubRows} | RowsAcc]
             }};
-        NCursor when NCursor >= PageEnd + Limit ->
-            {enough, ResultAcc#{cursor => NCursor}};
         NCursor when NCursor >= PageEnd ->
-            SubRows = lists:sublist(Rows, Limit - Count),
+            %% Rows overlap with the page end (and potentially with the page start).
+            %% Throw away rows in the beginning belonging to the previous page(s).
+            %% Then throw away rows in the tail belonging to the next page(s).
+            PageRows = lists:nthtail(max(0, PageStart - Cursor - 1), Rows),
+            SubRows = lists:sublist(PageRows, Limit - Count),
             {enough, ResultAcc#{
                 cursor => NCursor,
                 count => Count + length(SubRows),
@@ -707,20 +716,19 @@ format_query_result(
         end,
     #{
         meta => Meta,
-        data => lists:flatten(
-            lists:foldl(
-                fun({Node, Rows}, Acc) ->
-                    [
-                        lists:map(fun(Row) -> exec_format_fun(FmtFun, Node, Row, Opts) end, Rows)
-                        | Acc
-                    ]
-                end,
-                [],
-                RowsAcc
-            )
-        )
+        data => format_query_data(FmtFun, RowsAcc, Opts)
     }.
 
+format_query_data(FmtFun, RowsAcc, Opts) ->
+    %% NOTE: `RowsAcc` is reversed in the node-order, `lists:foldl/3` is correct here.
+    lists:foldl(
+        fun({Node, Rows}, Acc) ->
+            [exec_format_fun(FmtFun, Node, R, Opts) || R <- Rows] ++ Acc
+        end,
+        [],
+        RowsAcc
+    ).
+
 exec_format_fun(FmtFun, Node, Row, Opts) ->
     case erlang:fun_info(FmtFun, arity) of
         {arity, 1} -> FmtFun(Row);
@@ -813,7 +821,6 @@ b2i(Any) ->
 %%--------------------------------------------------------------------
 
 -ifdef(TEST).
--include_lib("eunit/include/eunit.hrl").
 
 params2qs_test_() ->
     QSchema = [
@@ -926,4 +933,79 @@ assert_paginate_results(Results, Size, Limit) ->
             ?_assertEqual(Size, length(AllData)),
             ?_assertEqual(Size, sets:size(sets:from_list(AllData)))
         ].
+
+accumulate_prop_test() ->
+    ?assert(proper:quickcheck(accumulate_prop(), [{numtests, 1000}])).
+
+accumulate_prop() ->
+    ?FORALL(
+        #{page := Page, limit := Limit, noderows := NodeRows},
+        emqx_proper_types:fixedmap(#{
+            page => page_t(),
+            limit => limit_t(),
+            noderows => noderows_t()
+        }),
+        begin
+            {Status, QRows} = accumulate_page_rows(Page, Limit, NodeRows),
+            {_Status, QRowsNext} = accumulate_page_rows(Page + 1, Limit, NodeRows),
+            measure(
+                #{
+                    "Limit" => Limit,
+                    "Page" => Page,
+                    "NRows" => length(QRows),
+                    "Complete" => emqx_utils_conv:int(Status == enough)
+                },
+                %% Verify page is non-empty if accumulation is complete.
+                accumulate_assert_nonempty(Status, Limit, QRows) and
+                    %% Verify rows across 2 consective pages form continuous sequence.
+                    accumulate_assert_continuous(QRows ++ QRowsNext)
+            )
+        end
+    ).
+
+accumulate_page_rows(Page, Limit, NodeRows) ->
+    QState = #{page => Page, limit => Limit},
+    {Status, #{rows := QRowsAcc}} = lists:foldl(
+        fun
+            ({Node, Rows}, {more, QRAcc}) ->
+                accumulate_query_rows(Node, Rows, QState, QRAcc);
+            (_NodeRows, {enough, QRAcc}) ->
+                {enough, QRAcc}
+        end,
+        {more, init_query_result()},
+        NodeRows
+    ),
+    QRows = format_query_data(fun(N, R) -> {N, R} end, QRowsAcc, #{}),
+    {Status, QRows}.
+
+accumulate_assert_nonempty(enough, Limit, QRows) ->
+    length(QRows) =:= Limit;
+accumulate_assert_nonempty(more, _Limit, _QRows) ->
+    true.
+
+accumulate_assert_continuous([{N, R1} | Rest = [{N, R2} | _]]) ->
+    (R2 - R1 =:= 1) andalso accumulate_assert_continuous(Rest);
+accumulate_assert_continuous([{_N1, _} | Rest = [{_N2, R} | _]]) ->
+    (R =:= 1) andalso accumulate_assert_continuous(Rest);
+accumulate_assert_continuous([_]) ->
+    true;
+accumulate_assert_continuous([]) ->
+    true.
+
+page_t() ->
+    pos_integer().
+
+limit_t() ->
+    emqx_proper_types:scaled(0.6, pos_integer()).
+
+noderows_t() ->
+    ?LET(
+        {Nodes, PageSize},
+        {pos_integer(), limit_t()},
+        [{N, lists:seq(1, PageSize)} || N <- lists:seq(1, Nodes)]
+    ).
+
+measure(NamedSamples, Test) ->
+    maps:fold(fun(Name, Sample, Acc) -> measure(Name, Sample, Acc) end, Test, NamedSamples).
+
 -endif.

+ 1 - 1
apps/emqx_resource/src/emqx_resource.app.src

@@ -1,7 +1,7 @@
 %% -*- mode: erlang -*-
 {application, emqx_resource, [
     {description, "Manager for all external resources"},
-    {vsn, "0.1.36"},
+    {vsn, "0.1.37"},
     {registered, []},
     {mod, {emqx_resource_app, []}},
     {applications, [

+ 8 - 1
apps/emqx_resource/src/emqx_resource_buffer_worker.erl

@@ -1321,7 +1321,7 @@ extract_connector_id(Id) when is_binary(Id) ->
 %% There is no need to query the conncector if the channel is not
 %% installed as the query will fail anyway.
 pre_query_channel_check(Id, {Id, _} = _Request, ChanSt, IsSimpleQuery) ->
-    case emqx_resource_manager:channel_status_is_channel_added(ChanSt) of
+    case is_channel_apt_for_queries(ChanSt) of
         true ->
             ok;
         false ->
@@ -2365,6 +2365,13 @@ buffer_worker(_Tid) ->
 is_simple_query(#{simple_query := Bool}) ->
     Bool.
 
+is_channel_apt_for_queries(?status_connected) ->
+    true;
+is_channel_apt_for_queries(?status_connecting) ->
+    true;
+is_channel_apt_for_queries(_) ->
+    false.
+
 -ifdef(TEST).
 -include_lib("eunit/include/eunit.hrl").
 adjust_batch_time_test_() ->

+ 18 - 5
apps/emqx_resource/src/emqx_resource_cache_cleaner.erl

@@ -79,7 +79,7 @@ handle_call(_Request, _From, State) ->
     {reply, ok, State}.
 
 handle_cast(#add_dry_run{id = ID, pid = Pid}, #{dry_run_pmon := Pmon0} = State0) ->
-    Pmon = emqx_pmon:monitor(Pid, ID, Pmon0),
+    Pmon = append_monitor(Pmon0, Pid, ID),
     State = State0#{dry_run_pmon := Pmon},
     {noreply, State};
 handle_cast(_Msg, State) ->
@@ -108,8 +108,8 @@ handle_down(Pid, State0) ->
             handle_down_cache(ID, Pid, State0);
         error ->
             case emqx_pmon:find(Pid, DryrunPmon) of
-                {ok, ID} ->
-                    handle_down_dry_run(ID, Pid, State0);
+                {ok, IDs} ->
+                    handle_down_dry_run(IDs, Pid, State0);
                 error ->
                     State0
             end
@@ -121,16 +121,20 @@ handle_down_cache(ID, Pid, State0) ->
     Pmon = emqx_pmon:erase(Pid, Pmon0),
     State0#{cache_pmon := Pmon}.
 
-handle_down_dry_run(ID, Pid, State0) ->
+handle_down_dry_run([ID | Rest], Pid, State0) ->
     #{dry_run_pmon := Pmon0} = State0,
     %% No need to wait here: since it's a dry run resource, it won't be recreated,
     %% assuming the ID is random enough.
     spawn(fun() ->
+        _ = emqx_resource_manager:remove(ID),
         emqx_resource_manager_sup:delete_child(ID),
         ?tp("resource_cache_cleaner_deleted_child", #{id => ID})
     end),
     Pmon = emqx_pmon:erase(Pid, Pmon0),
-    State0#{dry_run_pmon := Pmon}.
+    State = State0#{dry_run_pmon := Pmon},
+    handle_down_dry_run(Rest, Pid, State);
+handle_down_dry_run([], _Pid, State) ->
+    State.
 
 maybe_erase_cache(DownManager, ID) ->
     case emqx_resource_cache:read_manager_pid(ID) =:= DownManager of
@@ -141,3 +145,12 @@ maybe_erase_cache(DownManager, ID) ->
             %% restart by supervisor
             ok
     end.
+
+append_monitor(Pmon0, Pid, Value) ->
+    case emqx_pmon:find(Pid, Pmon0) of
+        error ->
+            emqx_pmon:monitor(Pid, [Value], Pmon0);
+        {ok, Values} ->
+            Pmon = emqx_pmon:demonitor(Pid, Pmon0),
+            emqx_pmon:monitor(Pid, [Value | Values], Pmon)
+    end.

+ 236 - 116
apps/emqx_resource/src/emqx_resource_manager.erl

@@ -50,12 +50,12 @@
     is_exist/1,
     get_metrics/1,
     reset_metrics/1,
-    channel_status_is_channel_added/1,
     get_query_mode_and_last_error/2
 ]).
 
 -export([
-    set_resource_status_connecting/1
+    set_resource_status_connecting/1,
+    external_error/1
 ]).
 
 % Server
@@ -71,6 +71,13 @@
 -export([stop/2]).
 -endif.
 
+%%------------------------------------------------------------------------------
+%% Type definitions
+%%------------------------------------------------------------------------------
+
+-define(not_added_yet, {?MODULE, not_added_yet}).
+-define(add_channel_failed(REASON), {?MODULE, add_channel_failed, REASON}).
+
 % State record
 -record(data, {
     id,
@@ -101,8 +108,8 @@
     },
     %% Callers waiting on health check
     hc_pending_callers = #{resource => [], channel => #{}} :: #{
-        resource := [gen_server:from()],
-        channel := #{channel_id() => [gen_server:from()]}
+        resource := [gen_statem:from()],
+        channel := #{channel_id() => [gen_statem:from()]}
     },
     extra
 }).
@@ -146,11 +153,15 @@
 %% calls/casts/generic timeouts
 -record(add_channel, {channel_id :: channel_id(), config :: map()}).
 -record(start_channel_health_check, {channel_id :: channel_id()}).
+-record(retry_add_channel, {channel_id :: channel_id()}).
 
 -type generic_timeout(Id, Content) :: {{timeout, Id}, timeout(), Content}.
 -type start_channel_health_check_action() :: generic_timeout(
     #start_channel_health_check{}, #start_channel_health_check{}
 ).
+-type retry_add_channel_action() :: generic_timeout(
+    #retry_add_channel{}, #retry_add_channel{}
+).
 
 %%------------------------------------------------------------------------------
 %% API
@@ -273,10 +284,13 @@ create_dry_run(ResId, ResourceType, Config, OnReadyCallback) ->
                     Error
             end;
         {error, Reason} ->
-            _ = remove(ResId),
+            %% Removal is done asynchronously.  See comment below.
             {error, Reason};
         timeout ->
-            _ = remove(ResId),
+            %% Removal is done asynchronously by the cache cleaner.  If the resource
+            %% process is stuck and not responding to calls, doing the removal
+            %% synchronously here would take more time than the defined timeout, possibly
+            %% timing out HTTP API requests.
             {error, timeout}
     end.
 
@@ -620,9 +634,10 @@ handle_event({call, From}, {channel_health_check, ChannelId}, _State, Data) ->
 %%--------------------------
 %% State: CONNECTING
 %%--------------------------
-handle_event(enter, _OldState, ?state_connecting = State, Data) ->
+handle_event(enter, _OldState, ?state_connecting = State, Data0) ->
+    Data = abort_all_channel_health_checks(Data0),
     ok = log_status_consistency(State, Data),
-    {keep_state_and_data, [{state_timeout, 0, health_check}]};
+    {keep_state, Data, [{state_timeout, 0, health_check}]};
 handle_event(internal, start_resource, ?state_connecting, Data) ->
     start_resource(Data, undefined);
 handle_event(state_timeout, health_check, ?state_connecting, Data) ->
@@ -640,7 +655,7 @@ handle_event(enter, _OldState, ?state_connected = State, Data) ->
     ok = log_status_consistency(State, Data),
     _ = emqx_alarm:safe_deactivate(Data#data.id),
     ?tp(resource_connected_enter, #{}),
-    {keep_state_and_data, resource_health_check_actions(Data)};
+    {keep_state, Data, resource_health_check_actions(Data)};
 handle_event(state_timeout, health_check, ?state_connected, Data) ->
     start_resource_health_check(Data);
 handle_event(
@@ -661,13 +676,17 @@ handle_event(
     Data
 ) ->
     handle_start_channel_health_check(Data, ChannelId);
+handle_event(
+    {timeout, #retry_add_channel{channel_id = ChannelId}}, _, ?state_connected = _State, Data
+) ->
+    handle_retry_add_channel(Data, ChannelId);
 %%--------------------------
 %% State: DISCONNECTED
 %%--------------------------
 handle_event(enter, _OldState, ?state_disconnected = State, Data0) ->
     ok = log_status_consistency(State, Data0),
     ?tp(resource_disconnected_enter, #{}),
-    Data = handle_abort_all_channel_health_checks(Data0),
+    Data = abort_all_channel_health_checks(Data0),
     {keep_state, Data, retry_actions(Data)};
 handle_event(state_timeout, auto_retry, ?state_disconnected, Data) ->
     ?tp(resource_auto_reconnect, #{}),
@@ -676,9 +695,10 @@ handle_event(state_timeout, auto_retry, ?state_disconnected, Data) ->
 %% State: STOPPED
 %% The stopped state is entered after the resource has been explicitly stopped
 %%--------------------------
-handle_event(enter, _OldState, ?state_stopped = State, Data) ->
+handle_event(enter, _OldState, ?state_stopped = State, Data0) ->
+    Data = abort_all_channel_health_checks(Data0),
     ok = log_status_consistency(State, Data),
-    {keep_state_and_data, []};
+    {keep_state, Data};
 %%--------------------------
 %% The following events can be handled in any other state
 %%--------------------------
@@ -713,6 +733,9 @@ handle_event(
     is_map_key(Pid, CHCWorkers)
 ->
     handle_channel_health_check_worker_down(Data0, Pid, Res);
+handle_event({timeout, #retry_add_channel{channel_id = _}}, _, _State, _Data) ->
+    %% We only add channels to the resource state in the connected state.
+    {keep_state_and_data, [postpone]};
 handle_event({timeout, #start_channel_health_check{channel_id = _}}, _, _State, _Data) ->
     %% Stale health check action; currently, we only probe channel health when connected.
     keep_state_and_data;
@@ -811,11 +834,12 @@ start_resource(Data, From) ->
             ),
             _ = maybe_alarm(?status_disconnected, IsDryRun, ResId, Err, Data#data.error),
             %% Add channels and raise alarms
-            NewData1 = channels_health_check(?status_disconnected, add_channels(Data)),
+            {Actions0, NewData1} = channels_health_check(?status_disconnected, add_channels(Data)),
             %% Keep track of the error reason why the connection did not work
             %% so that the Reason can be returned when the verification call is made.
             NewData2 = NewData1#data{status = ?status_disconnected, error = Err},
-            Actions = maybe_reply(retry_actions(NewData2), From, Err),
+            Actions1 = maybe_reply(retry_actions(NewData2), From, Err),
+            Actions = Actions1 ++ Actions0,
             {next_state, ?state_disconnected, update_state(NewData2), Actions}
     end.
 
@@ -845,9 +869,12 @@ maybe_update_callback_mode(Data = #data{mod = ResourceType, state = ResourceStat
             Data#data{callback_mode = CallMode}
     end.
 
-add_channels_in_list([], Data) ->
-    Data;
-add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) ->
+add_channels_in_list(ChannelsWithConfigs, Data) ->
+    add_channels_in_list(ChannelsWithConfigs, Data, _Actions = []).
+
+add_channels_in_list([], Data, Actions) ->
+    {Actions, Data};
+add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data, Actions) ->
     #data{
         id = ResId,
         mod = Mod,
@@ -869,6 +896,7 @@ add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) ->
                 channel_status_new_waiting_for_health_check(ChannelConfig),
                 AddedChannelsMap
             ),
+            NewActions = Actions,
             NewData = Data#data{
                 state = NewState,
                 added_channels = NewAddedChannelsMap
@@ -887,16 +915,17 @@ add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) ->
             ),
             NewAddedChannelsMap = maps:put(
                 ChannelID,
-                channel_status(Error, ChannelConfig),
+                channel_status(?add_channel_failed(Reason), ChannelConfig),
                 AddedChannelsMap
             ),
+            NewActions = [retry_add_channel_action(ChannelID, ChannelConfig, Data) | Actions],
             NewData = Data#data{
                 added_channels = NewAddedChannelsMap
             },
             %% Raise an alarm since the channel could not be added
             _ = maybe_alarm(?status_disconnected, IsDryRun, ChannelID, Error, no_prev_error)
     end,
-    add_channels_in_list(Rest, NewData).
+    add_channels_in_list(Rest, NewData, NewActions).
 
 maybe_stop_resource(#data{status = Status} = Data) when Status =/= ?rm_status_stopped ->
     stop_resource(Data);
@@ -925,11 +954,11 @@ stop_resource(#data{id = ResId} = Data) ->
 
 remove_channels(Data) ->
     Channels = maps:keys(Data#data.added_channels),
-    remove_channels_in_list(Channels, Data, false).
+    remove_channels_in_list(Channels, Data).
 
-remove_channels_in_list([], Data, _KeepInChannelMap) ->
+remove_channels_in_list([], Data) ->
     Data;
-remove_channels_in_list([ChannelID | Rest], Data, KeepInChannelMap) ->
+remove_channels_in_list([ChannelID | Rest], Data) ->
     #data{
         id = ResId,
         added_channels = AddedChannelsMap,
@@ -939,14 +968,8 @@ remove_channels_in_list([ChannelID | Rest], Data, KeepInChannelMap) ->
         type = Type
     } = Data,
     IsDryRun = emqx_resource:is_dry_run(ResId),
-    NewAddedChannelsMap =
-        case KeepInChannelMap of
-            true ->
-                AddedChannelsMap;
-            false ->
-                _ = maybe_clear_alarm(IsDryRun, ChannelID),
-                maps:remove(ChannelID, AddedChannelsMap)
-        end,
+    _ = maybe_clear_alarm(IsDryRun, ChannelID),
+    NewAddedChannelsMap = maps:remove(ChannelID, AddedChannelsMap),
     case safe_call_remove_channel(ResId, Mod, State, ChannelID) of
         {ok, NewState} ->
             NewData = Data#data{
@@ -971,7 +994,7 @@ remove_channels_in_list([ChannelID | Rest], Data, KeepInChannelMap) ->
                 added_channels = NewAddedChannelsMap
             }
     end,
-    remove_channels_in_list(Rest, NewData, KeepInChannelMap).
+    remove_channels_in_list(Rest, NewData).
 
 safe_call_remove_channel(_ResId, _Mod, undefined = State, _ChannelID) ->
     {ok, State};
@@ -1039,7 +1062,8 @@ handle_not_connected_add_channel(From, ChannelId, ChannelConfig, State, Data) ->
     NewData = add_or_update_channel_status(Data, ChannelId, ChannelConfig, State),
     {keep_state, update_state(NewData), [{reply, From, ok}]}.
 
-handle_remove_channel(From, ChannelId, Data) ->
+handle_remove_channel(From, ChannelId, Data0) ->
+    Data = abort_health_checks_for_channel(Data0, ChannelId),
     Channels = Data#data.added_channels,
     IsDryRun = emqx_resource:is_dry_run(Data#data.id),
     _ = maybe_clear_alarm(IsDryRun, ChannelId),
@@ -1194,9 +1218,9 @@ continue_resource_health_check_connected(NewStatus, Data0) ->
     case NewStatus of
         ?status_connected ->
             {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
-            Data2 = channels_health_check(?status_connected, Data1),
+            {Actions0, Data2} = channels_health_check(?status_connected, Data1),
             Data = update_state(Data2),
-            Actions = Replies ++ resource_health_check_actions(Data),
+            Actions = Replies ++ Actions0 ++ resource_health_check_actions(Data),
             {keep_state, Data, Actions};
         _ ->
             #data{id = ResId, group = Group, type = Type} = Data0,
@@ -1215,8 +1239,8 @@ continue_resource_health_check_connected(NewStatus, Data0) ->
             %% between the two here, as resource manager also has `stopped', which is
             %% not a valid status at the time of writing.
             {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
-            Data = channels_health_check(NewStatus, Data1),
-            Actions = Replies,
+            {Actions0, Data} = channels_health_check(NewStatus, Data1),
+            Actions = Replies ++ Actions0,
             {next_state, NewStatus, Data, Actions}
     end.
 
@@ -1225,16 +1249,16 @@ continue_resource_health_check_not_connected(NewStatus, Data0) ->
     {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
     case NewStatus of
         ?status_connected ->
-            Data = channels_health_check(?status_connected, Data1),
-            Actions = Replies,
+            {Actions0, Data} = channels_health_check(?status_connected, Data1),
+            Actions = Replies ++ Actions0,
             {next_state, ?state_connected, Data, Actions};
         ?status_connecting ->
-            Data = channels_health_check(?status_connecting, Data1),
-            Actions = Replies ++ resource_health_check_actions(Data),
+            {Actions0, Data} = channels_health_check(?status_connecting, Data1),
+            Actions = Replies ++ Actions0 ++ resource_health_check_actions(Data),
             {next_state, ?status_connecting, Data, Actions};
         ?status_disconnected ->
-            Data = channels_health_check(?status_disconnected, Data1),
-            Actions = Replies,
+            {Actions0, Data} = channels_health_check(?status_disconnected, Data1),
+            Actions = Replies ++ Actions0,
             {next_state, ?state_disconnected, Data, Actions}
     end.
 
@@ -1274,7 +1298,9 @@ handle_manual_channel_health_check(
     is_map_key(ChannelId, Channels)
 ->
     %% No ongoing health check: reply with current status.
-    {keep_state_and_data, [{reply, From, without_channel_config(maps:get(ChannelId, Channels))}]};
+    {keep_state_and_data, [
+        {reply, From, to_external_channel_status(maps:get(ChannelId, Channels))}
+    ]};
 handle_manual_channel_health_check(
     From,
     _Data,
@@ -1284,22 +1310,21 @@ handle_manual_channel_health_check(
         {reply, From, channel_error_status(channel_not_found)}
     ]}.
 
--spec channels_health_check(resource_status(), data()) -> data().
+-spec channels_health_check(resource_status(), data()) -> {[gen_statem:action()], data()}.
 channels_health_check(?status_connected = _ConnectorStatus, Data0) ->
     Channels = maps:to_list(Data0#data.added_channels),
-    %% All channels with a status different from connected or connecting are
-    %% not added
     ChannelsNotAdded = [
         ChannelId
      || {ChannelId, Status} <- Channels,
         not channel_status_is_channel_added(Status)
     ],
-    %% Attempt to add channels that are not added
+    %% Attempt to add channels to resource state that are not added yet
     ChannelsNotAddedWithConfigs = get_config_for_channels(Data0, ChannelsNotAdded),
-    Data1 = add_channels_in_list(ChannelsNotAddedWithConfigs, Data0),
-    %% Now that we have done the adding, we can get the status of all channels
+    {Actions, Data1} = add_channels_in_list(ChannelsNotAddedWithConfigs, Data0),
+    %% Now that we have done the adding, we can get the status of all channels (execept
+    %% unhealthy ones)
     Data2 = trigger_health_check_for_added_channels(Data1),
-    update_state(Data2);
+    {Actions, update_state(Data2)};
 channels_health_check(?status_connecting = _ConnectorStatus, Data0) ->
     %% Whenever the resource is connecting:
     %% 1. Change the status of all added channels to connecting
@@ -1337,33 +1362,35 @@ channels_health_check(?status_connecting = _ConnectorStatus, Data0) ->
         ChannelsWithNewAndPrevErrorStatuses
     ),
     Data1 = Data0#data{added_channels = NewChannels},
-    update_state(Data1);
-channels_health_check(ConnectorStatus, Data0) ->
-    %% Whenever the resource is not connected and not connecting:
-    %% 1. Remove all added channels
-    %% 2. Change the status to an error status
-    %% 3. Raise alarms
-    Channels = Data0#data.added_channels,
-    ChannelsToRemove = [
-        ChannelId
-     || {ChannelId, Status} <- maps:to_list(Channels),
-        channel_status_is_channel_added(Status)
-    ],
-    Data1 = remove_channels_in_list(ChannelsToRemove, Data0, true),
+    {_Actions = [], update_state(Data1)};
+channels_health_check(?status_disconnected = ConnectorStatus, Data1) ->
+    %% Whenever the resource is disconnected:
+    %% 1. Change the status of channels to an error status
+    %%    - Except for channels yet to be added to the resource state.  Those need to keep
+    %%    those special errors so they are added or retried.
+    %% 2. Raise alarms
+    Channels = Data1#data.added_channels,
     ChannelsWithNewAndOldStatuses =
-        [
-            {ChannelId, OldStatus,
-                channel_status(
-                    {error,
-                        resource_not_connected_channel_error_msg(
-                            ConnectorStatus,
-                            ChannelId,
-                            Data1
-                        )},
-                    Config
-                )}
-         || {ChannelId, #{config := Config} = OldStatus} <- maps:to_list(Data1#data.added_channels)
-        ],
+        lists:map(
+            fun
+                ({ChannelId, #{error := ?not_added_yet} = OldStatus}) ->
+                    {ChannelId, OldStatus, OldStatus};
+                ({ChannelId, #{error := ?add_channel_failed(_)} = OldStatus}) ->
+                    {ChannelId, OldStatus, OldStatus};
+                ({ChannelId, #{config := Config} = OldStatus}) ->
+                    {ChannelId, OldStatus,
+                        channel_status(
+                            {error,
+                                resource_not_connected_channel_error_msg(
+                                    ConnectorStatus,
+                                    ChannelId,
+                                    Data1
+                                )},
+                            Config
+                        )}
+            end,
+            maps:to_list(Data1#data.added_channels)
+        ),
     %% Raise alarms
     IsDryRun = emqx_resource:is_dry_run(Data1#data.id),
     _ = lists:foreach(
@@ -1381,7 +1408,7 @@ channels_health_check(ConnectorStatus, Data0) ->
         ChannelsWithNewAndOldStatuses
     ),
     Data2 = Data1#data{added_channels = NewChannels},
-    update_state(Data2).
+    {_Actions = [], update_state(Data2)}.
 
 resource_not_connected_channel_error_msg(ResourceStatus, ChannelId, Data1) ->
     ResourceId = Data1#data.id,
@@ -1401,27 +1428,40 @@ resource_not_connected_channel_error_msg(ResourceStatus, ChannelId, Data1) ->
 generic_timeout_action(Id, Timeout, Content) ->
     {{timeout, Id}, Timeout, Content}.
 
--spec start_channel_health_check_action(channel_id(), map(), map(), data() | timeout()) ->
+-spec start_channel_health_check_action(channel_id(), map(), map(), data()) ->
     [start_channel_health_check_action()].
 start_channel_health_check_action(ChannelId, NewChanStatus, PreviousChanStatus, Data = #data{}) ->
-    Timeout = get_channel_health_check_interval(ChannelId, NewChanStatus, PreviousChanStatus, Data),
+    ConfigSources =
+        lists:map(
+            fun
+                (#{config := Config}) ->
+                    Config;
+                (_) ->
+                    #{}
+            end,
+            [NewChanStatus, PreviousChanStatus]
+        ),
+    Timeout = get_channel_health_check_interval(ChannelId, ConfigSources, Data),
     Event = #start_channel_health_check{channel_id = ChannelId},
     [generic_timeout_action(Event, Timeout, Event)].
 
-get_channel_health_check_interval(ChannelId, NewChanStatus, PreviousChanStatus, Data) ->
+-spec retry_add_channel_action(channel_id(), map(), data()) -> retry_add_channel_action().
+retry_add_channel_action(ChannelId, ChannelConfig, Data) ->
+    Timeout = get_channel_health_check_interval(ChannelId, [ChannelConfig], Data),
+    Event = #retry_add_channel{channel_id = ChannelId},
+    generic_timeout_action(Event, Timeout, Event).
+
+get_channel_health_check_interval(ChannelId, ConfigSources, Data) ->
     emqx_utils:foldl_while(
         fun
-            (#{config := #{resource_opts := #{health_check_interval := HCInterval}}}, _Acc) ->
+            (#{resource_opts := #{health_check_interval := HCInterval}}, _Acc) ->
                 {halt, HCInterval};
             (_, Acc) ->
                 {cont, Acc}
         end,
         ?HEALTHCHECK_INTERVAL,
-        [
-            NewChanStatus,
-            PreviousChanStatus,
-            maps:get(ChannelId, Data#data.added_channels, #{})
-        ]
+        ConfigSources ++
+            [emqx_utils_maps:deep_get([ChannelId, config], Data#data.added_channels, #{})]
     ).
 
 %% Currently, we only call resource channel health checks when the underlying resource is
@@ -1434,7 +1474,7 @@ trigger_health_check_for_added_channels(Data0 = #data{hc_workers = HCWorkers0})
     NewOngoing = maps:filter(
         fun(ChannelId, OldStatus) ->
             (not is_map_key(ChannelId, Ongoing0)) andalso
-                channel_status_is_channel_added(OldStatus)
+                is_channel_apt_for_health_check(OldStatus)
         end,
         Data0#data.added_channels
     ),
@@ -1467,12 +1507,10 @@ continue_channel_health_check_connected(ChannelId, OldStatus, CurrentStatus, Dat
             Data1
     end.
 
-continue_channel_health_check_connected_no_update_during_check(ChannelId, OldStatus, Data1) ->
+continue_channel_health_check_connected_no_update_during_check(ChannelId, OldStatus, Data) ->
     %% Remove the added channels with a status different from connected or connecting
-    NewStatus = maps:get(ChannelId, Data1#data.added_channels),
-    ChannelsToRemove = [ChannelId || not channel_status_is_channel_added(NewStatus)],
-    Data = remove_channels_in_list(ChannelsToRemove, Data1, true),
-    IsDryRun = emqx_resource:is_dry_run(Data1#data.id),
+    NewStatus = maps:get(ChannelId, Data#data.added_channels),
+    IsDryRun = emqx_resource:is_dry_run(Data#data.id),
     %% Raise/clear alarms
     case NewStatus of
         #{status := ?status_connected} ->
@@ -1582,7 +1620,7 @@ handle_channel_health_check_worker_down_new_channels_and_status(
 reply_pending_channel_health_check_callers(
     ChannelId, Status0, Data0 = #data{hc_pending_callers = Pending0}
 ) ->
-    Status = without_channel_config(Status0),
+    Status = to_external_channel_status(Status0),
     #{channel := CPending0} = Pending0,
     Pending = maps:get(ChannelId, CPending0, []),
     Actions = [{reply, From, Status} || From <- Pending],
@@ -1590,6 +1628,21 @@ reply_pending_channel_health_check_callers(
     Data = Data0#data{hc_pending_callers = Pending0#{channel := CPending}},
     {Actions, Data}.
 
+handle_retry_add_channel(Data0, ChannelId) ->
+    ?tp(retry_add_channel, #{channel_id => ChannelId}),
+    maybe
+        {ok, StatusMap} ?= maps:find(ChannelId, Data0#data.added_channels),
+        %% Must contain config map if in data.
+        #{config := #{} = ChannelConfig} = StatusMap,
+        {Actions, Data1} = add_channels_in_list([{ChannelId, ChannelConfig}], Data0),
+        Data = trigger_health_check_for_added_channels(Data1),
+        {keep_state, Data, Actions}
+    else
+        error ->
+            %% Channel has been removed since timer was set?
+            keep_state_and_data
+    end.
+
 get_config_for_channels(Data0, ChannelsWithoutConfig) ->
     ResId = Data0#data.id,
     Mod = Data0#data.mod,
@@ -1648,7 +1701,7 @@ maybe_alarm(_Status, false, ResId, Error, _PrevError) ->
             {error, Reason} ->
                 emqx_utils:readable_error_msg(Reason);
             _ ->
-                Error1 = without_channel_config(Error),
+                Error1 = to_external_channel_status(Error),
                 emqx_utils:readable_error_msg(Error1)
         end,
     emqx_alarm:safe_activate(
@@ -1656,11 +1709,15 @@ maybe_alarm(_Status, false, ResId, Error, _PrevError) ->
         #{resource_id => ResId, reason => resource_down},
         <<"resource down: ", HrError/binary>>
     ),
-    ?tp(resource_activate_alarm, #{resource_id => ResId}).
+    ?tp(resource_activate_alarm, #{resource_id => ResId, error => HrError}).
 
 without_channel_config(Map) ->
     maps:without([config], Map).
 
+to_external_channel_status(StatusMap0) ->
+    StatusMap = without_channel_config(StatusMap0),
+    maps:update_with(error, fun external_error/1, StatusMap).
+
 -spec maybe_resume_resource_workers(resource_id(), resource_status()) -> ok.
 maybe_resume_resource_workers(ResId, ?status_connected) ->
     lists:foreach(
@@ -1701,6 +1758,8 @@ status_to_error(_) ->
     {error, undefined}.
 
 %% Compatibility
+external_error(?not_added_yet) -> not_added_yet;
+external_error(?add_channel_failed(Reason)) -> external_error(Reason);
 external_error({error, Reason}) -> Reason;
 external_error(Other) -> Other.
 
@@ -1713,7 +1772,9 @@ maybe_reply(Actions, From, Reply) ->
 data_record_to_external_map(Data) ->
     AddedChannelsWithoutConfigs =
         maps:map(
-            fun(_ChanID, Status) -> without_channel_config(Status) end,
+            fun(_ChanID, Status) ->
+                to_external_channel_status(Status)
+            end,
             Data#data.added_channels
         ),
     #{
@@ -1755,7 +1816,9 @@ safe_call(ResId, Message, Timeout) ->
         exit:{R, _} when R == noproc; R == normal; R == shutdown ->
             {error, not_found};
         exit:{timeout, _} ->
-            {error, timeout}
+            {error, timeout};
+        exit:{{shutdown, removed}, _} ->
+            {error, not_found}
     end.
 
 %% Helper functions for chanel status data
@@ -1771,7 +1834,7 @@ channel_status_not_added(ChannelConfig) ->
         %%                 connected and the on_channel_get_status callback has returned
         %%                 connected. The error field should be undefined.
         status => ?status_disconnected,
-        error => not_added_yet,
+        error => ?not_added_yet,
         config => ChannelConfig
     }.
 
@@ -1820,6 +1883,12 @@ channel_status({?status_connected, Error}, ChannelConfig) ->
         error => Error,
         config => ChannelConfig
     };
+channel_status(?add_channel_failed(_Reason) = Error, ChannelConfig) ->
+    #{
+        status => ?status_disconnected,
+        error => Error,
+        config => ChannelConfig
+    };
 channel_status({error, Reason}, ChannelConfig) ->
     S = channel_error_status(Reason),
     S#{config => ChannelConfig}.
@@ -1830,19 +1899,24 @@ channel_error_status(Reason) ->
         error => Reason
     }.
 
-channel_status_is_channel_added(#{status := St}) ->
-    channel_status_is_channel_added(St);
-channel_status_is_channel_added(?status_connected) ->
-    true;
-channel_status_is_channel_added(?status_connecting) ->
-    true;
-channel_status_is_channel_added(_Status) ->
-    false.
+is_channel_apt_for_health_check(#{error := {unhealthy_target, _}}) ->
+    false;
+is_channel_apt_for_health_check(#{error := unhealthy_target}) ->
+    false;
+is_channel_apt_for_health_check(StatusMap) ->
+    channel_status_is_channel_added(StatusMap).
+
+channel_status_is_channel_added(#{error := ?not_added_yet}) ->
+    false;
+channel_status_is_channel_added(#{error := ?add_channel_failed(_)}) ->
+    false;
+channel_status_is_channel_added(_StatusMap) ->
+    true.
 
 -spec add_or_update_channel_status(data(), channel_id(), map(), resource_state()) -> data().
 add_or_update_channel_status(Data, ChannelId, ChannelConfig, State) ->
     Channels = Data#data.added_channels,
-    ChannelStatus = channel_status({error, resource_not_operational}, ChannelConfig),
+    ChannelStatus = channel_status_not_added(ChannelConfig),
     NewChannels = maps:put(ChannelId, ChannelStatus, Channels),
     ResStatus = state_to_status(State),
     IsDryRun = emqx_resource:is_dry_run(ChannelId),
@@ -1861,10 +1935,18 @@ tag(Group, Type) ->
     Str = emqx_utils_conv:str(Group) ++ "/" ++ emqx_utils_conv:str(Type),
     string:uppercase(Str).
 
+%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
+%% might be lost for a running resource, and future attempts to bump them result in
+%% errors.  As mitigation, we ensure such metrics are created here so that restarting
+%% the resource or resetting its metrics can recreate them.
+ensure_metrics(ResId) ->
+    {ok, _} = emqx_resource:ensure_metrics(ResId),
+    ok.
+
 %% When a resource enters a `?status_disconnected' state, late channel health check
 %% replies are useless and could corrup state.
--spec handle_abort_all_channel_health_checks(data()) -> data().
-handle_abort_all_channel_health_checks(Data0) ->
+-spec abort_all_channel_health_checks(data()) -> data().
+abort_all_channel_health_checks(Data0) ->
     #data{
         hc_workers = #{channel := CHCWorkers} = HCWorkers0,
         hc_pending_callers = #{channel := CPending} = Pending0
@@ -1893,17 +1975,55 @@ handle_abort_all_channel_health_checks(Data0) ->
 
 abort_channel_health_check(Pid) ->
     %% We're already linked to the worker pids due to `spawn_link'.
+    MRef = monitor(process, Pid),
     exit(Pid, kill),
+    receive
+        {'DOWN', MRef, process, Pid, _} ->
+            ok
+    end,
     %% Clean the exit signal so it doesn't contaminate state handling.
     receive
         {'EXIT', Pid, _} ->
             ok
+    after 0 -> ok
     end.
 
-%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
-%% might be lost for a running resource, and future attempts to bump them result in
-%% errors.  As mitigation, we ensure such metrics are created here so that restarting
-%% the resource or resetting its metrics can recreate them.
-ensure_metrics(ResId) ->
-    {ok, _} = emqx_resource:ensure_metrics(ResId),
-    ok.
+map_take_or(Map, Key, Default) ->
+    maybe
+        error ?= maps:take(Key, Map),
+        {Default, Map}
+    end.
+
+abort_health_checks_for_channel(Data0, ChannelId) ->
+    #data{
+        hc_workers = #{channel := #{ongoing := Ongoing0} = CHCWorkers0} = HCWorkers0,
+        hc_pending_callers = #{channel := CPending0} = Pending0
+    } = Data0,
+    Ongoing = maps:remove(ChannelId, Ongoing0),
+    {Callers, CPending} = map_take_or(CPending0, ChannelId, []),
+    lists:foreach(
+        fun(From) ->
+            gen_statem:reply(From, {error, resource_disconnected})
+        end,
+        Callers
+    ),
+    CHCWorkers = maps:fold(
+        fun
+            (Pid, ChannelId0, Acc) when is_pid(Pid), ChannelId0 == ChannelId ->
+                ?tp(warning, "aborting_channel_hc", #{channel_id => ChannelId, pid => Pid}),
+                abort_channel_health_check(Pid),
+                maps:remove(Pid, Acc);
+            (ChannelId0, _Config, Acc) when ChannelId0 == ChannelId ->
+                maps:remove(ChannelId0, Acc);
+            (_, _, Acc) ->
+                Acc
+        end,
+        CHCWorkers0,
+        CHCWorkers0
+    ),
+    HCWorkers = HCWorkers0#{channel := CHCWorkers#{ongoing := Ongoing}},
+    Pending = Pending0#{channel := CPending},
+    Data0#data{
+        hc_workers = HCWorkers,
+        hc_pending_callers = Pending
+    }.

+ 6 - 1
apps/emqx_resource/src/emqx_resource_manager_sup.erl

@@ -56,11 +56,16 @@ init([]) ->
     {ok, {SupFlags, ChildSpecs}}.
 
 child_spec(ResId, Group, ResourceType, Config, Opts) ->
+    RestartType =
+        case emqx_resource:is_dry_run(ResId) of
+            true -> temporary;
+            false -> transient
+        end,
     #{
         id => ResId,
         start =>
             {emqx_resource_manager, start_link, [ResId, Group, ResourceType, Config, Opts]},
-        restart => transient,
+        restart => RestartType,
         %% never force kill a resource manager.
         %% because otherwise it may lead to release leak,
         %% resource_manager's terminate callback calls resource on_stop

+ 22 - 19
apps/emqx_resource/test/emqx_resource_SUITE.erl

@@ -1120,29 +1120,32 @@ create_dry_run_local_succ() ->
 
 t_create_dry_run_local_failed(_) ->
     ct:timetrap({seconds, 120}),
-    ct:pal("creating with creation error"),
-    Res1 = emqx_resource:create_dry_run_local(
-        ?TEST_RESOURCE,
-        #{create_error => true}
-    ),
-    ?assertMatch({error, _}, Res1),
+    emqx_utils:nolink_apply(fun() ->
+        ct:pal("creating with creation error"),
+        Res1 = emqx_resource:create_dry_run_local(
+            ?TEST_RESOURCE,
+            #{create_error => true}
+        ),
+        ?assertMatch({error, _}, Res1),
 
-    ct:pal("creating with health check error"),
-    Res2 = emqx_resource:create_dry_run_local(
-        ?TEST_RESOURCE,
-        #{name => test_resource, health_check_error => true}
-    ),
-    ?assertMatch({error, _}, Res2),
+        ct:pal("creating with health check error"),
+        Res2 = emqx_resource:create_dry_run_local(
+            ?TEST_RESOURCE,
+            #{name => test_resource, health_check_error => true}
+        ),
+        ?assertMatch({error, _}, Res2),
 
-    ct:pal("creating with stop error"),
-    Res3 = emqx_resource:create_dry_run_local(
-        ?TEST_RESOURCE,
-        #{name => test_resource, stop_error => true}
-    ),
-    ?assertEqual(ok, Res3),
+        ct:pal("creating with stop error"),
+        Res3 = emqx_resource:create_dry_run_local(
+            ?TEST_RESOURCE,
+            #{name => test_resource, stop_error => true}
+        ),
+        ?assertEqual(ok, Res3),
+        ok
+    end),
     ?retry(
         100,
-        5,
+        50,
         ?assertEqual(
             [],
             emqx_resource:list_instances_verbose()

+ 1 - 1
apps/emqx_utils/src/emqx_utils.app.src

@@ -2,7 +2,7 @@
 {application, emqx_utils, [
     {description, "Miscellaneous utilities for EMQX apps"},
     % strict semver, bump manually!
-    {vsn, "5.4.2"},
+    {vsn, "5.4.3"},
     {modules, [
         emqx_utils,
         emqx_utils_api,

+ 27 - 4
apps/emqx_utils/src/emqx_utils_redact.erl

@@ -155,13 +155,36 @@ redact_v(V) when is_binary(V) ->
         [{var, _}] ->
             V;
         _ ->
-            <<?REDACT_VAL>>
+            do_redact_v(V)
     end;
 redact_v([{str, Bin}]) when is_binary(Bin) ->
     %% The HOCON schema system may generate sensitive values with this format
-    [{str, <<?REDACT_VAL>>}];
-redact_v(_V) ->
-    ?REDACT_VAL.
+    [{str, do_redact_v(Bin)}];
+redact_v(V) ->
+    do_redact_v(V).
+
+do_redact_v(<<"file://", _/binary>> = V) ->
+    V;
+do_redact_v("file://" ++ _ = V) ->
+    V;
+do_redact_v(B) when is_binary(B) ->
+    <<?REDACT_VAL>>;
+do_redact_v(L) when is_list(L) ->
+    ?REDACT_VAL;
+do_redact_v(F) ->
+    try
+        %% this can happen in logs
+        case emqx_secret:term(F) of
+            {file, File} ->
+                File;
+            V ->
+                do_redact_v(V)
+        end
+    catch
+        _:_ ->
+            %% most of the time
+            ?REDACT_VAL
+    end.
 
 deobfuscate(NewConf, OldConf) ->
     deobfuscate(NewConf, OldConf, fun(_) -> false end).

+ 41 - 0
apps/emqx_utils/test/emqx_utils_redact_tests.erl

@@ -45,6 +45,47 @@ no_redact_template_var_test() ->
         })
     ).
 
+no_redact_file_paths_test() ->
+    ?assertEqual(
+        #{
+            password => <<"file:///abs/path/a">>,
+            <<"secret">> => <<"file://relative/path/b">>,
+            account_key => "file://string/path/x"
+        },
+        redact(#{
+            password => <<"file:///abs/path/a">>,
+            <<"secret">> => <<"file://relative/path/b">>,
+            account_key => "file://string/path/x"
+        })
+    ).
+
+no_redact_wrapped_file_paths_test() ->
+    ?assertEqual(
+        #{password => <<"file:///abs/path/a">>},
+        redact(#{
+            password => emqx_secret:wrap_load({file, <<"file:///abs/path/a">>})
+        })
+    ).
+
+redact_wrapped_secret_test() ->
+    ?assertEqual(
+        #{password => <<"******">>},
+        redact(#{
+            password => emqx_secret:wrap(<<"aaa">>)
+        })
+    ).
+
+deobfuscate_file_path_secrets_test_() ->
+    Original1 = #{foo => #{bar => #{headers => #{"authorization" => "file://a"}}}},
+    Original2 = #{foo => #{bar => #{headers => #{"authorization" => "a"}}}},
+    Redacted2 = #{foo => #{bar => #{headers => #{"authorization" => "******"}}}},
+    [
+        ?_assertEqual(Original1, redact(Original1)),
+        ?_assertEqual(Original1, emqx_utils_redact:deobfuscate(Original1, Original1)),
+        ?_assertEqual(Redacted2, redact(Original2)),
+        ?_assertEqual(Original2, emqx_utils_redact:deobfuscate(Redacted2, Original2))
+    ].
+
 redact(X) -> emqx_utils:redact(X).
 
 is_redacted(Key, Value) ->

+ 6 - 3
bin/emqx

@@ -784,7 +784,8 @@ check_config() {
     ## this command checks the configs without generating any files
     call_hocon -v \
         -s "$SCHEMA_MOD" \
-        -c "$DATA_DIR"/configs/cluster.hocon \
+        -c "$EMQX_ETC_DIR"/base.hocon \
+        -c "$CONFIGS_DIR"/cluster.hocon \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         check_schema
 }
@@ -804,7 +805,8 @@ generate_config() {
 
     ## This command populates two files: app.<time>.config and vm.<time>.args
     ## It takes input sources and overlays values in below order:
-    ##   - $DATA_DIR/cluster.hocon (if exists)
+    ##   - etc/base.hocon
+    ##   - $CONFIGS_DIR/cluster.hocon
     ##   - etc/emqx.conf
     ##   - environment variables starts with EMQX_ e.g. EMQX_NODE__ROLE
     ##
@@ -812,7 +814,8 @@ generate_config() {
     ##       because it has to sync cluster.hocon from other nodes.
     call_hocon -v -t "$NOW_TIME" \
         -s "$SCHEMA_MOD" \
-        -c "$DATA_DIR"/configs/cluster.hocon \
+        -c "$EMQX_ETC_DIR"/base.hocon \
+        -c "$CONFIGS_DIR"/cluster.hocon \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -d "$DATA_DIR"/configs generate
 

+ 3 - 0
changes/ce/feat-13739.en.md

@@ -0,0 +1,3 @@
+Support clear monitor (statistics) data for the whole cluster.
+
+Send `DELETE` request to endpoint `api/v5/monitor` to clear all collected monitoring metrics.

+ 1 - 0
changes/ce/feat-14247.en.md

@@ -0,0 +1 @@
+Write client attribute named `tns` to log messages if such client attribute exists.

+ 20 - 0
changes/ce/feat-14269.en.md

@@ -0,0 +1,20 @@
+Added `etc/base.hocon` config file.
+
+In this release, we introduced a new configuration file, `etc/base.hocon`, to enhance configuration management and clarity.
+
+Previously, since emqx.conf was the only place for manually crafted configurations, and because it sits at the top-most layer
+of the configuration override system, it caused some confusion.
+While mutable (not read-only) configurations set in `emqx.conf` could be changed through the UI, API, or CLI and take effect immediately,
+those changes would not persist after a node restart — leading to inconsistent behavior.
+
+To address this, we’ve added etc/base.hocon as a foundational configuration layer.
+The updated configuration precedence order, from top to bottom, is as follows:
+
+1. Environment variables
+2. `etc/emqx.conf`
+3. `data/configs/cluster.hocon`
+4. `etc/base.hocon`
+
+The `etc/base.hocon` file serves as the base layer for configurations.
+While settings defined here can still be modified after the node starts,
+this layer ensures consistent override behavior.

+ 1 - 0
changes/ce/fix-14267.en.md

@@ -0,0 +1 @@
+Do not redact secrets in logs and HTTP responses when the secret string is a file path (`file:///path/to/the/secret`).

+ 1 - 0
changes/ce/fix-14272.en.md

@@ -0,0 +1 @@
+`auto_subscribe` configuration loaded via CLI shows success but fails to take effect.

+ 1 - 0
changes/ce/fix-14317.en.md

@@ -0,0 +1 @@
+Prevent potential issues where APIs involving paging may return empty pages, in case the internal APIs will be subtly misused in the future.

+ 7 - 0
changes/ce/fix-14318.en.md

@@ -0,0 +1,7 @@
+Fixed the initialization of the HTTP connector state.  When there was incoming traffic being handled by an HTTP action and its underlying connector restarted, cryptic crashes could be seen in the logs mentioning `function_clause`.
+
+Example:
+
+```
+20:42:36.850 [error] msg: "resource_exception", info: #{error => {error, function_clause}, id => <<"action:http:a:connector:http:a">>, name => call_query, ...
+```

+ 5 - 0
changes/ce/fix-14319.en.md

@@ -0,0 +1,5 @@
+Refactored resource management internal state machine.  As consequence, some race condition bugs have been eliminated.  One such example is the HTTP action, which, when under incoming traffic and when its health check flap, may produce errors like the following:
+
+```
+2024-11-29T14:58:17.994119+00:00 [error] msg: action_not_found, connector: <<"connector:http:a">>, action_id: <<"action:http:a:connector:http:a">>
+```

+ 1 - 0
changes/ee/feat-14110.en.md

@@ -0,0 +1 @@
+Added support for Pulsar driver to report metrics.  Now, it will report metrics such as queuing, inflight and dropped message count for better observability.

+ 7 - 0
changes/ee/fix-14291.en.md

@@ -0,0 +1,7 @@
+Upgraded Pulsar producer driver to fix handling of `Redirect` `LookupType` responses when looking up a topic in Pulsar.
+
+Before this fix, if the `LookupType` response type was `Redirect` when (re)starting a producer, it would incorrectly attempt to connect to the returned broker and fail to publish any messages.  Example logs under such condition:
+
+```
+2024-11-25T20:40:54.140659+00:00 [error] [pulsar-producer][persistent://public/default/p3-partition-0] Response error:'ServiceNotReady', msg:"Namespace bundle for topic (persistent://public/default/p3-partition-0) not served by this instance. Please redo the lookup. Request is denied: namespace=public/default"
+```

File diff suppressed because it is too large
+ 7 - 0
changes/ee/fix-14345.en.md


+ 3 - 0
dev

@@ -318,6 +318,7 @@ generate_app_conf() {
 
     ## This command populates two files: app.<time>.config and vm.<time>.args
     ## It takes input sources and overlays values in below order:
+    ##   - etc/base.hocon
     ##   - $DATA_DIR/cluster.hocon (if exists)
     ##   - etc/emqx.conf
     ##   - environment variables starts with EMQX_ e.g. EMQX_NODE__ROLE
@@ -325,6 +326,7 @@ generate_app_conf() {
     ## NOTE: it's a known issue that cluster.hocon may change right after the node boots up
     ##       because it has to sync cluster.hocon from other nodes.
     call_hocon -v -t "$NOW_TIME" -s "$SCHEMA_MOD" \
+        -c "$EMQX_ETC_DIR"/base.hocon \
         -c "$EMQX_DATA_DIR"/configs/cluster.hocon \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -d "$EMQX_DATA_DIR"/configs generate
@@ -358,6 +360,7 @@ EOF
 # copy cert files and acl.conf to etc
 copy_other_conf_files() {
     cp -r apps/emqx/etc/certs "$EMQX_ETC_DIR"/
+    cp -r apps/emqx_conf/etc/base.hocon "$EMQX_ETC_DIR"/
     cp apps/emqx_auth/etc/acl.conf "$EMQX_ETC_DIR"/
 }
 

+ 6 - 0
mix.exs

@@ -947,6 +947,12 @@ defmodule EMQXUmbrella.MixProject do
       Path.join(etc, "emqx.conf")
     )
 
+    render_template(
+      "apps/emqx_conf/etc/base.hocon",
+      assigns,
+      Path.join(etc, "base.hocon")
+    )
+
     render_template(
       "rel/emqx_vars",
       assigns,

+ 3 - 2
rebar.config.erl

@@ -198,7 +198,7 @@ plugins() ->
 test_plugins() ->
     [
         {rebar3_proper, "0.12.1"},
-        {coveralls, {git, "https://github.com/emqx/coveralls-erl", {tag, "v2.2.0-emqx-3"}}}
+        {coveralls, {git, "https://github.com/emqx/coveralls-erl", {tag, "v2.2.0-emqx-4"}}}
     ].
 
 test_deps() ->
@@ -541,7 +541,8 @@ emqx_etc_overlay_per_rel(_RelType) ->
 emqx_etc_overlay() ->
     [
         {"{{base_dir}}/lib/emqx/etc/ssl_dist.conf", "etc/ssl_dist.conf"},
-        {"{{base_dir}}/lib/emqx_conf/etc/emqx.conf.all", "etc/emqx.conf"}
+        {"{{base_dir}}/lib/emqx_conf/etc/emqx.conf.all", "etc/emqx.conf"},
+        {"{{base_dir}}/lib/emqx_conf/etc/base.hocon", "etc/base.hocon"}
     ].
 
 get_vsn(Profile) ->

+ 5 - 0
rel/i18n/emqx_dashboard_monitor_api.hocon

@@ -5,6 +5,11 @@ list_monitor.desc:
 list_monitor.label:
 """List cluster stats data"""
 
+clear_monitor.desc:
+"""Clear monitor (statistics) data for the whole cluster."""
+clear_monitor.label:
+"""Clear cluster stats data"""
+
 list_monitor_node.desc:
 """List the monitor (statistics) data on the specified node."""
 list_monitor_node.label:

+ 8 - 0
scripts/test/emqx-boot.bats

@@ -27,3 +27,11 @@
     [[ $status -ne 0 ]]
     rm -f $conffile
 }
+
+@test "corrupted base.hocon" {
+    conffile="./_build/$PROFILE/rel/emqx/etc/base.hocon"
+    echo "{" > $conffile
+    run ./_build/$PROFILE/rel/emqx/bin/emqx console
+    [[ $status -ne 0 ]]
+    rm -f $conffile
+}