Просмотр исходного кода

Merge pull request #14356 from zmstone/241206-sync-relese-584-to-release-58

241206 sync relese 584 to release 58
zmstone 1 год назад
Родитель
Сommit
16dfeb3010
79 измененных файлов с 1770 добавлено и 454 удалено
  1. 5 0
      apps/emqx/include/emqx.hrl
  2. 1 0
      apps/emqx/priv/bpapi.versions
  3. 1 1
      apps/emqx/src/emqx.app.src
  4. 39 4
      apps/emqx/src/emqx_channel.erl
  5. 37 25
      apps/emqx/src/emqx_config.erl
  6. 6 2
      apps/emqx/src/emqx_connection.erl
  7. 6 2
      apps/emqx/src/emqx_logger.erl
  8. 3 1
      apps/emqx/src/emqx_logger_textfmt.erl
  9. 3 3
      apps/emqx/src/emqx_schema.erl
  10. 2 2
      apps/emqx/src/emqx_schema_secret.erl
  11. 7 4
      apps/emqx/src/emqx_secret_loader.erl
  12. 7 2
      apps/emqx/src/emqx_trace/emqx_trace_formatter.erl
  13. 2 0
      apps/emqx/src/emqx_types.erl
  14. 6 2
      apps/emqx/src/emqx_ws_connection.erl
  15. 110 0
      apps/emqx/test/emqx_channel_tests.erl
  16. 1 1
      apps/emqx/test/emqx_common_test_helpers.erl
  17. 27 1
      apps/emqx/test/emqx_config_SUITE.erl
  18. 3 3
      apps/emqx/test/emqx_release_tests.erl
  19. 29 6
      apps/emqx/test/emqx_secret_tests.erl
  20. 52 0
      apps/emqx/test/emqx_trace_formatter_tests.erl
  21. 1 1
      apps/emqx_auto_subscribe/src/emqx_auto_subscribe.app.src
  22. 19 18
      apps/emqx_auto_subscribe/src/emqx_auto_subscribe.erl
  23. 20 0
      apps/emqx_auto_subscribe/test/emqx_auto_subscribe_SUITE.erl
  24. 1 1
      apps/emqx_bridge/src/emqx_bridge.app.src
  25. 4 0
      apps/emqx_bridge/src/emqx_bridge_v2.erl
  26. 1 1
      apps/emqx_bridge_http/src/emqx_bridge_http.app.src
  27. 2 1
      apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl
  28. 4 1
      apps/emqx_bridge_kafka/mix.exs
  29. 3 2
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src
  30. 25 0
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka_app.erl
  31. 0 21
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka_impl_consumer.erl
  32. 46 0
      apps/emqx_bridge_kafka/src/emqx_bridge_kafka_sup.erl
  33. 2 1
      apps/emqx_bridge_kafka/test/emqx_bridge_v2_kafka_consumer_SUITE.erl
  34. 10 7
      apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl
  35. 21 1
      apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl
  36. 1 1
      apps/emqx_bridge_pulsar/mix.exs
  37. 1 1
      apps/emqx_bridge_pulsar/rebar.config
  38. 1 1
      apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar.app.src
  39. 216 82
      apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_connector.erl
  40. 1 1
      apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl
  41. 328 42
      apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_v2_SUITE.erl
  42. 24 0
      apps/emqx_conf/etc/base.hocon
  43. 5 27
      apps/emqx_conf/etc/emqx_conf.conf
  44. 48 0
      apps/emqx_connector/test/emqx_connector_SUITE.erl
  45. 1 1
      apps/emqx_dashboard/src/emqx_dashboard.app.src
  46. 7 4
      apps/emqx_dashboard/src/emqx_dashboard_monitor.erl
  47. 20 1
      apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl
  48. 5 1
      apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v1.erl
  49. 44 0
      apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v2.erl
  50. 17 7
      apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl
  51. 1 1
      apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl
  52. 1 1
      apps/emqx_management/src/emqx_management.app.src
  53. 98 16
      apps/emqx_management/src/emqx_mgmt_api.erl
  54. 1 1
      apps/emqx_resource/src/emqx_resource.app.src
  55. 8 1
      apps/emqx_resource/src/emqx_resource_buffer_worker.erl
  56. 18 5
      apps/emqx_resource/src/emqx_resource_cache_cleaner.erl
  57. 236 116
      apps/emqx_resource/src/emqx_resource_manager.erl
  58. 6 1
      apps/emqx_resource/src/emqx_resource_manager_sup.erl
  59. 22 19
      apps/emqx_resource/test/emqx_resource_SUITE.erl
  60. 1 1
      apps/emqx_utils/src/emqx_utils.app.src
  61. 27 4
      apps/emqx_utils/src/emqx_utils_redact.erl
  62. 41 0
      apps/emqx_utils/test/emqx_utils_redact_tests.erl
  63. 6 3
      bin/emqx
  64. 3 0
      changes/ce/feat-13739.en.md
  65. 1 0
      changes/ce/feat-14247.en.md
  66. 20 0
      changes/ce/feat-14269.en.md
  67. 1 0
      changes/ce/fix-14267.en.md
  68. 1 0
      changes/ce/fix-14272.en.md
  69. 1 0
      changes/ce/fix-14317.en.md
  70. 7 0
      changes/ce/fix-14318.en.md
  71. 5 0
      changes/ce/fix-14319.en.md
  72. 1 0
      changes/ee/feat-14110.en.md
  73. 7 0
      changes/ee/fix-14291.en.md
  74. 7 0
      changes/ee/fix-14345.en.md
  75. 3 0
      dev
  76. 6 0
      mix.exs
  77. 3 2
      rebar.config.erl
  78. 5 0
      rel/i18n/emqx_dashboard_monitor_api.hocon
  79. 8 0
      scripts/test/emqx-boot.bats

+ 5 - 0
apps/emqx/include/emqx.hrl

@@ -112,4 +112,9 @@
 -define(KIND_REPLICATE, replicate).
 -define(KIND_REPLICATE, replicate).
 -define(KIND_INITIATE, initiate).
 -define(KIND_INITIATE, initiate).
 
 
+%%--------------------------------------------------------------------
+%% Client Attributes
+%%--------------------------------------------------------------------
+-define(CLIENT_ATTR_NAME_TNS, <<"tns">>).
+
 -endif.
 -endif.

+ 1 - 0
apps/emqx/priv/bpapi.versions

@@ -20,6 +20,7 @@
 {emqx_conf,4}.
 {emqx_conf,4}.
 {emqx_connector,1}.
 {emqx_connector,1}.
 {emqx_dashboard,1}.
 {emqx_dashboard,1}.
+{emqx_dashboard,2}.
 {emqx_delayed,1}.
 {emqx_delayed,1}.
 {emqx_delayed,2}.
 {emqx_delayed,2}.
 {emqx_delayed,3}.
 {emqx_delayed,3}.

+ 1 - 1
apps/emqx/src/emqx.app.src

@@ -2,7 +2,7 @@
 {application, emqx, [
 {application, emqx, [
     {id, "emqx"},
     {id, "emqx"},
     {description, "EMQX Core"},
     {description, "EMQX Core"},
-    {vsn, "5.4.3"},
+    {vsn, "5.5.0"},
     {modules, []},
     {modules, []},
     {registered, []},
     {registered, []},
     {applications, [
     {applications, [

+ 39 - 4
apps/emqx/src/emqx_channel.erl

@@ -72,8 +72,14 @@
     prepare_will_message_for_publishing/2
     prepare_will_message_for_publishing/2
 ]).
 ]).
 
 
-%% Exports for CT
--export([set_field/3]).
+%% Exports for tests
+-ifdef(TEST).
+-export([
+    dummy/0,
+    set_field/3,
+    set_log_meta/2
+]).
+-endif.
 
 
 -if(?EMQX_RELEASE_EDITION == ee).
 -if(?EMQX_RELEASE_EDITION == ee).
 -export([basic_trace_attrs/1]).
 -export([basic_trace_attrs/1]).
@@ -2011,8 +2017,33 @@ fix_mountpoint(ClientInfo = #{mountpoint := MountPoint}) ->
 
 
 set_log_meta(_ConnPkt, #channel{clientinfo = #{clientid := ClientId} = ClientInfo}) ->
 set_log_meta(_ConnPkt, #channel{clientinfo = #{clientid := ClientId} = ClientInfo}) ->
     Username = maps:get(username, ClientInfo, undefined),
     Username = maps:get(username, ClientInfo, undefined),
-    emqx_logger:set_metadata_clientid(ClientId),
-    emqx_logger:set_metadata_username(Username).
+    Attrs = maps:get(client_attrs, ClientInfo, #{}),
+    Tns0 = maps:get(?CLIENT_ATTR_NAME_TNS, Attrs, undefined),
+    %% No need to add Tns to log metadata if it's aready a prefix is client ID
+    %% Or if it's the username.
+    Tns =
+        case is_clientid_namespaced(ClientId, Tns0) orelse Username =:= Tns0 of
+            true ->
+                undefined;
+            false ->
+                Tns0
+        end,
+    Meta0 = [{clientid, ClientId}, {username, Username}, {tns, Tns}],
+    %% Drop undefined or <<>>
+    Meta = lists:filter(fun({_, V}) -> V =/= undefined andalso V =/= <<>> end, Meta0),
+    emqx_logger:set_proc_metadata(maps:from_list(Meta)).
+
+%% clientid_override is an expression which is free to set tns as a prefix, suffix or whatsoever,
+%% but as a best-effort log metadata optimization, we only check for prefix
+is_clientid_namespaced(ClientId, Tns) when is_binary(Tns) andalso Tns =/= <<>> ->
+    case ClientId of
+        <<Tns:(size(Tns))/binary, _/binary>> ->
+            true;
+        _ ->
+            false
+    end;
+is_clientid_namespaced(_ClientId, _Tns) ->
+    false.
 
 
 %%--------------------------------------------------------------------
 %%--------------------------------------------------------------------
 %% Check banned
 %% Check banned
@@ -3231,6 +3262,10 @@ subscribe_authz_result_attrs(CheckResult) ->
 %% For CT tests
 %% For CT tests
 %%--------------------------------------------------------------------
 %%--------------------------------------------------------------------
 
 
+-ifdef(TEST).
+dummy() -> #channel{}.
+
 set_field(Name, Value, Channel) ->
 set_field(Name, Value, Channel) ->
     Pos = emqx_utils:index_of(Name, record_info(fields, channel)),
     Pos = emqx_utils:index_of(Name, record_info(fields, channel)),
     setelement(Pos + 1, Channel, Value).
     setelement(Pos + 1, Channel, Value).
+-endif.

+ 37 - 25
apps/emqx/src/emqx_config.erl

@@ -97,7 +97,7 @@
 -export([upgrade_raw_conf/2]).
 -export([upgrade_raw_conf/2]).
 
 
 -ifdef(TEST).
 -ifdef(TEST).
--export([erase_all/0, backup_and_write/2]).
+-export([erase_all/0, backup_and_write/2, cluster_hocon_file/0, base_hocon_file/0]).
 -endif.
 -endif.
 
 
 -include("logger.hrl").
 -include("logger.hrl").
@@ -440,12 +440,13 @@ do_parse_hocon(true, Conf, IncDirs) ->
 do_parse_hocon(false, Conf, IncDirs) ->
 do_parse_hocon(false, Conf, IncDirs) ->
     Opts = #{format => map, include_dirs => IncDirs},
     Opts = #{format => map, include_dirs => IncDirs},
     case is_binary(Conf) of
     case is_binary(Conf) of
-        %% only use in test
         true ->
         true ->
+            %% only used in test
             hocon:binary(Conf, Opts);
             hocon:binary(Conf, Opts);
         false ->
         false ->
+            BaseHocon = base_hocon_file(),
             ClusterFile = cluster_hocon_file(),
             ClusterFile = cluster_hocon_file(),
-            hocon:files([ClusterFile | Conf], Opts)
+            hocon:files([BaseHocon, ClusterFile | Conf], Opts)
     end.
     end.
 
 
 include_dirs() ->
 include_dirs() ->
@@ -541,12 +542,12 @@ ensure_file_deleted(F) ->
 
 
 -spec read_override_conf(map()) -> raw_config().
 -spec read_override_conf(map()) -> raw_config().
 read_override_conf(#{} = Opts) ->
 read_override_conf(#{} = Opts) ->
-    File =
+    Files =
         case has_deprecated_file() of
         case has_deprecated_file() of
-            true -> deprecated_conf_file(Opts);
-            false -> cluster_hocon_file()
+            true -> [deprecated_conf_file(Opts)];
+            false -> [base_hocon_file(), cluster_hocon_file()]
         end,
         end,
-    load_hocon_file(File, map).
+    load_hocon_files(Files, map).
 
 
 %% @doc Return `true' if this node is upgraded from older version which used cluster-override.conf for
 %% @doc Return `true' if this node is upgraded from older version which used cluster-override.conf for
 %% cluster-wide config persistence.
 %% cluster-wide config persistence.
@@ -564,6 +565,9 @@ deprecated_conf_file(Opts) when is_map(Opts) ->
 deprecated_conf_file(Which) when is_atom(Which) ->
 deprecated_conf_file(Which) when is_atom(Which) ->
     application:get_env(emqx, Which, undefined).
     application:get_env(emqx, Which, undefined).
 
 
+base_hocon_file() ->
+    emqx:etc_file("base.hocon").
+
 %% The newer version cluster-wide config persistence file.
 %% The newer version cluster-wide config persistence file.
 cluster_hocon_file() ->
 cluster_hocon_file() ->
     application:get_env(emqx, cluster_hocon_file, undefined).
     application:get_env(emqx, cluster_hocon_file, undefined).
@@ -633,16 +637,29 @@ save_to_override_conf(true = _HasDeprecatedFile, RawConf, Opts) ->
         undefined ->
         undefined ->
             ok;
             ok;
         FileName ->
         FileName ->
-            backup_and_write(FileName, hocon_pp:do(RawConf, Opts))
+            backup_and_write(FileName, generate_hocon_content(RawConf, Opts))
     end;
     end;
 save_to_override_conf(false = _HasDeprecatedFile, RawConf, Opts) ->
 save_to_override_conf(false = _HasDeprecatedFile, RawConf, Opts) ->
     case cluster_hocon_file() of
     case cluster_hocon_file() of
         undefined ->
         undefined ->
             ok;
             ok;
         FileName ->
         FileName ->
-            backup_and_write(FileName, hocon_pp:do(RawConf, Opts))
+            backup_and_write(FileName, generate_hocon_content(RawConf, Opts))
     end.
     end.
 
 
+generate_hocon_content(RawConf, Opts) ->
+    [
+        cluster_dot_hocon_header(),
+        hocon_pp:do(RawConf, Opts)
+    ].
+
+cluster_dot_hocon_header() ->
+    [
+        "# This file is generated. Do not edit.\n",
+        "# The configs are results of online config changes from UI/API/CLI.\n",
+        "# To persist configs in this file, copy the content to etc/base.hocon.\n"
+    ].
+
 %% @private This is the same human-readable timestamp format as
 %% @private This is the same human-readable timestamp format as
 %% hocon-cli generated app.<time>.config file name.
 %% hocon-cli generated app.<time>.config file name.
 now_time() ->
 now_time() ->
@@ -730,22 +747,17 @@ remove_handlers() ->
     emqx_sys_mon:remove_handler(),
     emqx_sys_mon:remove_handler(),
     ok.
     ok.
 
 
-load_hocon_file(FileName, LoadType) ->
-    case filelib:is_regular(FileName) of
-        true ->
-            Opts = #{include_dirs => include_dirs(), format => LoadType},
-            case hocon:load(FileName, Opts) of
-                {ok, Raw0} ->
-                    Raw0;
-                {error, Reason} ->
-                    throw(#{
-                        msg => failed_to_load_conf,
-                        reason => Reason,
-                        file => FileName
-                    })
-            end;
-        false ->
-            #{}
+load_hocon_files(FileNames, LoadType) ->
+    Opts = #{include_dirs => include_dirs(), format => LoadType},
+    case hocon:files(FileNames, Opts) of
+        {ok, Raw0} ->
+            Raw0;
+        {error, Reason} ->
+            throw(#{
+                msg => failed_to_load_conf,
+                reason => Reason,
+                files => FileNames
+            })
     end.
     end.
 
 
 do_get_raw(Path) ->
 do_get_raw(Path) ->

+ 6 - 2
apps/emqx/src/emqx_connection.erl

@@ -126,7 +126,10 @@
     limiter_timer :: undefined | reference(),
     limiter_timer :: undefined | reference(),
 
 
     %% QUIC conn shared state
     %% QUIC conn shared state
-    quic_conn_ss :: option(map())
+    quic_conn_ss :: option(map()),
+
+    %% Extra field for future hot-upgrade support
+    extra = []
 }).
 }).
 
 
 -record(retry, {
 -record(retry, {
@@ -366,7 +369,8 @@ init_state(
         limiter_buffer = queue:new(),
         limiter_buffer = queue:new(),
         limiter_timer = undefined,
         limiter_timer = undefined,
         %% for quic streams to inherit
         %% for quic streams to inherit
-        quic_conn_ss = maps:get(conn_shared_state, Opts, undefined)
+        quic_conn_ss = maps:get(conn_shared_state, Opts, undefined),
+        extra = []
     }.
     }.
 
 
 run_loop(
 run_loop(

+ 6 - 2
apps/emqx/src/emqx_logger.erl

@@ -48,6 +48,7 @@
     set_primary_log_level/1,
     set_primary_log_level/1,
     set_log_handler_level/2,
     set_log_handler_level/2,
     set_log_level/1,
     set_log_level/1,
+    set_level/1,
     set_all_log_handlers_level/1
     set_all_log_handlers_level/1
 ]).
 ]).
 
 
@@ -244,13 +245,16 @@ set_log_handler_level(HandlerId, Level) ->
     end.
     end.
 
 
 %% @doc Set both the primary and all handlers level in one command
 %% @doc Set both the primary and all handlers level in one command
--spec set_log_level(logger:level()) -> ok | {error, term()}.
-set_log_level(Level) ->
+-spec set_level(logger:level()) -> ok | {error, term()}.
+set_level(Level) ->
     case set_primary_log_level(Level) of
     case set_primary_log_level(Level) of
         ok -> set_all_log_handlers_level(Level);
         ok -> set_all_log_handlers_level(Level);
         {error, Error} -> {error, {primary_logger_level, Error}}
         {error, Error} -> {error, {primary_logger_level, Error}}
     end.
     end.
 
 
+set_log_level(Level) ->
+    set_level(Level).
+
 %%--------------------------------------------------------------------
 %%--------------------------------------------------------------------
 %% Internal Functions
 %% Internal Functions
 %%--------------------------------------------------------------------
 %%--------------------------------------------------------------------

+ 3 - 1
apps/emqx/src/emqx_logger_textfmt.erl

@@ -122,6 +122,7 @@ enrich_report(ReportRaw0, Meta, Config) ->
             undefined -> maps:get(username, ReportRaw, undefined);
             undefined -> maps:get(username, ReportRaw, undefined);
             Username0 -> Username0
             Username0 -> Username0
         end,
         end,
+    Tns = maps:get(tns, Meta, undefined),
     ClientId = maps:get(clientid, Meta, undefined),
     ClientId = maps:get(clientid, Meta, undefined),
     Peer = maps:get(peername, Meta, undefined),
     Peer = maps:get(peername, Meta, undefined),
     Msg = maps:get(msg, ReportRaw, undefined),
     Msg = maps:get(msg, ReportRaw, undefined),
@@ -135,7 +136,7 @@ enrich_report(ReportRaw0, Meta, Config) ->
             ({_, undefined}, Acc) -> Acc;
             ({_, undefined}, Acc) -> Acc;
             (Item, Acc) -> [Item | Acc]
             (Item, Acc) -> [Item | Acc]
         end,
         end,
-        maps:to_list(maps:without([topic, msg, clientid, username, tag], ReportRaw)),
+        maps:to_list(maps:without([topic, msg, tns, clientid, username, tag], ReportRaw)),
         [
         [
             {topic, try_format_unicode(Topic)},
             {topic, try_format_unicode(Topic)},
             {username, try_format_unicode(Username)},
             {username, try_format_unicode(Username)},
@@ -143,6 +144,7 @@ enrich_report(ReportRaw0, Meta, Config) ->
             {mfa, try_format_unicode(MFA)},
             {mfa, try_format_unicode(MFA)},
             {msg, Msg},
             {msg, Msg},
             {clientid, try_format_unicode(ClientId)},
             {clientid, try_format_unicode(ClientId)},
+            {tns, try_format_unicode(Tns)},
             {tag, Tag}
             {tag, Tag}
         ]
         ]
     ).
     ).

+ 3 - 3
apps/emqx/src/emqx_schema.erl

@@ -3422,7 +3422,8 @@ naive_env_interpolation(Other) ->
     Other.
     Other.
 
 
 split_path(Path) ->
 split_path(Path) ->
-    split_path(Path, []).
+    {Name0, Tail} = split_path(Path, []),
+    {string:trim(Name0, both, "{}"), Tail}.
 
 
 split_path([], Acc) ->
 split_path([], Acc) ->
     {lists:reverse(Acc), []};
     {lists:reverse(Acc), []};
@@ -3431,8 +3432,7 @@ split_path([Char | Rest], Acc) when Char =:= $/ orelse Char =:= $\\ ->
 split_path([Char | Rest], Acc) ->
 split_path([Char | Rest], Acc) ->
     split_path(Rest, [Char | Acc]).
     split_path(Rest, [Char | Acc]).
 
 
-resolve_env(Name0) ->
-    Name = string:trim(Name0, both, "{}"),
+resolve_env(Name) ->
     Value = os:getenv(Name),
     Value = os:getenv(Name),
     case Value =/= false andalso Value =/= "" of
     case Value =/= false andalso Value =/= "" of
         true ->
         true ->

+ 2 - 2
apps/emqx/src/emqx_schema_secret.erl

@@ -71,8 +71,8 @@ convert_secret(Secret, #{}) ->
     end.
     end.
 
 
 -spec wrap(source()) -> emqx_secret:t(t()).
 -spec wrap(source()) -> emqx_secret:t(t()).
-wrap(<<"file://", Filename/binary>>) ->
-    emqx_secret:wrap_load({file, Filename});
+wrap(<<"file://", _Filename/binary>> = Secret) ->
+    emqx_secret:wrap_load({file, Secret});
 wrap(Secret) ->
 wrap(Secret) ->
     emqx_secret:wrap(Secret).
     emqx_secret:wrap(Secret).
 
 

+ 7 - 4
apps/emqx/src/emqx_secret_loader.erl

@@ -22,14 +22,17 @@
 
 
 -export_type([source/0]).
 -export_type([source/0]).
 
 
--type source() :: {file, file:filename_all()}.
+-type source() :: {file, string() | binary()}.
 
 
 -spec load(source()) -> binary() | no_return().
 -spec load(source()) -> binary() | no_return().
-load({file, Filename}) ->
-    file(Filename).
+load({file, <<"file://", Path/binary>>}) ->
+    file(Path);
+load({file, "file://" ++ Path}) ->
+    file(Path).
 
 
 -spec file(file:filename_all()) -> binary() | no_return().
 -spec file(file:filename_all()) -> binary() | no_return().
-file(Filename) ->
+file(Filename0) ->
+    Filename = emqx_schema:naive_env_interpolation(Filename0),
     case file:read_file(Filename) of
     case file:read_file(Filename) of
         {ok, Secret} ->
         {ok, Secret} ->
             string:trim(Secret, trailing);
             string:trim(Secret, trailing);

+ 7 - 2
apps/emqx/src/emqx_trace/emqx_trace_formatter.erl

@@ -33,12 +33,17 @@ format(
     #{level := debug, meta := Meta = #{trace_tag := Tag}, msg := Msg} =
     #{level := debug, meta := Meta = #{trace_tag := Tag}, msg := Msg} =
         emqx_logger_textfmt:evaluate_lazy_values(Entry),
         emqx_logger_textfmt:evaluate_lazy_values(Entry),
     Time = emqx_utils_calendar:now_to_rfc3339(microsecond),
     Time = emqx_utils_calendar:now_to_rfc3339(microsecond),
+    Tns =
+        case to_iolist(maps:get(tns, Meta, "")) of
+            "" -> "";
+            X -> [" tns: ", X]
+        end,
     ClientId = to_iolist(maps:get(clientid, Meta, "")),
     ClientId = to_iolist(maps:get(clientid, Meta, "")),
     Peername = maps:get(peername, Meta, ""),
     Peername = maps:get(peername, Meta, ""),
     MetaBin = format_meta(Meta, PEncode),
     MetaBin = format_meta(Meta, PEncode),
     Msg1 = to_iolist(Msg),
     Msg1 = to_iolist(Msg),
     Tag1 = to_iolist(Tag),
     Tag1 = to_iolist(Tag),
-    [Time, " [", Tag1, "] ", ClientId, "@", Peername, " msg: ", Msg1, ", ", MetaBin, "\n"];
+    [Time, " [", Tag1, "] ", ClientId, "@", Peername, Tns, " msg: ", Msg1, ", ", MetaBin, "\n"];
 format(Event, Config) ->
 format(Event, Config) ->
     emqx_logger_textfmt:format(Event, Config).
     emqx_logger_textfmt:format(Event, Config).
 
 
@@ -79,7 +84,7 @@ format_meta_data(Meta, _Encode) ->
     Meta.
     Meta.
 
 
 format_meta(Meta0, Encode) ->
 format_meta(Meta0, Encode) ->
-    Meta1 = maps:without([msg, clientid, peername, trace_tag], Meta0),
+    Meta1 = maps:without([msg, tns, clientid, peername, trace_tag], Meta0),
     Meta2 = format_meta_data(Meta1, Encode),
     Meta2 = format_meta_data(Meta1, Encode),
     kvs_to_iolist(lists:sort(fun compare_meta_kvs/2, maps:to_list(Meta2))).
     kvs_to_iolist(lists:sort(fun compare_meta_kvs/2, maps:to_list(Meta2))).
 
 

+ 2 - 0
apps/emqx/src/emqx_types.erl

@@ -49,6 +49,7 @@
     sockstate/0,
     sockstate/0,
     conninfo/0,
     conninfo/0,
     clientinfo/0,
     clientinfo/0,
+    tns/0,
     clientid/0,
     clientid/0,
     username/0,
     username/0,
     password/0,
     password/0,
@@ -195,6 +196,7 @@
     atom() => term()
     atom() => term()
 }.
 }.
 -type client_attrs() :: #{binary() => binary()}.
 -type client_attrs() :: #{binary() => binary()}.
+-type tns() :: binary().
 -type clientid() :: binary() | atom().
 -type clientid() :: binary() | atom().
 -type username() :: option(binary()).
 -type username() :: option(binary()).
 -type password() :: option(binary()).
 -type password() :: option(binary()).

+ 6 - 2
apps/emqx/src/emqx_ws_connection.erl

@@ -97,7 +97,10 @@
     limiter_buffer :: queue:queue(cache()),
     limiter_buffer :: queue:queue(cache()),
 
 
     %% limiter timers
     %% limiter timers
-    limiter_timer :: undefined | reference()
+    limiter_timer :: undefined | reference(),
+
+    %% Extra field for future hot-upgrade support
+    extra = []
 }).
 }).
 
 
 -record(retry, {
 -record(retry, {
@@ -330,7 +333,8 @@ websocket_init([Req, Opts]) ->
                     zone = Zone,
                     zone = Zone,
                     listener = {Type, Listener},
                     listener = {Type, Listener},
                     limiter_timer = undefined,
                     limiter_timer = undefined,
-                    limiter_buffer = queue:new()
+                    limiter_buffer = queue:new(),
+                    extra = []
                 },
                 },
                 hibernate};
                 hibernate};
         {denny, Reason} ->
         {denny, Reason} ->

+ 110 - 0
apps/emqx/test/emqx_channel_tests.erl

@@ -0,0 +1,110 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_channel_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+
+set_tns_in_log_meta_test_() ->
+    PdKey = '$logger_metadata$',
+    Original = get(PdKey),
+    Set = fun(Cinfo) ->
+        Ch = emqx_channel:dummy(),
+        Ch1 = emqx_channel:set_field(clientinfo, Cinfo, Ch),
+        emqx_channel:set_log_meta(dummy, Ch1)
+    end,
+    Restore = fun() -> put(PdKey, Original) end,
+    NoTns = #{
+        clientid => <<"id1">>,
+        client_attrs => #{<<"not_tns">> => <<"tns1">>},
+        username => <<"user1">>
+    },
+    NoTnsFn = fun(M) ->
+        ?assertMatch(
+            #{
+                clientid := <<"id1">>,
+                username := <<"user1">>
+            },
+            M
+        ),
+        ?assertNot(maps:is_key(tns, M))
+    end,
+    Prefixed = #{
+        clientid => <<"tns1-id1">>,
+        client_attrs => #{<<"tns">> => <<"tns1">>},
+        username => <<"user2">>
+    },
+    PrefixedFn = fun(M) ->
+        ?assertMatch(
+            #{
+                clientid := <<"tns1-id1">>,
+                username := <<"user2">>
+            },
+            M
+        ),
+        ?assertNot(maps:is_key(tns, M))
+    end,
+
+    Username = #{
+        clientid => <<"id1">>,
+        client_attrs => #{<<"tns">> => <<"user3">>},
+        username => <<"user3">>
+    },
+    UsernameFn =
+        fun(M) ->
+            ?assertMatch(
+                #{
+                    clientid := <<"id1">>,
+                    username := <<"user3">>
+                },
+                M
+            ),
+            ?assertNot(maps:is_key(tns, M))
+        end,
+    TnsAdded = #{
+        clientid => <<"id4">>,
+        client_attrs => #{<<"tns">> => <<"tns1">>},
+        username => <<"user4">>
+    },
+    TnsAddedFn = fun(M) ->
+        ?assertMatch(
+            #{
+                clientid := <<"id4">>,
+                username := <<"user4">>,
+                tns := <<"tns1">>
+            },
+            M
+        )
+    end,
+    Run = fun(Cinfo, CheckFn) ->
+        Set(Cinfo),
+        try
+            CheckFn(get(PdKey))
+        after
+            Restore()
+        end
+    end,
+    MakeTestFn = fun(Cinfo, CheckFn) ->
+        fun() ->
+            Run(Cinfo, CheckFn)
+        end
+    end,
+    [
+        {"tns-added", MakeTestFn(TnsAdded, TnsAddedFn)},
+        {"username as tns", MakeTestFn(Username, UsernameFn)},
+        {"tns prefixed clientid", MakeTestFn(Prefixed, PrefixedFn)},
+        {"no tns", MakeTestFn(NoTns, NoTnsFn)}
+    ].

+ 1 - 1
apps/emqx/test/emqx_common_test_helpers.erl

@@ -547,7 +547,7 @@ force_set_config_file_paths(emqx, Paths) ->
     %% we need init cluster conf, so we can save the cluster conf to the file
     %% we need init cluster conf, so we can save the cluster conf to the file
     application:set_env(emqx, local_override_conf_file, "local_override.conf"),
     application:set_env(emqx, local_override_conf_file, "local_override.conf"),
     application:set_env(emqx, cluster_override_conf_file, "cluster_override.conf"),
     application:set_env(emqx, cluster_override_conf_file, "cluster_override.conf"),
-    application:set_env(emqx, cluster_conf_file, "cluster.hocon"),
+    application:set_env(emqx, cluster_hocon_file, "cluster.hocon"),
     application:set_env(emqx, config_files, Paths);
     application:set_env(emqx, config_files, Paths);
 force_set_config_file_paths(_, _) ->
 force_set_config_file_paths(_, _) ->
     ok.
     ok.

+ 27 - 1
apps/emqx/test/emqx_config_SUITE.erl

@@ -92,7 +92,7 @@ t_init_load(C) when is_list(C) ->
     emqx_config:erase_all(),
     emqx_config:erase_all(),
     {ok, DeprecatedFile} = application:get_env(emqx, cluster_override_conf_file),
     {ok, DeprecatedFile} = application:get_env(emqx, cluster_override_conf_file),
     ?assertEqual(false, filelib:is_regular(DeprecatedFile), DeprecatedFile),
     ?assertEqual(false, filelib:is_regular(DeprecatedFile), DeprecatedFile),
-    %% Don't has deprecated file
+    %% Don't have deprecated file
     ok = emqx_config:init_load(emqx_schema, [ConfFile]),
     ok = emqx_config:init_load(emqx_schema, [ConfFile]),
     ?assertEqual(ExpectRootNames, lists:sort(emqx_config:get_root_names())),
     ?assertEqual(ExpectRootNames, lists:sort(emqx_config:get_root_names())),
     ?assertMatch({ok, #{raw_config := 256}}, emqx:update_config([mqtt, max_topic_levels], 256)),
     ?assertMatch({ok, #{raw_config := 256}}, emqx:update_config([mqtt, max_topic_levels], 256)),
@@ -104,6 +104,32 @@ t_init_load(C) when is_list(C) ->
     ?assertMatch({ok, #{raw_config := 128}}, emqx:update_config([mqtt, max_topic_levels], 128)),
     ?assertMatch({ok, #{raw_config := 128}}, emqx:update_config([mqtt, max_topic_levels], 128)),
     ok = file:delete(DeprecatedFile).
     ok = file:delete(DeprecatedFile).
 
 
+t_init_load_with_base_hocon(C) when is_list(C) ->
+    BaseHocon = emqx_config:base_hocon_file(),
+    ClusterHocon = emqx_config:cluster_hocon_file(),
+    ConfFile = "./test_emqx_2.conf",
+    ok = filelib:ensure_dir(BaseHocon),
+    ok = file:write_file(
+        BaseHocon,
+        "mqtt.max_topic_levels = 123\n"
+        "mqtt.max_clientid_len=12\n"
+        "mqtt.max_inflight=12\n"
+    ),
+    ok = file:write_file(
+        ClusterHocon,
+        "mqtt.max_clientid_len = 123\n"
+        "mqtt.max_inflight=22\n"
+    ),
+    ok = file:write_file(ConfFile, "mqtt.max_inflight = 123\n"),
+    ok = emqx_config:init_load(emqx_schema, [ConfFile]),
+    ?assertEqual(123, emqx:get_config([mqtt, max_topic_levels])),
+    ?assertEqual(123, emqx:get_config([mqtt, max_clientid_len])),
+    ?assertEqual(123, emqx:get_config([mqtt, max_inflight])),
+    emqx_config:erase_all(),
+    ok = file:delete(BaseHocon),
+    ok = file:delete(ClusterHocon),
+    ok.
+
 t_unknown_root_keys(C) when is_list(C) ->
 t_unknown_root_keys(C) when is_list(C) ->
     ?check_trace(
     ?check_trace(
         #{timetrap => 1000},
         #{timetrap => 1000},

+ 3 - 3
apps/emqx/test/emqx_release_tests.erl

@@ -59,15 +59,15 @@ vsn_compre_test_() ->
         end}
         end}
     ].
     ].
 
 
-emqx_flavor_test(_Config) ->
+emqx_flavor_test() ->
     case emqx_release:edition() of
     case emqx_release:edition() of
         ce ->
         ce ->
             ok;
             ok;
         ee ->
         ee ->
             ?assertEqual(official, emqx_release:get_flavor()),
             ?assertEqual(official, emqx_release:get_flavor()),
-            ?assertEqual("EMQX Enterprise", emqx:get_description()),
+            ?assertEqual("EMQX Enterprise", emqx_app:get_description()),
             emqx_release:set_flavor(marketplace),
             emqx_release:set_flavor(marketplace),
             ?assertEqual(marketplace, emqx_release:get_flavor()),
             ?assertEqual(marketplace, emqx_release:get_flavor()),
-            ?assertEqual("EMQX Enterprise(marketplace)", emqx:get_description()),
+            ?assertEqual("EMQX Enterprise(marketplace)", emqx_app:get_description()),
             emqx_release:set_flavor(official)
             emqx_release:set_flavor(official)
     end.
     end.

+ 29 - 6
apps/emqx/test/emqx_secret_tests.erl

@@ -39,21 +39,41 @@ wrap_unwrap_load_test_() ->
         fun(Filename) ->
         fun(Filename) ->
             ?_assertEqual(
             ?_assertEqual(
                 Secret,
                 Secret,
-                emqx_secret:unwrap(emqx_secret:wrap_load({file, Filename}))
+                emqx_secret:unwrap(emqx_secret:wrap_load(file_ref(Filename)))
             )
             )
         end
         end
     }.
     }.
 
 
+wrap_unwrap_load_path_env_interpolate_test_() ->
+    Secret = <<"111">>,
+    {
+        setup,
+        fun() -> write_temp_file(Secret) end,
+        fun(Filename) -> file:delete(Filename) end,
+        fun(Filename) ->
+            fun() ->
+                os:putenv("SECRETFILEPATH", Filename),
+                File = "file://${SECRETFILEPATH}",
+                try
+                    ?assertEqual(
+                        Secret,
+                        emqx_secret:unwrap(emqx_secret:wrap_load({file, File}))
+                    )
+                after
+                    os:unsetenv("SECRETFILEPATH")
+                end
+            end
+        end
+    }.
+
 wrap_load_term_test() ->
 wrap_load_term_test() ->
-    ?assertEqual(
-        {file, "no/such/file/i/swear"},
-        emqx_secret:term(emqx_secret:wrap_load({file, "no/such/file/i/swear"}))
-    ).
+    Ref = file_ref("no/such/file/i/swear"),
+    ?assertEqual(Ref, emqx_secret:term(emqx_secret:wrap_load(Ref))).
 
 
 wrap_unwrap_missing_file_test() ->
 wrap_unwrap_missing_file_test() ->
     ?assertThrow(
     ?assertThrow(
         #{msg := failed_to_read_secret_file, reason := "No such file or directory"},
         #{msg := failed_to_read_secret_file, reason := "No such file or directory"},
-        emqx_secret:unwrap(emqx_secret:wrap_load({file, "no/such/file/i/swear"}))
+        emqx_secret:unwrap(emqx_secret:wrap_load(file_ref("no/such/file/i/swear")))
     ).
     ).
 
 
 wrap_term_test() ->
 wrap_term_test() ->
@@ -74,3 +94,6 @@ write_temp_file(Bytes) ->
     Filename = filename:join("/tmp", ?MODULE_STRING ++ integer_to_list(-Ts)),
     Filename = filename:join("/tmp", ?MODULE_STRING ++ integer_to_list(-Ts)),
     ok = file:write_file(Filename, Bytes),
     ok = file:write_file(Filename, Bytes),
     Filename.
     Filename.
+
+file_ref(Path) ->
+    {file, "file://" ++ Path}.

+ 52 - 0
apps/emqx/test/emqx_trace_formatter_tests.erl

@@ -0,0 +1,52 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_trace_formatter_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+
+format_no_tns_in_meta_test() ->
+    Meta = #{
+        clientid => <<"c">>,
+        trace_tag => tag
+    },
+    Event = #{
+        level => debug,
+        meta => Meta,
+        msg => <<"test_msg">>
+    },
+    Config = #{payload_encode => hidden},
+    Formatted = format(Event, Config),
+    ?assertMatch(nomatch, re:run(Formatted, "tns:")),
+    ok.
+
+format_tns_in_meta_test() ->
+    Meta = #{
+        tns => <<"a">>,
+        clientid => <<"c">>,
+        trace_tag => tag
+    },
+    Event = #{
+        level => debug,
+        meta => Meta,
+        msg => <<"test_msg">>
+    },
+    Config = #{payload_encode => hidden},
+    Formatted = format(Event, Config),
+    ?assertMatch({match, _}, re:run(Formatted, "\stns:\sa\s")),
+    ok.
+
+format(Event, Config) ->
+    unicode:characters_to_binary(emqx_trace_formatter:format(Event, Config)).

+ 1 - 1
apps/emqx_auto_subscribe/src/emqx_auto_subscribe.app.src

@@ -1,7 +1,7 @@
 %% -*- mode: erlang -*-
 %% -*- mode: erlang -*-
 {application, emqx_auto_subscribe, [
 {application, emqx_auto_subscribe, [
     {description, "Auto subscribe Application"},
     {description, "Auto subscribe Application"},
-    {vsn, "0.1.6"},
+    {vsn, "0.1.7"},
     {registered, []},
     {registered, []},
     {mod, {emqx_auto_subscribe_app, []}},
     {mod, {emqx_auto_subscribe_app, []}},
     {applications, [
     {applications, [

+ 19 - 18
apps/emqx_auto_subscribe/src/emqx_auto_subscribe.erl

@@ -25,6 +25,7 @@
 -define(HOOK_POINT, 'client.connected').
 -define(HOOK_POINT, 'client.connected').
 
 
 -define(MAX_AUTO_SUBSCRIBE, 20).
 -define(MAX_AUTO_SUBSCRIBE, 20).
+-define(ROOT_KEY, auto_subscribe).
 
 
 -export([load/0, unload/0]).
 -export([load/0, unload/0]).
 
 
@@ -47,27 +48,27 @@
 ]).
 ]).
 
 
 load() ->
 load() ->
-    ok = emqx_conf:add_handler([auto_subscribe, topics], ?MODULE),
+    ok = emqx_conf:add_handler([?ROOT_KEY], ?MODULE),
     update_hook().
     update_hook().
 
 
 unload() ->
 unload() ->
-    emqx_conf:remove_handler([auto_subscribe, topics]).
+    emqx_conf:remove_handler([?ROOT_KEY]).
 
 
 max_limit() ->
 max_limit() ->
     ?MAX_AUTO_SUBSCRIBE.
     ?MAX_AUTO_SUBSCRIBE.
 
 
 list() ->
 list() ->
-    format(emqx_conf:get([auto_subscribe, topics], [])).
+    format(emqx_conf:get([?ROOT_KEY, topics], [])).
 
 
 update(Topics) when length(Topics) =< ?MAX_AUTO_SUBSCRIBE ->
 update(Topics) when length(Topics) =< ?MAX_AUTO_SUBSCRIBE ->
     case
     case
         emqx_conf:update(
         emqx_conf:update(
-            [auto_subscribe, topics],
-            Topics,
+            [?ROOT_KEY],
+            #{<<"topics">> => Topics},
             #{rawconf_with_defaults => true, override_to => cluster}
             #{rawconf_with_defaults => true, override_to => cluster}
         )
         )
     of
     of
-        {ok, #{raw_config := NewTopics}} ->
+        {ok, #{raw_config := #{<<"topics">> := NewTopics}}} ->
             {ok, NewTopics};
             {ok, NewTopics};
         {error, Reason} ->
         {error, Reason} ->
             {error, Reason}
             {error, Reason}
@@ -75,9 +76,8 @@ update(Topics) when length(Topics) =< ?MAX_AUTO_SUBSCRIBE ->
 update(_Topics) ->
 update(_Topics) ->
     {error, quota_exceeded}.
     {error, quota_exceeded}.
 
 
-post_config_update(_KeyPath, _Req, NewTopics, _OldConf, _AppEnvs) ->
-    Config = emqx_conf:get([auto_subscribe], #{}),
-    update_hook(Config#{topics => NewTopics}).
+post_config_update([?ROOT_KEY], _Req, NewConf, _OldConf, _AppEnvs) ->
+    update_hook(NewConf).
 
 
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
 %% hook
 %% hook
@@ -100,25 +100,26 @@ on_client_connected(_, _, _) ->
 
 
 -spec get_basic_usage_info() -> #{auto_subscribe_count => non_neg_integer()}.
 -spec get_basic_usage_info() -> #{auto_subscribe_count => non_neg_integer()}.
 get_basic_usage_info() ->
 get_basic_usage_info() ->
-    AutoSubscribe = emqx_conf:get([auto_subscribe, topics], []),
+    AutoSubscribe = emqx_conf:get([?ROOT_KEY, topics], []),
     #{auto_subscribe_count => length(AutoSubscribe)}.
     #{auto_subscribe_count => length(AutoSubscribe)}.
 
 
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
 %% Data backup
 %% Data backup
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
 
 
-import_config(#{<<"auto_subscribe">> := #{<<"topics">> := Topics}}) ->
-    ConfPath = [auto_subscribe, topics],
-    OldTopics = emqx:get_raw_config(ConfPath, []),
+import_config(#{<<"auto_subscribe">> := #{<<"topics">> := Topics} = AutoSubscribe}) ->
+    ConfPath = [?ROOT_KEY],
+    OldTopics = emqx:get_raw_config(ConfPath ++ [topics], []),
     KeyFun = fun(#{<<"topic">> := T}) -> T end,
     KeyFun = fun(#{<<"topic">> := T}) -> T end,
     MergedTopics = emqx_utils:merge_lists(OldTopics, Topics, KeyFun),
     MergedTopics = emqx_utils:merge_lists(OldTopics, Topics, KeyFun),
-    case emqx_conf:update(ConfPath, MergedTopics, #{override_to => cluster}) of
-        {ok, #{raw_config := NewTopics}} ->
+    Conf = AutoSubscribe#{<<"topics">> => MergedTopics},
+    case emqx_conf:update(ConfPath, Conf, #{override_to => cluster}) of
+        {ok, #{raw_config := #{<<"topics">> := NewTopics}}} ->
             Changed = maps:get(changed, emqx_utils:diff_lists(NewTopics, OldTopics, KeyFun)),
             Changed = maps:get(changed, emqx_utils:diff_lists(NewTopics, OldTopics, KeyFun)),
             Changed1 = [ConfPath ++ [T] || {#{<<"topic">> := T}, _} <- Changed],
             Changed1 = [ConfPath ++ [T] || {#{<<"topic">> := T}, _} <- Changed],
-            {ok, #{root_key => auto_subscribe, changed => Changed1}};
+            {ok, #{root_key => ?ROOT_KEY, changed => Changed1}};
         Error ->
         Error ->
-            {error, #{root_key => auto_subscribe, reason => Error}}
+            {error, #{root_key => ?ROOT_KEY, reason => Error}}
     end;
     end;
 import_config(_RawConf) ->
 import_config(_RawConf) ->
     {ok, #{root_key => auto_subscribe, changed => []}}.
     {ok, #{root_key => auto_subscribe, changed => []}}.
@@ -139,7 +140,7 @@ format(Rule = #{topic := Topic}) when is_map(Rule) ->
     }.
     }.
 
 
 update_hook() ->
 update_hook() ->
-    update_hook(emqx_conf:get([auto_subscribe], #{})).
+    update_hook(emqx_conf:get([?ROOT_KEY], #{topics => []})).
 
 
 update_hook(Config) ->
 update_hook(Config) ->
     {TopicHandler, Options} = emqx_auto_subscribe_handler:init(Config),
     {TopicHandler, Options} = emqx_auto_subscribe_handler:init(Config),

+ 20 - 0
apps/emqx_auto_subscribe/test/emqx_auto_subscribe_SUITE.erl

@@ -20,6 +20,7 @@
 
 
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("common_test/include/ct.hrl").
 -include_lib("common_test/include/ct.hrl").
+-import(emqx_config_SUITE, [prepare_conf_file/3]).
 
 
 -define(TOPIC_C, <<"/c/${clientid}">>).
 -define(TOPIC_C, <<"/c/${clientid}">>).
 -define(TOPIC_U, <<"/u/${username}">>).
 -define(TOPIC_U, <<"/u/${username}">>).
@@ -100,12 +101,18 @@ init_per_suite(Config) ->
 init_per_testcase(t_get_basic_usage_info, Config) ->
 init_per_testcase(t_get_basic_usage_info, Config) ->
     {ok, _} = emqx_auto_subscribe:update([]),
     {ok, _} = emqx_auto_subscribe:update([]),
     Config;
     Config;
+init_per_testcase(t_auto_subscribe_reload_from_file, Config) ->
+    {ok, _} = emqx_auto_subscribe:update([]),
+    Config;
 init_per_testcase(_TestCase, Config) ->
 init_per_testcase(_TestCase, Config) ->
     Config.
     Config.
 
 
 end_per_testcase(t_get_basic_usage_info, _Config) ->
 end_per_testcase(t_get_basic_usage_info, _Config) ->
     {ok, _} = emqx_auto_subscribe:update([]),
     {ok, _} = emqx_auto_subscribe:update([]),
     ok;
     ok;
+end_per_testcase(t_auto_subscribe_reload_from_file, _Config) ->
+    {ok, _} = emqx_auto_subscribe:update([]),
+    ok;
 end_per_testcase(_TestCase, _Config) ->
 end_per_testcase(_TestCase, _Config) ->
     ok.
     ok.
 
 
@@ -131,6 +138,19 @@ t_auto_subscribe(_) ->
     ?assertEqual(check_subs(length(?TOPICS)), ok),
     ?assertEqual(check_subs(length(?TOPICS)), ok),
     emqtt:disconnect(Client),
     emqtt:disconnect(Client),
     ok.
     ok.
+t_auto_subscribe_reload_from_file(Config) ->
+    ConfBin = hocon_pp:do(
+        #{<<"auto_subscribe">> => #{<<"topics">> => [#{<<"topic">> => Topic} || Topic <- ?TOPICS]}},
+        #{}
+    ),
+    ConfFile = prepare_conf_file(?FUNCTION_NAME, ConfBin, Config),
+    ok = emqx_conf_cli:conf(["load", "--replace", ConfFile]),
+    {ok, Client} = emqtt:start_link(#{username => ?CLIENT_USERNAME, clientid => ?CLIENT_ID}),
+    {ok, _} = emqtt:connect(Client),
+    timer:sleep(200),
+    ?assertEqual(check_subs(length(?TOPICS)), ok),
+    emqtt:disconnect(Client),
+    ok.
 
 
 t_update(_) ->
 t_update(_) ->
     Path = emqx_mgmt_api_test_util:api_path(["mqtt", "auto_subscribe"]),
     Path = emqx_mgmt_api_test_util:api_path(["mqtt", "auto_subscribe"]),

+ 1 - 1
apps/emqx_bridge/src/emqx_bridge.app.src

@@ -1,7 +1,7 @@
 %% -*- mode: erlang -*-
 %% -*- mode: erlang -*-
 {application, emqx_bridge, [
 {application, emqx_bridge, [
     {description, "EMQX bridges"},
     {description, "EMQX bridges"},
-    {vsn, "0.2.7"},
+    {vsn, "0.2.8"},
     {registered, [emqx_bridge_sup]},
     {registered, [emqx_bridge_sup]},
     {mod, {emqx_bridge_app, []}},
     {mod, {emqx_bridge_app, []}},
     {applications, [
     {applications, [

+ 4 - 0
apps/emqx_bridge/src/emqx_bridge_v2.erl

@@ -280,10 +280,14 @@ lookup(ConfRootName, Type, Name) ->
             ChannelStatus = maps:get(BridgeV2Id, Channels, undefined),
             ChannelStatus = maps:get(BridgeV2Id, Channels, undefined),
             {DisplayBridgeV2Status, ErrorMsg} =
             {DisplayBridgeV2Status, ErrorMsg} =
                 case {ChannelStatus, ConnectorStatus} of
                 case {ChannelStatus, ConnectorStatus} of
+                    {_, ?status_disconnected} ->
+                        {?status_disconnected, <<"Resource not operational">>};
                     {#{status := ?status_connected}, _} ->
                     {#{status := ?status_connected}, _} ->
                         {?status_connected, <<"">>};
                         {?status_connected, <<"">>};
                     {#{error := resource_not_operational}, ?status_connecting} ->
                     {#{error := resource_not_operational}, ?status_connecting} ->
                         {?status_connecting, <<"Not installed">>};
                         {?status_connecting, <<"Not installed">>};
+                    {#{error := not_added_yet}, _} ->
+                        {?status_connecting, <<"Not installed">>};
                     {#{status := Status, error := undefined}, _} ->
                     {#{status := Status, error := undefined}, _} ->
                         {Status, <<"Unknown reason">>};
                         {Status, <<"Unknown reason">>};
                     {#{status := Status, error := Error}, _} ->
                     {#{status := Status, error := Error}, _} ->

+ 1 - 1
apps/emqx_bridge_http/src/emqx_bridge_http.app.src

@@ -1,6 +1,6 @@
 {application, emqx_bridge_http, [
 {application, emqx_bridge_http, [
     {description, "EMQX HTTP Bridge and Connector Application"},
     {description, "EMQX HTTP Bridge and Connector Application"},
-    {vsn, "0.3.5"},
+    {vsn, "0.3.6"},
     {registered, []},
     {registered, []},
     {applications, [kernel, stdlib, emqx_resource, ehttpc]},
     {applications, [kernel, stdlib, emqx_resource, ehttpc]},
     {env, [
     {env, [

+ 2 - 1
apps/emqx_bridge_http/src/emqx_bridge_http_connector.erl

@@ -236,7 +236,8 @@ on_start(
         port => Port,
         port => Port,
         connect_timeout => ConnectTimeout,
         connect_timeout => ConnectTimeout,
         scheme => Scheme,
         scheme => Scheme,
-        request => preprocess_request(maps:get(request, Config, undefined))
+        request => preprocess_request(maps:get(request, Config, undefined)),
+        installed_actions => #{}
     },
     },
     case start_pool(InstId, PoolOpts) of
     case start_pool(InstId, PoolOpts) of
         ok ->
         ok ->

+ 4 - 1
apps/emqx_bridge_kafka/mix.exs

@@ -18,7 +18,10 @@ defmodule EMQXBridgeKafka.MixProject do
   end
   end
 
 
   def application do
   def application do
-    [extra_applications: UMP.extra_applications()]
+    [
+      extra_applications: UMP.extra_applications(),
+      mod: {:emqx_bridge_kafka_app, []}
+    ]
   end
   end
 
 
   def deps() do
   def deps() do

+ 3 - 2
apps/emqx_bridge_kafka/src/emqx_bridge_kafka.app.src

@@ -1,8 +1,8 @@
 %% -*- mode: erlang -*-
 %% -*- mode: erlang -*-
 {application, emqx_bridge_kafka, [
 {application, emqx_bridge_kafka, [
     {description, "EMQX Enterprise Kafka Bridge"},
     {description, "EMQX Enterprise Kafka Bridge"},
-    {vsn, "0.5.1"},
-    {registered, [emqx_bridge_kafka_consumer_sup]},
+    {vsn, "0.5.2"},
+    {registered, [emqx_bridge_kafka_sup, emqx_bridge_kafka_consumer_sup]},
     {applications, [
     {applications, [
         kernel,
         kernel,
         stdlib,
         stdlib,
@@ -12,6 +12,7 @@
         brod,
         brod,
         brod_gssapi
         brod_gssapi
     ]},
     ]},
+    {mod, {emqx_bridge_kafka_app, []}},
     {env, [
     {env, [
         {emqx_action_info_modules, [
         {emqx_action_info_modules, [
             emqx_bridge_kafka_producer_action_info,
             emqx_bridge_kafka_producer_action_info,

+ 25 - 0
apps/emqx_bridge_kafka/src/emqx_bridge_kafka_app.erl

@@ -0,0 +1,25 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%--------------------------------------------------------------------
+-module(emqx_bridge_kafka_app).
+
+-behaviour(application).
+
+%% `application' API
+-export([start/2, stop/1]).
+
+%%------------------------------------------------------------------------------
+%% Type declarations
+%%------------------------------------------------------------------------------
+
+%%------------------------------------------------------------------------------
+%% `application' API
+%%------------------------------------------------------------------------------
+
+-spec start(application:start_type(), term()) -> {ok, pid()}.
+start(_Type, _Args) ->
+    emqx_bridge_kafka_sup:start_link().
+
+-spec stop(term()) -> ok.
+stop(_State) ->
+    ok.

+ 0 - 21
apps/emqx_bridge_kafka/src/emqx_bridge_kafka_impl_consumer.erl

@@ -381,26 +381,6 @@ make_subscriber_id(BridgeName) ->
     BridgeNameBin = to_bin(BridgeName),
     BridgeNameBin = to_bin(BridgeName),
     <<"kafka_subscriber:", BridgeNameBin/binary>>.
     <<"kafka_subscriber:", BridgeNameBin/binary>>.
 
 
-ensure_consumer_supervisor_started() ->
-    Mod = emqx_bridge_kafka_consumer_sup,
-    ChildSpec =
-        #{
-            id => Mod,
-            start => {Mod, start_link, []},
-            restart => permanent,
-            shutdown => infinity,
-            type => supervisor,
-            modules => [Mod]
-        },
-    case supervisor:start_child(emqx_bridge_sup, ChildSpec) of
-        {ok, _Pid} ->
-            ok;
-        {error, already_present} ->
-            ok;
-        {error, {already_started, _Pid}} ->
-            ok
-    end.
-
 -spec start_consumer(
 -spec start_consumer(
     source_config(),
     source_config(),
     connector_resource_id(),
     connector_resource_id(),
@@ -424,7 +404,6 @@ start_consumer(Config, ConnectorResId, SourceResId, ClientID, ConnState) ->
             value_encoding_mode := ValueEncodingMode
             value_encoding_mode := ValueEncodingMode
         } = Params0
         } = Params0
     } = Config,
     } = Config,
-    ok = ensure_consumer_supervisor_started(),
     ?tp(kafka_consumer_sup_started, #{}),
     ?tp(kafka_consumer_sup_started, #{}),
     TopicMapping = ensure_topic_mapping(Params0),
     TopicMapping = ensure_topic_mapping(Params0),
     InitialState = #{
     InitialState = #{

+ 46 - 0
apps/emqx_bridge_kafka/src/emqx_bridge_kafka_sup.erl

@@ -0,0 +1,46 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%--------------------------------------------------------------------
+-module(emqx_bridge_kafka_sup).
+
+-behaviour(supervisor).
+
+%% API
+-export([start_link/0]).
+
+%% `supervisor' API
+-export([init/1]).
+
+%%------------------------------------------------------------------------------
+%% API
+%%------------------------------------------------------------------------------
+
+start_link() ->
+    supervisor:start_link({local, ?MODULE}, ?MODULE, []).
+
+%%------------------------------------------------------------------------------
+%% `supervisor' API
+%%------------------------------------------------------------------------------
+
+init([]) ->
+    SupFlags = #{
+        strategy => one_for_one,
+        intensity => 10,
+        period => 10
+    },
+    ConsumerSup = sup_spec(emqx_bridge_kafka_consumer_sup),
+    ChildSpecs = [ConsumerSup],
+    {ok, {SupFlags, ChildSpecs}}.
+
+%%------------------------------------------------------------------------------
+%% Internal fns
+%%------------------------------------------------------------------------------
+
+sup_spec(Mod) ->
+    #{
+        id => Mod,
+        start => {Mod, start_link, []},
+        restart => permanent,
+        shutdown => infinity,
+        type => supervisor
+    }.

+ 2 - 1
apps/emqx_bridge_kafka/test/emqx_bridge_v2_kafka_consumer_SUITE.erl

@@ -459,7 +459,8 @@ t_repeated_topics(Config) ->
                 emqx_bridge_v2_testlib:create_source_api([{source_name, Name2} | Config]),
                 emqx_bridge_v2_testlib:create_source_api([{source_name, Name2} | Config]),
             ?assertEqual(
             ?assertEqual(
                 match,
                 match,
-                re:run(Error, <<"Topics .* already exist in other sources">>, [{capture, none}])
+                re:run(Error, <<"Topics .* already exist in other sources">>, [{capture, none}]),
+                #{error => Error}
             ),
             ),
             ok
             ok
         end,
         end,

+ 10 - 7
apps/emqx_bridge_oracle/test/emqx_bridge_oracle_SUITE.erl

@@ -127,6 +127,8 @@ common_init_per_testcase(TestCase, Config0) ->
     ),
     ),
     ok = snabbkaffe:start_trace(),
     ok = snabbkaffe:start_trace(),
     [
     [
+        {bridge_type, ?BRIDGE_TYPE_BIN},
+        {bridge_name, Name},
         {oracle_name, Name},
         {oracle_name, Name},
         {oracle_config_string, ConfigString},
         {oracle_config_string, ConfigString},
         {oracle_config, OracleConfig}
         {oracle_config, OracleConfig}
@@ -730,18 +732,20 @@ t_no_sid_nor_service_name(Config0) ->
     ok.
     ok.
 
 
 t_missing_table(Config) ->
 t_missing_table(Config) ->
-    ResourceId = resource_id(Config),
+    Name = ?config(bridge_name, Config),
     ?check_trace(
     ?check_trace(
         begin
         begin
             drop_table_if_exists(Config),
             drop_table_if_exists(Config),
             ?assertMatch({ok, _}, create_bridge_api(Config)),
             ?assertMatch({ok, _}, create_bridge_api(Config)),
-            ActionId = emqx_bridge_v2:id(?BRIDGE_TYPE_BIN, ?config(oracle_name, Config)),
             ?retry(
             ?retry(
                 _Sleep = 1_000,
                 _Sleep = 1_000,
                 _Attempts = 20,
                 _Attempts = 20,
                 ?assertMatch(
                 ?assertMatch(
-                    {ok, Status} when Status =:= disconnected orelse Status =:= connecting,
-                    emqx_resource_manager:health_check(ResourceId)
+                    {ok, #{
+                        <<"status">> := <<"disconnected">>,
+                        <<"status_reason">> := <<"{unhealthy_target,", _/binary>>
+                    }},
+                    emqx_bridge_testlib:get_bridge_api(Config)
                 )
                 )
             ),
             ),
             ?block_until(#{?snk_kind := oracle_undefined_table}),
             ?block_until(#{?snk_kind := oracle_undefined_table}),
@@ -752,10 +756,9 @@ t_missing_table(Config) ->
                 payload => ?config(oracle_name, Config),
                 payload => ?config(oracle_name, Config),
                 retain => true
                 retain => true
             },
             },
-            Message = {ActionId, Params},
             ?assertMatch(
             ?assertMatch(
-                {error, {resource_error, #{reason := not_connected}}},
-                emqx_resource:simple_sync_query(ResourceId, Message)
+                {error, {resource_error, #{reason := unhealthy_target}}},
+                emqx_bridge_v2:send_message(?BRIDGE_TYPE_BIN, Name, Params, _QueryOpts = #{})
             ),
             ),
             ok
             ok
         end,
         end,

+ 21 - 1
apps/emqx_bridge_pgsql/test/emqx_bridge_pgsql_SUITE.erl

@@ -803,17 +803,37 @@ t_table_removed(Config) ->
     BridgeType = ?config(pgsql_bridge_type, Config),
     BridgeType = ?config(pgsql_bridge_type, Config),
     ?check_trace(
     ?check_trace(
         begin
         begin
+            ct:pal("creating table"),
             connect_and_create_table(Config),
             connect_and_create_table(Config),
-            ?assertMatch({ok, _}, create_bridge(Config)),
+            ct:pal("creating bridge"),
+            ?assertMatch(
+                {ok, _},
+                create_bridge(Config, #{
+                    <<"resource_opts">> => #{
+                        <<"health_check_interval">> => <<"1s">>
+                    }
+                })
+            ),
+            ct:pal("checking bridge health"),
             ?retry(
             ?retry(
                 _Sleep = 100,
                 _Sleep = 100,
                 _Attempts = 200,
                 _Attempts = 200,
                 ?assertMatch(#{status := connected}, emqx_bridge_v2:health_check(BridgeType, Name))
                 ?assertMatch(#{status := connected}, emqx_bridge_v2:health_check(BridgeType, Name))
             ),
             ),
+            ct:pal("dropping table"),
             connect_and_drop_table(Config),
             connect_and_drop_table(Config),
             Val = integer_to_binary(erlang:unique_integer()),
             Val = integer_to_binary(erlang:unique_integer()),
             SentData = #{payload => Val, timestamp => 1668602148000},
             SentData = #{payload => Val, timestamp => 1668602148000},
             ActionId = emqx_bridge_v2:id(BridgeType, Name),
             ActionId = emqx_bridge_v2:id(BridgeType, Name),
+            ?retry(
+                _Sleep = 100,
+                _Attempts = 200,
+                ?assertMatch(
+                    #{error := {unhealthy_target, _}, status := disconnected},
+                    emqx_bridge_v2:health_check(BridgeType, Name)
+                )
+            ),
+            ct:pal("sending query"),
             case query_resource_sync(Config, {ActionId, SentData}) of
             case query_resource_sync(Config, {ActionId, SentData}) of
                 {error, {unrecoverable_error, _}} ->
                 {error, {unrecoverable_error, _}} ->
                     ok;
                     ok;

+ 1 - 1
apps/emqx_bridge_pulsar/mix.exs

@@ -25,7 +25,7 @@ defmodule EMQXBridgePulsar.MixProject do
     [
     [
       UMP.common_dep(:crc32cer),
       UMP.common_dep(:crc32cer),
       UMP.common_dep(:snappyer),
       UMP.common_dep(:snappyer),
-      {:pulsar, github: "emqx/pulsar-client-erl", tag: "0.8.6"},
+      {:pulsar, github: "emqx/pulsar-client-erl", tag: "2.0.0"},
       {:emqx_connector, in_umbrella: true, runtime: false},
       {:emqx_connector, in_umbrella: true, runtime: false},
       {:emqx_resource, in_umbrella: true},
       {:emqx_resource, in_umbrella: true},
       {:emqx_bridge, in_umbrella: true, runtime: false}
       {:emqx_bridge, in_umbrella: true, runtime: false}

+ 1 - 1
apps/emqx_bridge_pulsar/rebar.config

@@ -2,7 +2,7 @@
 
 
 {erl_opts, [debug_info]}.
 {erl_opts, [debug_info]}.
 {deps, [
 {deps, [
-    {pulsar, {git, "https://github.com/emqx/pulsar-client-erl.git", {tag, "0.8.6"}}},
+    {pulsar, {git, "https://github.com/emqx/pulsar-client-erl.git", {tag, "2.0.0"}}},
     {emqx_connector, {path, "../../apps/emqx_connector"}},
     {emqx_connector, {path, "../../apps/emqx_connector"}},
     {emqx_resource, {path, "../../apps/emqx_resource"}},
     {emqx_resource, {path, "../../apps/emqx_resource"}},
     {emqx_bridge, {path, "../../apps/emqx_bridge"}}
     {emqx_bridge, {path, "../../apps/emqx_bridge"}}

+ 1 - 1
apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar.app.src

@@ -1,6 +1,6 @@
 {application, emqx_bridge_pulsar, [
 {application, emqx_bridge_pulsar, [
     {description, "EMQX Pulsar Bridge"},
     {description, "EMQX Pulsar Bridge"},
-    {vsn, "0.2.6"},
+    {vsn, "0.2.7"},
     {registered, []},
     {registered, []},
     {applications, [
     {applications, [
         kernel,
         kernel,

+ 216 - 82
apps/emqx_bridge_pulsar/src/emqx_bridge_pulsar_connector.erl

@@ -26,6 +26,9 @@
     on_format_query_result/1
     on_format_query_result/1
 ]).
 ]).
 
 
+-export([on_pulsar_ack/2]).
+-export([handle_telemetry_event/4]).
+
 -type pulsar_client_id() :: atom().
 -type pulsar_client_id() :: atom().
 -type state() :: #{
 -type state() :: #{
     client_id := pulsar_client_id(),
     client_id := pulsar_client_id(),
@@ -51,6 +54,7 @@
 %% Allocatable resources
 %% Allocatable resources
 -define(pulsar_client_id, pulsar_client_id).
 -define(pulsar_client_id, pulsar_client_id).
 -define(pulsar_producers, pulsar_producers).
 -define(pulsar_producers, pulsar_producers).
+-define(telemetry_handler_id, telemetry_handler_id).
 
 
 -define(HEALTH_CHECK_RETRY_TIMEOUT, 4_000).
 -define(HEALTH_CHECK_RETRY_TIMEOUT, 4_000).
 
 
@@ -71,12 +75,12 @@ query_opts(#{resource_opts := #{query_mode := sync}, parameters := #{sync_timeou
 query_opts(_) ->
 query_opts(_) ->
     #{}.
     #{}.
 
 
--spec on_start(resource_id(), config()) -> {ok, state()}.
-on_start(InstanceId, Config) ->
+-spec on_start(connector_resource_id(), config()) -> {ok, state()}.
+on_start(ConnResId, Config) ->
     #{servers := Servers0, ssl := SSL} = Config,
     #{servers := Servers0, ssl := SSL} = Config,
     Servers = format_servers(Servers0),
     Servers = format_servers(Servers0),
-    ClientId = make_client_id(InstanceId),
-    ok = emqx_resource:allocate_resource(InstanceId, ?pulsar_client_id, ClientId),
+    ClientId = make_client_id(ConnResId),
+    ok = emqx_resource:allocate_resource(ConnResId, ?pulsar_client_id, ClientId),
     SSLOpts = emqx_tls_lib:to_client_opts(SSL),
     SSLOpts = emqx_tls_lib:to_client_opts(SSL),
     ConnectTimeout = maps:get(connect_timeout, Config, timer:seconds(10)),
     ConnectTimeout = maps:get(connect_timeout, Config, timer:seconds(10)),
     ClientOpts = #{
     ClientOpts = #{
@@ -85,12 +89,12 @@ on_start(InstanceId, Config) ->
         conn_opts => conn_opts(Config)
         conn_opts => conn_opts(Config)
     },
     },
     case pulsar:ensure_supervised_client(ClientId, Servers, ClientOpts) of
     case pulsar:ensure_supervised_client(ClientId, Servers, ClientOpts) of
-        {ok, _Pid} ->
+        {ok, _} ->
             ?tp(
             ?tp(
                 info,
                 info,
                 "pulsar_client_started",
                 "pulsar_client_started",
                 #{
                 #{
-                    instance_id => InstanceId,
+                    instance_id => ConnResId,
                     pulsar_hosts => Servers
                     pulsar_hosts => Servers
                 }
                 }
             );
             );
@@ -98,7 +102,7 @@ on_start(InstanceId, Config) ->
             RedactedReason = emqx_utils:redact(Reason, fun is_sensitive_key/1),
             RedactedReason = emqx_utils:redact(Reason, fun is_sensitive_key/1),
             ?SLOG(error, #{
             ?SLOG(error, #{
                 msg => "failed_to_start_pulsar_client",
                 msg => "failed_to_start_pulsar_client",
-                instance_id => InstanceId,
+                instance_id => ConnResId,
                 pulsar_hosts => Servers,
                 pulsar_hosts => Servers,
                 reason => RedactedReason
                 reason => RedactedReason
             }),
             }),
@@ -112,84 +116,95 @@ on_start(InstanceId, Config) ->
     {ok, #{channels => #{}, client_id => ClientId, client_opts => ClientOpts}}.
     {ok, #{channels => #{}, client_id => ClientId, client_opts => ClientOpts}}.
 
 
 on_add_channel(
 on_add_channel(
-    InstanceId,
+    ConnResId,
     #{channels := Channels, client_id := ClientId, client_opts := ClientOpts} = State,
     #{channels := Channels, client_id := ClientId, client_opts := ClientOpts} = State,
-    ChannelId,
+    ActionResId,
     #{parameters := #{message := Message, sync_timeout := SyncTimeout} = Params}
     #{parameters := #{message := Message, sync_timeout := SyncTimeout} = Params}
 ) ->
 ) ->
-    case maps:is_key(ChannelId, Channels) of
+    case maps:is_key(ActionResId, Channels) of
         true ->
         true ->
             {error, channel_already_exists};
             {error, channel_already_exists};
         false ->
         false ->
-            {ok, Producers} = start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params),
+            {ok, Producers} = start_producer(ConnResId, ActionResId, ClientId, ClientOpts, Params),
             Parameters = #{
             Parameters = #{
                 message => compile_message_template(Message),
                 message => compile_message_template(Message),
                 sync_timeout => SyncTimeout,
                 sync_timeout => SyncTimeout,
                 producers => Producers
                 producers => Producers
             },
             },
-            NewChannels = maps:put(ChannelId, Parameters, Channels),
+            NewChannels = maps:put(ActionResId, Parameters, Channels),
             {ok, State#{channels => NewChannels}}
             {ok, State#{channels => NewChannels}}
     end.
     end.
 
 
-on_remove_channel(InstanceId, State, ChannelId) ->
-    #{channels := Channels, client_id := ClientId} = State,
-    case maps:find(ChannelId, Channels) of
+on_remove_channel(ConnResId, State, ActionResId) ->
+    #{channels := Channels} = State,
+    case maps:find(ActionResId, Channels) of
         {ok, #{producers := Producers}} ->
         {ok, #{producers := Producers}} ->
-            stop_producers(ClientId, Producers),
-            emqx_resource:deallocate_resource(InstanceId, {?pulsar_producers, ChannelId}),
-            {ok, State#{channels => maps:remove(ChannelId, Channels)}};
+            stop_producers(ActionResId, Producers),
+            emqx_resource:deallocate_resource(ConnResId, {?pulsar_producers, ActionResId}),
+            deallocate_telemetry_handlers(ConnResId, ActionResId),
+            {ok, State#{channels => maps:remove(ActionResId, Channels)}};
         error ->
         error ->
             {ok, State}
             {ok, State}
     end.
     end.
 
 
-on_get_channels(InstanceId) ->
-    emqx_bridge_v2:get_channels_for_connector(InstanceId).
+on_get_channels(ConnResId) ->
+    emqx_bridge_v2:get_channels_for_connector(ConnResId).
 
 
 -spec on_stop(resource_id(), state()) -> ok.
 -spec on_stop(resource_id(), state()) -> ok.
-on_stop(InstanceId, _State) ->
-    Resources0 = emqx_resource:get_allocated_resources(InstanceId),
-    case maps:take(?pulsar_client_id, Resources0) of
-        {ClientId, Resources} ->
-            maps:foreach(
-                fun({?pulsar_producers, _BridgeV2Id}, Producers) ->
-                    stop_producers(ClientId, Producers)
-                end,
-                Resources
-            ),
-            stop_client(ClientId),
-            ?tp(pulsar_bridge_stopped, #{instance_id => InstanceId}),
-            ok;
-        error ->
-            ok
-    end.
+on_stop(ConnResId, _State) ->
+    Resources = emqx_resource:get_allocated_resources(ConnResId),
+    maps:foreach(
+        fun
+            ({?pulsar_producers, ActionResId}, Producers) ->
+                stop_producers(ActionResId, Producers);
+            (_, _) ->
+                ok
+        end,
+        Resources
+    ),
+    maps:foreach(
+        fun
+            ({?telemetry_handler_id, _ActionResId}, TelemetryId) ->
+                deallocate_telemetry_handlers(ConnResId, TelemetryId);
+            (_, _) ->
+                ok
+        end,
+        Resources
+    ),
+    maps:foreach(
+        fun
+            (?pulsar_client_id, ClientId) ->
+                stop_client(ClientId);
+            (_, _) ->
+                ok
+        end,
+        Resources
+    ),
+    ?tp(pulsar_bridge_stopped, #{instance_id => ConnResId}),
+    ok.
 
 
 %% Note: since Pulsar client has its own replayq that is not managed by
 %% Note: since Pulsar client has its own replayq that is not managed by
 %% `emqx_resource_buffer_worker', we must avoid returning `disconnected' here.  Otherwise,
 %% `emqx_resource_buffer_worker', we must avoid returning `disconnected' here.  Otherwise,
 %% `emqx_resource_manager' will kill the Pulsar producers and messages might be lost.
 %% `emqx_resource_manager' will kill the Pulsar producers and messages might be lost.
 -spec on_get_status(resource_id(), state()) -> connected | connecting.
 -spec on_get_status(resource_id(), state()) -> connected | connecting.
-on_get_status(_InstanceId, State = #{}) ->
+on_get_status(_ConnResId, State = #{}) ->
     #{client_id := ClientId} = State,
     #{client_id := ClientId} = State,
-    case pulsar_client_sup:find_client(ClientId) of
-        {ok, Pid} ->
-            try pulsar_client:get_status(Pid) of
-                true -> ?status_connected;
-                false -> ?status_connecting
-            catch
-                exit:{timeout, _} ->
-                    ?status_connecting;
-                exit:{noproc, _} ->
-                    ?status_connecting
-            end;
-        {error, _} ->
+    try pulsar_client_manager:get_status(ClientId, 5_000) of
+        true -> ?status_connected;
+        false -> ?status_connecting
+    catch
+        exit:{timeout, _} ->
+            ?status_connecting;
+        exit:{noproc, _} ->
             ?status_connecting
             ?status_connecting
     end;
     end;
-on_get_status(_InstanceId, _State) ->
+on_get_status(_ConnResId, _State) ->
     %% If a health check happens just after a concurrent request to
     %% If a health check happens just after a concurrent request to
     %% create the bridge is not quite finished, `State = undefined'.
     %% create the bridge is not quite finished, `State = undefined'.
     ?status_connecting.
     ?status_connecting.
 
 
-on_get_channel_status(_InstanceId, ChannelId, #{channels := Channels}) ->
-    case maps:find(ChannelId, Channels) of
+on_get_channel_status(_ConnResId, ActionResId, #{channels := Channels}) ->
+    case maps:find(ActionResId, Channels) of
         {ok, #{producers := Producers}} ->
         {ok, #{producers := Producers}} ->
             get_producer_status(Producers);
             get_producer_status(Producers);
         error ->
         error ->
@@ -200,21 +215,21 @@ on_get_channel_status(_InstanceId, ChannelId, #{channels := Channels}) ->
     {ok, term()}
     {ok, term()}
     | {error, timeout}
     | {error, timeout}
     | {error, term()}.
     | {error, term()}.
-on_query(_InstanceId, {ChannelId, Message}, State) ->
+on_query(_ConnResId, {ActionResId, Message}, State) ->
     #{channels := Channels} = State,
     #{channels := Channels} = State,
-    case maps:find(ChannelId, Channels) of
+    case maps:find(ActionResId, Channels) of
         error ->
         error ->
             {error, channel_not_found};
             {error, channel_not_found};
         {ok, #{message := MessageTmpl, sync_timeout := SyncTimeout, producers := Producers}} ->
         {ok, #{message := MessageTmpl, sync_timeout := SyncTimeout, producers := Producers}} ->
             PulsarMessage = render_message(Message, MessageTmpl),
             PulsarMessage = render_message(Message, MessageTmpl),
-            emqx_trace:rendered_action_template(ChannelId, #{
+            emqx_trace:rendered_action_template(ActionResId, #{
                 message => PulsarMessage,
                 message => PulsarMessage,
                 sync_timeout => SyncTimeout,
                 sync_timeout => SyncTimeout,
                 is_async => false
                 is_async => false
             }),
             }),
             ?tp_span(
             ?tp_span(
                 "pulsar_producer_query_enter",
                 "pulsar_producer_query_enter",
-                #{instance_id => _InstanceId, message => Message, mode => sync},
+                #{instance_id => _ConnResId, message => Message, mode => sync},
                 try
                 try
                     ?tp("pulsar_producer_send", #{msg => PulsarMessage, mode => sync}),
                     ?tp("pulsar_producer_send", #{msg => PulsarMessage, mode => sync}),
                     pulsar:send_sync(Producers, [PulsarMessage], SyncTimeout)
                     pulsar:send_sync(Producers, [PulsarMessage], SyncTimeout)
@@ -229,33 +244,97 @@ on_query(_InstanceId, {ChannelId, Message}, State) ->
     resource_id(), tuple(), {ReplyFun :: function(), Args :: list()}, state()
     resource_id(), tuple(), {ReplyFun :: function(), Args :: list()}, state()
 ) ->
 ) ->
     {ok, pid()}.
     {ok, pid()}.
-on_query_async(_InstanceId, {ChannelId, Message}, AsyncReplyFn, State) ->
+on_query_async(_ConnResId, {ActionResId, Message}, AsyncReplyFn, State) ->
     #{channels := Channels} = State,
     #{channels := Channels} = State,
-    case maps:find(ChannelId, Channels) of
+    case maps:find(ActionResId, Channels) of
         error ->
         error ->
             {error, {unrecoverable_error, channel_not_found}};
             {error, {unrecoverable_error, channel_not_found}};
         {ok, #{message := MessageTmpl, producers := Producers}} ->
         {ok, #{message := MessageTmpl, producers := Producers}} ->
             ?tp_span(
             ?tp_span(
                 "pulsar_producer_query_enter",
                 "pulsar_producer_query_enter",
-                #{instance_id => _InstanceId, message => Message, mode => async},
-                on_query_async2(ChannelId, Producers, Message, MessageTmpl, AsyncReplyFn)
+                #{instance_id => _ConnResId, message => Message, mode => async},
+                on_query_async2(ActionResId, Producers, Message, MessageTmpl, AsyncReplyFn)
             )
             )
     end.
     end.
 
 
-on_query_async2(ChannelId, Producers, Message, MessageTmpl, AsyncReplyFn) ->
+on_query_async2(ActionResId, Producers, Message, MessageTmpl, AsyncReplyFn) ->
     PulsarMessage = render_message(Message, MessageTmpl),
     PulsarMessage = render_message(Message, MessageTmpl),
-    emqx_trace:rendered_action_template(ChannelId, #{
+    emqx_trace:rendered_action_template(ActionResId, #{
         message => PulsarMessage,
         message => PulsarMessage,
         is_async => true
         is_async => true
     }),
     }),
+    CallbackFn = {fun ?MODULE:on_pulsar_ack/2, [AsyncReplyFn]},
     ?tp("pulsar_producer_send", #{msg => PulsarMessage, mode => async}),
     ?tp("pulsar_producer_send", #{msg => PulsarMessage, mode => async}),
-    pulsar:send(Producers, [PulsarMessage], #{callback_fn => AsyncReplyFn}).
+    pulsar:send(Producers, [PulsarMessage], #{callback_fn => CallbackFn}).
 
 
 on_format_query_result({ok, Info}) ->
 on_format_query_result({ok, Info}) ->
     #{result => ok, info => Info};
     #{result => ok, info => Info};
 on_format_query_result(Result) ->
 on_format_query_result(Result) ->
     Result.
     Result.
 
 
+on_pulsar_ack(_ReplyFnAndArgs, {error, Reason}) when
+    Reason =:= expired;
+    Reason =:= overflow
+->
+    %% We already bumped the dropped counter in `handle_telemetry_event/4', so no need to
+    %% call the wrapping callback here (it would bump the failure counter).
+    ok;
+on_pulsar_ack(ReplyFnAndArgs, Result) ->
+    emqx_resource:apply_reply_fun(ReplyFnAndArgs, Result).
+
+%%-------------------------------------------------------------------------------------
+%% `telemetry' API
+%%-------------------------------------------------------------------------------------
+
+%% we *must* match the bridge id in the event metadata with that in
+%% the handler config; otherwise, multiple pulsar producer bridges will
+%% install multiple handlers to the same pulsar events, multiplying the
+handle_telemetry_event(
+    [pulsar, dropped],
+    #{counter_inc := Val, reason := queue_full},
+    #{action_id := ID},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:dropped_queue_full_inc(ID, Val);
+handle_telemetry_event(
+    [pulsar, dropped],
+    #{counter_inc := Val, reason := expired},
+    #{action_id := ID},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:dropped_expired_inc(ID, Val);
+handle_telemetry_event(
+    [pulsar, queuing],
+    #{gauge_set := Val},
+    #{action_id := ID, partition_topic := PartitionTopic},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:queuing_set(ID, PartitionTopic, Val);
+handle_telemetry_event(
+    [pulsar, queuing_bytes],
+    #{gauge_set := Val},
+    #{action_id := ID, partition_topic := PartitionTopic},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:queuing_bytes_set(ID, PartitionTopic, Val);
+handle_telemetry_event(
+    [pulsar, retried],
+    #{counter_inc := Val},
+    #{action_id := ID},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:retried_inc(ID, Val);
+handle_telemetry_event(
+    [pulsar, inflight],
+    #{gauge_set := Val},
+    #{action_id := ID, partition_topic := PartitionTopic},
+    #{action_id := ID}
+) when is_integer(Val) ->
+    emqx_resource_metrics:inflight_set(ID, PartitionTopic, Val);
+handle_telemetry_event(_EventId, _Metrics, _Metadata, _HandlerConfig) ->
+    %% Event that we do not handle
+    ok.
+
 %%-------------------------------------------------------------------------------------
 %%-------------------------------------------------------------------------------------
 %% Internal fns
 %% Internal fns
 %%-------------------------------------------------------------------------------------
 %%-------------------------------------------------------------------------------------
@@ -270,12 +349,12 @@ format_servers(Servers0) ->
     ).
     ).
 
 
 -spec make_client_id(resource_id()) -> pulsar_client_id().
 -spec make_client_id(resource_id()) -> pulsar_client_id().
-make_client_id(InstanceId) ->
-    case emqx_resource:is_dry_run(InstanceId) of
+make_client_id(ConnResId) ->
+    case emqx_resource:is_dry_run(ConnResId) of
         true ->
         true ->
             pulsar_producer_probe;
             pulsar_producer_probe;
         false ->
         false ->
-            {pulsar, Name} = emqx_connector_resource:parse_connector_id(InstanceId),
+            {pulsar, Name} = emqx_connector_resource:parse_connector_id(ConnResId),
             ClientIdBin = iolist_to_binary([
             ClientIdBin = iolist_to_binary([
                 <<"pulsar:">>,
                 <<"pulsar:">>,
                 emqx_utils_conv:bin(Name),
                 emqx_utils_conv:bin(Name),
@@ -304,22 +383,22 @@ conn_opts(#{authentication := #{jwt := JWT}}) ->
 replayq_dir(ClientId) ->
 replayq_dir(ClientId) ->
     filename:join([emqx:data_dir(), "pulsar", emqx_utils_conv:bin(ClientId)]).
     filename:join([emqx:data_dir(), "pulsar", emqx_utils_conv:bin(ClientId)]).
 
 
-producer_name(InstanceId, ChannelId) ->
-    case emqx_resource:is_dry_run(InstanceId) of
+producer_name(ConnResId, ActionResId) ->
+    case emqx_resource:is_dry_run(ConnResId) of
         %% do not create more atom
         %% do not create more atom
         true ->
         true ->
             pulsar_producer_probe_worker;
             pulsar_producer_probe_worker;
         false ->
         false ->
-            ChannelIdBin = emqx_utils_conv:bin(ChannelId),
+            ActionResIdBin = emqx_utils_conv:bin(ActionResId),
             binary_to_atom(
             binary_to_atom(
                 iolist_to_binary([
                 iolist_to_binary([
                     <<"producer-">>,
                     <<"producer-">>,
-                    ChannelIdBin
+                    ActionResIdBin
                 ])
                 ])
             )
             )
     end.
     end.
 
 
-start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
+start_producer(ConnResId, ActionResId, ClientId, ClientOpts, Params) ->
     #{
     #{
         conn_opts := ConnOpts,
         conn_opts := ConnOpts,
         ssl_opts := SSLOpts
         ssl_opts := SSLOpts
@@ -342,8 +421,8 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
     {OffloadMode, ReplayQDir} =
     {OffloadMode, ReplayQDir} =
         case BufferMode of
         case BufferMode of
             memory -> {false, false};
             memory -> {false, false};
-            disk -> {false, replayq_dir(ChannelId)};
-            hybrid -> {true, replayq_dir(ChannelId)}
+            disk -> {false, replayq_dir(ActionResId)};
+            hybrid -> {true, replayq_dir(ActionResId)}
         end,
         end,
     MemOLP =
     MemOLP =
         case os:type() of
         case os:type() of
@@ -357,7 +436,7 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
         replayq_seg_bytes => SegmentBytes,
         replayq_seg_bytes => SegmentBytes,
         drop_if_highmem => MemOLP
         drop_if_highmem => MemOLP
     },
     },
-    ProducerName = producer_name(InstanceId, ChannelId),
+    ProducerName = producer_name(ConnResId, ActionResId),
     ?tp(pulsar_producer_capture_name, #{producer_name => ProducerName}),
     ?tp(pulsar_producer_capture_name, #{producer_name => ProducerName}),
     ProducerOpts0 =
     ProducerOpts0 =
         #{
         #{
@@ -369,15 +448,22 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
             retention_period => RetentionPeriod,
             retention_period => RetentionPeriod,
             ssl_opts => SSLOpts,
             ssl_opts => SSLOpts,
             strategy => partition_strategy(Strategy),
             strategy => partition_strategy(Strategy),
-            tcp_opts => [{sndbuf, SendBuffer}]
+            tcp_opts => [{sndbuf, SendBuffer}],
+            telemetry_metadata => #{action_id => ActionResId}
         },
         },
     ProducerOpts = maps:merge(ReplayQOpts, ProducerOpts0),
     ProducerOpts = maps:merge(ReplayQOpts, ProducerOpts0),
     ?tp(pulsar_producer_about_to_start_producers, #{producer_name => ProducerName}),
     ?tp(pulsar_producer_about_to_start_producers, #{producer_name => ProducerName}),
+    ok = emqx_resource:allocate_resource(
+        ConnResId,
+        {?telemetry_handler_id, ActionResId},
+        ActionResId
+    ),
+    _ = maybe_install_telemetry_handlers(ActionResId),
     try pulsar:ensure_supervised_producers(ClientId, PulsarTopic, ProducerOpts) of
     try pulsar:ensure_supervised_producers(ClientId, PulsarTopic, ProducerOpts) of
         {ok, Producers} ->
         {ok, Producers} ->
             ok = emqx_resource:allocate_resource(
             ok = emqx_resource:allocate_resource(
-                InstanceId,
-                {?pulsar_producers, ChannelId},
+                ConnResId,
+                {?pulsar_producers, ActionResId},
                 Producers
                 Producers
             ),
             ),
             ?tp(pulsar_producer_producers_allocated, #{}),
             ?tp(pulsar_producer_producers_allocated, #{}),
@@ -389,7 +475,7 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
                 error,
                 error,
                 "failed_to_start_pulsar_producer",
                 "failed_to_start_pulsar_producer",
                 #{
                 #{
-                    instance_id => InstanceId,
+                    instance_id => ConnResId,
                     kind => Kind,
                     kind => Kind,
                     reason => emqx_utils:redact(Error, fun is_sensitive_key/1),
                     reason => emqx_utils:redact(Error, fun is_sensitive_key/1),
                     stacktrace => Stacktrace
                     stacktrace => Stacktrace
@@ -399,6 +485,7 @@ start_producer(InstanceId, ChannelId, ClientId, ClientOpts, Params) ->
                 pulsar_client_id => ClientId,
                 pulsar_client_id => ClientId,
                 producers => undefined
                 producers => undefined
             }),
             }),
+            _ = uninstall_telemetry_handlers(ActionResId),
             throw(failed_to_start_pulsar_producer)
             throw(failed_to_start_pulsar_producer)
     end.
     end.
 
 
@@ -417,20 +504,20 @@ stop_client(ClientId) ->
     ),
     ),
     ok.
     ok.
 
 
--spec stop_producers(pulsar_client_id(), pulsar_producers:producers()) -> ok.
-stop_producers(ClientId, Producers) ->
+-spec stop_producers(action_resource_id(), pulsar_producers:producers()) -> ok.
+stop_producers(ActionResId, Producers) ->
     _ = log_when_error(
     _ = log_when_error(
         fun() ->
         fun() ->
             ok = pulsar:stop_and_delete_supervised_producers(Producers),
             ok = pulsar:stop_and_delete_supervised_producers(Producers),
             ?tp(pulsar_bridge_producer_stopped, #{
             ?tp(pulsar_bridge_producer_stopped, #{
-                pulsar_client_id => ClientId,
+                action_id => ActionResId,
                 producers => Producers
                 producers => Producers
             }),
             }),
             ok
             ok
         end,
         end,
         #{
         #{
             msg => "failed_to_delete_pulsar_producer",
             msg => "failed_to_delete_pulsar_producer",
-            pulsar_client_id => ClientId
+            action_id => ActionResId
         }
         }
     ),
     ),
     ok.
     ok.
@@ -516,3 +603,50 @@ do_get_error_message(Iterator) ->
         none ->
         none ->
             error
             error
     end.
     end.
+
+maybe_install_telemetry_handlers(ActionResId) ->
+    %% Attach event handlers for telemetry events. If a handler with the
+    %% handler id already exists, the attach_many function does nothing
+    telemetry:attach_many(
+        %% unique handler id
+        ActionResId,
+        [
+            [pulsar, dropped],
+            [pulsar, queuing],
+            [pulsar, queuing_bytes],
+            [pulsar, retried],
+            [pulsar, inflight]
+        ],
+        fun ?MODULE:handle_telemetry_event/4,
+        %% we *must* keep track of the same id that is handed down to
+        %% wolff producers; otherwise, multiple kafka producer bridges
+        %% will install multiple handlers to the same wolff events,
+        %% multiplying the metric counts...
+        #{action_id => ActionResId}
+    ).
+
+with_log_at_error(Fun, Log) ->
+    try
+        Fun()
+    catch
+        C:E ->
+            ?SLOG(error, Log#{
+                exception => C,
+                reason => E
+            })
+    end.
+
+uninstall_telemetry_handlers(TelemetryId) ->
+    telemetry:detach(TelemetryId).
+
+deallocate_telemetry_handlers(ConnResId, ActionResId) ->
+    _ = with_log_at_error(
+        fun() ->
+            _ = uninstall_telemetry_handlers(ActionResId),
+            emqx_resource:deallocate_resource(ConnResId, {?telemetry_handler_id, ActionResId})
+        end,
+        #{
+            msg => "failed_to_uninstall_telemetry_handlers",
+            action_id => ActionResId
+        }
+    ).

+ 1 - 1
apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_connector_SUITE.erl

@@ -400,7 +400,7 @@ start_consumer(TestCase, Config) ->
         cacertfile => filename:join([CertsPath, "cacert.pem"])
         cacertfile => filename:join([CertsPath, "cacert.pem"])
     },
     },
     Opts = #{enable_ssl => UseTLS, ssl_opts => emqx_tls_lib:to_client_opts(SSLOpts)},
     Opts = #{enable_ssl => UseTLS, ssl_opts => emqx_tls_lib:to_client_opts(SSLOpts)},
-    {ok, _ClientPid} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
+    {ok, _} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
     ConsumerOpts = Opts#{
     ConsumerOpts = Opts#{
         cb_init_args => #{send_to => self()},
         cb_init_args => #{send_to => self()},
         cb_module => pulsar_echo_consumer,
         cb_module => pulsar_echo_consumer,

+ 328 - 42
apps/emqx_bridge_pulsar/test/emqx_bridge_pulsar_v2_SUITE.erl

@@ -52,21 +52,18 @@ init_per_group(plain = Type, Config) ->
     PulsarHost = os:getenv("PULSAR_PLAIN_HOST", "toxiproxy"),
     PulsarHost = os:getenv("PULSAR_PLAIN_HOST", "toxiproxy"),
     PulsarPort = list_to_integer(os:getenv("PULSAR_PLAIN_PORT", "6652")),
     PulsarPort = list_to_integer(os:getenv("PULSAR_PLAIN_PORT", "6652")),
     ProxyName = "pulsar_plain",
     ProxyName = "pulsar_plain",
+    reset_proxy(),
     case emqx_common_test_helpers:is_tcp_server_available(PulsarHost, PulsarPort) of
     case emqx_common_test_helpers:is_tcp_server_available(PulsarHost, PulsarPort) of
         true ->
         true ->
             Config1 = common_init_per_group(),
             Config1 = common_init_per_group(),
-            ConnectorName = ?MODULE,
-            NewConfig =
-                [
-                    {proxy_name, ProxyName},
-                    {pulsar_host, PulsarHost},
-                    {pulsar_port, PulsarPort},
-                    {pulsar_type, Type},
-                    {use_tls, false}
-                    | Config1 ++ Config
-                ],
-            create_connector(ConnectorName, NewConfig),
-            NewConfig;
+            [
+                {proxy_name, ProxyName},
+                {pulsar_host, PulsarHost},
+                {pulsar_port, PulsarPort},
+                {pulsar_type, Type},
+                {use_tls, false}
+                | Config1 ++ Config
+            ];
         false ->
         false ->
             maybe_skip_without_ci()
             maybe_skip_without_ci()
     end;
     end;
@@ -74,21 +71,18 @@ init_per_group(tls = Type, Config) ->
     PulsarHost = os:getenv("PULSAR_TLS_HOST", "toxiproxy"),
     PulsarHost = os:getenv("PULSAR_TLS_HOST", "toxiproxy"),
     PulsarPort = list_to_integer(os:getenv("PULSAR_TLS_PORT", "6653")),
     PulsarPort = list_to_integer(os:getenv("PULSAR_TLS_PORT", "6653")),
     ProxyName = "pulsar_tls",
     ProxyName = "pulsar_tls",
+    reset_proxy(),
     case emqx_common_test_helpers:is_tcp_server_available(PulsarHost, PulsarPort) of
     case emqx_common_test_helpers:is_tcp_server_available(PulsarHost, PulsarPort) of
         true ->
         true ->
             Config1 = common_init_per_group(),
             Config1 = common_init_per_group(),
-            ConnectorName = ?MODULE,
-            NewConfig =
-                [
-                    {proxy_name, ProxyName},
-                    {pulsar_host, PulsarHost},
-                    {pulsar_port, PulsarPort},
-                    {pulsar_type, Type},
-                    {use_tls, true}
-                    | Config1 ++ Config
-                ],
-            create_connector(ConnectorName, NewConfig),
-            NewConfig;
+            [
+                {proxy_name, ProxyName},
+                {pulsar_host, PulsarHost},
+                {pulsar_port, PulsarPort},
+                {pulsar_type, Type},
+                {use_tls, true}
+                | Config1 ++ Config
+            ];
         false ->
         false ->
             maybe_skip_without_ci()
             maybe_skip_without_ci()
     end;
     end;
@@ -105,9 +99,9 @@ end_per_group(_Group, _Config) ->
     ok.
     ok.
 
 
 common_init_per_group() ->
 common_init_per_group() ->
+    reset_proxy(),
     ProxyHost = os:getenv("PROXY_HOST", "toxiproxy"),
     ProxyHost = os:getenv("PROXY_HOST", "toxiproxy"),
     ProxyPort = list_to_integer(os:getenv("PROXY_PORT", "8474")),
     ProxyPort = list_to_integer(os:getenv("PROXY_PORT", "8474")),
-    emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
     UniqueNum = integer_to_binary(erlang:unique_integer()),
     UniqueNum = integer_to_binary(erlang:unique_integer()),
     MQTTTopic = <<"mqtt/topic/", UniqueNum/binary>>,
     MQTTTopic = <<"mqtt/topic/", UniqueNum/binary>>,
     [
     [
@@ -116,6 +110,12 @@ common_init_per_group() ->
         {mqtt_topic, MQTTTopic}
         {mqtt_topic, MQTTTopic}
     ].
     ].
 
 
+reset_proxy() ->
+    ProxyHost = os:getenv("PROXY_HOST", "toxiproxy"),
+    ProxyPort = list_to_integer(os:getenv("PROXY_PORT", "8474")),
+    emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
+    ok.
+
 common_end_per_group(Config) ->
 common_end_per_group(Config) ->
     ProxyHost = ?config(proxy_host, Config),
     ProxyHost = ?config(proxy_host, Config),
     ProxyPort = ?config(proxy_port, Config),
     ProxyPort = ?config(proxy_port, Config),
@@ -131,7 +131,7 @@ end_per_testcase(_Testcase, Config) ->
     ProxyHost = ?config(proxy_host, Config),
     ProxyHost = ?config(proxy_host, Config),
     ProxyPort = ?config(proxy_port, Config),
     ProxyPort = ?config(proxy_port, Config),
     emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
     emqx_common_test_helpers:reset_proxy(ProxyHost, ProxyPort),
-    emqx_bridge_v2_testlib:delete_all_bridges(),
+    emqx_bridge_v2_testlib:delete_all_bridges_and_connectors(),
     stop_consumer(Config),
     stop_consumer(Config),
     %% in CI, apparently this needs more time since the
     %% in CI, apparently this needs more time since the
     %% machines struggle with all the containers running...
     %% machines struggle with all the containers running...
@@ -159,21 +159,25 @@ common_init_per_testcase(TestCase, Config0) ->
 %% Helper fns
 %% Helper fns
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
 
 
-create_connector(Name, Config) ->
-    Connector = pulsar_connector(Config),
-    {ok, _} = emqx_connector:create(?TYPE, Name, Connector).
+create_connector(Config) ->
+    {201, _} = create_connector_api([
+        {connector_type, ?TYPE},
+        {connector_name, ?MODULE},
+        {connector_config, connector_config(Config)}
+    ]),
+    ok.
 
 
 delete_connector(Name) ->
 delete_connector(Name) ->
     ok = emqx_connector:remove(?TYPE, Name).
     ok = emqx_connector:remove(?TYPE, Name).
 
 
 create_action(Name, Config) ->
 create_action(Name, Config) ->
-    Action = pulsar_action(Config),
+    Action = action_config(Config),
     {ok, _} = emqx_bridge_v2:create(actions, ?TYPE, Name, Action).
     {ok, _} = emqx_bridge_v2:create(actions, ?TYPE, Name, Action).
 
 
 delete_action(Name) ->
 delete_action(Name) ->
     ok = emqx_bridge_v2:remove(actions, ?TYPE, Name).
     ok = emqx_bridge_v2:remove(actions, ?TYPE, Name).
 
 
-pulsar_connector(Config) ->
+connector_config(Config) ->
     PulsarHost = ?config(pulsar_host, Config),
     PulsarHost = ?config(pulsar_host, Config),
     PulsarPort = ?config(pulsar_port, Config),
     PulsarPort = ?config(pulsar_port, Config),
     UseTLS = proplists:get_value(use_tls, Config, false),
     UseTLS = proplists:get_value(use_tls, Config, false),
@@ -201,11 +205,13 @@ pulsar_connector(Config) ->
     },
     },
     emqx_bridge_v2_testlib:parse_and_check_connector(?TYPE, Name, InnerConfigMap).
     emqx_bridge_v2_testlib:parse_and_check_connector(?TYPE, Name, InnerConfigMap).
 
 
-pulsar_action(Config) ->
+action_config(Config) ->
+    action_config(atom_to_binary(?MODULE), Config).
+
+action_config(ConnectorName, Config) ->
     QueryMode = proplists:get_value(query_mode, Config, <<"sync">>),
     QueryMode = proplists:get_value(query_mode, Config, <<"sync">>),
-    Name = atom_to_binary(?MODULE),
     InnerConfigMap = #{
     InnerConfigMap = #{
-        <<"connector">> => Name,
+        <<"connector">> => ConnectorName,
         <<"enable">> => true,
         <<"enable">> => true,
         <<"parameters">> => #{
         <<"parameters">> => #{
             <<"retention_period">> => <<"infinity">>,
             <<"retention_period">> => <<"infinity">>,
@@ -231,7 +237,7 @@ pulsar_action(Config) ->
             <<"metrics_flush_interval">> => <<"300ms">>
             <<"metrics_flush_interval">> => <<"300ms">>
         }
         }
     },
     },
-    emqx_bridge_v2_testlib:parse_and_check(action, ?TYPE, Name, InnerConfigMap).
+    emqx_bridge_v2_testlib:parse_and_check(action, ?TYPE, <<"some_action">>, InnerConfigMap).
 
 
 instance_id(Type, Name) ->
 instance_id(Type, Name) ->
     ConnectorId = emqx_bridge_resource:resource_id(Type, ?TYPE, Name),
     ConnectorId = emqx_bridge_resource:resource_id(Type, ?TYPE, Name),
@@ -269,7 +275,7 @@ start_consumer(TestCase, Config) ->
         cacertfile => filename:join([CertsPath, "cacert.pem"])
         cacertfile => filename:join([CertsPath, "cacert.pem"])
     },
     },
     Opts = #{enable_ssl => UseTLS, ssl_opts => emqx_tls_lib:to_client_opts(SSLOpts)},
     Opts = #{enable_ssl => UseTLS, ssl_opts => emqx_tls_lib:to_client_opts(SSLOpts)},
-    {ok, _ClientPid} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
+    {ok, _} = pulsar:ensure_supervised_client(ConsumerClientId, [URL], Opts),
     ConsumerOpts = Opts#{
     ConsumerOpts = Opts#{
         cb_init_args => #{send_to => self()},
         cb_init_args => #{send_to => self()},
         cb_module => pulsar_echo_consumer,
         cb_module => pulsar_echo_consumer,
@@ -398,6 +404,39 @@ group_path(Config) ->
             Path
             Path
     end.
     end.
 
 
+create_connector_api(Config) ->
+    emqx_bridge_v2_testlib:simplify_result(
+        emqx_bridge_v2_testlib:create_connector_api(Config)
+    ).
+
+create_action_api(Config) ->
+    create_action_api(Config, _Overrides = #{}).
+
+create_action_api(Config, Overrides) ->
+    emqx_bridge_v2_testlib:simplify_result(
+        emqx_bridge_v2_testlib:create_kind_api(Config, Overrides)
+    ).
+
+update_action_api(Config, Overrides) ->
+    emqx_bridge_v2_testlib:simplify_result(
+        emqx_bridge_v2_testlib:update_bridge_api(Config, Overrides)
+    ).
+
+get_combined_metrics(ActionResId, RuleId) ->
+    Metrics = emqx_resource:get_metrics(ActionResId),
+    RuleMetrics = emqx_metrics_worker:get_counters(rule_metrics, RuleId),
+    Metrics#{rule => RuleMetrics}.
+
+reset_combined_metrics(ActionResId, RuleId) ->
+    #{
+        kind := action,
+        type := Type,
+        name := Name
+    } = emqx_bridge_v2:parse_id(ActionResId),
+    ok = emqx_bridge_v2:reset_metrics(actions, Type, Name),
+    ok = emqx_rule_engine:reset_metrics_for_rule(RuleId),
+    ok.
+
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
 %% Testcases
 %% Testcases
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
@@ -406,7 +445,8 @@ t_action_probe(matrix) ->
     [[plain], [tls]];
     [[plain], [tls]];
 t_action_probe(Config) when is_list(Config) ->
 t_action_probe(Config) when is_list(Config) ->
     Name = atom_to_binary(?FUNCTION_NAME),
     Name = atom_to_binary(?FUNCTION_NAME),
-    Action = pulsar_action(Config),
+    create_connector(Config),
+    Action = action_config(Config),
     {ok, Res0} = emqx_bridge_v2_testlib:probe_bridge_api(action, ?TYPE, Name, Action),
     {ok, Res0} = emqx_bridge_v2_testlib:probe_bridge_api(action, ?TYPE, Name, Action),
     ?assertMatch({{_, 204, _}, _, _}, Res0),
     ?assertMatch({{_, 204, _}, _, _}, Res0),
     ok.
     ok.
@@ -424,6 +464,7 @@ t_action(Config) when is_list(Config) ->
             _ -> <<"async">>
             _ -> <<"async">>
         end,
         end,
     Name = atom_to_binary(?FUNCTION_NAME),
     Name = atom_to_binary(?FUNCTION_NAME),
+    create_connector(Config),
     create_action(Name, [{query_mode, QueryMode} | Config]),
     create_action(Name, [{query_mode, QueryMode} | Config]),
     Actions = emqx_bridge_v2:list(actions),
     Actions = emqx_bridge_v2:list(actions),
     Any = fun(#{name := BName}) -> BName =:= Name end,
     Any = fun(#{name := BName}) -> BName =:= Name end,
@@ -477,8 +518,8 @@ t_multiple_actions_sharing_topic(matrix) ->
 t_multiple_actions_sharing_topic(Config) when is_list(Config) ->
 t_multiple_actions_sharing_topic(Config) when is_list(Config) ->
     Type = ?TYPE,
     Type = ?TYPE,
     ConnectorName = <<"c">>,
     ConnectorName = <<"c">>,
-    ConnectorConfig = pulsar_connector(Config),
-    ActionConfig = pulsar_action(Config),
+    ConnectorConfig = connector_config(Config),
+    ActionConfig = action_config(ConnectorName, Config),
     ?check_trace(
     ?check_trace(
         begin
         begin
             ConnectorParams = [
             ConnectorParams = [
@@ -572,14 +613,259 @@ t_sync_query_down(Config0) when is_list(Config0) ->
         success_tp_filter =>
         success_tp_filter =>
             ?match_event(#{?snk_kind := pulsar_echo_consumer_message})
             ?match_event(#{?snk_kind := pulsar_echo_consumer_message})
     },
     },
+    ConnectorName = atom_to_binary(?FUNCTION_NAME),
     Config = [
     Config = [
         {connector_type, ?TYPE},
         {connector_type, ?TYPE},
-        {connector_name, ?FUNCTION_NAME},
-        {connector_config, pulsar_connector(Config0)},
+        {connector_name, ConnectorName},
+        {connector_config, connector_config(Config0)},
         {action_type, ?TYPE},
         {action_type, ?TYPE},
         {action_name, ?FUNCTION_NAME},
         {action_name, ?FUNCTION_NAME},
-        {action_config, pulsar_action(Config0)}
+        {action_config, action_config(ConnectorName, Config0)}
         | proplists_with([proxy_name, proxy_host, proxy_port], Config0)
         | proplists_with([proxy_name, proxy_host, proxy_port], Config0)
     ],
     ],
     emqx_bridge_v2_testlib:t_sync_query_down(Config, Opts),
     emqx_bridge_v2_testlib:t_sync_query_down(Config, Opts),
     ok.
     ok.
+
+%% Checks that we correctly handle telemetry events emitted by pulsar.
+t_telemetry_metrics(matrix) ->
+    [[plain]];
+t_telemetry_metrics(Config) when is_list(Config) ->
+    ProxyName = ?config(proxy_name, Config),
+    ProxyHost = ?config(proxy_host, Config),
+    ProxyPort = ?config(proxy_port, Config),
+    Type = ?TYPE,
+    ConnectorName = <<"c">>,
+    ConnectorConfig = connector_config(Config),
+    ActionConfig = action_config(ConnectorName, Config),
+    ConnectorParams = [
+        {connector_config, ConnectorConfig},
+        {connector_name, ConnectorName},
+        {connector_type, Type}
+    ],
+    ActionName1 = <<"a1">>,
+    ActionParams1 = [
+        {action_config, ActionConfig},
+        {action_name, ActionName1},
+        {action_type, Type}
+    ],
+    ActionName2 = <<"a2">>,
+    ActionParams2 = [
+        {action_config, ActionConfig},
+        {action_name, ActionName2},
+        {action_type, Type}
+    ],
+    ?check_trace(
+        begin
+            {201, _} =
+                create_connector_api(ConnectorParams),
+            {201, _} =
+                create_action_api(
+                    ActionParams1,
+                    %% Initially, this will overflow on small messages
+                    #{
+                        <<"parameters">> => #{
+                            <<"buffer">> => #{
+                                <<"mode">> => <<"disk">>,
+                                <<"per_partition_limit">> => <<"2B">>,
+                                <<"segment_bytes">> => <<"1B">>
+                            }
+                        }
+                    }
+                ),
+            {201, _} =
+                create_action_api(ActionParams2),
+            RuleTopic = <<"t/a2">>,
+            {ok, #{<<"id">> := RuleId}} =
+                emqx_bridge_v2_testlib:create_rule_and_action_http(Type, RuleTopic, [
+                    {bridge_name, ActionName1}
+                ]),
+            {ok, C} = emqtt:start_link([]),
+            {ok, _} = emqtt:connect(C),
+            SendMessage = fun() ->
+                ReqPayload = payload(),
+                ReqPayloadBin = emqx_utils_json:encode(ReqPayload),
+                {ok, _} = emqtt:publish(C, RuleTopic, #{}, ReqPayloadBin, [
+                    {qos, 1}, {retain, false}
+                ]),
+                ok
+            end,
+            SendMessage(),
+            ActionResId1 = emqx_bridge_v2_testlib:bridge_id(ActionParams1),
+            ActionResId2 = emqx_bridge_v2_testlib:bridge_id(ActionParams2),
+            ?retry(
+                100,
+                10,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 1,
+                            'dropped.expired' := 0,
+                            success := 0,
+                            matched := 1,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        },
+                        rule := #{
+                            matched := 1,
+                            %% todo: bump action failure count when dropped to mimic common
+                            %% buffer worker behavior.
+                            'actions.failed' := 0,
+                            'actions.failed.unknown' := 0,
+                            'actions.success' := 0
+                        }
+                    },
+                    get_combined_metrics(ActionResId1, RuleId)
+                )
+            ),
+            reset_combined_metrics(ActionResId1, RuleId),
+            %% Now to make it drop expired messages
+            {200, _} =
+                update_action_api(ActionParams1, #{
+                    <<"parameters">> => #{
+                        <<"retention_period">> => <<"10ms">>
+                    }
+                }),
+            emqx_common_test_helpers:with_failure(down, ProxyName, ProxyHost, ProxyPort, fun() ->
+                SendMessage(),
+                ?retry(
+                    100,
+                    10,
+                    ?assertMatch(
+                        #{
+                            counters := #{
+                                'dropped.queue_full' := 0,
+                                'dropped.expired' := 0,
+                                success := 0,
+                                matched := 1,
+                                failed := 0,
+                                received := 0
+                            },
+                            gauges := #{
+                                inflight := 0,
+                                queuing := 1,
+                                queuing_bytes := QueuingBytes1
+                            }
+                        } when QueuingBytes1 > 0,
+                        get_combined_metrics(ActionResId1, RuleId)
+                    )
+                ),
+                %% Other action is not affected by telemetry events for first action.
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 0,
+                            success := 0,
+                            matched := 0,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        }
+                    },
+                    emqx_resource:get_metrics(ActionResId2)
+                ),
+                ct:sleep(20),
+                ok
+            end),
+            %% After connection is restored, the request is already expired
+            ?retry(
+                500,
+                20,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 1,
+                            success := 0,
+                            matched := 1,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        },
+                        rule := #{
+                            matched := 1,
+                            %% todo: bump action failure count when dropped to mimic common
+                            %% buffer worker behavior.
+                            'actions.failed' := 0,
+                            'actions.failed.unknown' := 0,
+                            'actions.success' := 0
+                        }
+                    },
+                    get_combined_metrics(ActionResId1, RuleId)
+                )
+            ),
+            reset_combined_metrics(ActionResId1, RuleId),
+
+            %% Now, a success.
+            SendMessage(),
+            ?retry(
+                500,
+                20,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 0,
+                            success := 1,
+                            matched := 1,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        },
+                        rule := #{
+                            matched := 1,
+                            'actions.failed' := 0,
+                            'actions.failed.unknown' := 0,
+                            'actions.success' := 1
+                        }
+                    },
+                    get_combined_metrics(ActionResId1, RuleId)
+                )
+            ),
+
+            %% Other action is not affected by telemetry events for first action.
+            ?retry(
+                100,
+                10,
+                ?assertMatch(
+                    #{
+                        counters := #{
+                            'dropped.queue_full' := 0,
+                            'dropped.expired' := 0,
+                            success := 0,
+                            matched := 0,
+                            failed := 0,
+                            received := 0
+                        },
+                        gauges := #{
+                            inflight := 0,
+                            queuing := 0,
+                            queuing_bytes := 0
+                        }
+                    },
+                    emqx_resource:get_metrics(ActionResId2)
+                )
+            ),
+
+            ok
+        end,
+        []
+    ),
+    ok.

+ 24 - 0
apps/emqx_conf/etc/base.hocon

@@ -0,0 +1,24 @@
+## Define configurations that can later be overridden through UI/API/CLI.
+##
+## Config precedence order:
+##   etc/base.hocon < cluster.hocon < emqx.conf < environment variables
+
+## Logging configs
+## EMQX provides support for two primary log handlers: `file` and `console`,
+## with an additional `audit` handler specifically designed to always direct logs to files.
+## The system's default log handling behavior can be configured via the environment
+## variable `EMQX_DEFAULT_LOG_HANDLER`, which accepts the following settings:
+##  - `file`: Directs log output exclusively to files.
+##  - `console`: Channels log output solely to the console.
+## It's noteworthy that `EMQX_DEFAULT_LOG_HANDLER` is set to `file`
+## when EMQX is initiated via systemd `emqx.service` file.
+## In scenarios outside systemd initiation, `console` serves as the default log handler.
+## Read more about configs here: {{ emqx_configuration_doc_log }}
+log {
+    file {
+        level = warning
+    }
+    console {
+        level = warning
+    }
+}

+ 5 - 27
apps/emqx_conf/etc/emqx_conf.conf

@@ -1,12 +1,10 @@
-## NOTE:
-## This config file overrides data/configs/cluster.hocon,
-## and is merged with environment variables which start with 'EMQX_' prefix.
+## Place read-only configurations in this file.
+## To define configurations that can later be overridden through UI/API/CLI, add them to `etc/base.hocon`.
 ##
 ##
-## Config changes made from EMQX dashboard UI, management HTTP API, or CLI
-## are stored in data/configs/cluster.hocon.
-## To avoid confusion, please do not store the same configs in both files.
+## Config precedence order:
+##   etc/base.hocon < cluster.hocon < emqx.conf < environment variables
 ##
 ##
-## See {{ emqx_configuration_doc }} for more details.
+## See {{ emqx_configuration_doc }} for more information.
 ## Configuration full example can be found in etc/examples
 ## Configuration full example can be found in etc/examples
 
 
 node {
 node {
@@ -19,23 +17,3 @@ cluster {
   name = emqxcl
   name = emqxcl
   discovery_strategy = manual
   discovery_strategy = manual
 }
 }
-
-## EMQX provides support for two primary log handlers: `file` and `console`, with an additional `audit` handler specifically designed to always direct logs to files.
-## The system's default log handling behavior can be configured via the environment variable `EMQX_DEFAULT_LOG_HANDLER`, which accepts the following settings:
-##
-##   - `file`: Directs log output exclusively to files.
-##   - `console`: Channels log output solely to the console.
-##
-## It's noteworthy that `EMQX_DEFAULT_LOG_HANDLER` is set to `file` when EMQX is initiated via systemd `emqx.service` file.
-## In scenarios outside systemd initiation, `console` serves as the default log handler.
-
-## Read more about configs here: {{ emqx_configuration_doc_log }}
-
-# log {
-#     file {
-#         level = warning
-#     }
-#     console {
-#         level = warning
-#     }
-# }

+ 48 - 0
apps/emqx_connector/test/emqx_connector_SUITE.erl

@@ -20,6 +20,7 @@
 
 
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("common_test/include/ct.hrl").
 -include_lib("common_test/include/ct.hrl").
+-include_lib("snabbkaffe/include/snabbkaffe.hrl").
 
 
 -define(CONNECTOR, emqx_connector_dummy_impl).
 -define(CONNECTOR, emqx_connector_dummy_impl).
 
 
@@ -327,6 +328,53 @@ t_no_buffer_workers(Config) ->
     ?assertEqual([], supervisor:which_children(emqx_resource_buffer_worker_sup)),
     ?assertEqual([], supervisor:which_children(emqx_resource_buffer_worker_sup)),
     ok.
     ok.
 
 
+%% Checks that the maximum timeout (currently) set by `resource_opts.health_check_timeout'
+%% is respected when doing a dry run, even if the removal gets stuck because the resource
+%% process is unresponsive.
+t_dryrun_timeout({'init', Config}) ->
+    meck:new(emqx_connector_resource, [passthrough]),
+    meck:expect(emqx_connector_resource, connector_to_resource_type, 1, ?CONNECTOR),
+    meck:new(?CONNECTOR, [non_strict]),
+    meck:expect(?CONNECTOR, resource_type, 0, dummy),
+    meck:expect(?CONNECTOR, callback_mode, 0, async_if_possible),
+    %% hang forever
+    meck:expect(?CONNECTOR, on_start, fun(_ConnResId, _Opts) ->
+        receive
+            go -> ok
+        end
+    end),
+    meck:expect(?CONNECTOR, on_get_channels, 1, []),
+    meck:expect(?CONNECTOR, on_add_channel, 4, {ok, connector_state}),
+    meck:expect(?CONNECTOR, on_stop, 2, ok),
+    meck:expect(?CONNECTOR, on_get_status, 2, connected),
+    meck:expect(?CONNECTOR, query_mode, 1, sync),
+    Config;
+t_dryrun_timeout({'end', _Config}) ->
+    meck:unload(),
+    ok;
+t_dryrun_timeout(Config) when is_list(Config) ->
+    Type = kafka_producer,
+    Conf0 = connector_config(),
+    Timeout = 100,
+    Conf = Conf0#{<<"resource_opts">> => #{<<"health_check_interval">> => Timeout}},
+    %% Minimum timeout is capped at 5 s in `emqx_resource_manager'...  Plus, we need to
+    %% wait for removal of stuck process, which itself has another 5 s timeout.
+    ct:timetrap(15_000),
+    %% Cache cleaner is triggered when the process initiating the dry run dies.
+    Pid = spawn_link(fun() ->
+        Res = emqx_connector_resource:create_dry_run(Type, Conf),
+        ?assertEqual({error, timeout}, Res),
+        ok
+    end),
+    MRef = monitor(process, Pid),
+    receive
+        {'DOWN', MRef, _, _, _} ->
+            ok
+    end,
+    %% Should be removed asynchronously by cache cleaner.
+    ?retry(1_000, 7, ?assertEqual([], emqx_resource:list_instances())),
+    ok.
+
 %% helpers
 %% helpers
 
 
 connector_config() ->
 connector_config() ->

+ 1 - 1
apps/emqx_dashboard/src/emqx_dashboard.app.src

@@ -2,7 +2,7 @@
 {application, emqx_dashboard, [
 {application, emqx_dashboard, [
     {description, "EMQX Web Dashboard"},
     {description, "EMQX Web Dashboard"},
     % strict semver, bump manually!
     % strict semver, bump manually!
-    {vsn, "5.1.7"},
+    {vsn, "5.2.0"},
     {modules, []},
     {modules, []},
     {registered, [emqx_dashboard_sup]},
     {registered, [emqx_dashboard_sup]},
     {applications, [
     {applications, [

+ 7 - 4
apps/emqx_dashboard/src/emqx_dashboard_monitor.erl

@@ -24,7 +24,7 @@
 
 
 -behaviour(gen_server).
 -behaviour(gen_server).
 
 
--export([create_tables/0]).
+-export([create_tables/0, clear_table/0]).
 -export([start_link/0]).
 -export([start_link/0]).
 
 
 -export([
 -export([
@@ -98,6 +98,9 @@ create_tables() ->
     ]),
     ]),
     [?TAB].
     [?TAB].
 
 
+clear_table() ->
+    mria:clear_table(?TAB).
+
 %% -------------------------------------------------------------------------------------------------
 %% -------------------------------------------------------------------------------------------------
 %% API
 %% API
 
 
@@ -133,7 +136,7 @@ current_rate(Node) when Node == node() ->
             {ok, maps:merge(maps:from_list(Rate0), non_rate_value())}
             {ok, maps:merge(maps:from_list(Rate0), non_rate_value())}
     end;
     end;
 current_rate(Node) ->
 current_rate(Node) ->
-    case emqx_dashboard_proto_v1:current_rate(Node) of
+    case emqx_dashboard_proto_v2:current_rate(Node) of
         {badrpc, Reason} ->
         {badrpc, Reason} ->
             {badrpc, #{node => Node, reason => Reason}};
             {badrpc, #{node => Node, reason => Reason}};
         {ok, Rate} ->
         {ok, Rate} ->
@@ -316,7 +319,7 @@ do_sample(all, Time) when is_integer(Time) ->
 do_sample(Node, Time) when Node == node() andalso is_integer(Time) ->
 do_sample(Node, Time) when Node == node() andalso is_integer(Time) ->
     do_sample_local(Time);
     do_sample_local(Time);
 do_sample(Node, Time) when is_integer(Time) ->
 do_sample(Node, Time) when is_integer(Time) ->
-    case emqx_dashboard_proto_v1:do_sample(Node, Time) of
+    case emqx_dashboard_proto_v2:do_sample(Node, Time) of
         {badrpc, Reason} ->
         {badrpc, Reason} ->
             {badrpc, #{node => Node, reason => Reason}};
             {badrpc, #{node => Node, reason => Reason}};
         Res ->
         Res ->
@@ -348,7 +351,7 @@ sample_nodes(Nodes, Time) ->
     lists:foldl(fun(I, B) -> merge_samplers(Time, I, B) end, #{}, Success).
     lists:foldl(fun(I, B) -> merge_samplers(Time, I, B) end, #{}, Success).
 
 
 concurrently_sample_nodes(Nodes, Time) ->
 concurrently_sample_nodes(Nodes, Time) ->
-    %% emqx_dashboard_proto_v1:do_sample has a timeout (5s),
+    %% emqx_dashboard_proto_v2:do_sample has a timeout (5s),
     %% call emqx_utils:pmap here instead of a rpc multicall
     %% call emqx_utils:pmap here instead of a rpc multicall
     %% to avoid having to introduce a new bpapi proto version
     %% to avoid having to introduce a new bpapi proto version
     emqx_utils:pmap(fun(Node) -> do_sample(Node, Time) end, Nodes, infinity).
     emqx_utils:pmap(fun(Node) -> do_sample(Node, Time) end, Nodes, infinity).

+ 20 - 1
apps/emqx_dashboard/src/emqx_dashboard_monitor_api.erl

@@ -20,6 +20,7 @@
 -include_lib("typerefl/include/types.hrl").
 -include_lib("typerefl/include/types.hrl").
 -include_lib("hocon/include/hocon_types.hrl").
 -include_lib("hocon/include/hocon_types.hrl").
 -include_lib("emqx_utils/include/emqx_utils_api.hrl").
 -include_lib("emqx_utils/include/emqx_utils_api.hrl").
+-include_lib("emqx/include/logger.hrl").
 
 
 -behaviour(minirest_api).
 -behaviour(minirest_api).
 
 
@@ -61,6 +62,13 @@ schema("/monitor") ->
                 200 => hoconsc:mk(hoconsc:array(hoconsc:ref(sampler)), #{}),
                 200 => hoconsc:mk(hoconsc:array(hoconsc:ref(sampler)), #{}),
                 400 => emqx_dashboard_swagger:error_codes(['BAD_RPC'], <<"Bad RPC">>)
                 400 => emqx_dashboard_swagger:error_codes(['BAD_RPC'], <<"Bad RPC">>)
             }
             }
+        },
+        delete => #{
+            tags => [<<"Metrics">>],
+            description => ?DESC(clear_monitor),
+            responses => #{
+                204 => <<"Metrics deleted">>
+            }
         }
         }
     };
     };
 schema("/monitor/nodes/:node") ->
 schema("/monitor/nodes/:node") ->
@@ -148,7 +156,18 @@ fields_current(Names) ->
 monitor(get, #{query_string := QS, bindings := Bindings}) ->
 monitor(get, #{query_string := QS, bindings := Bindings}) ->
     Latest = maps:get(<<"latest">>, QS, infinity),
     Latest = maps:get(<<"latest">>, QS, infinity),
     RawNode = maps:get(node, Bindings, <<"all">>),
     RawNode = maps:get(node, Bindings, <<"all">>),
-    emqx_utils_api:with_node_or_cluster(RawNode, dashboard_samplers_fun(Latest)).
+    emqx_utils_api:with_node_or_cluster(RawNode, dashboard_samplers_fun(Latest));
+monitor(delete, _) ->
+    Nodes = emqx:running_nodes(),
+    Results = emqx_dashboard_proto_v2:clear_table(Nodes),
+    NodeResults = lists:zip(Nodes, Results),
+    NodeErrors = [Result || Result = {_Node, NOk} <- NodeResults, NOk =/= {atomic, ok}],
+    NodeErrors == [] orelse
+        ?SLOG(warning, #{
+            msg => "clear_monitor_metrics_rpc_errors",
+            errors => NodeErrors
+        }),
+    ?NO_CONTENT.
 
 
 dashboard_samplers_fun(Latest) ->
 dashboard_samplers_fun(Latest) ->
     fun(NodeOrCluster) ->
     fun(NodeOrCluster) ->

+ 5 - 1
apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v1.erl

@@ -21,7 +21,8 @@
 -export([
 -export([
     introduced_in/0,
     introduced_in/0,
     do_sample/2,
     do_sample/2,
-    current_rate/1
+    current_rate/1,
+    deprecated_since/0
 ]).
 ]).
 
 
 -include("emqx_dashboard.hrl").
 -include("emqx_dashboard.hrl").
@@ -30,6 +31,9 @@
 introduced_in() ->
 introduced_in() ->
     "5.0.0".
     "5.0.0".
 
 
+deprecated_since() ->
+    "5.8.4".
+
 -spec do_sample(node(), Latest :: pos_integer() | infinity) -> list(map()) | emqx_rpc:badrpc().
 -spec do_sample(node(), Latest :: pos_integer() | infinity) -> list(map()) | emqx_rpc:badrpc().
 do_sample(Node, Latest) ->
 do_sample(Node, Latest) ->
     rpc:call(Node, emqx_dashboard_monitor, do_sample, [Node, Latest], ?RPC_TIMEOUT).
     rpc:call(Node, emqx_dashboard_monitor, do_sample, [Node, Latest], ?RPC_TIMEOUT).

+ 44 - 0
apps/emqx_dashboard/src/proto/emqx_dashboard_proto_v2.erl

@@ -0,0 +1,44 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_dashboard_proto_v2).
+
+-behaviour(emqx_bpapi).
+
+-export([
+    introduced_in/0,
+    do_sample/2,
+    clear_table/1,
+    current_rate/1
+]).
+
+-include("emqx_dashboard.hrl").
+-include_lib("emqx/include/bpapi.hrl").
+
+introduced_in() ->
+    "5.8.4".
+
+-spec do_sample(node(), Latest :: pos_integer() | infinity) -> list(map()) | emqx_rpc:badrpc().
+do_sample(Node, Latest) ->
+    erpc:call(Node, emqx_dashboard_monitor, do_sample, [Node, Latest], ?RPC_TIMEOUT).
+
+-spec clear_table(Nodes :: [node()]) -> emqx_rpc:erpc_multicall(ok).
+clear_table(Nodes) ->
+    erpc:multicall(Nodes, emqx_dashboard_monitor, clear_table, [], ?RPC_TIMEOUT).
+
+-spec current_rate(node()) -> {ok, map()} | emqx_rpc:badrpc().
+current_rate(Node) ->
+    erpc:call(Node, emqx_dashboard_monitor, current_rate, [Node], ?RPC_TIMEOUT).

+ 17 - 7
apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl

@@ -405,8 +405,8 @@ t_handle_old_monitor_data(_Config) ->
 
 
     ok = meck:new(emqx, [passthrough, no_history]),
     ok = meck:new(emqx, [passthrough, no_history]),
     ok = meck:expect(emqx, running_nodes, fun() -> [node(), 'other@node'] end),
     ok = meck:expect(emqx, running_nodes, fun() -> [node(), 'other@node'] end),
-    ok = meck:new(emqx_dashboard_proto_v1, [passthrough, no_history]),
-    ok = meck:expect(emqx_dashboard_proto_v1, do_sample, fun('other@node', _Time) ->
+    ok = meck:new(emqx_dashboard_proto_v2, [passthrough, no_history]),
+    ok = meck:expect(emqx_dashboard_proto_v2, do_sample, fun('other@node', _Time) ->
         Self ! sample_called,
         Self ! sample_called,
         FakeOldData
         FakeOldData
     end),
     end),
@@ -421,7 +421,7 @@ t_handle_old_monitor_data(_Config) ->
         hd(emqx_dashboard_monitor:samplers())
         hd(emqx_dashboard_monitor:samplers())
     ),
     ),
     ?assertReceive(sample_called, 1_000),
     ?assertReceive(sample_called, 1_000),
-    ok = meck:unload([emqx, emqx_dashboard_proto_v1]),
+    ok = meck:unload([emqx, emqx_dashboard_proto_v2]),
     ok.
     ok.
 
 
 t_monitor_api(_) ->
 t_monitor_api(_) ->
@@ -583,6 +583,8 @@ t_monitor_reset(_) ->
         ),
         ),
     {ok, Samplers} = request(["monitor"], "latest=1"),
     {ok, Samplers} = request(["monitor"], "latest=1"),
     ?assertEqual(1, erlang:length(Samplers)),
     ?assertEqual(1, erlang:length(Samplers)),
+    ok = delete(["monitor"]),
+    ?assertMatch({ok, []}, request(["monitor"], "latest=1")),
     ok.
     ok.
 
 
 t_monitor_api_error(_) ->
 t_monitor_api_error(_) ->
@@ -666,7 +668,7 @@ t_persistent_session_stats(Config) ->
                 <<"connections">> := 3,
                 <<"connections">> := 3,
                 <<"disconnected_durable_sessions">> := 1,
                 <<"disconnected_durable_sessions">> := 1,
                 %% N.B.: we currently don't perform any deduplication between persistent
                 %% N.B.: we currently don't perform any deduplication between persistent
-                %% and non-persistent routes, so we count `commont/topic' twice and get 8
+                %% and non-persistent routes, so we count `common/topic' twice and get 8
                 %% instead of 6 here.
                 %% instead of 6 here.
                 <<"topics">> := 8,
                 <<"topics">> := 8,
                 <<"subscriptions">> := 8,
                 <<"subscriptions">> := 8,
@@ -702,7 +704,7 @@ t_persistent_session_stats(Config) ->
                 <<"connections">> := 3,
                 <<"connections">> := 3,
                 <<"disconnected_durable_sessions">> := 2,
                 <<"disconnected_durable_sessions">> := 2,
                 %% N.B.: we currently don't perform any deduplication between persistent
                 %% N.B.: we currently don't perform any deduplication between persistent
-                %% and non-persistent routes, so we count `commont/topic' twice and get 8
+                %% and non-persistent routes, so we count `common/topic' twice and get 8
                 %% instead of 6 here.
                 %% instead of 6 here.
                 <<"topics">> := 8,
                 <<"topics">> := 8,
                 <<"subscriptions">> := 8,
                 <<"subscriptions">> := 8,
@@ -712,7 +714,9 @@ t_persistent_session_stats(Config) ->
             ?ON(N1, request(["monitor_current"]))
             ?ON(N1, request(["monitor_current"]))
         )
         )
     end),
     end),
-
+    ?assertNotMatch({ok, []}, ?ON(N1, request(["monitor"]))),
+    ?assertMatch(ok, ?ON(N1, delete(["monitor"]))),
+    ?assertMatch({ok, []}, ?ON(N1, request(["monitor"]))),
     ok.
     ok.
 
 
 %% Checks that we get consistent data when changing the requested time window for
 %% Checks that we get consistent data when changing the requested time window for
@@ -842,6 +846,10 @@ get_req_cluster(Config, Path, QS) ->
 host(Port) ->
 host(Port) ->
     "http://127.0.0.1:" ++ integer_to_list(Port).
     "http://127.0.0.1:" ++ integer_to_list(Port).
 
 
+delete(Path) ->
+    Url = url(Path, ""),
+    do_request_api(delete, {Url, [auth_header_()]}).
+
 url(Parts, QS) ->
 url(Parts, QS) ->
     url(?SERVER, Parts, QS).
     url(?SERVER, Parts, QS).
 
 
@@ -858,6 +866,8 @@ do_request_api(Method, Request) ->
     case httpc:request(Method, Request, [], []) of
     case httpc:request(Method, Request, [], []) of
         {error, socket_closed_remotely} ->
         {error, socket_closed_remotely} ->
             {error, socket_closed_remotely};
             {error, socket_closed_remotely};
+        {ok, {{"HTTP/1.1", 204, _}, _, _}} ->
+            ok;
         {ok, {{"HTTP/1.1", Code, _}, _, Return}} when
         {ok, {{"HTTP/1.1", Code, _}, _, Return}} when
             Code >= 200 andalso Code =< 299
             Code >= 200 andalso Code =< 299
         ->
         ->
@@ -960,4 +970,4 @@ cluster_node_appspec(Enable, Port0) ->
     ].
     ].
 
 
 clean_data() ->
 clean_data() ->
-    ok = emqx_dashboard_monitor:clean(-1).
+    ok = emqx_dashboard_monitor:clean(-100000).

+ 1 - 1
apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl

@@ -239,7 +239,7 @@ transitions(Node, DB) ->
 %% Try to eliminate any ambiguity in the message representation.
 %% Try to eliminate any ambiguity in the message representation.
 message_canonical_form(Msg0 = #message{}) ->
 message_canonical_form(Msg0 = #message{}) ->
     message_canonical_form(emqx_message:to_map(Msg0));
     message_canonical_form(emqx_message:to_map(Msg0));
-message_canonical_form(#{flags := Flags0, headers := Headers0, payload := Payload0} = Msg) ->
+message_canonical_form(#{flags := Flags0, headers := _Headers0, payload := Payload0} = Msg) ->
     %% Remove flags that are false:
     %% Remove flags that are false:
     Flags = maps:filter(
     Flags = maps:filter(
         fun(_Key, Val) -> Val end,
         fun(_Key, Val) -> Val end,

+ 1 - 1
apps/emqx_management/src/emqx_management.app.src

@@ -2,7 +2,7 @@
 {application, emqx_management, [
 {application, emqx_management, [
     {description, "EMQX Management API and CLI"},
     {description, "EMQX Management API and CLI"},
     % strict semver, bump manually!
     % strict semver, bump manually!
-    {vsn, "5.3.3"},
+    {vsn, "5.3.4"},
     {modules, []},
     {modules, []},
     {registered, [emqx_management_sup]},
     {registered, [emqx_management_sup]},
     {applications, [
     {applications, [

+ 98 - 16
apps/emqx_management/src/emqx_mgmt_api.erl

@@ -54,7 +54,11 @@
 ]).
 ]).
 
 
 -ifdef(TEST).
 -ifdef(TEST).
+-include_lib("proper/include/proper.hrl").
+-include_lib("eunit/include/eunit.hrl").
+
 -export([paginate_test_format/1]).
 -export([paginate_test_format/1]).
+
 -endif.
 -endif.
 
 
 -export_type([
 -export_type([
@@ -557,18 +561,23 @@ accumulate_query_rows(
     Len = length(Rows),
     Len = length(Rows),
     case Cursor + Len of
     case Cursor + Len of
         NCursor when NCursor < PageStart ->
         NCursor when NCursor < PageStart ->
+            %% Haven't reached the required page.
             {more, ResultAcc#{cursor => NCursor}};
             {more, ResultAcc#{cursor => NCursor}};
         NCursor when NCursor < PageEnd ->
         NCursor when NCursor < PageEnd ->
+            %% Rows overlap with the page start
+            %% Throw away rows in the beginning belonging to the previous page(s).
             SubRows = lists:nthtail(max(0, PageStart - Cursor - 1), Rows),
             SubRows = lists:nthtail(max(0, PageStart - Cursor - 1), Rows),
             {more, ResultAcc#{
             {more, ResultAcc#{
                 cursor => NCursor,
                 cursor => NCursor,
                 count => Count + length(SubRows),
                 count => Count + length(SubRows),
                 rows => [{Node, SubRows} | RowsAcc]
                 rows => [{Node, SubRows} | RowsAcc]
             }};
             }};
-        NCursor when NCursor >= PageEnd + Limit ->
-            {enough, ResultAcc#{cursor => NCursor}};
         NCursor when NCursor >= PageEnd ->
         NCursor when NCursor >= PageEnd ->
-            SubRows = lists:sublist(Rows, Limit - Count),
+            %% Rows overlap with the page end (and potentially with the page start).
+            %% Throw away rows in the beginning belonging to the previous page(s).
+            %% Then throw away rows in the tail belonging to the next page(s).
+            PageRows = lists:nthtail(max(0, PageStart - Cursor - 1), Rows),
+            SubRows = lists:sublist(PageRows, Limit - Count),
             {enough, ResultAcc#{
             {enough, ResultAcc#{
                 cursor => NCursor,
                 cursor => NCursor,
                 count => Count + length(SubRows),
                 count => Count + length(SubRows),
@@ -707,20 +716,19 @@ format_query_result(
         end,
         end,
     #{
     #{
         meta => Meta,
         meta => Meta,
-        data => lists:flatten(
-            lists:foldl(
-                fun({Node, Rows}, Acc) ->
-                    [
-                        lists:map(fun(Row) -> exec_format_fun(FmtFun, Node, Row, Opts) end, Rows)
-                        | Acc
-                    ]
-                end,
-                [],
-                RowsAcc
-            )
-        )
+        data => format_query_data(FmtFun, RowsAcc, Opts)
     }.
     }.
 
 
+format_query_data(FmtFun, RowsAcc, Opts) ->
+    %% NOTE: `RowsAcc` is reversed in the node-order, `lists:foldl/3` is correct here.
+    lists:foldl(
+        fun({Node, Rows}, Acc) ->
+            [exec_format_fun(FmtFun, Node, R, Opts) || R <- Rows] ++ Acc
+        end,
+        [],
+        RowsAcc
+    ).
+
 exec_format_fun(FmtFun, Node, Row, Opts) ->
 exec_format_fun(FmtFun, Node, Row, Opts) ->
     case erlang:fun_info(FmtFun, arity) of
     case erlang:fun_info(FmtFun, arity) of
         {arity, 1} -> FmtFun(Row);
         {arity, 1} -> FmtFun(Row);
@@ -813,7 +821,6 @@ b2i(Any) ->
 %%--------------------------------------------------------------------
 %%--------------------------------------------------------------------
 
 
 -ifdef(TEST).
 -ifdef(TEST).
--include_lib("eunit/include/eunit.hrl").
 
 
 params2qs_test_() ->
 params2qs_test_() ->
     QSchema = [
     QSchema = [
@@ -926,4 +933,79 @@ assert_paginate_results(Results, Size, Limit) ->
             ?_assertEqual(Size, length(AllData)),
             ?_assertEqual(Size, length(AllData)),
             ?_assertEqual(Size, sets:size(sets:from_list(AllData)))
             ?_assertEqual(Size, sets:size(sets:from_list(AllData)))
         ].
         ].
+
+accumulate_prop_test() ->
+    ?assert(proper:quickcheck(accumulate_prop(), [{numtests, 1000}])).
+
+accumulate_prop() ->
+    ?FORALL(
+        #{page := Page, limit := Limit, noderows := NodeRows},
+        emqx_proper_types:fixedmap(#{
+            page => page_t(),
+            limit => limit_t(),
+            noderows => noderows_t()
+        }),
+        begin
+            {Status, QRows} = accumulate_page_rows(Page, Limit, NodeRows),
+            {_Status, QRowsNext} = accumulate_page_rows(Page + 1, Limit, NodeRows),
+            measure(
+                #{
+                    "Limit" => Limit,
+                    "Page" => Page,
+                    "NRows" => length(QRows),
+                    "Complete" => emqx_utils_conv:int(Status == enough)
+                },
+                %% Verify page is non-empty if accumulation is complete.
+                accumulate_assert_nonempty(Status, Limit, QRows) and
+                    %% Verify rows across 2 consective pages form continuous sequence.
+                    accumulate_assert_continuous(QRows ++ QRowsNext)
+            )
+        end
+    ).
+
+accumulate_page_rows(Page, Limit, NodeRows) ->
+    QState = #{page => Page, limit => Limit},
+    {Status, #{rows := QRowsAcc}} = lists:foldl(
+        fun
+            ({Node, Rows}, {more, QRAcc}) ->
+                accumulate_query_rows(Node, Rows, QState, QRAcc);
+            (_NodeRows, {enough, QRAcc}) ->
+                {enough, QRAcc}
+        end,
+        {more, init_query_result()},
+        NodeRows
+    ),
+    QRows = format_query_data(fun(N, R) -> {N, R} end, QRowsAcc, #{}),
+    {Status, QRows}.
+
+accumulate_assert_nonempty(enough, Limit, QRows) ->
+    length(QRows) =:= Limit;
+accumulate_assert_nonempty(more, _Limit, _QRows) ->
+    true.
+
+accumulate_assert_continuous([{N, R1} | Rest = [{N, R2} | _]]) ->
+    (R2 - R1 =:= 1) andalso accumulate_assert_continuous(Rest);
+accumulate_assert_continuous([{_N1, _} | Rest = [{_N2, R} | _]]) ->
+    (R =:= 1) andalso accumulate_assert_continuous(Rest);
+accumulate_assert_continuous([_]) ->
+    true;
+accumulate_assert_continuous([]) ->
+    true.
+
+page_t() ->
+    pos_integer().
+
+limit_t() ->
+    emqx_proper_types:scaled(0.6, pos_integer()).
+
+noderows_t() ->
+    ?LET(
+        {Nodes, PageSize},
+        {pos_integer(), limit_t()},
+        [{N, lists:seq(1, PageSize)} || N <- lists:seq(1, Nodes)]
+    ).
+
+measure(NamedSamples, Test) ->
+    maps:fold(fun(Name, Sample, Acc) -> measure(Name, Sample, Acc) end, Test, NamedSamples).
+
 -endif.
 -endif.

+ 1 - 1
apps/emqx_resource/src/emqx_resource.app.src

@@ -1,7 +1,7 @@
 %% -*- mode: erlang -*-
 %% -*- mode: erlang -*-
 {application, emqx_resource, [
 {application, emqx_resource, [
     {description, "Manager for all external resources"},
     {description, "Manager for all external resources"},
-    {vsn, "0.1.36"},
+    {vsn, "0.1.37"},
     {registered, []},
     {registered, []},
     {mod, {emqx_resource_app, []}},
     {mod, {emqx_resource_app, []}},
     {applications, [
     {applications, [

+ 8 - 1
apps/emqx_resource/src/emqx_resource_buffer_worker.erl

@@ -1321,7 +1321,7 @@ extract_connector_id(Id) when is_binary(Id) ->
 %% There is no need to query the conncector if the channel is not
 %% There is no need to query the conncector if the channel is not
 %% installed as the query will fail anyway.
 %% installed as the query will fail anyway.
 pre_query_channel_check(Id, {Id, _} = _Request, ChanSt, IsSimpleQuery) ->
 pre_query_channel_check(Id, {Id, _} = _Request, ChanSt, IsSimpleQuery) ->
-    case emqx_resource_manager:channel_status_is_channel_added(ChanSt) of
+    case is_channel_apt_for_queries(ChanSt) of
         true ->
         true ->
             ok;
             ok;
         false ->
         false ->
@@ -2365,6 +2365,13 @@ buffer_worker(_Tid) ->
 is_simple_query(#{simple_query := Bool}) ->
 is_simple_query(#{simple_query := Bool}) ->
     Bool.
     Bool.
 
 
+is_channel_apt_for_queries(?status_connected) ->
+    true;
+is_channel_apt_for_queries(?status_connecting) ->
+    true;
+is_channel_apt_for_queries(_) ->
+    false.
+
 -ifdef(TEST).
 -ifdef(TEST).
 -include_lib("eunit/include/eunit.hrl").
 -include_lib("eunit/include/eunit.hrl").
 adjust_batch_time_test_() ->
 adjust_batch_time_test_() ->

+ 18 - 5
apps/emqx_resource/src/emqx_resource_cache_cleaner.erl

@@ -79,7 +79,7 @@ handle_call(_Request, _From, State) ->
     {reply, ok, State}.
     {reply, ok, State}.
 
 
 handle_cast(#add_dry_run{id = ID, pid = Pid}, #{dry_run_pmon := Pmon0} = State0) ->
 handle_cast(#add_dry_run{id = ID, pid = Pid}, #{dry_run_pmon := Pmon0} = State0) ->
-    Pmon = emqx_pmon:monitor(Pid, ID, Pmon0),
+    Pmon = append_monitor(Pmon0, Pid, ID),
     State = State0#{dry_run_pmon := Pmon},
     State = State0#{dry_run_pmon := Pmon},
     {noreply, State};
     {noreply, State};
 handle_cast(_Msg, State) ->
 handle_cast(_Msg, State) ->
@@ -108,8 +108,8 @@ handle_down(Pid, State0) ->
             handle_down_cache(ID, Pid, State0);
             handle_down_cache(ID, Pid, State0);
         error ->
         error ->
             case emqx_pmon:find(Pid, DryrunPmon) of
             case emqx_pmon:find(Pid, DryrunPmon) of
-                {ok, ID} ->
-                    handle_down_dry_run(ID, Pid, State0);
+                {ok, IDs} ->
+                    handle_down_dry_run(IDs, Pid, State0);
                 error ->
                 error ->
                     State0
                     State0
             end
             end
@@ -121,16 +121,20 @@ handle_down_cache(ID, Pid, State0) ->
     Pmon = emqx_pmon:erase(Pid, Pmon0),
     Pmon = emqx_pmon:erase(Pid, Pmon0),
     State0#{cache_pmon := Pmon}.
     State0#{cache_pmon := Pmon}.
 
 
-handle_down_dry_run(ID, Pid, State0) ->
+handle_down_dry_run([ID | Rest], Pid, State0) ->
     #{dry_run_pmon := Pmon0} = State0,
     #{dry_run_pmon := Pmon0} = State0,
     %% No need to wait here: since it's a dry run resource, it won't be recreated,
     %% No need to wait here: since it's a dry run resource, it won't be recreated,
     %% assuming the ID is random enough.
     %% assuming the ID is random enough.
     spawn(fun() ->
     spawn(fun() ->
+        _ = emqx_resource_manager:remove(ID),
         emqx_resource_manager_sup:delete_child(ID),
         emqx_resource_manager_sup:delete_child(ID),
         ?tp("resource_cache_cleaner_deleted_child", #{id => ID})
         ?tp("resource_cache_cleaner_deleted_child", #{id => ID})
     end),
     end),
     Pmon = emqx_pmon:erase(Pid, Pmon0),
     Pmon = emqx_pmon:erase(Pid, Pmon0),
-    State0#{dry_run_pmon := Pmon}.
+    State = State0#{dry_run_pmon := Pmon},
+    handle_down_dry_run(Rest, Pid, State);
+handle_down_dry_run([], _Pid, State) ->
+    State.
 
 
 maybe_erase_cache(DownManager, ID) ->
 maybe_erase_cache(DownManager, ID) ->
     case emqx_resource_cache:read_manager_pid(ID) =:= DownManager of
     case emqx_resource_cache:read_manager_pid(ID) =:= DownManager of
@@ -141,3 +145,12 @@ maybe_erase_cache(DownManager, ID) ->
             %% restart by supervisor
             %% restart by supervisor
             ok
             ok
     end.
     end.
+
+append_monitor(Pmon0, Pid, Value) ->
+    case emqx_pmon:find(Pid, Pmon0) of
+        error ->
+            emqx_pmon:monitor(Pid, [Value], Pmon0);
+        {ok, Values} ->
+            Pmon = emqx_pmon:demonitor(Pid, Pmon0),
+            emqx_pmon:monitor(Pid, [Value | Values], Pmon)
+    end.

+ 236 - 116
apps/emqx_resource/src/emqx_resource_manager.erl

@@ -50,12 +50,12 @@
     is_exist/1,
     is_exist/1,
     get_metrics/1,
     get_metrics/1,
     reset_metrics/1,
     reset_metrics/1,
-    channel_status_is_channel_added/1,
     get_query_mode_and_last_error/2
     get_query_mode_and_last_error/2
 ]).
 ]).
 
 
 -export([
 -export([
-    set_resource_status_connecting/1
+    set_resource_status_connecting/1,
+    external_error/1
 ]).
 ]).
 
 
 % Server
 % Server
@@ -71,6 +71,13 @@
 -export([stop/2]).
 -export([stop/2]).
 -endif.
 -endif.
 
 
+%%------------------------------------------------------------------------------
+%% Type definitions
+%%------------------------------------------------------------------------------
+
+-define(not_added_yet, {?MODULE, not_added_yet}).
+-define(add_channel_failed(REASON), {?MODULE, add_channel_failed, REASON}).
+
 % State record
 % State record
 -record(data, {
 -record(data, {
     id,
     id,
@@ -101,8 +108,8 @@
     },
     },
     %% Callers waiting on health check
     %% Callers waiting on health check
     hc_pending_callers = #{resource => [], channel => #{}} :: #{
     hc_pending_callers = #{resource => [], channel => #{}} :: #{
-        resource := [gen_server:from()],
-        channel := #{channel_id() => [gen_server:from()]}
+        resource := [gen_statem:from()],
+        channel := #{channel_id() => [gen_statem:from()]}
     },
     },
     extra
     extra
 }).
 }).
@@ -146,11 +153,15 @@
 %% calls/casts/generic timeouts
 %% calls/casts/generic timeouts
 -record(add_channel, {channel_id :: channel_id(), config :: map()}).
 -record(add_channel, {channel_id :: channel_id(), config :: map()}).
 -record(start_channel_health_check, {channel_id :: channel_id()}).
 -record(start_channel_health_check, {channel_id :: channel_id()}).
+-record(retry_add_channel, {channel_id :: channel_id()}).
 
 
 -type generic_timeout(Id, Content) :: {{timeout, Id}, timeout(), Content}.
 -type generic_timeout(Id, Content) :: {{timeout, Id}, timeout(), Content}.
 -type start_channel_health_check_action() :: generic_timeout(
 -type start_channel_health_check_action() :: generic_timeout(
     #start_channel_health_check{}, #start_channel_health_check{}
     #start_channel_health_check{}, #start_channel_health_check{}
 ).
 ).
+-type retry_add_channel_action() :: generic_timeout(
+    #retry_add_channel{}, #retry_add_channel{}
+).
 
 
 %%------------------------------------------------------------------------------
 %%------------------------------------------------------------------------------
 %% API
 %% API
@@ -273,10 +284,13 @@ create_dry_run(ResId, ResourceType, Config, OnReadyCallback) ->
                     Error
                     Error
             end;
             end;
         {error, Reason} ->
         {error, Reason} ->
-            _ = remove(ResId),
+            %% Removal is done asynchronously.  See comment below.
             {error, Reason};
             {error, Reason};
         timeout ->
         timeout ->
-            _ = remove(ResId),
+            %% Removal is done asynchronously by the cache cleaner.  If the resource
+            %% process is stuck and not responding to calls, doing the removal
+            %% synchronously here would take more time than the defined timeout, possibly
+            %% timing out HTTP API requests.
             {error, timeout}
             {error, timeout}
     end.
     end.
 
 
@@ -620,9 +634,10 @@ handle_event({call, From}, {channel_health_check, ChannelId}, _State, Data) ->
 %%--------------------------
 %%--------------------------
 %% State: CONNECTING
 %% State: CONNECTING
 %%--------------------------
 %%--------------------------
-handle_event(enter, _OldState, ?state_connecting = State, Data) ->
+handle_event(enter, _OldState, ?state_connecting = State, Data0) ->
+    Data = abort_all_channel_health_checks(Data0),
     ok = log_status_consistency(State, Data),
     ok = log_status_consistency(State, Data),
-    {keep_state_and_data, [{state_timeout, 0, health_check}]};
+    {keep_state, Data, [{state_timeout, 0, health_check}]};
 handle_event(internal, start_resource, ?state_connecting, Data) ->
 handle_event(internal, start_resource, ?state_connecting, Data) ->
     start_resource(Data, undefined);
     start_resource(Data, undefined);
 handle_event(state_timeout, health_check, ?state_connecting, Data) ->
 handle_event(state_timeout, health_check, ?state_connecting, Data) ->
@@ -640,7 +655,7 @@ handle_event(enter, _OldState, ?state_connected = State, Data) ->
     ok = log_status_consistency(State, Data),
     ok = log_status_consistency(State, Data),
     _ = emqx_alarm:safe_deactivate(Data#data.id),
     _ = emqx_alarm:safe_deactivate(Data#data.id),
     ?tp(resource_connected_enter, #{}),
     ?tp(resource_connected_enter, #{}),
-    {keep_state_and_data, resource_health_check_actions(Data)};
+    {keep_state, Data, resource_health_check_actions(Data)};
 handle_event(state_timeout, health_check, ?state_connected, Data) ->
 handle_event(state_timeout, health_check, ?state_connected, Data) ->
     start_resource_health_check(Data);
     start_resource_health_check(Data);
 handle_event(
 handle_event(
@@ -661,13 +676,17 @@ handle_event(
     Data
     Data
 ) ->
 ) ->
     handle_start_channel_health_check(Data, ChannelId);
     handle_start_channel_health_check(Data, ChannelId);
+handle_event(
+    {timeout, #retry_add_channel{channel_id = ChannelId}}, _, ?state_connected = _State, Data
+) ->
+    handle_retry_add_channel(Data, ChannelId);
 %%--------------------------
 %%--------------------------
 %% State: DISCONNECTED
 %% State: DISCONNECTED
 %%--------------------------
 %%--------------------------
 handle_event(enter, _OldState, ?state_disconnected = State, Data0) ->
 handle_event(enter, _OldState, ?state_disconnected = State, Data0) ->
     ok = log_status_consistency(State, Data0),
     ok = log_status_consistency(State, Data0),
     ?tp(resource_disconnected_enter, #{}),
     ?tp(resource_disconnected_enter, #{}),
-    Data = handle_abort_all_channel_health_checks(Data0),
+    Data = abort_all_channel_health_checks(Data0),
     {keep_state, Data, retry_actions(Data)};
     {keep_state, Data, retry_actions(Data)};
 handle_event(state_timeout, auto_retry, ?state_disconnected, Data) ->
 handle_event(state_timeout, auto_retry, ?state_disconnected, Data) ->
     ?tp(resource_auto_reconnect, #{}),
     ?tp(resource_auto_reconnect, #{}),
@@ -676,9 +695,10 @@ handle_event(state_timeout, auto_retry, ?state_disconnected, Data) ->
 %% State: STOPPED
 %% State: STOPPED
 %% The stopped state is entered after the resource has been explicitly stopped
 %% The stopped state is entered after the resource has been explicitly stopped
 %%--------------------------
 %%--------------------------
-handle_event(enter, _OldState, ?state_stopped = State, Data) ->
+handle_event(enter, _OldState, ?state_stopped = State, Data0) ->
+    Data = abort_all_channel_health_checks(Data0),
     ok = log_status_consistency(State, Data),
     ok = log_status_consistency(State, Data),
-    {keep_state_and_data, []};
+    {keep_state, Data};
 %%--------------------------
 %%--------------------------
 %% The following events can be handled in any other state
 %% The following events can be handled in any other state
 %%--------------------------
 %%--------------------------
@@ -713,6 +733,9 @@ handle_event(
     is_map_key(Pid, CHCWorkers)
     is_map_key(Pid, CHCWorkers)
 ->
 ->
     handle_channel_health_check_worker_down(Data0, Pid, Res);
     handle_channel_health_check_worker_down(Data0, Pid, Res);
+handle_event({timeout, #retry_add_channel{channel_id = _}}, _, _State, _Data) ->
+    %% We only add channels to the resource state in the connected state.
+    {keep_state_and_data, [postpone]};
 handle_event({timeout, #start_channel_health_check{channel_id = _}}, _, _State, _Data) ->
 handle_event({timeout, #start_channel_health_check{channel_id = _}}, _, _State, _Data) ->
     %% Stale health check action; currently, we only probe channel health when connected.
     %% Stale health check action; currently, we only probe channel health when connected.
     keep_state_and_data;
     keep_state_and_data;
@@ -811,11 +834,12 @@ start_resource(Data, From) ->
             ),
             ),
             _ = maybe_alarm(?status_disconnected, IsDryRun, ResId, Err, Data#data.error),
             _ = maybe_alarm(?status_disconnected, IsDryRun, ResId, Err, Data#data.error),
             %% Add channels and raise alarms
             %% Add channels and raise alarms
-            NewData1 = channels_health_check(?status_disconnected, add_channels(Data)),
+            {Actions0, NewData1} = channels_health_check(?status_disconnected, add_channels(Data)),
             %% Keep track of the error reason why the connection did not work
             %% Keep track of the error reason why the connection did not work
             %% so that the Reason can be returned when the verification call is made.
             %% so that the Reason can be returned when the verification call is made.
             NewData2 = NewData1#data{status = ?status_disconnected, error = Err},
             NewData2 = NewData1#data{status = ?status_disconnected, error = Err},
-            Actions = maybe_reply(retry_actions(NewData2), From, Err),
+            Actions1 = maybe_reply(retry_actions(NewData2), From, Err),
+            Actions = Actions1 ++ Actions0,
             {next_state, ?state_disconnected, update_state(NewData2), Actions}
             {next_state, ?state_disconnected, update_state(NewData2), Actions}
     end.
     end.
 
 
@@ -845,9 +869,12 @@ maybe_update_callback_mode(Data = #data{mod = ResourceType, state = ResourceStat
             Data#data{callback_mode = CallMode}
             Data#data{callback_mode = CallMode}
     end.
     end.
 
 
-add_channels_in_list([], Data) ->
-    Data;
-add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) ->
+add_channels_in_list(ChannelsWithConfigs, Data) ->
+    add_channels_in_list(ChannelsWithConfigs, Data, _Actions = []).
+
+add_channels_in_list([], Data, Actions) ->
+    {Actions, Data};
+add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data, Actions) ->
     #data{
     #data{
         id = ResId,
         id = ResId,
         mod = Mod,
         mod = Mod,
@@ -869,6 +896,7 @@ add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) ->
                 channel_status_new_waiting_for_health_check(ChannelConfig),
                 channel_status_new_waiting_for_health_check(ChannelConfig),
                 AddedChannelsMap
                 AddedChannelsMap
             ),
             ),
+            NewActions = Actions,
             NewData = Data#data{
             NewData = Data#data{
                 state = NewState,
                 state = NewState,
                 added_channels = NewAddedChannelsMap
                 added_channels = NewAddedChannelsMap
@@ -887,16 +915,17 @@ add_channels_in_list([{ChannelID, ChannelConfig} | Rest], Data) ->
             ),
             ),
             NewAddedChannelsMap = maps:put(
             NewAddedChannelsMap = maps:put(
                 ChannelID,
                 ChannelID,
-                channel_status(Error, ChannelConfig),
+                channel_status(?add_channel_failed(Reason), ChannelConfig),
                 AddedChannelsMap
                 AddedChannelsMap
             ),
             ),
+            NewActions = [retry_add_channel_action(ChannelID, ChannelConfig, Data) | Actions],
             NewData = Data#data{
             NewData = Data#data{
                 added_channels = NewAddedChannelsMap
                 added_channels = NewAddedChannelsMap
             },
             },
             %% Raise an alarm since the channel could not be added
             %% Raise an alarm since the channel could not be added
             _ = maybe_alarm(?status_disconnected, IsDryRun, ChannelID, Error, no_prev_error)
             _ = maybe_alarm(?status_disconnected, IsDryRun, ChannelID, Error, no_prev_error)
     end,
     end,
-    add_channels_in_list(Rest, NewData).
+    add_channels_in_list(Rest, NewData, NewActions).
 
 
 maybe_stop_resource(#data{status = Status} = Data) when Status =/= ?rm_status_stopped ->
 maybe_stop_resource(#data{status = Status} = Data) when Status =/= ?rm_status_stopped ->
     stop_resource(Data);
     stop_resource(Data);
@@ -925,11 +954,11 @@ stop_resource(#data{id = ResId} = Data) ->
 
 
 remove_channels(Data) ->
 remove_channels(Data) ->
     Channels = maps:keys(Data#data.added_channels),
     Channels = maps:keys(Data#data.added_channels),
-    remove_channels_in_list(Channels, Data, false).
+    remove_channels_in_list(Channels, Data).
 
 
-remove_channels_in_list([], Data, _KeepInChannelMap) ->
+remove_channels_in_list([], Data) ->
     Data;
     Data;
-remove_channels_in_list([ChannelID | Rest], Data, KeepInChannelMap) ->
+remove_channels_in_list([ChannelID | Rest], Data) ->
     #data{
     #data{
         id = ResId,
         id = ResId,
         added_channels = AddedChannelsMap,
         added_channels = AddedChannelsMap,
@@ -939,14 +968,8 @@ remove_channels_in_list([ChannelID | Rest], Data, KeepInChannelMap) ->
         type = Type
         type = Type
     } = Data,
     } = Data,
     IsDryRun = emqx_resource:is_dry_run(ResId),
     IsDryRun = emqx_resource:is_dry_run(ResId),
-    NewAddedChannelsMap =
-        case KeepInChannelMap of
-            true ->
-                AddedChannelsMap;
-            false ->
-                _ = maybe_clear_alarm(IsDryRun, ChannelID),
-                maps:remove(ChannelID, AddedChannelsMap)
-        end,
+    _ = maybe_clear_alarm(IsDryRun, ChannelID),
+    NewAddedChannelsMap = maps:remove(ChannelID, AddedChannelsMap),
     case safe_call_remove_channel(ResId, Mod, State, ChannelID) of
     case safe_call_remove_channel(ResId, Mod, State, ChannelID) of
         {ok, NewState} ->
         {ok, NewState} ->
             NewData = Data#data{
             NewData = Data#data{
@@ -971,7 +994,7 @@ remove_channels_in_list([ChannelID | Rest], Data, KeepInChannelMap) ->
                 added_channels = NewAddedChannelsMap
                 added_channels = NewAddedChannelsMap
             }
             }
     end,
     end,
-    remove_channels_in_list(Rest, NewData, KeepInChannelMap).
+    remove_channels_in_list(Rest, NewData).
 
 
 safe_call_remove_channel(_ResId, _Mod, undefined = State, _ChannelID) ->
 safe_call_remove_channel(_ResId, _Mod, undefined = State, _ChannelID) ->
     {ok, State};
     {ok, State};
@@ -1039,7 +1062,8 @@ handle_not_connected_add_channel(From, ChannelId, ChannelConfig, State, Data) ->
     NewData = add_or_update_channel_status(Data, ChannelId, ChannelConfig, State),
     NewData = add_or_update_channel_status(Data, ChannelId, ChannelConfig, State),
     {keep_state, update_state(NewData), [{reply, From, ok}]}.
     {keep_state, update_state(NewData), [{reply, From, ok}]}.
 
 
-handle_remove_channel(From, ChannelId, Data) ->
+handle_remove_channel(From, ChannelId, Data0) ->
+    Data = abort_health_checks_for_channel(Data0, ChannelId),
     Channels = Data#data.added_channels,
     Channels = Data#data.added_channels,
     IsDryRun = emqx_resource:is_dry_run(Data#data.id),
     IsDryRun = emqx_resource:is_dry_run(Data#data.id),
     _ = maybe_clear_alarm(IsDryRun, ChannelId),
     _ = maybe_clear_alarm(IsDryRun, ChannelId),
@@ -1194,9 +1218,9 @@ continue_resource_health_check_connected(NewStatus, Data0) ->
     case NewStatus of
     case NewStatus of
         ?status_connected ->
         ?status_connected ->
             {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
             {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
-            Data2 = channels_health_check(?status_connected, Data1),
+            {Actions0, Data2} = channels_health_check(?status_connected, Data1),
             Data = update_state(Data2),
             Data = update_state(Data2),
-            Actions = Replies ++ resource_health_check_actions(Data),
+            Actions = Replies ++ Actions0 ++ resource_health_check_actions(Data),
             {keep_state, Data, Actions};
             {keep_state, Data, Actions};
         _ ->
         _ ->
             #data{id = ResId, group = Group, type = Type} = Data0,
             #data{id = ResId, group = Group, type = Type} = Data0,
@@ -1215,8 +1239,8 @@ continue_resource_health_check_connected(NewStatus, Data0) ->
             %% between the two here, as resource manager also has `stopped', which is
             %% between the two here, as resource manager also has `stopped', which is
             %% not a valid status at the time of writing.
             %% not a valid status at the time of writing.
             {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
             {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
-            Data = channels_health_check(NewStatus, Data1),
-            Actions = Replies,
+            {Actions0, Data} = channels_health_check(NewStatus, Data1),
+            Actions = Replies ++ Actions0,
             {next_state, NewStatus, Data, Actions}
             {next_state, NewStatus, Data, Actions}
     end.
     end.
 
 
@@ -1225,16 +1249,16 @@ continue_resource_health_check_not_connected(NewStatus, Data0) ->
     {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
     {Replies, Data1} = reply_pending_resource_health_check_callers(NewStatus, Data0),
     case NewStatus of
     case NewStatus of
         ?status_connected ->
         ?status_connected ->
-            Data = channels_health_check(?status_connected, Data1),
-            Actions = Replies,
+            {Actions0, Data} = channels_health_check(?status_connected, Data1),
+            Actions = Replies ++ Actions0,
             {next_state, ?state_connected, Data, Actions};
             {next_state, ?state_connected, Data, Actions};
         ?status_connecting ->
         ?status_connecting ->
-            Data = channels_health_check(?status_connecting, Data1),
-            Actions = Replies ++ resource_health_check_actions(Data),
+            {Actions0, Data} = channels_health_check(?status_connecting, Data1),
+            Actions = Replies ++ Actions0 ++ resource_health_check_actions(Data),
             {next_state, ?status_connecting, Data, Actions};
             {next_state, ?status_connecting, Data, Actions};
         ?status_disconnected ->
         ?status_disconnected ->
-            Data = channels_health_check(?status_disconnected, Data1),
-            Actions = Replies,
+            {Actions0, Data} = channels_health_check(?status_disconnected, Data1),
+            Actions = Replies ++ Actions0,
             {next_state, ?state_disconnected, Data, Actions}
             {next_state, ?state_disconnected, Data, Actions}
     end.
     end.
 
 
@@ -1274,7 +1298,9 @@ handle_manual_channel_health_check(
     is_map_key(ChannelId, Channels)
     is_map_key(ChannelId, Channels)
 ->
 ->
     %% No ongoing health check: reply with current status.
     %% No ongoing health check: reply with current status.
-    {keep_state_and_data, [{reply, From, without_channel_config(maps:get(ChannelId, Channels))}]};
+    {keep_state_and_data, [
+        {reply, From, to_external_channel_status(maps:get(ChannelId, Channels))}
+    ]};
 handle_manual_channel_health_check(
 handle_manual_channel_health_check(
     From,
     From,
     _Data,
     _Data,
@@ -1284,22 +1310,21 @@ handle_manual_channel_health_check(
         {reply, From, channel_error_status(channel_not_found)}
         {reply, From, channel_error_status(channel_not_found)}
     ]}.
     ]}.
 
 
--spec channels_health_check(resource_status(), data()) -> data().
+-spec channels_health_check(resource_status(), data()) -> {[gen_statem:action()], data()}.
 channels_health_check(?status_connected = _ConnectorStatus, Data0) ->
 channels_health_check(?status_connected = _ConnectorStatus, Data0) ->
     Channels = maps:to_list(Data0#data.added_channels),
     Channels = maps:to_list(Data0#data.added_channels),
-    %% All channels with a status different from connected or connecting are
-    %% not added
     ChannelsNotAdded = [
     ChannelsNotAdded = [
         ChannelId
         ChannelId
      || {ChannelId, Status} <- Channels,
      || {ChannelId, Status} <- Channels,
         not channel_status_is_channel_added(Status)
         not channel_status_is_channel_added(Status)
     ],
     ],
-    %% Attempt to add channels that are not added
+    %% Attempt to add channels to resource state that are not added yet
     ChannelsNotAddedWithConfigs = get_config_for_channels(Data0, ChannelsNotAdded),
     ChannelsNotAddedWithConfigs = get_config_for_channels(Data0, ChannelsNotAdded),
-    Data1 = add_channels_in_list(ChannelsNotAddedWithConfigs, Data0),
-    %% Now that we have done the adding, we can get the status of all channels
+    {Actions, Data1} = add_channels_in_list(ChannelsNotAddedWithConfigs, Data0),
+    %% Now that we have done the adding, we can get the status of all channels (execept
+    %% unhealthy ones)
     Data2 = trigger_health_check_for_added_channels(Data1),
     Data2 = trigger_health_check_for_added_channels(Data1),
-    update_state(Data2);
+    {Actions, update_state(Data2)};
 channels_health_check(?status_connecting = _ConnectorStatus, Data0) ->
 channels_health_check(?status_connecting = _ConnectorStatus, Data0) ->
     %% Whenever the resource is connecting:
     %% Whenever the resource is connecting:
     %% 1. Change the status of all added channels to connecting
     %% 1. Change the status of all added channels to connecting
@@ -1337,33 +1362,35 @@ channels_health_check(?status_connecting = _ConnectorStatus, Data0) ->
         ChannelsWithNewAndPrevErrorStatuses
         ChannelsWithNewAndPrevErrorStatuses
     ),
     ),
     Data1 = Data0#data{added_channels = NewChannels},
     Data1 = Data0#data{added_channels = NewChannels},
-    update_state(Data1);
-channels_health_check(ConnectorStatus, Data0) ->
-    %% Whenever the resource is not connected and not connecting:
-    %% 1. Remove all added channels
-    %% 2. Change the status to an error status
-    %% 3. Raise alarms
-    Channels = Data0#data.added_channels,
-    ChannelsToRemove = [
-        ChannelId
-     || {ChannelId, Status} <- maps:to_list(Channels),
-        channel_status_is_channel_added(Status)
-    ],
-    Data1 = remove_channels_in_list(ChannelsToRemove, Data0, true),
+    {_Actions = [], update_state(Data1)};
+channels_health_check(?status_disconnected = ConnectorStatus, Data1) ->
+    %% Whenever the resource is disconnected:
+    %% 1. Change the status of channels to an error status
+    %%    - Except for channels yet to be added to the resource state.  Those need to keep
+    %%    those special errors so they are added or retried.
+    %% 2. Raise alarms
+    Channels = Data1#data.added_channels,
     ChannelsWithNewAndOldStatuses =
     ChannelsWithNewAndOldStatuses =
-        [
-            {ChannelId, OldStatus,
-                channel_status(
-                    {error,
-                        resource_not_connected_channel_error_msg(
-                            ConnectorStatus,
-                            ChannelId,
-                            Data1
-                        )},
-                    Config
-                )}
-         || {ChannelId, #{config := Config} = OldStatus} <- maps:to_list(Data1#data.added_channels)
-        ],
+        lists:map(
+            fun
+                ({ChannelId, #{error := ?not_added_yet} = OldStatus}) ->
+                    {ChannelId, OldStatus, OldStatus};
+                ({ChannelId, #{error := ?add_channel_failed(_)} = OldStatus}) ->
+                    {ChannelId, OldStatus, OldStatus};
+                ({ChannelId, #{config := Config} = OldStatus}) ->
+                    {ChannelId, OldStatus,
+                        channel_status(
+                            {error,
+                                resource_not_connected_channel_error_msg(
+                                    ConnectorStatus,
+                                    ChannelId,
+                                    Data1
+                                )},
+                            Config
+                        )}
+            end,
+            maps:to_list(Data1#data.added_channels)
+        ),
     %% Raise alarms
     %% Raise alarms
     IsDryRun = emqx_resource:is_dry_run(Data1#data.id),
     IsDryRun = emqx_resource:is_dry_run(Data1#data.id),
     _ = lists:foreach(
     _ = lists:foreach(
@@ -1381,7 +1408,7 @@ channels_health_check(ConnectorStatus, Data0) ->
         ChannelsWithNewAndOldStatuses
         ChannelsWithNewAndOldStatuses
     ),
     ),
     Data2 = Data1#data{added_channels = NewChannels},
     Data2 = Data1#data{added_channels = NewChannels},
-    update_state(Data2).
+    {_Actions = [], update_state(Data2)}.
 
 
 resource_not_connected_channel_error_msg(ResourceStatus, ChannelId, Data1) ->
 resource_not_connected_channel_error_msg(ResourceStatus, ChannelId, Data1) ->
     ResourceId = Data1#data.id,
     ResourceId = Data1#data.id,
@@ -1401,27 +1428,40 @@ resource_not_connected_channel_error_msg(ResourceStatus, ChannelId, Data1) ->
 generic_timeout_action(Id, Timeout, Content) ->
 generic_timeout_action(Id, Timeout, Content) ->
     {{timeout, Id}, Timeout, Content}.
     {{timeout, Id}, Timeout, Content}.
 
 
--spec start_channel_health_check_action(channel_id(), map(), map(), data() | timeout()) ->
+-spec start_channel_health_check_action(channel_id(), map(), map(), data()) ->
     [start_channel_health_check_action()].
     [start_channel_health_check_action()].
 start_channel_health_check_action(ChannelId, NewChanStatus, PreviousChanStatus, Data = #data{}) ->
 start_channel_health_check_action(ChannelId, NewChanStatus, PreviousChanStatus, Data = #data{}) ->
-    Timeout = get_channel_health_check_interval(ChannelId, NewChanStatus, PreviousChanStatus, Data),
+    ConfigSources =
+        lists:map(
+            fun
+                (#{config := Config}) ->
+                    Config;
+                (_) ->
+                    #{}
+            end,
+            [NewChanStatus, PreviousChanStatus]
+        ),
+    Timeout = get_channel_health_check_interval(ChannelId, ConfigSources, Data),
     Event = #start_channel_health_check{channel_id = ChannelId},
     Event = #start_channel_health_check{channel_id = ChannelId},
     [generic_timeout_action(Event, Timeout, Event)].
     [generic_timeout_action(Event, Timeout, Event)].
 
 
-get_channel_health_check_interval(ChannelId, NewChanStatus, PreviousChanStatus, Data) ->
+-spec retry_add_channel_action(channel_id(), map(), data()) -> retry_add_channel_action().
+retry_add_channel_action(ChannelId, ChannelConfig, Data) ->
+    Timeout = get_channel_health_check_interval(ChannelId, [ChannelConfig], Data),
+    Event = #retry_add_channel{channel_id = ChannelId},
+    generic_timeout_action(Event, Timeout, Event).
+
+get_channel_health_check_interval(ChannelId, ConfigSources, Data) ->
     emqx_utils:foldl_while(
     emqx_utils:foldl_while(
         fun
         fun
-            (#{config := #{resource_opts := #{health_check_interval := HCInterval}}}, _Acc) ->
+            (#{resource_opts := #{health_check_interval := HCInterval}}, _Acc) ->
                 {halt, HCInterval};
                 {halt, HCInterval};
             (_, Acc) ->
             (_, Acc) ->
                 {cont, Acc}
                 {cont, Acc}
         end,
         end,
         ?HEALTHCHECK_INTERVAL,
         ?HEALTHCHECK_INTERVAL,
-        [
-            NewChanStatus,
-            PreviousChanStatus,
-            maps:get(ChannelId, Data#data.added_channels, #{})
-        ]
+        ConfigSources ++
+            [emqx_utils_maps:deep_get([ChannelId, config], Data#data.added_channels, #{})]
     ).
     ).
 
 
 %% Currently, we only call resource channel health checks when the underlying resource is
 %% Currently, we only call resource channel health checks when the underlying resource is
@@ -1434,7 +1474,7 @@ trigger_health_check_for_added_channels(Data0 = #data{hc_workers = HCWorkers0})
     NewOngoing = maps:filter(
     NewOngoing = maps:filter(
         fun(ChannelId, OldStatus) ->
         fun(ChannelId, OldStatus) ->
             (not is_map_key(ChannelId, Ongoing0)) andalso
             (not is_map_key(ChannelId, Ongoing0)) andalso
-                channel_status_is_channel_added(OldStatus)
+                is_channel_apt_for_health_check(OldStatus)
         end,
         end,
         Data0#data.added_channels
         Data0#data.added_channels
     ),
     ),
@@ -1467,12 +1507,10 @@ continue_channel_health_check_connected(ChannelId, OldStatus, CurrentStatus, Dat
             Data1
             Data1
     end.
     end.
 
 
-continue_channel_health_check_connected_no_update_during_check(ChannelId, OldStatus, Data1) ->
+continue_channel_health_check_connected_no_update_during_check(ChannelId, OldStatus, Data) ->
     %% Remove the added channels with a status different from connected or connecting
     %% Remove the added channels with a status different from connected or connecting
-    NewStatus = maps:get(ChannelId, Data1#data.added_channels),
-    ChannelsToRemove = [ChannelId || not channel_status_is_channel_added(NewStatus)],
-    Data = remove_channels_in_list(ChannelsToRemove, Data1, true),
-    IsDryRun = emqx_resource:is_dry_run(Data1#data.id),
+    NewStatus = maps:get(ChannelId, Data#data.added_channels),
+    IsDryRun = emqx_resource:is_dry_run(Data#data.id),
     %% Raise/clear alarms
     %% Raise/clear alarms
     case NewStatus of
     case NewStatus of
         #{status := ?status_connected} ->
         #{status := ?status_connected} ->
@@ -1582,7 +1620,7 @@ handle_channel_health_check_worker_down_new_channels_and_status(
 reply_pending_channel_health_check_callers(
 reply_pending_channel_health_check_callers(
     ChannelId, Status0, Data0 = #data{hc_pending_callers = Pending0}
     ChannelId, Status0, Data0 = #data{hc_pending_callers = Pending0}
 ) ->
 ) ->
-    Status = without_channel_config(Status0),
+    Status = to_external_channel_status(Status0),
     #{channel := CPending0} = Pending0,
     #{channel := CPending0} = Pending0,
     Pending = maps:get(ChannelId, CPending0, []),
     Pending = maps:get(ChannelId, CPending0, []),
     Actions = [{reply, From, Status} || From <- Pending],
     Actions = [{reply, From, Status} || From <- Pending],
@@ -1590,6 +1628,21 @@ reply_pending_channel_health_check_callers(
     Data = Data0#data{hc_pending_callers = Pending0#{channel := CPending}},
     Data = Data0#data{hc_pending_callers = Pending0#{channel := CPending}},
     {Actions, Data}.
     {Actions, Data}.
 
 
+handle_retry_add_channel(Data0, ChannelId) ->
+    ?tp(retry_add_channel, #{channel_id => ChannelId}),
+    maybe
+        {ok, StatusMap} ?= maps:find(ChannelId, Data0#data.added_channels),
+        %% Must contain config map if in data.
+        #{config := #{} = ChannelConfig} = StatusMap,
+        {Actions, Data1} = add_channels_in_list([{ChannelId, ChannelConfig}], Data0),
+        Data = trigger_health_check_for_added_channels(Data1),
+        {keep_state, Data, Actions}
+    else
+        error ->
+            %% Channel has been removed since timer was set?
+            keep_state_and_data
+    end.
+
 get_config_for_channels(Data0, ChannelsWithoutConfig) ->
 get_config_for_channels(Data0, ChannelsWithoutConfig) ->
     ResId = Data0#data.id,
     ResId = Data0#data.id,
     Mod = Data0#data.mod,
     Mod = Data0#data.mod,
@@ -1648,7 +1701,7 @@ maybe_alarm(_Status, false, ResId, Error, _PrevError) ->
             {error, Reason} ->
             {error, Reason} ->
                 emqx_utils:readable_error_msg(Reason);
                 emqx_utils:readable_error_msg(Reason);
             _ ->
             _ ->
-                Error1 = without_channel_config(Error),
+                Error1 = to_external_channel_status(Error),
                 emqx_utils:readable_error_msg(Error1)
                 emqx_utils:readable_error_msg(Error1)
         end,
         end,
     emqx_alarm:safe_activate(
     emqx_alarm:safe_activate(
@@ -1656,11 +1709,15 @@ maybe_alarm(_Status, false, ResId, Error, _PrevError) ->
         #{resource_id => ResId, reason => resource_down},
         #{resource_id => ResId, reason => resource_down},
         <<"resource down: ", HrError/binary>>
         <<"resource down: ", HrError/binary>>
     ),
     ),
-    ?tp(resource_activate_alarm, #{resource_id => ResId}).
+    ?tp(resource_activate_alarm, #{resource_id => ResId, error => HrError}).
 
 
 without_channel_config(Map) ->
 without_channel_config(Map) ->
     maps:without([config], Map).
     maps:without([config], Map).
 
 
+to_external_channel_status(StatusMap0) ->
+    StatusMap = without_channel_config(StatusMap0),
+    maps:update_with(error, fun external_error/1, StatusMap).
+
 -spec maybe_resume_resource_workers(resource_id(), resource_status()) -> ok.
 -spec maybe_resume_resource_workers(resource_id(), resource_status()) -> ok.
 maybe_resume_resource_workers(ResId, ?status_connected) ->
 maybe_resume_resource_workers(ResId, ?status_connected) ->
     lists:foreach(
     lists:foreach(
@@ -1701,6 +1758,8 @@ status_to_error(_) ->
     {error, undefined}.
     {error, undefined}.
 
 
 %% Compatibility
 %% Compatibility
+external_error(?not_added_yet) -> not_added_yet;
+external_error(?add_channel_failed(Reason)) -> external_error(Reason);
 external_error({error, Reason}) -> Reason;
 external_error({error, Reason}) -> Reason;
 external_error(Other) -> Other.
 external_error(Other) -> Other.
 
 
@@ -1713,7 +1772,9 @@ maybe_reply(Actions, From, Reply) ->
 data_record_to_external_map(Data) ->
 data_record_to_external_map(Data) ->
     AddedChannelsWithoutConfigs =
     AddedChannelsWithoutConfigs =
         maps:map(
         maps:map(
-            fun(_ChanID, Status) -> without_channel_config(Status) end,
+            fun(_ChanID, Status) ->
+                to_external_channel_status(Status)
+            end,
             Data#data.added_channels
             Data#data.added_channels
         ),
         ),
     #{
     #{
@@ -1755,7 +1816,9 @@ safe_call(ResId, Message, Timeout) ->
         exit:{R, _} when R == noproc; R == normal; R == shutdown ->
         exit:{R, _} when R == noproc; R == normal; R == shutdown ->
             {error, not_found};
             {error, not_found};
         exit:{timeout, _} ->
         exit:{timeout, _} ->
-            {error, timeout}
+            {error, timeout};
+        exit:{{shutdown, removed}, _} ->
+            {error, not_found}
     end.
     end.
 
 
 %% Helper functions for chanel status data
 %% Helper functions for chanel status data
@@ -1771,7 +1834,7 @@ channel_status_not_added(ChannelConfig) ->
         %%                 connected and the on_channel_get_status callback has returned
         %%                 connected and the on_channel_get_status callback has returned
         %%                 connected. The error field should be undefined.
         %%                 connected. The error field should be undefined.
         status => ?status_disconnected,
         status => ?status_disconnected,
-        error => not_added_yet,
+        error => ?not_added_yet,
         config => ChannelConfig
         config => ChannelConfig
     }.
     }.
 
 
@@ -1820,6 +1883,12 @@ channel_status({?status_connected, Error}, ChannelConfig) ->
         error => Error,
         error => Error,
         config => ChannelConfig
         config => ChannelConfig
     };
     };
+channel_status(?add_channel_failed(_Reason) = Error, ChannelConfig) ->
+    #{
+        status => ?status_disconnected,
+        error => Error,
+        config => ChannelConfig
+    };
 channel_status({error, Reason}, ChannelConfig) ->
 channel_status({error, Reason}, ChannelConfig) ->
     S = channel_error_status(Reason),
     S = channel_error_status(Reason),
     S#{config => ChannelConfig}.
     S#{config => ChannelConfig}.
@@ -1830,19 +1899,24 @@ channel_error_status(Reason) ->
         error => Reason
         error => Reason
     }.
     }.
 
 
-channel_status_is_channel_added(#{status := St}) ->
-    channel_status_is_channel_added(St);
-channel_status_is_channel_added(?status_connected) ->
-    true;
-channel_status_is_channel_added(?status_connecting) ->
-    true;
-channel_status_is_channel_added(_Status) ->
-    false.
+is_channel_apt_for_health_check(#{error := {unhealthy_target, _}}) ->
+    false;
+is_channel_apt_for_health_check(#{error := unhealthy_target}) ->
+    false;
+is_channel_apt_for_health_check(StatusMap) ->
+    channel_status_is_channel_added(StatusMap).
+
+channel_status_is_channel_added(#{error := ?not_added_yet}) ->
+    false;
+channel_status_is_channel_added(#{error := ?add_channel_failed(_)}) ->
+    false;
+channel_status_is_channel_added(_StatusMap) ->
+    true.
 
 
 -spec add_or_update_channel_status(data(), channel_id(), map(), resource_state()) -> data().
 -spec add_or_update_channel_status(data(), channel_id(), map(), resource_state()) -> data().
 add_or_update_channel_status(Data, ChannelId, ChannelConfig, State) ->
 add_or_update_channel_status(Data, ChannelId, ChannelConfig, State) ->
     Channels = Data#data.added_channels,
     Channels = Data#data.added_channels,
-    ChannelStatus = channel_status({error, resource_not_operational}, ChannelConfig),
+    ChannelStatus = channel_status_not_added(ChannelConfig),
     NewChannels = maps:put(ChannelId, ChannelStatus, Channels),
     NewChannels = maps:put(ChannelId, ChannelStatus, Channels),
     ResStatus = state_to_status(State),
     ResStatus = state_to_status(State),
     IsDryRun = emqx_resource:is_dry_run(ChannelId),
     IsDryRun = emqx_resource:is_dry_run(ChannelId),
@@ -1861,10 +1935,18 @@ tag(Group, Type) ->
     Str = emqx_utils_conv:str(Group) ++ "/" ++ emqx_utils_conv:str(Type),
     Str = emqx_utils_conv:str(Group) ++ "/" ++ emqx_utils_conv:str(Type),
     string:uppercase(Str).
     string:uppercase(Str).
 
 
+%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
+%% might be lost for a running resource, and future attempts to bump them result in
+%% errors.  As mitigation, we ensure such metrics are created here so that restarting
+%% the resource or resetting its metrics can recreate them.
+ensure_metrics(ResId) ->
+    {ok, _} = emqx_resource:ensure_metrics(ResId),
+    ok.
+
 %% When a resource enters a `?status_disconnected' state, late channel health check
 %% When a resource enters a `?status_disconnected' state, late channel health check
 %% replies are useless and could corrup state.
 %% replies are useless and could corrup state.
--spec handle_abort_all_channel_health_checks(data()) -> data().
-handle_abort_all_channel_health_checks(Data0) ->
+-spec abort_all_channel_health_checks(data()) -> data().
+abort_all_channel_health_checks(Data0) ->
     #data{
     #data{
         hc_workers = #{channel := CHCWorkers} = HCWorkers0,
         hc_workers = #{channel := CHCWorkers} = HCWorkers0,
         hc_pending_callers = #{channel := CPending} = Pending0
         hc_pending_callers = #{channel := CPending} = Pending0
@@ -1893,17 +1975,55 @@ handle_abort_all_channel_health_checks(Data0) ->
 
 
 abort_channel_health_check(Pid) ->
 abort_channel_health_check(Pid) ->
     %% We're already linked to the worker pids due to `spawn_link'.
     %% We're already linked to the worker pids due to `spawn_link'.
+    MRef = monitor(process, Pid),
     exit(Pid, kill),
     exit(Pid, kill),
+    receive
+        {'DOWN', MRef, process, Pid, _} ->
+            ok
+    end,
     %% Clean the exit signal so it doesn't contaminate state handling.
     %% Clean the exit signal so it doesn't contaminate state handling.
     receive
     receive
         {'EXIT', Pid, _} ->
         {'EXIT', Pid, _} ->
             ok
             ok
+    after 0 -> ok
     end.
     end.
 
 
-%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
-%% might be lost for a running resource, and future attempts to bump them result in
-%% errors.  As mitigation, we ensure such metrics are created here so that restarting
-%% the resource or resetting its metrics can recreate them.
-ensure_metrics(ResId) ->
-    {ok, _} = emqx_resource:ensure_metrics(ResId),
-    ok.
+map_take_or(Map, Key, Default) ->
+    maybe
+        error ?= maps:take(Key, Map),
+        {Default, Map}
+    end.
+
+abort_health_checks_for_channel(Data0, ChannelId) ->
+    #data{
+        hc_workers = #{channel := #{ongoing := Ongoing0} = CHCWorkers0} = HCWorkers0,
+        hc_pending_callers = #{channel := CPending0} = Pending0
+    } = Data0,
+    Ongoing = maps:remove(ChannelId, Ongoing0),
+    {Callers, CPending} = map_take_or(CPending0, ChannelId, []),
+    lists:foreach(
+        fun(From) ->
+            gen_statem:reply(From, {error, resource_disconnected})
+        end,
+        Callers
+    ),
+    CHCWorkers = maps:fold(
+        fun
+            (Pid, ChannelId0, Acc) when is_pid(Pid), ChannelId0 == ChannelId ->
+                ?tp(warning, "aborting_channel_hc", #{channel_id => ChannelId, pid => Pid}),
+                abort_channel_health_check(Pid),
+                maps:remove(Pid, Acc);
+            (ChannelId0, _Config, Acc) when ChannelId0 == ChannelId ->
+                maps:remove(ChannelId0, Acc);
+            (_, _, Acc) ->
+                Acc
+        end,
+        CHCWorkers0,
+        CHCWorkers0
+    ),
+    HCWorkers = HCWorkers0#{channel := CHCWorkers#{ongoing := Ongoing}},
+    Pending = Pending0#{channel := CPending},
+    Data0#data{
+        hc_workers = HCWorkers,
+        hc_pending_callers = Pending
+    }.

+ 6 - 1
apps/emqx_resource/src/emqx_resource_manager_sup.erl

@@ -56,11 +56,16 @@ init([]) ->
     {ok, {SupFlags, ChildSpecs}}.
     {ok, {SupFlags, ChildSpecs}}.
 
 
 child_spec(ResId, Group, ResourceType, Config, Opts) ->
 child_spec(ResId, Group, ResourceType, Config, Opts) ->
+    RestartType =
+        case emqx_resource:is_dry_run(ResId) of
+            true -> temporary;
+            false -> transient
+        end,
     #{
     #{
         id => ResId,
         id => ResId,
         start =>
         start =>
             {emqx_resource_manager, start_link, [ResId, Group, ResourceType, Config, Opts]},
             {emqx_resource_manager, start_link, [ResId, Group, ResourceType, Config, Opts]},
-        restart => transient,
+        restart => RestartType,
         %% never force kill a resource manager.
         %% never force kill a resource manager.
         %% because otherwise it may lead to release leak,
         %% because otherwise it may lead to release leak,
         %% resource_manager's terminate callback calls resource on_stop
         %% resource_manager's terminate callback calls resource on_stop

+ 22 - 19
apps/emqx_resource/test/emqx_resource_SUITE.erl

@@ -1120,29 +1120,32 @@ create_dry_run_local_succ() ->
 
 
 t_create_dry_run_local_failed(_) ->
 t_create_dry_run_local_failed(_) ->
     ct:timetrap({seconds, 120}),
     ct:timetrap({seconds, 120}),
-    ct:pal("creating with creation error"),
-    Res1 = emqx_resource:create_dry_run_local(
-        ?TEST_RESOURCE,
-        #{create_error => true}
-    ),
-    ?assertMatch({error, _}, Res1),
+    emqx_utils:nolink_apply(fun() ->
+        ct:pal("creating with creation error"),
+        Res1 = emqx_resource:create_dry_run_local(
+            ?TEST_RESOURCE,
+            #{create_error => true}
+        ),
+        ?assertMatch({error, _}, Res1),
 
 
-    ct:pal("creating with health check error"),
-    Res2 = emqx_resource:create_dry_run_local(
-        ?TEST_RESOURCE,
-        #{name => test_resource, health_check_error => true}
-    ),
-    ?assertMatch({error, _}, Res2),
+        ct:pal("creating with health check error"),
+        Res2 = emqx_resource:create_dry_run_local(
+            ?TEST_RESOURCE,
+            #{name => test_resource, health_check_error => true}
+        ),
+        ?assertMatch({error, _}, Res2),
 
 
-    ct:pal("creating with stop error"),
-    Res3 = emqx_resource:create_dry_run_local(
-        ?TEST_RESOURCE,
-        #{name => test_resource, stop_error => true}
-    ),
-    ?assertEqual(ok, Res3),
+        ct:pal("creating with stop error"),
+        Res3 = emqx_resource:create_dry_run_local(
+            ?TEST_RESOURCE,
+            #{name => test_resource, stop_error => true}
+        ),
+        ?assertEqual(ok, Res3),
+        ok
+    end),
     ?retry(
     ?retry(
         100,
         100,
-        5,
+        50,
         ?assertEqual(
         ?assertEqual(
             [],
             [],
             emqx_resource:list_instances_verbose()
             emqx_resource:list_instances_verbose()

+ 1 - 1
apps/emqx_utils/src/emqx_utils.app.src

@@ -2,7 +2,7 @@
 {application, emqx_utils, [
 {application, emqx_utils, [
     {description, "Miscellaneous utilities for EMQX apps"},
     {description, "Miscellaneous utilities for EMQX apps"},
     % strict semver, bump manually!
     % strict semver, bump manually!
-    {vsn, "5.4.2"},
+    {vsn, "5.4.3"},
     {modules, [
     {modules, [
         emqx_utils,
         emqx_utils,
         emqx_utils_api,
         emqx_utils_api,

+ 27 - 4
apps/emqx_utils/src/emqx_utils_redact.erl

@@ -155,13 +155,36 @@ redact_v(V) when is_binary(V) ->
         [{var, _}] ->
         [{var, _}] ->
             V;
             V;
         _ ->
         _ ->
-            <<?REDACT_VAL>>
+            do_redact_v(V)
     end;
     end;
 redact_v([{str, Bin}]) when is_binary(Bin) ->
 redact_v([{str, Bin}]) when is_binary(Bin) ->
     %% The HOCON schema system may generate sensitive values with this format
     %% The HOCON schema system may generate sensitive values with this format
-    [{str, <<?REDACT_VAL>>}];
-redact_v(_V) ->
-    ?REDACT_VAL.
+    [{str, do_redact_v(Bin)}];
+redact_v(V) ->
+    do_redact_v(V).
+
+do_redact_v(<<"file://", _/binary>> = V) ->
+    V;
+do_redact_v("file://" ++ _ = V) ->
+    V;
+do_redact_v(B) when is_binary(B) ->
+    <<?REDACT_VAL>>;
+do_redact_v(L) when is_list(L) ->
+    ?REDACT_VAL;
+do_redact_v(F) ->
+    try
+        %% this can happen in logs
+        case emqx_secret:term(F) of
+            {file, File} ->
+                File;
+            V ->
+                do_redact_v(V)
+        end
+    catch
+        _:_ ->
+            %% most of the time
+            ?REDACT_VAL
+    end.
 
 
 deobfuscate(NewConf, OldConf) ->
 deobfuscate(NewConf, OldConf) ->
     deobfuscate(NewConf, OldConf, fun(_) -> false end).
     deobfuscate(NewConf, OldConf, fun(_) -> false end).

+ 41 - 0
apps/emqx_utils/test/emqx_utils_redact_tests.erl

@@ -45,6 +45,47 @@ no_redact_template_var_test() ->
         })
         })
     ).
     ).
 
 
+no_redact_file_paths_test() ->
+    ?assertEqual(
+        #{
+            password => <<"file:///abs/path/a">>,
+            <<"secret">> => <<"file://relative/path/b">>,
+            account_key => "file://string/path/x"
+        },
+        redact(#{
+            password => <<"file:///abs/path/a">>,
+            <<"secret">> => <<"file://relative/path/b">>,
+            account_key => "file://string/path/x"
+        })
+    ).
+
+no_redact_wrapped_file_paths_test() ->
+    ?assertEqual(
+        #{password => <<"file:///abs/path/a">>},
+        redact(#{
+            password => emqx_secret:wrap_load({file, <<"file:///abs/path/a">>})
+        })
+    ).
+
+redact_wrapped_secret_test() ->
+    ?assertEqual(
+        #{password => <<"******">>},
+        redact(#{
+            password => emqx_secret:wrap(<<"aaa">>)
+        })
+    ).
+
+deobfuscate_file_path_secrets_test_() ->
+    Original1 = #{foo => #{bar => #{headers => #{"authorization" => "file://a"}}}},
+    Original2 = #{foo => #{bar => #{headers => #{"authorization" => "a"}}}},
+    Redacted2 = #{foo => #{bar => #{headers => #{"authorization" => "******"}}}},
+    [
+        ?_assertEqual(Original1, redact(Original1)),
+        ?_assertEqual(Original1, emqx_utils_redact:deobfuscate(Original1, Original1)),
+        ?_assertEqual(Redacted2, redact(Original2)),
+        ?_assertEqual(Original2, emqx_utils_redact:deobfuscate(Redacted2, Original2))
+    ].
+
 redact(X) -> emqx_utils:redact(X).
 redact(X) -> emqx_utils:redact(X).
 
 
 is_redacted(Key, Value) ->
 is_redacted(Key, Value) ->

+ 6 - 3
bin/emqx

@@ -784,7 +784,8 @@ check_config() {
     ## this command checks the configs without generating any files
     ## this command checks the configs without generating any files
     call_hocon -v \
     call_hocon -v \
         -s "$SCHEMA_MOD" \
         -s "$SCHEMA_MOD" \
-        -c "$DATA_DIR"/configs/cluster.hocon \
+        -c "$EMQX_ETC_DIR"/base.hocon \
+        -c "$CONFIGS_DIR"/cluster.hocon \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         check_schema
         check_schema
 }
 }
@@ -804,7 +805,8 @@ generate_config() {
 
 
     ## This command populates two files: app.<time>.config and vm.<time>.args
     ## This command populates two files: app.<time>.config and vm.<time>.args
     ## It takes input sources and overlays values in below order:
     ## It takes input sources and overlays values in below order:
-    ##   - $DATA_DIR/cluster.hocon (if exists)
+    ##   - etc/base.hocon
+    ##   - $CONFIGS_DIR/cluster.hocon
     ##   - etc/emqx.conf
     ##   - etc/emqx.conf
     ##   - environment variables starts with EMQX_ e.g. EMQX_NODE__ROLE
     ##   - environment variables starts with EMQX_ e.g. EMQX_NODE__ROLE
     ##
     ##
@@ -812,7 +814,8 @@ generate_config() {
     ##       because it has to sync cluster.hocon from other nodes.
     ##       because it has to sync cluster.hocon from other nodes.
     call_hocon -v -t "$NOW_TIME" \
     call_hocon -v -t "$NOW_TIME" \
         -s "$SCHEMA_MOD" \
         -s "$SCHEMA_MOD" \
-        -c "$DATA_DIR"/configs/cluster.hocon \
+        -c "$EMQX_ETC_DIR"/base.hocon \
+        -c "$CONFIGS_DIR"/cluster.hocon \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -d "$DATA_DIR"/configs generate
         -d "$DATA_DIR"/configs generate
 
 

+ 3 - 0
changes/ce/feat-13739.en.md

@@ -0,0 +1,3 @@
+Support clear monitor (statistics) data for the whole cluster.
+
+Send `DELETE` request to endpoint `api/v5/monitor` to clear all collected monitoring metrics.

+ 1 - 0
changes/ce/feat-14247.en.md

@@ -0,0 +1 @@
+Write client attribute named `tns` to log messages if such client attribute exists.

+ 20 - 0
changes/ce/feat-14269.en.md

@@ -0,0 +1,20 @@
+Added `etc/base.hocon` config file.
+
+In this release, we introduced a new configuration file, `etc/base.hocon`, to enhance configuration management and clarity.
+
+Previously, since emqx.conf was the only place for manually crafted configurations, and because it sits at the top-most layer
+of the configuration override system, it caused some confusion.
+While mutable (not read-only) configurations set in `emqx.conf` could be changed through the UI, API, or CLI and take effect immediately,
+those changes would not persist after a node restart — leading to inconsistent behavior.
+
+To address this, we’ve added etc/base.hocon as a foundational configuration layer.
+The updated configuration precedence order, from top to bottom, is as follows:
+
+1. Environment variables
+2. `etc/emqx.conf`
+3. `data/configs/cluster.hocon`
+4. `etc/base.hocon`
+
+The `etc/base.hocon` file serves as the base layer for configurations.
+While settings defined here can still be modified after the node starts,
+this layer ensures consistent override behavior.

+ 1 - 0
changes/ce/fix-14267.en.md

@@ -0,0 +1 @@
+Do not redact secrets in logs and HTTP responses when the secret string is a file path (`file:///path/to/the/secret`).

+ 1 - 0
changes/ce/fix-14272.en.md

@@ -0,0 +1 @@
+`auto_subscribe` configuration loaded via CLI shows success but fails to take effect.

+ 1 - 0
changes/ce/fix-14317.en.md

@@ -0,0 +1 @@
+Prevent potential issues where APIs involving paging may return empty pages, in case the internal APIs will be subtly misused in the future.

+ 7 - 0
changes/ce/fix-14318.en.md

@@ -0,0 +1,7 @@
+Fixed the initialization of the HTTP connector state.  When there was incoming traffic being handled by an HTTP action and its underlying connector restarted, cryptic crashes could be seen in the logs mentioning `function_clause`.
+
+Example:
+
+```
+20:42:36.850 [error] msg: "resource_exception", info: #{error => {error, function_clause}, id => <<"action:http:a:connector:http:a">>, name => call_query, ...
+```

+ 5 - 0
changes/ce/fix-14319.en.md

@@ -0,0 +1,5 @@
+Refactored resource management internal state machine.  As consequence, some race condition bugs have been eliminated.  One such example is the HTTP action, which, when under incoming traffic and when its health check flap, may produce errors like the following:
+
+```
+2024-11-29T14:58:17.994119+00:00 [error] msg: action_not_found, connector: <<"connector:http:a">>, action_id: <<"action:http:a:connector:http:a">>
+```

+ 1 - 0
changes/ee/feat-14110.en.md

@@ -0,0 +1 @@
+Added support for Pulsar driver to report metrics.  Now, it will report metrics such as queuing, inflight and dropped message count for better observability.

+ 7 - 0
changes/ee/fix-14291.en.md

@@ -0,0 +1,7 @@
+Upgraded Pulsar producer driver to fix handling of `Redirect` `LookupType` responses when looking up a topic in Pulsar.
+
+Before this fix, if the `LookupType` response type was `Redirect` when (re)starting a producer, it would incorrectly attempt to connect to the returned broker and fail to publish any messages.  Example logs under such condition:
+
+```
+2024-11-25T20:40:54.140659+00:00 [error] [pulsar-producer][persistent://public/default/p3-partition-0] Response error:'ServiceNotReady', msg:"Namespace bundle for topic (persistent://public/default/p3-partition-0) not served by this instance. Please redo the lookup. Request is denied: namespace=public/default"
+```

Разница между файлами не показана из-за своего большого размера
+ 7 - 0
changes/ee/fix-14345.en.md


+ 3 - 0
dev

@@ -318,6 +318,7 @@ generate_app_conf() {
 
 
     ## This command populates two files: app.<time>.config and vm.<time>.args
     ## This command populates two files: app.<time>.config and vm.<time>.args
     ## It takes input sources and overlays values in below order:
     ## It takes input sources and overlays values in below order:
+    ##   - etc/base.hocon
     ##   - $DATA_DIR/cluster.hocon (if exists)
     ##   - $DATA_DIR/cluster.hocon (if exists)
     ##   - etc/emqx.conf
     ##   - etc/emqx.conf
     ##   - environment variables starts with EMQX_ e.g. EMQX_NODE__ROLE
     ##   - environment variables starts with EMQX_ e.g. EMQX_NODE__ROLE
@@ -325,6 +326,7 @@ generate_app_conf() {
     ## NOTE: it's a known issue that cluster.hocon may change right after the node boots up
     ## NOTE: it's a known issue that cluster.hocon may change right after the node boots up
     ##       because it has to sync cluster.hocon from other nodes.
     ##       because it has to sync cluster.hocon from other nodes.
     call_hocon -v -t "$NOW_TIME" -s "$SCHEMA_MOD" \
     call_hocon -v -t "$NOW_TIME" -s "$SCHEMA_MOD" \
+        -c "$EMQX_ETC_DIR"/base.hocon \
         -c "$EMQX_DATA_DIR"/configs/cluster.hocon \
         -c "$EMQX_DATA_DIR"/configs/cluster.hocon \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -c "$EMQX_ETC_DIR"/emqx.conf \
         -d "$EMQX_DATA_DIR"/configs generate
         -d "$EMQX_DATA_DIR"/configs generate
@@ -358,6 +360,7 @@ EOF
 # copy cert files and acl.conf to etc
 # copy cert files and acl.conf to etc
 copy_other_conf_files() {
 copy_other_conf_files() {
     cp -r apps/emqx/etc/certs "$EMQX_ETC_DIR"/
     cp -r apps/emqx/etc/certs "$EMQX_ETC_DIR"/
+    cp -r apps/emqx_conf/etc/base.hocon "$EMQX_ETC_DIR"/
     cp apps/emqx_auth/etc/acl.conf "$EMQX_ETC_DIR"/
     cp apps/emqx_auth/etc/acl.conf "$EMQX_ETC_DIR"/
 }
 }
 
 

+ 6 - 0
mix.exs

@@ -947,6 +947,12 @@ defmodule EMQXUmbrella.MixProject do
       Path.join(etc, "emqx.conf")
       Path.join(etc, "emqx.conf")
     )
     )
 
 
+    render_template(
+      "apps/emqx_conf/etc/base.hocon",
+      assigns,
+      Path.join(etc, "base.hocon")
+    )
+
     render_template(
     render_template(
       "rel/emqx_vars",
       "rel/emqx_vars",
       assigns,
       assigns,

+ 3 - 2
rebar.config.erl

@@ -198,7 +198,7 @@ plugins() ->
 test_plugins() ->
 test_plugins() ->
     [
     [
         {rebar3_proper, "0.12.1"},
         {rebar3_proper, "0.12.1"},
-        {coveralls, {git, "https://github.com/emqx/coveralls-erl", {tag, "v2.2.0-emqx-3"}}}
+        {coveralls, {git, "https://github.com/emqx/coveralls-erl", {tag, "v2.2.0-emqx-4"}}}
     ].
     ].
 
 
 test_deps() ->
 test_deps() ->
@@ -541,7 +541,8 @@ emqx_etc_overlay_per_rel(_RelType) ->
 emqx_etc_overlay() ->
 emqx_etc_overlay() ->
     [
     [
         {"{{base_dir}}/lib/emqx/etc/ssl_dist.conf", "etc/ssl_dist.conf"},
         {"{{base_dir}}/lib/emqx/etc/ssl_dist.conf", "etc/ssl_dist.conf"},
-        {"{{base_dir}}/lib/emqx_conf/etc/emqx.conf.all", "etc/emqx.conf"}
+        {"{{base_dir}}/lib/emqx_conf/etc/emqx.conf.all", "etc/emqx.conf"},
+        {"{{base_dir}}/lib/emqx_conf/etc/base.hocon", "etc/base.hocon"}
     ].
     ].
 
 
 get_vsn(Profile) ->
 get_vsn(Profile) ->

+ 5 - 0
rel/i18n/emqx_dashboard_monitor_api.hocon

@@ -5,6 +5,11 @@ list_monitor.desc:
 list_monitor.label:
 list_monitor.label:
 """List cluster stats data"""
 """List cluster stats data"""
 
 
+clear_monitor.desc:
+"""Clear monitor (statistics) data for the whole cluster."""
+clear_monitor.label:
+"""Clear cluster stats data"""
+
 list_monitor_node.desc:
 list_monitor_node.desc:
 """List the monitor (statistics) data on the specified node."""
 """List the monitor (statistics) data on the specified node."""
 list_monitor_node.label:
 list_monitor_node.label:

+ 8 - 0
scripts/test/emqx-boot.bats

@@ -27,3 +27,11 @@
     [[ $status -ne 0 ]]
     [[ $status -ne 0 ]]
     rm -f $conffile
     rm -f $conffile
 }
 }
+
+@test "corrupted base.hocon" {
+    conffile="./_build/$PROFILE/rel/emqx/etc/base.hocon"
+    echo "{" > $conffile
+    run ./_build/$PROFILE/rel/emqx/bin/emqx console
+    [[ $status -ne 0 ]]
+    rm -f $conffile
+}