Bläddra i källkod

Merge pull request #13248 from ieQu1/dev/EMQX-12491-local-backend

replace builtin DS backend with builtin_local and builtin_raft
ieQu1 1 år sedan
förälder
incheckning
9f30da334f
62 ändrade filer med 2587 tillägg och 889 borttagningar
  1. 10 5
      apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl
  2. 1 0
      apps/emqx/rebar.config
  3. 1 1
      apps/emqx/src/emqx.app.src
  4. 125 70
      apps/emqx/src/emqx_ds_schema.erl
  5. 16 2
      apps/emqx/test/emqx_persistent_messages_SUITE.erl
  6. 2 1
      apps/emqx/test/emqx_persistent_session_SUITE.erl
  7. 2 0
      apps/emqx_conf/src/emqx_conf_schema.erl
  8. 35 30
      apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl
  9. 32 0
      apps/emqx_ds_backends/README.md
  10. 26 0
      apps/emqx_ds_backends/rebar.config.script
  11. 26 0
      apps/emqx_ds_backends/src/emqx_ds_backends.app.src.script
  12. 115 328
      apps/emqx_durable_storage/test/emqx_ds_SUITE.erl
  13. 32 0
      apps/emqx_ds_builtin_local/README.md
  14. 5 0
      apps/emqx_ds_builtin_local/rebar.config
  15. 11 0
      apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.app.src
  16. 382 0
      apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl
  17. 38 0
      apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_app.erl
  18. 219 0
      apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_db_sup.erl
  19. 204 0
      apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_meta.erl
  20. 127 0
      apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_sup.erl
  21. 346 0
      apps/emqx_ds_builtin_local/test/emqx_ds_builtin_local_SUITE.erl
  22. 94 0
      apps/emqx_ds_builtin_raft/BSL.txt
  23. 3 0
      apps/emqx_ds_builtin_raft/README.md
  24. 6 0
      apps/emqx_ds_builtin_raft/rebar.config
  25. 11 0
      apps/emqx_ds_builtin_raft/src/emqx_ds_builtin_raft.app.src
  26. 11 0
      apps/emqx_ds_builtin_raft/src/emqx_ds_builtin_raft_app.erl
  27. 5 16
      apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl
  28. 9 28
      apps/emqx_durable_storage/src/emqx_ds_builtin_sup.erl
  29. 48 35
      apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl
  30. 0 12
      apps/emqx_durable_storage/src/emqx_ds_replication_layer.hrl
  31. 1 13
      apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl
  32. 0 12
      apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl
  33. 3 15
      apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl
  34. 2 2
      apps/emqx_durable_storage/src/emqx_ds_replication_snapshot.erl
  35. 0 12
      apps/emqx_durable_storage/src/proto/emqx_ds_proto_v1.erl
  36. 0 12
      apps/emqx_durable_storage/src/proto/emqx_ds_proto_v2.erl
  37. 0 12
      apps/emqx_durable_storage/src/proto/emqx_ds_proto_v3.erl
  38. 0 12
      apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl
  39. 279 40
      apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl
  40. 1 1
      apps/emqx_ds_shared_sub/test/emqx_ds_shared_sub_SUITE.erl
  41. 34 2
      apps/emqx_durable_storage/README.md
  42. 6 6
      apps/emqx_durable_storage/include/emqx_ds_metrics.hrl
  43. 31 24
      apps/emqx_durable_storage/src/emqx_ds.erl
  44. 63 37
      apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl
  45. 51 51
      apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl
  46. 15 9
      apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl
  47. 15 22
      apps/emqx_durable_storage/src/emqx_ds_sup.erl
  48. 2 2
      apps/emqx_durable_storage/src/emqx_durable_storage.app.src
  49. 11 15
      apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl
  50. 30 6
      apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl
  51. 3 1
      apps/emqx_machine/priv/reboot_lists.eterm
  52. 2 0
      apps/emqx_machine/src/emqx_machine_boot.erl
  53. 9 10
      apps/emqx_management/src/emqx_mgmt_api_ds.erl
  54. 5 0
      apps/emqx_management/src/emqx_mgmt_cli.erl
  55. 21 16
      apps/emqx_management/test/emqx_mgmt_SUITE.erl
  56. 1 0
      apps/emqx_management/test/emqx_mgmt_api_clients_SUITE.erl
  57. 15 10
      apps/emqx_management/test/emqx_mgmt_api_ds_SUITE.erl
  58. 6 6
      apps/emqx_prometheus/src/emqx_prometheus.erl
  59. 16 0
      changes/ce/breaking-13248.en.md
  60. 4 1
      mix.exs
  61. 1 0
      rebar.config.erl
  62. 18 12
      rel/i18n/emqx_ds_schema.hocon

+ 10 - 5
apps/emqx/integration_test/emqx_persistent_session_ds_SUITE.erl

@@ -25,11 +25,16 @@ all() ->
     emqx_common_test_helpers:all(?MODULE).
 
 init_per_suite(Config) ->
-    TCApps = emqx_cth_suite:start(
-        app_specs(),
-        #{work_dir => emqx_cth_suite:work_dir(Config)}
-    ),
-    [{tc_apps, TCApps} | Config].
+    case emqx_ds_test_helpers:skip_if_norepl() of
+        false ->
+            TCApps = emqx_cth_suite:start(
+                app_specs(),
+                #{work_dir => emqx_cth_suite:work_dir(Config)}
+            ),
+            [{tc_apps, TCApps} | Config];
+        Yes ->
+            Yes
+    end.
 
 end_per_suite(Config) ->
     TCApps = ?config(tc_apps, Config),

+ 1 - 0
apps/emqx/rebar.config

@@ -24,6 +24,7 @@
 {deps, [
     {emqx_utils, {path, "../emqx_utils"}},
     {emqx_durable_storage, {path, "../emqx_durable_storage"}},
+    {emqx_ds_backends, {path, "../emqx_ds_backends"}},
     {lc, {git, "https://github.com/emqx/lc.git", {tag, "0.3.2"}}},
     {gproc, {git, "https://github.com/emqx/gproc", {tag, "0.9.0.1"}}},
     {cowboy, {git, "https://github.com/emqx/cowboy", {tag, "2.9.2"}}},

+ 1 - 1
apps/emqx/src/emqx.app.src

@@ -18,7 +18,7 @@
         sasl,
         lc,
         hocon,
-        emqx_durable_storage,
+        emqx_ds_backends,
         bcrypt,
         pbkdf2,
         emqx_http_lib,

+ 125 - 70
apps/emqx/src/emqx_ds_schema.erl

@@ -18,7 +18,7 @@
 -module(emqx_ds_schema).
 
 %% API:
--export([schema/0, translate_builtin/1]).
+-export([schema/0, translate_builtin_raft/1, translate_builtin_local/1]).
 
 %% Behavior callbacks:
 -export([fields/1, desc/1, namespace/0]).
@@ -32,42 +32,51 @@
 %% Type declarations
 %%================================================================================
 
+-ifndef(EMQX_RELEASE_EDITION).
+-define(EMQX_RELEASE_EDITION, ce).
+-endif.
+
+-if(?EMQX_RELEASE_EDITION == ee).
+-define(DEFAULT_BACKEND, builtin_raft).
+-define(BUILTIN_BACKENDS, [ref(builtin_raft), ref(builtin_local)]).
+-else.
+-define(DEFAULT_BACKEND, builtin_local).
+-define(BUILTIN_BACKENDS, [ref(builtin_local)]).
+-endif.
+
 %%================================================================================
 %% API
 %%================================================================================
 
-translate_builtin(
+translate_builtin_raft(
     Backend = #{
-        backend := builtin,
+        backend := builtin_raft,
         n_shards := NShards,
         n_sites := NSites,
         replication_factor := ReplFactor,
         layout := Layout
     }
 ) ->
-    Storage =
-        case Layout of
-            #{
-                type := wildcard_optimized,
-                bits_per_topic_level := BitsPerTopicLevel,
-                epoch_bits := EpochBits,
-                topic_index_bytes := TIBytes
-            } ->
-                {emqx_ds_storage_bitfield_lts, #{
-                    bits_per_topic_level => BitsPerTopicLevel,
-                    topic_index_bytes => TIBytes,
-                    epoch_bits => EpochBits
-                }};
-            #{type := reference} ->
-                {emqx_ds_storage_reference, #{}}
-        end,
     #{
-        backend => builtin,
+        backend => builtin_raft,
         n_shards => NShards,
         n_sites => NSites,
         replication_factor => ReplFactor,
         replication_options => maps:get(replication_options, Backend, #{}),
-        storage => Storage
+        storage => translate_layout(Layout)
+    }.
+
+translate_builtin_local(
+    #{
+        backend := builtin_local,
+        n_shards := NShards,
+        layout := Layout
+    }
+) ->
+    #{
+        backend => builtin_local,
+        n_shards => NShards,
+        storage => translate_layout(Layout)
     }.
 
 %%================================================================================
@@ -83,24 +92,24 @@ schema() ->
             ds_schema(#{
                 default =>
                     #{
-                        <<"backend">> => builtin
+                        <<"backend">> => ?DEFAULT_BACKEND
                     },
                 importance => ?IMPORTANCE_MEDIUM,
                 desc => ?DESC(messages)
             })}
     ].
 
-fields(builtin) ->
-    %% Schema for the builtin backend:
+fields(builtin_local) ->
+    %% Schema for the builtin_raft backend:
     [
         {backend,
             sc(
-                builtin,
+                builtin_local,
                 #{
                     'readOnly' => true,
-                    default => builtin,
+                    default => builtin_local,
                     importance => ?IMPORTANCE_MEDIUM,
-                    desc => ?DESC(builtin_backend)
+                    desc => ?DESC(backend_type)
                 }
             )},
         {'_config_handler',
@@ -108,27 +117,32 @@ fields(builtin) ->
                 {module(), atom()},
                 #{
                     'readOnly' => true,
-                    default => {?MODULE, translate_builtin},
+                    default => {?MODULE, translate_builtin_local},
                     importance => ?IMPORTANCE_HIDDEN
                 }
-            )},
-        {data_dir,
+            )}
+        | common_builtin_fields()
+    ];
+fields(builtin_raft) ->
+    %% Schema for the builtin_raft backend:
+    [
+        {backend,
             sc(
-                string(),
+                builtin_raft,
                 #{
-                    mapping => "emqx_durable_storage.db_data_dir",
-                    required => false,
+                    'readOnly' => true,
+                    default => builtin_raft,
                     importance => ?IMPORTANCE_MEDIUM,
-                    desc => ?DESC(builtin_data_dir)
+                    desc => ?DESC(backend_type)
                 }
             )},
-        {n_shards,
+        {'_config_handler',
             sc(
-                pos_integer(),
+                {module(), atom()},
                 #{
-                    default => 12,
-                    importance => ?IMPORTANCE_MEDIUM,
-                    desc => ?DESC(builtin_n_shards)
+                    'readOnly' => true,
+                    default => {?MODULE, translate_builtin_raft},
+                    importance => ?IMPORTANCE_HIDDEN
                 }
             )},
         %% TODO: Deprecate once cluster management and rebalancing is implemented.
@@ -157,29 +171,10 @@ fields(builtin) ->
                     default => #{},
                     importance => ?IMPORTANCE_HIDDEN
                 }
-            )},
-        {local_write_buffer,
-            sc(
-                ref(builtin_local_write_buffer),
-                #{
-                    importance => ?IMPORTANCE_HIDDEN,
-                    desc => ?DESC(builtin_local_write_buffer)
-                }
-            )},
-        {layout,
-            sc(
-                hoconsc:union(builtin_layouts()),
-                #{
-                    desc => ?DESC(builtin_layout),
-                    importance => ?IMPORTANCE_MEDIUM,
-                    default =>
-                        #{
-                            <<"type">> => wildcard_optimized
-                        }
-                }
             )}
+        | common_builtin_fields()
     ];
-fields(builtin_local_write_buffer) ->
+fields(builtin_write_buffer) ->
     [
         {max_items,
             sc(
@@ -188,7 +183,7 @@ fields(builtin_local_write_buffer) ->
                     default => 1000,
                     mapping => "emqx_durable_storage.egress_batch_size",
                     importance => ?IMPORTANCE_HIDDEN,
-                    desc => ?DESC(builtin_local_write_buffer_max_items)
+                    desc => ?DESC(builtin_write_buffer_max_items)
                 }
             )},
         {flush_interval,
@@ -198,7 +193,7 @@ fields(builtin_local_write_buffer) ->
                     default => 100,
                     mapping => "emqx_durable_storage.egress_flush_interval",
                     importance => ?IMPORTANCE_HIDDEN,
-                    desc => ?DESC(builtin_local_write_buffer_flush_interval)
+                    desc => ?DESC(builtin_write_buffer_flush_interval)
                 }
             )}
     ];
@@ -252,10 +247,55 @@ fields(layout_builtin_reference) ->
             )}
     ].
 
-desc(builtin) ->
-    ?DESC(builtin);
-desc(builtin_local_write_buffer) ->
-    ?DESC(builtin_local_write_buffer);
+common_builtin_fields() ->
+    [
+        {data_dir,
+            sc(
+                string(),
+                #{
+                    mapping => "emqx_durable_storage.db_data_dir",
+                    required => false,
+                    importance => ?IMPORTANCE_MEDIUM,
+                    desc => ?DESC(builtin_data_dir)
+                }
+            )},
+        {n_shards,
+            sc(
+                pos_integer(),
+                #{
+                    default => 16,
+                    importance => ?IMPORTANCE_MEDIUM,
+                    desc => ?DESC(builtin_n_shards)
+                }
+            )},
+        {local_write_buffer,
+            sc(
+                ref(builtin_write_buffer),
+                #{
+                    importance => ?IMPORTANCE_HIDDEN,
+                    desc => ?DESC(builtin_write_buffer)
+                }
+            )},
+        {layout,
+            sc(
+                hoconsc:union(builtin_layouts()),
+                #{
+                    desc => ?DESC(builtin_layout),
+                    importance => ?IMPORTANCE_MEDIUM,
+                    default =>
+                        #{
+                            <<"type">> => wildcard_optimized
+                        }
+                }
+            )}
+    ].
+
+desc(builtin_raft) ->
+    ?DESC(builtin_raft);
+desc(builtin_local) ->
+    ?DESC(builtin_local);
+desc(builtin_write_buffer) ->
+    ?DESC(builtin_write_buffer);
 desc(layout_builtin_wildcard_optimized) ->
     ?DESC(layout_builtin_wildcard_optimized);
 desc(layout_builtin_reference) ->
@@ -267,12 +307,27 @@ desc(_) ->
 %% Internal functions
 %%================================================================================
 
+translate_layout(
+    #{
+        type := wildcard_optimized,
+        bits_per_topic_level := BitsPerTopicLevel,
+        epoch_bits := EpochBits,
+        topic_index_bytes := TIBytes
+    }
+) ->
+    {emqx_ds_storage_bitfield_lts, #{
+        bits_per_topic_level => BitsPerTopicLevel,
+        topic_index_bytes => TIBytes,
+        epoch_bits => EpochBits
+    }};
+translate_layout(#{type := reference}) ->
+    {emqx_ds_storage_reference, #{}}.
+
 ds_schema(Options) ->
     sc(
-        hoconsc:union([
-            ref(builtin)
-            | emqx_schema_hooks:injection_point('durable_storage.backends', [])
-        ]),
+        hoconsc:union(
+            ?BUILTIN_BACKENDS ++ emqx_schema_hooks:injection_point('durable_storage.backends', [])
+        ),
         Options
     ).
 

+ 16 - 2
apps/emqx/test/emqx_persistent_messages_SUITE.erl

@@ -32,9 +32,23 @@
 all() ->
     emqx_common_test_helpers:all(?MODULE).
 
+%% Needed for standalone mode:
+-ifndef(EMQX_RELEASE_EDITION).
+-define(EMQX_RELEASE_EDITION, ce).
+-endif.
+
+-if(?EMQX_RELEASE_EDITION == ee).
+
 init_per_suite(Config) ->
     Config.
 
+-else.
+
+init_per_suite(Config) ->
+    {skip, no_replication}.
+
+-endif.
+
 end_per_suite(_Config) ->
     ok.
 
@@ -465,7 +479,7 @@ t_metrics_not_dropped(_Config) ->
 t_replication_options(_Config) ->
     ?assertMatch(
         #{
-            backend := builtin,
+            backend := builtin_raft,
             replication_options := #{
                 wal_max_size_bytes := 16000000,
                 wal_max_batch_size := 1024,
@@ -570,7 +584,7 @@ wait_shards_online(Nodes = [Node | _]) ->
     ?retry(500, 10, [?assertEqual(NShards, shards_online(N)) || N <- Nodes]).
 
 shards_online(Node) ->
-    length(erpc:call(Node, emqx_ds_builtin_db_sup, which_shards, [?PERSISTENT_MESSAGE_DB])).
+    length(erpc:call(Node, emqx_ds_builtin_raft_db_sup, which_shards, [?PERSISTENT_MESSAGE_DB])).
 
 get_mqtt_port(Node, Type) ->
     {_IP, Port} = erpc:call(Node, emqx_config, get, [[listeners, Type, default, bind]]),

+ 2 - 1
apps/emqx/test/emqx_persistent_session_SUITE.erl

@@ -81,7 +81,8 @@ init_per_group(persistence_enabled, Config) ->
                 "  heartbeat_interval = 100ms\n"
                 "  renew_streams_interval = 100ms\n"
                 "  session_gc_interval = 2s\n"
-                "}"},
+                "}\n"
+                "durable_storage.messages.backend = builtin_local"},
         {persistence, ds}
         | Config
     ];

+ 2 - 0
apps/emqx_conf/src/emqx_conf_schema.erl

@@ -1457,6 +1457,8 @@ cluster_options(k8s, Conf) ->
         {suffix, conf_get("cluster.k8s.suffix", Conf, "")}
     ];
 cluster_options(manual, _Conf) ->
+    [];
+cluster_options(singleton, _Conf) ->
     [].
 
 to_atom(Atom) when is_atom(Atom) ->

+ 35 - 30
apps/emqx_dashboard/test/emqx_dashboard_monitor_SUITE.erl

@@ -82,37 +82,42 @@ end_per_suite(_Config) ->
     ok.
 
 init_per_group(persistent_sessions = Group, Config) ->
-    AppSpecsFn = fun(Enable) ->
-        Port =
-            case Enable of
-                true -> "18083";
-                false -> "0"
+    case emqx_ds_test_helpers:skip_if_norepl() of
+        false ->
+            AppSpecsFn = fun(Enable) ->
+                Port =
+                    case Enable of
+                        true -> "18083";
+                        false -> "0"
+                    end,
+                [
+                    emqx_conf,
+                    {emqx, "durable_sessions {enable = true}"},
+                    {emqx_retainer, ?BASE_RETAINER_CONF},
+                    emqx_management,
+                    emqx_mgmt_api_test_util:emqx_dashboard(
+                        lists:concat([
+                            "dashboard.listeners.http { bind = " ++ Port ++ " }\n",
+                            "dashboard.sample_interval = 1s\n",
+                            "dashboard.listeners.http.enable = " ++ atom_to_list(Enable)
+                        ])
+                    )
+                ]
             end,
-        [
-            emqx_conf,
-            {emqx, "durable_sessions {enable = true}"},
-            {emqx_retainer, ?BASE_RETAINER_CONF},
-            emqx_management,
-            emqx_mgmt_api_test_util:emqx_dashboard(
-                lists:concat([
-                    "dashboard.listeners.http { bind = " ++ Port ++ " }\n",
-                    "dashboard.sample_interval = 1s\n",
-                    "dashboard.listeners.http.enable = " ++ atom_to_list(Enable)
-                ])
-            )
-        ]
-    end,
-    NodeSpecs = [
-        {dashboard_monitor1, #{apps => AppSpecsFn(true)}},
-        {dashboard_monitor2, #{apps => AppSpecsFn(false)}}
-    ],
-    Nodes =
-        [N1 | _] = emqx_cth_cluster:start(
-            NodeSpecs,
-            #{work_dir => emqx_cth_suite:work_dir(Group, Config)}
-        ),
-    ?ON(N1, {ok, _} = emqx_common_test_http:create_default_app()),
-    [{cluster, Nodes} | Config];
+            NodeSpecs = [
+                {dashboard_monitor1, #{apps => AppSpecsFn(true)}},
+                {dashboard_monitor2, #{apps => AppSpecsFn(false)}}
+            ],
+            Nodes =
+                [N1 | _] = emqx_cth_cluster:start(
+                    NodeSpecs,
+                    #{work_dir => emqx_cth_suite:work_dir(Group, Config)}
+                ),
+            ?ON(N1, {ok, _} = emqx_common_test_http:create_default_app()),
+            [{cluster, Nodes} | Config];
+        Yes ->
+            Yes
+    end;
 init_per_group(common = Group, Config) ->
     Apps = emqx_cth_suite:start(
         [

+ 32 - 0
apps/emqx_ds_backends/README.md

@@ -0,0 +1,32 @@
+# EMQX Durable Storage Backends
+
+This is a placeholder OTP application that depends on all durable storage backends available in the release.
+Starting it will ensure that all backends are properly loaded and registered.
+
+Consumers of `emqx_durable_storage` API should depend on this application instead of the parent `emqx_durable_storage`.
+
+# Features
+
+N/A
+
+# Limitation
+
+N/A
+
+# Documentation links
+
+N/A
+
+# Usage
+
+Any business application that creates DS databases should add this application as a dependency.
+
+# Configurations
+
+None
+
+# Other
+N/A
+
+# Contributing
+Please see our [contributing.md](../../CONTRIBUTING.md).

+ 26 - 0
apps/emqx_ds_backends/rebar.config.script

@@ -0,0 +1,26 @@
+%% -*- mode:erlang -*-
+Profile = case os:getenv("PROFILE") of
+            "emqx-enterprise" ++ _ ->
+              ee;
+            false ->
+              io:format(user, "WARN: environment variable PROFILE is not set, using 'emqx-enterprise'~n", []),
+              ee;
+            _ ->
+              ce
+          end,
+CEDeps =
+    [
+        {emqx_utils, {path, "../emqx_utils"}},
+        {emqx_durable_storage, {path, "../emqx_durable_storage"}},
+        {emqx_ds_builtin_local, {path, "../emqx_ds_builtin_local"}}
+    ],
+EEDeps =
+    [
+        {emqx_ds_builtin_raft, {path, "../emqx_ds_builtin_raft"}}
+    ],
+case Profile of
+  ee ->
+    [{deps, CEDeps ++ EEDeps}];
+  ce ->
+    [{deps, CEDeps}]
+end.

+ 26 - 0
apps/emqx_ds_backends/src/emqx_ds_backends.app.src.script

@@ -0,0 +1,26 @@
+%% -*- mode: erlang -*-
+Profile = case os:getenv("PROFILE") of
+            "emqx-enterprise" ++ _ ->
+              ee;
+            false ->
+              io:format(user, "WARN: environment variable PROFILE is not set, using 'emqx-enterprise'~n", []),
+              ee;
+            _ ->
+              ce
+          end,
+
+{application, emqx_ds_backends, [
+    {description, "A placeholder application that depends on all available DS backends"},
+    % strict semver, bump manually!
+    {vsn, "0.1.0"},
+    {modules, []},
+    {registered, []},
+    {applications, [kernel, stdlib, emqx_durable_storage, emqx_ds_builtin_local |
+                    case Profile of
+                      ee ->
+                          [emqx_ds_builtin_raft];
+                      ce ->
+                          []
+                    end]},
+    {env, []}
+]}.

+ 115 - 328
apps/emqx_durable_storage/test/emqx_ds_SUITE.erl

@@ -13,7 +13,7 @@
 %% See the License for the specific language governing permissions and
 %% limitations under the License.
 %%--------------------------------------------------------------------
--module(emqx_ds_SUITE).
+-module(emqx_ds_backends_SUITE).
 
 -compile(export_all).
 -compile(nowarn_export_all).
@@ -26,52 +26,27 @@
 
 -define(N_SHARDS, 1).
 
-opts() ->
-    #{
-        backend => builtin,
-        storage => {emqx_ds_storage_reference, #{}},
-        n_shards => ?N_SHARDS,
-        n_sites => 1,
-        replication_factor => 3,
-        replication_options => #{}
-    }.
+opts(Config) ->
+    proplists:get_value(ds_conf, Config).
 
 %% A simple smoke test that verifies that opening/closing the DB
 %% doesn't crash, and not much else
-t_00_smoke_open_drop(_Config) ->
+t_00_smoke_open_drop(Config) ->
     DB = 'DB',
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
-    %% Check metadata:
-    %%    We have only one site:
-    [Site] = emqx_ds_replication_layer_meta:sites(),
-    %%    Check all shards:
-    Shards = emqx_ds_replication_layer_meta:shards(DB),
-    %%    Since there is only one site all shards should be allocated
-    %%    to this site:
-    MyShards = emqx_ds_replication_layer_meta:my_shards(DB),
-    ?assertEqual(?N_SHARDS, length(Shards)),
-    lists:foreach(
-        fun(Shard) ->
-            ?assertEqual(
-                [Site], emqx_ds_replication_layer_meta:replica_set(DB, Shard)
-            )
-        end,
-        Shards
-    ),
-    ?assertEqual(lists:sort(Shards), lists:sort(MyShards)),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     %% Reopen the DB and make sure the operation is idempotent:
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     %% Close the DB:
     ?assertMatch(ok, emqx_ds:drop_db(DB)).
 
 %% A simple smoke test that verifies that storing the messages doesn't
 %% crash
-t_01_smoke_store(_Config) ->
+t_01_smoke_store(Config) ->
     ?check_trace(
         #{timetrap => 10_000},
         begin
             DB = default,
-            ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
             Msg = message(<<"foo/bar">>, <<"foo">>, 0),
             ?assertMatch(ok, emqx_ds:store_batch(DB, [Msg]))
         end,
@@ -80,9 +55,9 @@ t_01_smoke_store(_Config) ->
 
 %% A simple smoke test that verifies that getting the list of streams
 %% doesn't crash and that iterators can be opened.
-t_02_smoke_get_streams_start_iter(_Config) ->
+t_02_smoke_get_streams_start_iter(Config) ->
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     StartTime = 0,
     TopicFilter = ['#'],
     [{Rank, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
@@ -91,9 +66,9 @@ t_02_smoke_get_streams_start_iter(_Config) ->
 
 %% A simple smoke test that verifies that it's possible to iterate
 %% over messages.
-t_03_smoke_iterate(_Config) ->
+t_03_smoke_iterate(Config) ->
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     StartTime = 0,
     TopicFilter = ['#'],
     Msgs = [
@@ -101,7 +76,7 @@ t_03_smoke_iterate(_Config) ->
         message(<<"foo">>, <<"2">>, 1),
         message(<<"bar/bar">>, <<"3">>, 2)
     ],
-    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs, #{sync => true})),
     [{_, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
     {ok, Iter0} = emqx_ds:make_iterator(DB, Stream, TopicFilter, StartTime),
     {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter0),
@@ -112,9 +87,9 @@ t_03_smoke_iterate(_Config) ->
 %% to the external resources, such as clients' sessions, and they
 %% should always be able to continue replaying the topics from where
 %% they are left off.
-t_04_restart(_Config) ->
+t_04_restart(Config) ->
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     TopicFilter = ['#'],
     StartTime = 0,
     Msgs = [
@@ -122,22 +97,22 @@ t_04_restart(_Config) ->
         message(<<"foo">>, <<"2">>, 1),
         message(<<"bar/bar">>, <<"3">>, 2)
     ],
-    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs, #{sync => true})),
     [{_, Stream}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
     {ok, Iter0} = emqx_ds:make_iterator(DB, Stream, TopicFilter, StartTime),
     %% Restart the application:
     ?tp(warning, emqx_ds_SUITE_restart_app, #{}),
     ok = application:stop(emqx_durable_storage),
     {ok, _} = application:ensure_all_started(emqx_durable_storage),
-    ok = emqx_ds:open_db(DB, opts()),
+    ok = emqx_ds:open_db(DB, opts(Config)),
     %% The old iterator should be still operational:
     {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter0),
     ?assertEqual(Msgs, Batch, {Iter0, Iter}).
 
 %% Check that we can create iterators directly from DS keys.
-t_05_update_iterator(_Config) ->
+t_05_update_iterator(Config) ->
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     TopicFilter = ['#'],
     StartTime = 0,
     Msgs = [
@@ -158,104 +133,49 @@ t_05_update_iterator(_Config) ->
     ?assertEqual(Msgs, [Msg0 | Batch], #{from_key => Iter1, final_iter => Iter}),
     ok.
 
-t_06_update_config(_Config) ->
+t_06_smoke_add_generation(Config) ->
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
-    TopicFilter = ['#'],
+    BeginTime = os:system_time(millisecond),
 
-    DataSet = update_data_set(),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+    [{Gen1, #{created_at := Created1, since := Since1, until := undefined}}] = maps:to_list(
+        emqx_ds:list_generations_with_lifetimes(DB)
+    ),
 
-    ToMsgs = fun(Datas) ->
-        lists:map(
-            fun({Topic, Payload}) ->
-                message(Topic, Payload, emqx_message:timestamp_now())
-            end,
-            Datas
-        )
-    end,
-
-    {_, StartTimes, MsgsList} =
-        lists:foldl(
-            fun
-                (Datas, {true, TimeAcc, MsgAcc}) ->
-                    Msgs = ToMsgs(Datas),
-                    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
-                    {false, TimeAcc, [Msgs | MsgAcc]};
-                (Datas, {Any, TimeAcc, MsgAcc}) ->
-                    timer:sleep(500),
-                    ?assertMatch(ok, emqx_ds:update_db_config(DB, opts())),
-                    timer:sleep(500),
-                    StartTime = emqx_message:timestamp_now(),
-                    Msgs = ToMsgs(Datas),
-                    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
-                    {Any, [StartTime | TimeAcc], [Msgs | MsgAcc]}
-            end,
-            {true, [emqx_message:timestamp_now()], []},
-            DataSet
-        ),
-
-    Checker = fun({StartTime, Msgs0}, Acc) ->
-        Msgs = Acc ++ Msgs0,
-        Batch = emqx_ds_test_helpers:consume(DB, TopicFilter, StartTime),
-        ?assertEqual(Msgs, Batch, StartTime),
-        Msgs
-    end,
-    lists:foldl(Checker, [], lists:zip(StartTimes, MsgsList)).
-
-t_07_add_generation(_Config) ->
+    ?assertMatch(ok, emqx_ds:add_generation(DB)),
+    [
+        {Gen1, #{created_at := Created1, since := Since1, until := Until1}},
+        {Gen2, #{created_at := Created2, since := Since2, until := undefined}}
+    ] = maps:to_list(emqx_ds:list_generations_with_lifetimes(DB)),
+    %% Check units of the return values (+/- 10s from test begin time):
+    ?give_or_take(BeginTime, 10_000, Created1),
+    ?give_or_take(BeginTime, 10_000, Created2),
+    ?give_or_take(BeginTime, 10_000, Since2),
+    ?give_or_take(BeginTime, 10_000, Until1).
+
+t_07_smoke_update_config(Config) ->
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
-    TopicFilter = ['#'],
-
-    DataSet = update_data_set(),
-
-    ToMsgs = fun(Datas) ->
-        lists:map(
-            fun({Topic, Payload}) ->
-                message(Topic, Payload, emqx_message:timestamp_now())
-            end,
-            Datas
-        )
-    end,
-
-    {_, StartTimes, MsgsList} =
-        lists:foldl(
-            fun
-                (Datas, {true, TimeAcc, MsgAcc}) ->
-                    Msgs = ToMsgs(Datas),
-                    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
-                    {false, TimeAcc, [Msgs | MsgAcc]};
-                (Datas, {Any, TimeAcc, MsgAcc}) ->
-                    timer:sleep(500),
-                    ?assertMatch(ok, emqx_ds:add_generation(DB)),
-                    timer:sleep(500),
-                    StartTime = emqx_message:timestamp_now(),
-                    Msgs = ToMsgs(Datas),
-                    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
-                    {Any, [StartTime | TimeAcc], [Msgs | MsgAcc]}
-            end,
-            {true, [emqx_message:timestamp_now()], []},
-            DataSet
-        ),
-
-    Checker = fun({StartTime, Msgs0}, Acc) ->
-        Msgs = Acc ++ Msgs0,
-        Batch = emqx_ds_test_helpers:consume(DB, TopicFilter, StartTime),
-        ?assertEqual(Msgs, Batch, StartTime),
-        Msgs
-    end,
-    lists:foldl(Checker, [], lists:zip(StartTimes, MsgsList)).
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+    ?assertMatch(
+        [{_, _}],
+        maps:to_list(emqx_ds:list_generations_with_lifetimes(DB))
+    ),
+    ?assertMatch(ok, emqx_ds:update_db_config(DB, opts(Config))),
+    ?assertMatch(
+        [{_, _}, {_, _}],
+        maps:to_list(emqx_ds:list_generations_with_lifetimes(DB))
+    ).
 
 %% Verifies the basic usage of `list_generations_with_lifetimes' and `drop_generation'...
 %%   1) Cannot drop current generation.
 %%   2) All existing generations are returned by `list_generation_with_lifetimes'.
 %%   3) Dropping a generation removes it from the list.
 %%   4) Dropped generations stay dropped even after restarting the application.
-t_08_smoke_list_drop_generation(_Config) ->
+t_08_smoke_list_drop_generation(Config) ->
     DB = ?FUNCTION_NAME,
     ?check_trace(
         begin
-            ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
             %% Exactly one generation at first.
             Generations0 = emqx_ds:list_generations_with_lifetimes(DB),
             ?assertMatch(
@@ -295,7 +215,7 @@ t_08_smoke_list_drop_generation(_Config) ->
             %% Should persist surviving generation list
             ok = application:stop(emqx_durable_storage),
             {ok, _} = application:ensure_all_started(emqx_durable_storage),
-            ok = emqx_ds:open_db(DB, opts()),
+            ok = emqx_ds:open_db(DB, opts(Config)),
 
             Generations3 = emqx_ds:list_generations_with_lifetimes(DB),
             ?assertMatch(
@@ -310,12 +230,12 @@ t_08_smoke_list_drop_generation(_Config) ->
     ),
     ok.
 
-t_09_atomic_store_batch(_Config) ->
+t_09_atomic_store_batch(Config) ->
     DB = ?FUNCTION_NAME,
     ?check_trace(
         begin
             application:set_env(emqx_durable_storage, egress_batch_size, 1),
-            ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
             Msgs = [
                 message(<<"1">>, <<"1">>, 0),
                 message(<<"2">>, <<"2">>, 1),
@@ -328,19 +248,19 @@ t_09_atomic_store_batch(_Config) ->
                     sync => true
                 })
             ),
-            {ok, Flush} = ?block_until(#{?snk_kind := emqx_ds_replication_layer_egress_flush}),
+            {ok, Flush} = ?block_until(#{?snk_kind := emqx_ds_buffer_flush}),
             ?assertMatch(#{batch := [_, _, _]}, Flush)
         end,
         []
     ),
     ok.
 
-t_10_non_atomic_store_batch(_Config) ->
+t_10_non_atomic_store_batch(Config) ->
     DB = ?FUNCTION_NAME,
     ?check_trace(
         begin
             application:set_env(emqx_durable_storage, egress_batch_size, 1),
-            ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
             Msgs = [
                 message(<<"1">>, <<"1">>, 0),
                 message(<<"2">>, <<"2">>, 1),
@@ -358,7 +278,7 @@ t_10_non_atomic_store_batch(_Config) ->
         end,
         fun(Trace) ->
             %% Should contain one flush per message.
-            Batches = ?projection(batch, ?of_kind(emqx_ds_replication_layer_egress_flush, Trace)),
+            Batches = ?projection(batch, ?of_kind(emqx_ds_buffer_flush, Trace)),
             ?assertMatch([_], Batches),
             ?assertMatch(
                 [_, _, _],
@@ -369,11 +289,11 @@ t_10_non_atomic_store_batch(_Config) ->
     ),
     ok.
 
-t_smoke_delete_next(_Config) ->
+t_smoke_delete_next(Config) ->
     DB = ?FUNCTION_NAME,
     ?check_trace(
         begin
-            ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
             StartTime = 0,
             TopicFilter = [<<"foo">>, '#'],
             Msgs =
@@ -410,7 +330,7 @@ t_smoke_delete_next(_Config) ->
     ),
     ok.
 
-t_drop_generation_with_never_used_iterator(_Config) ->
+t_drop_generation_with_never_used_iterator(Config) ->
     %% This test checks how the iterator behaves when:
     %%   1) it's created at generation 1 and not consumed from.
     %%   2) generation 2 is created and 1 dropped.
@@ -418,7 +338,7 @@ t_drop_generation_with_never_used_iterator(_Config) ->
     %% In this case, the iterator won't see any messages and the stream will end.
 
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
 
     TopicFilter = emqx_topic:words(<<"foo/+">>),
@@ -458,7 +378,7 @@ t_drop_generation_with_never_used_iterator(_Config) ->
 
     ok.
 
-t_drop_generation_with_used_once_iterator(_Config) ->
+t_drop_generation_with_used_once_iterator(Config) ->
     %% This test checks how the iterator behaves when:
     %%   1) it's created at generation 1 and consumes at least 1 message.
     %%   2) generation 2 is created and 1 dropped.
@@ -466,7 +386,7 @@ t_drop_generation_with_used_once_iterator(_Config) ->
     %% In this case, the iterator should see no more messages and the stream will end.
 
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
 
     TopicFilter = emqx_topic:words(<<"foo/+">>),
@@ -499,12 +419,12 @@ t_drop_generation_with_used_once_iterator(_Config) ->
         emqx_ds_test_helpers:consume_iter(DB, Iter1)
     ).
 
-t_drop_generation_update_iterator(_Config) ->
+t_drop_generation_update_iterator(Config) ->
     %% This checks the behavior of `emqx_ds:update_iterator' after the generation
     %% underlying the iterator has been dropped.
 
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
 
     TopicFilter = emqx_topic:words(<<"foo/+">>),
@@ -528,12 +448,12 @@ t_drop_generation_update_iterator(_Config) ->
         emqx_ds:update_iterator(DB, Iter1, Key2)
     ).
 
-t_make_iterator_stale_stream(_Config) ->
+t_make_iterator_stale_stream(Config) ->
     %% This checks the behavior of `emqx_ds:make_iterator' after the generation underlying
     %% the stream has been dropped.
 
     DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
     [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
 
     TopicFilter = emqx_topic:words(<<"foo/+">>),
@@ -556,7 +476,7 @@ t_make_iterator_stale_stream(_Config) ->
 
     ok.
 
-t_get_streams_concurrently_with_drop_generation(_Config) ->
+t_get_streams_concurrently_with_drop_generation(Config) ->
     %% This checks that we can get all streams while a generation is dropped
     %% mid-iteration.
 
@@ -564,7 +484,7 @@ t_get_streams_concurrently_with_drop_generation(_Config) ->
     ?check_trace(
         #{timetrap => 5_000},
         begin
-            ?assertMatch(ok, emqx_ds:open_db(DB, opts())),
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
 
             [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
             ok = emqx_ds:add_generation(DB),
@@ -593,171 +513,6 @@ t_get_streams_concurrently_with_drop_generation(_Config) ->
         []
     ).
 
-t_error_mapping_replication_layer(_Config) ->
-    %% This checks that the replication layer maps recoverable errors correctly.
-
-    ok = emqx_ds_test_helpers:mock_rpc(),
-    ok = snabbkaffe:start_trace(),
-
-    DB = ?FUNCTION_NAME,
-    ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})),
-    [Shard1, Shard2] = emqx_ds_replication_layer_meta:shards(DB),
-
-    TopicFilter = emqx_topic:words(<<"foo/#">>),
-    Msgs = [
-        message(<<"C1">>, <<"foo/bar">>, <<"1">>, 0),
-        message(<<"C1">>, <<"foo/baz">>, <<"2">>, 1),
-        message(<<"C2">>, <<"foo/foo">>, <<"3">>, 2),
-        message(<<"C3">>, <<"foo/xyz">>, <<"4">>, 3),
-        message(<<"C4">>, <<"foo/bar">>, <<"5">>, 4),
-        message(<<"C5">>, <<"foo/oof">>, <<"6">>, 5)
-    ],
-
-    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
-
-    ?block_until(#{?snk_kind := emqx_ds_replication_layer_egress_flush, shard := Shard1}),
-    ?block_until(#{?snk_kind := emqx_ds_replication_layer_egress_flush, shard := Shard2}),
-
-    Streams0 = emqx_ds:get_streams(DB, TopicFilter, 0),
-    Iterators0 = lists:map(
-        fun({_Rank, S}) ->
-            {ok, Iter} = emqx_ds:make_iterator(DB, S, TopicFilter, 0),
-            Iter
-        end,
-        Streams0
-    ),
-
-    %% Disrupt the link to the second shard.
-    ok = emqx_ds_test_helpers:mock_rpc_result(
-        fun(_Node, emqx_ds_replication_layer, _Function, Args) ->
-            case Args of
-                [DB, Shard1 | _] -> passthrough;
-                [DB, Shard2 | _] -> unavailable
-            end
-        end
-    ),
-
-    %% Result of `emqx_ds:get_streams/3` will just contain partial results, not an error.
-    Streams1 = emqx_ds:get_streams(DB, TopicFilter, 0),
-    ?assert(
-        length(Streams1) > 0 andalso length(Streams1) =< length(Streams0),
-        Streams1
-    ),
-
-    %% At least one of `emqx_ds:make_iterator/4` will end in an error.
-    Results1 = lists:map(
-        fun({_Rank, S}) ->
-            case emqx_ds:make_iterator(DB, S, TopicFilter, 0) of
-                Ok = {ok, _Iter} ->
-                    Ok;
-                Error = {error, recoverable, {erpc, _}} ->
-                    Error;
-                Other ->
-                    ct:fail({unexpected_result, Other})
-            end
-        end,
-        Streams0
-    ),
-    ?assert(
-        length([error || {error, _, _} <- Results1]) > 0,
-        Results1
-    ),
-
-    %% At least one of `emqx_ds:next/3` over initial set of iterators will end in an error.
-    Results2 = lists:map(
-        fun(Iter) ->
-            case emqx_ds:next(DB, Iter, _BatchSize = 42) of
-                Ok = {ok, _Iter, [_ | _]} ->
-                    Ok;
-                Error = {error, recoverable, {badrpc, _}} ->
-                    Error;
-                Other ->
-                    ct:fail({unexpected_result, Other})
-            end
-        end,
-        Iterators0
-    ),
-    ?assert(
-        length([error || {error, _, _} <- Results2]) > 0,
-        Results2
-    ),
-    meck:unload().
-
-%% This testcase verifies the behavior of `store_batch' operation
-%% when the underlying code experiences recoverable or unrecoverable
-%% problems.
-t_store_batch_fail(_Config) ->
-    ?check_trace(
-        #{timetrap => 15_000},
-        try
-            meck:new(emqx_ds_storage_layer, [passthrough, no_history]),
-            DB = ?FUNCTION_NAME,
-            ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})),
-            %% Success:
-            Batch1 = [
-                message(<<"C1">>, <<"foo/bar">>, <<"1">>, 1),
-                message(<<"C1">>, <<"foo/bar">>, <<"2">>, 1)
-            ],
-            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch1, #{sync => true})),
-            %% Inject unrecoverable error:
-            meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) ->
-                {error, unrecoverable, mock}
-            end),
-            Batch2 = [
-                message(<<"C1">>, <<"foo/bar">>, <<"3">>, 1),
-                message(<<"C1">>, <<"foo/bar">>, <<"4">>, 1)
-            ],
-            ?assertMatch(
-                {error, unrecoverable, mock}, emqx_ds:store_batch(DB, Batch2, #{sync => true})
-            ),
-            meck:unload(emqx_ds_storage_layer),
-            %% Inject a recoveralbe error:
-            meck:new(ra, [passthrough, no_history]),
-            meck:expect(ra, process_command, fun(Servers, Shard, Command) ->
-                ?tp(ra_command, #{servers => Servers, shard => Shard, command => Command}),
-                {timeout, mock}
-            end),
-            Batch3 = [
-                message(<<"C1">>, <<"foo/bar">>, <<"5">>, 2),
-                message(<<"C2">>, <<"foo/bar">>, <<"6">>, 2),
-                message(<<"C1">>, <<"foo/bar">>, <<"7">>, 3),
-                message(<<"C2">>, <<"foo/bar">>, <<"8">>, 3)
-            ],
-            %% Note: due to idempotency issues the number of retries
-            %% is currently set to 0:
-            ?assertMatch(
-                {error, recoverable, {timeout, mock}},
-                emqx_ds:store_batch(DB, Batch3, #{sync => true})
-            ),
-            meck:unload(ra),
-            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})),
-            lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 1))
-        after
-            meck:unload()
-        end,
-        [
-            {"message ordering", fun(StoredMessages, _Trace) ->
-                [{_, Stream1}, {_, Stream2}] = StoredMessages,
-                ?assertMatch(
-                    [
-                        #message{payload = <<"1">>},
-                        #message{payload = <<"2">>},
-                        #message{payload = <<"5">>},
-                        #message{payload = <<"7">>}
-                    ],
-                    Stream1
-                ),
-                ?assertMatch(
-                    [
-                        #message{payload = <<"6">>},
-                        #message{payload = <<"8">>}
-                    ],
-                    Stream2
-                )
-            end}
-        ]
-    ).
-
 update_data_set() ->
     [
         [
@@ -802,27 +557,59 @@ delete(DB, It0, Selector, BatchSize, Acc) ->
 
 %% CT callbacks
 
-all() -> emqx_common_test_helpers:all(?MODULE).
+all() ->
+    [{group, builtin_local}, {group, builtin_raft}].
+
+groups() ->
+    TCs = emqx_common_test_helpers:all(?MODULE),
+    [
+        {builtin_local, TCs},
+        {builtin_raft, TCs}
+    ].
+
+init_per_group(builtin_local, Config) ->
+    Conf = #{
+        backend => builtin_local,
+        storage => {emqx_ds_storage_reference, #{}},
+        n_shards => ?N_SHARDS
+    },
+    [{ds_conf, Conf} | Config];
+init_per_group(builtin_raft, Config) ->
+    case emqx_ds_test_helpers:skip_if_norepl() of
+        false ->
+            Conf = #{
+                backend => builtin_raft,
+                storage => {emqx_ds_storage_reference, #{}},
+                n_shards => ?N_SHARDS,
+                n_sites => 1,
+                replication_factor => 3,
+                replication_options => #{}
+            },
+            [{ds_conf, Conf} | Config];
+        Yes ->
+            Yes
+    end.
+
+end_per_group(_Group, Config) ->
+    Config.
 
 init_per_suite(Config) ->
-    emqx_common_test_helpers:clear_screen(),
+    Config.
+
+end_per_suite(_Config) ->
+    ok.
+
+init_per_testcase(TC, Config) ->
     Apps = emqx_cth_suite:start(
-        [mria, emqx_durable_storage],
-        #{work_dir => ?config(priv_dir, Config)}
+        [emqx_durable_storage, emqx_ds_backends],
+        #{work_dir => emqx_cth_suite:work_dir(TC, Config)}
     ),
+    ct:pal("Apps: ~p", [Apps]),
     [{apps, Apps} | Config].
 
-end_per_suite(Config) ->
+end_per_testcase(TC, Config) ->
+    ok = emqx_ds:drop_db(TC),
     ok = emqx_cth_suite:stop(?config(apps, Config)),
-    ok.
-
-init_per_testcase(_TC, Config) ->
-    application:ensure_all_started(emqx_durable_storage),
-    Config.
-
-end_per_testcase(_TC, _Config) ->
-    snabbkaffe:stop(),
-    ok = application:stop(emqx_durable_storage),
-    mria:stop(),
     _ = mnesia:delete_schema([node()]),
+    snabbkaffe:stop(),
     ok.

+ 32 - 0
apps/emqx_ds_builtin_local/README.md

@@ -0,0 +1,32 @@
+# Local Backend for EMQX Durable Storage
+
+# Features
+
+This backend uses local RocksDB database to store data.
+
+# Limitation
+
+This backend cannot be used in a clustered EMQX setup.
+
+# Documentation links
+
+TBD
+
+# Usage
+
+TBD
+
+# Configurations
+
+TBD
+
+# HTTP APIs
+
+TBD
+
+# Other
+
+TBD
+
+# Contributing
+Please see our [contributing.md](../../CONTRIBUTING.md).

+ 5 - 0
apps/emqx_ds_builtin_local/rebar.config

@@ -0,0 +1,5 @@
+%% -*- mode:erlang -*-
+
+{deps, [
+    {emqx_durable_storage, {path, "../emqx_durable_storage"}}
+]}.

+ 11 - 0
apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.app.src

@@ -0,0 +1,11 @@
+%% -*- mode: erlang -*-
+{application, emqx_ds_builtin_local, [
+    {description, "A DS backend that stores all data locally and thus doesn't support clustering."},
+    % strict semver, bump manually!
+    {vsn, "0.1.0"},
+    {modules, []},
+    {registered, []},
+    {applications, [kernel, stdlib, gproc, mria, rocksdb, emqx_durable_storage, emqx_utils]},
+    {mod, {emqx_ds_builtin_local_app, []}},
+    {env, []}
+]}.

+ 382 - 0
apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local.erl

@@ -0,0 +1,382 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_ds_builtin_local).
+
+-behaviour(emqx_ds).
+-behaviour(emqx_ds_buffer).
+
+%% API:
+-export([]).
+
+%% behavior callbacks:
+-export([
+    %% `emqx_ds':
+    open_db/2,
+    close_db/1,
+    add_generation/1,
+    update_db_config/2,
+    list_generations_with_lifetimes/1,
+    drop_generation/2,
+    drop_db/1,
+    store_batch/3,
+    get_streams/3,
+    get_delete_streams/3,
+    make_iterator/4,
+    make_delete_iterator/4,
+    update_iterator/3,
+    next/3,
+    delete_next/4,
+
+    %% `emqx_ds_buffer':
+    init_buffer/3,
+    flush_buffer/4,
+    shard_of_message/4
+]).
+
+-export_type([db_opts/0, shard/0, iterator/0, delete_iterator/0]).
+
+-include_lib("emqx_utils/include/emqx_message.hrl").
+
+%%================================================================================
+%% Type declarations
+%%================================================================================
+
+-define(tag, 1).
+-define(shard, 2).
+-define(enc, 3).
+
+-define(IT, 61).
+-define(DELETE_IT, 62).
+
+-type shard() :: binary().
+
+-opaque iterator() ::
+    #{
+        ?tag := ?IT,
+        ?shard := shard(),
+        ?enc := term()
+    }.
+
+-opaque delete_iterator() ::
+    #{
+        ?tag := ?DELETE_IT,
+        ?shard := shard(),
+        ?enc := term()
+    }.
+
+-type db_opts() ::
+    #{
+        backend := builtin_local,
+        storage := emqx_ds_storage_layer:prototype(),
+        n_shards := pos_integer()
+    }.
+
+-type generation_rank() :: {shard(), emqx_ds_storage_layer:gen_id()}.
+
+-define(stream(SHARD, INNER), [2, SHARD | INNER]).
+-define(delete_stream(SHARD, INNER), [3, SHARD | INNER]).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+%%================================================================================
+%% behavior callbacks
+%%================================================================================
+
+-spec open_db(emqx_ds:db(), db_opts()) -> ok | {error, _}.
+open_db(DB, CreateOpts) ->
+    case emqx_ds_builtin_local_sup:start_db(DB, CreateOpts) of
+        {ok, _} ->
+            ok;
+        {error, {already_started, _}} ->
+            ok;
+        {error, Err} ->
+            {error, Err}
+    end.
+
+-spec close_db(emqx_ds:db()) -> ok.
+close_db(DB) ->
+    emqx_ds_builtin_local_sup:stop_db(DB).
+
+-spec add_generation(emqx_ds:db()) -> ok | {error, _}.
+add_generation(DB) ->
+    Shards = emqx_ds_builtin_local_meta:shards(DB),
+    Errors = lists:filtermap(
+        fun(Shard) ->
+            ShardId = {DB, Shard},
+            case
+                emqx_ds_storage_layer:add_generation(
+                    ShardId, emqx_ds_builtin_local_meta:ensure_monotonic_timestamp(ShardId)
+                )
+            of
+                ok ->
+                    false;
+                Error ->
+                    {true, {Shard, Error}}
+            end
+        end,
+        Shards
+    ),
+    case Errors of
+        [] -> ok;
+        _ -> {error, Errors}
+    end.
+
+-spec update_db_config(emqx_ds:db(), db_opts()) -> ok | {error, _}.
+update_db_config(DB, CreateOpts) ->
+    Opts = #{} = emqx_ds_builtin_local_meta:update_db_config(DB, CreateOpts),
+    lists:foreach(
+        fun(Shard) ->
+            ShardId = {DB, Shard},
+            emqx_ds_storage_layer:update_config(
+                ShardId, emqx_ds_builtin_local_meta:ensure_monotonic_timestamp(ShardId), Opts
+            )
+        end,
+        emqx_ds_builtin_local_meta:shards(DB)
+    ).
+
+-spec list_generations_with_lifetimes(emqx_ds:db()) ->
+    #{emqx_ds:generation_rank() => emqx_ds:generation_info()}.
+list_generations_with_lifetimes(DB) ->
+    lists:foldl(
+        fun(Shard, Acc) ->
+            maps:fold(
+                fun(GenId, Data0, Acc1) ->
+                    Data = maps:update_with(
+                        until,
+                        fun timeus_to_timestamp/1,
+                        maps:update_with(since, fun timeus_to_timestamp/1, Data0)
+                    ),
+                    Acc1#{{Shard, GenId} => Data}
+                end,
+                Acc,
+                emqx_ds_storage_layer:list_generations_with_lifetimes({DB, Shard})
+            )
+        end,
+        #{},
+        emqx_ds_builtin_local_meta:shards(DB)
+    ).
+
+-spec drop_generation(emqx_ds:db(), generation_rank()) -> ok | {error, _}.
+drop_generation(DB, {Shard, GenId}) ->
+    emqx_ds_storage_layer:drop_generation({DB, Shard}, GenId).
+
+-spec drop_db(emqx_ds:db()) -> ok | {error, _}.
+drop_db(DB) ->
+    close_db(DB),
+    lists:foreach(
+        fun(Shard) ->
+            emqx_ds_storage_layer:drop_shard({DB, Shard})
+        end,
+        emqx_ds_builtin_local_meta:shards(DB)
+    ),
+    emqx_ds_builtin_local_meta:drop_db(DB).
+
+-spec store_batch(emqx_ds:db(), [emqx_types:message()], emqx_ds:message_store_opts()) ->
+    emqx_ds:store_batch_result().
+store_batch(DB, Messages, Opts) ->
+    try
+        emqx_ds_buffer:store_batch(DB, Messages, Opts)
+    catch
+        error:{Reason, _Call} when Reason == timeout; Reason == noproc ->
+            {error, recoverable, Reason}
+    end.
+
+-record(bs, {options :: term()}).
+-type buffer_state() :: #bs{}.
+
+-spec init_buffer(emqx_ds:db(), shard(), _Options) -> {ok, buffer_state()}.
+init_buffer(DB, Shard, Options) ->
+    ShardId = {DB, Shard},
+    case current_timestamp(ShardId) of
+        undefined ->
+            Latest = erlang:system_time(microsecond),
+            emqx_ds_builtin_local_meta:set_current_timestamp(ShardId, Latest);
+        _Latest ->
+            ok
+    end,
+    {ok, #bs{options = Options}}.
+
+-spec flush_buffer(emqx_ds:db(), shard(), [emqx_types:message()], buffer_state()) ->
+    {buffer_state(), emqx_ds:store_batch_result()}.
+flush_buffer(DB, Shard, Messages, S0 = #bs{options = Options}) ->
+    {Latest, Batch} = assign_timestamps(current_timestamp({DB, Shard}), Messages),
+    Result = emqx_ds_storage_layer:store_batch({DB, Shard}, Batch, Options),
+    emqx_ds_builtin_local_meta:set_current_timestamp({DB, Shard}, Latest),
+    {S0, Result}.
+
+assign_timestamps(Latest, Messages) ->
+    assign_timestamps(Latest, Messages, []).
+
+assign_timestamps(Latest, [MessageIn | Rest], Acc) ->
+    case emqx_message:timestamp(MessageIn, microsecond) of
+        TimestampUs when TimestampUs > Latest ->
+            Message = assign_timestamp(TimestampUs, MessageIn),
+            assign_timestamps(TimestampUs, Rest, [Message | Acc]);
+        _Earlier ->
+            Message = assign_timestamp(Latest + 1, MessageIn),
+            assign_timestamps(Latest + 1, Rest, [Message | Acc])
+    end;
+assign_timestamps(Latest, [], Acc) ->
+    {Latest, lists:reverse(Acc)}.
+
+assign_timestamp(TimestampUs, Message) ->
+    {TimestampUs, Message}.
+
+-spec shard_of_message(emqx_ds:db(), emqx_types:message(), clientid | topic, _Options) -> shard().
+shard_of_message(DB, #message{from = From, topic = Topic}, SerializeBy, _Options) ->
+    N = emqx_ds_builtin_local_meta:n_shards(DB),
+    Hash =
+        case SerializeBy of
+            clientid -> erlang:phash2(From, N);
+            topic -> erlang:phash2(Topic, N)
+        end,
+    integer_to_binary(Hash).
+
+-spec get_streams(emqx_ds:db(), emqx_ds:topic_filter(), emqx_ds:time()) ->
+    [{emqx_ds:stream_rank(), emqx_ds:ds_specific_stream()}].
+get_streams(DB, TopicFilter, StartTime) ->
+    Shards = emqx_ds_builtin_local_meta:shards(DB),
+    lists:flatmap(
+        fun(Shard) ->
+            Streams = emqx_ds_storage_layer:get_streams(
+                {DB, Shard}, TopicFilter, timestamp_to_timeus(StartTime)
+            ),
+            lists:map(
+                fun({RankY, InnerStream}) ->
+                    Rank = {Shard, RankY},
+                    {Rank, ?stream(Shard, InnerStream)}
+                end,
+                Streams
+            )
+        end,
+        Shards
+    ).
+
+-spec make_iterator(
+    emqx_ds:db(), emqx_ds:ds_specific_stream(), emqx_ds:topic_filter(), emqx_ds:time()
+) ->
+    emqx_ds:make_iterator_result(emqx_ds:ds_specific_iterator()).
+make_iterator(DB, ?stream(Shard, InnerStream), TopicFilter, StartTime) ->
+    ShardId = {DB, Shard},
+    case
+        emqx_ds_storage_layer:make_iterator(
+            ShardId, InnerStream, TopicFilter, timestamp_to_timeus(StartTime)
+        )
+    of
+        {ok, Iter} ->
+            {ok, #{?tag => ?IT, ?shard => Shard, ?enc => Iter}};
+        Error = {error, _, _} ->
+            Error
+    end.
+
+-spec update_iterator(emqx_ds:db(), emqx_ds:ds_specific_iterator(), emqx_ds:message_key()) ->
+    emqx_ds:make_iterator_result(iterator()).
+update_iterator(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0}, Key) ->
+    case emqx_ds_storage_layer:update_iterator({DB, Shard}, StorageIter0, Key) of
+        {ok, StorageIter} ->
+            {ok, Iter0#{?enc => StorageIter}};
+        Err = {error, _, _} ->
+            Err
+    end.
+
+-spec next(emqx_ds:db(), iterator(), pos_integer()) -> emqx_ds:next_result(iterator()).
+next(DB, Iter0 = #{?tag := ?IT, ?shard := Shard, ?enc := StorageIter0}, N) ->
+    ShardId = {DB, Shard},
+    T0 = erlang:monotonic_time(microsecond),
+    Result = emqx_ds_storage_layer:next(ShardId, StorageIter0, N, current_timestamp(ShardId)),
+    T1 = erlang:monotonic_time(microsecond),
+    emqx_ds_builtin_metrics:observe_next_time(DB, T1 - T0),
+    case Result of
+        {ok, StorageIter, Batch} ->
+            Iter = Iter0#{?enc := StorageIter},
+            {ok, Iter, Batch};
+        Other ->
+            Other
+    end.
+
+-spec get_delete_streams(emqx_ds:db(), emqx_ds:topic_filter(), emqx_ds:time()) ->
+    [emqx_ds:ds_specific_delete_stream()].
+get_delete_streams(DB, TopicFilter, StartTime) ->
+    Shards = emqx_ds_builtin_local_meta:shards(DB),
+    lists:flatmap(
+        fun(Shard) ->
+            Streams = emqx_ds_storage_layer:get_delete_streams(
+                {DB, Shard}, TopicFilter, timestamp_to_timeus(StartTime)
+            ),
+            lists:map(
+                fun(InnerStream) ->
+                    ?delete_stream(Shard, InnerStream)
+                end,
+                Streams
+            )
+        end,
+        Shards
+    ).
+
+-spec make_delete_iterator(
+    emqx_ds:db(), emqx_ds:ds_specific_delete_stream(), emqx_ds:topic_filter(), emqx_ds:time()
+) ->
+    emqx_ds:make_delete_iterator_result(delete_iterator()).
+make_delete_iterator(DB, ?delete_stream(Shard, InnerStream), TopicFilter, StartTime) ->
+    ShardId = {DB, Shard},
+    case
+        emqx_ds_storage_layer:make_delete_iterator(
+            ShardId, InnerStream, TopicFilter, timestamp_to_timeus(StartTime)
+        )
+    of
+        {ok, Iter} ->
+            {ok, #{?tag => ?DELETE_IT, ?shard => Shard, ?enc => Iter}};
+        Error = {error, _, _} ->
+            Error
+    end.
+
+-spec delete_next(emqx_ds:db(), delete_iterator(), emqx_ds:delete_selector(), pos_integer()) ->
+    emqx_ds:delete_next_result(emqx_ds:delete_iterator()).
+delete_next(DB, Iter = #{?tag := ?DELETE_IT, ?shard := Shard, ?enc := StorageIter0}, Selector, N) ->
+    ShardId = {DB, Shard},
+    case
+        emqx_ds_storage_layer:delete_next(
+            ShardId, StorageIter0, Selector, N, current_timestamp(ShardId)
+        )
+    of
+        {ok, StorageIter, Ndeleted} ->
+            {ok, Iter#{?enc => StorageIter}, Ndeleted};
+        {ok, end_of_stream} ->
+            {ok, end_of_stream};
+        Error ->
+            Error
+    end.
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+current_timestamp(ShardId) ->
+    emqx_ds_builtin_local_meta:current_timestamp(ShardId).
+
+%%================================================================================
+%% Internal functions
+%%================================================================================
+
+timestamp_to_timeus(TimestampMs) ->
+    TimestampMs * 1000.
+
+timeus_to_timestamp(undefined) ->
+    undefined;
+timeus_to_timestamp(TimestampUs) ->
+    TimestampUs div 1000.

+ 38 - 0
apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_app.erl

@@ -0,0 +1,38 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_ds_builtin_local_app).
+
+%% API:
+-export([]).
+
+%% behavior callbacks:
+-export([start/2]).
+
+%%================================================================================
+%% behavior callbacks
+%%================================================================================
+
+start(_StartType, _StartArgs) ->
+    emqx_ds:register_backend(builtin_local, emqx_ds_builtin_local),
+    emqx_ds_builtin_local_sup:start_top().
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+%%================================================================================
+%% Internal functions
+%%================================================================================

+ 219 - 0
apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_db_sup.erl

@@ -0,0 +1,219 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc Supervisor that contains all the processes that belong to a
+%% given builtin DS database.
+-module(emqx_ds_builtin_local_db_sup).
+
+-behaviour(supervisor).
+
+%% API:
+-export([
+    start_db/2,
+    start_shard/1,
+    stop_shard/1,
+    terminate_storage/1,
+    restart_storage/1,
+    ensure_shard/1
+]).
+-export([which_dbs/0, which_shards/1]).
+
+%% Debug:
+-export([
+    get_shard_workers/1
+]).
+
+%% behaviour callbacks:
+-export([init/1]).
+
+%% internal exports:
+-export([start_link_sup/2]).
+
+%%================================================================================
+%% Type declarations
+%%================================================================================
+
+-define(via(REC), {via, gproc, {n, l, REC}}).
+
+-define(db_sup, ?MODULE).
+-define(shards_sup, emqx_ds_builtin_local_db_shards_sup).
+-define(shard_sup, emqx_ds_builtin_local_db_shard_sup).
+
+-record(?db_sup, {db}).
+-record(?shards_sup, {db}).
+-record(?shard_sup, {db, shard}).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+-spec start_db(emqx_ds:db(), emqx_ds_builtin_local:db_opts()) -> {ok, pid()}.
+start_db(DB, Opts) ->
+    start_link_sup(#?db_sup{db = DB}, Opts).
+
+-spec start_shard(emqx_ds_storage_layer:shard_id()) ->
+    supervisor:startchild_ret().
+start_shard({DB, Shard}) ->
+    supervisor:start_child(?via(#?shards_sup{db = DB}), shard_spec(DB, Shard)).
+
+-spec stop_shard(emqx_ds_storage_layer:shard_id()) -> ok | {error, not_found}.
+stop_shard({DB, Shard}) ->
+    Sup = ?via(#?shards_sup{db = DB}),
+    case supervisor:terminate_child(Sup, Shard) of
+        ok ->
+            supervisor:delete_child(Sup, Shard);
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+-spec terminate_storage(emqx_ds_storage_layer:shard_id()) -> ok | {error, _Reason}.
+terminate_storage({DB, Shard}) ->
+    Sup = ?via(#?shard_sup{db = DB, shard = Shard}),
+    supervisor:terminate_child(Sup, {Shard, storage}).
+
+-spec restart_storage(emqx_ds_storage_layer:shard_id()) -> {ok, _Child} | {error, _Reason}.
+restart_storage({DB, Shard}) ->
+    Sup = ?via(#?shard_sup{db = DB, shard = Shard}),
+    supervisor:restart_child(Sup, {Shard, storage}).
+
+-spec ensure_shard(emqx_ds_storage_layer:shard_id()) ->
+    ok | {error, _Reason}.
+ensure_shard(Shard) ->
+    ensure_started(start_shard(Shard)).
+
+-spec which_shards(emqx_ds:db()) ->
+    [_Child].
+which_shards(DB) ->
+    supervisor:which_children(?via(#?shards_sup{db = DB})).
+
+%% @doc Return the list of builtin DS databases that are currently
+%% active on the node.
+-spec which_dbs() -> [emqx_ds:db()].
+which_dbs() ->
+    Key = {n, l, #?db_sup{_ = '_', db = '$1'}},
+    gproc:select({local, names}, [{{Key, '_', '_'}, [], ['$1']}]).
+
+%% @doc Get pids of all local shard servers for the given DB.
+-spec get_shard_workers(emqx_ds:db()) -> #{_Shard => pid()}.
+get_shard_workers(DB) ->
+    Shards = supervisor:which_children(?via(#?shards_sup{db = DB})),
+    L = lists:flatmap(
+        fun
+            ({_Shard, Sup, _, _}) when is_pid(Sup) ->
+                [{Id, Pid} || {Id, Pid, _, _} <- supervisor:which_children(Sup), is_pid(Pid)];
+            (_) ->
+                []
+        end,
+        Shards
+    ),
+    maps:from_list(L).
+
+%%================================================================================
+%% behaviour callbacks
+%%================================================================================
+
+init({#?db_sup{db = DB}, DefaultOpts}) ->
+    %% Spec for the top-level supervisor for the database:
+    logger:notice("Starting DS DB ~p", [DB]),
+    emqx_ds_builtin_metrics:init_for_db(DB),
+    Opts = emqx_ds_builtin_local_meta:open_db(DB, DefaultOpts),
+    Children = [
+        sup_spec(#?shards_sup{db = DB}, Opts)
+    ],
+    SupFlags = #{
+        strategy => one_for_all,
+        intensity => 0,
+        period => 1
+    },
+    {ok, {SupFlags, Children}};
+init({#?shards_sup{db = DB}, _Opts}) ->
+    %% Spec for the supervisor that manages the supervisors for
+    %% each local shard of the DB:
+    SupFlags = #{
+        strategy => one_for_one,
+        intensity => 10,
+        period => 1
+    },
+    Children = [shard_spec(DB, Shard) || Shard <- emqx_ds_builtin_local_meta:shards(DB)],
+    {ok, {SupFlags, Children}};
+init({#?shard_sup{db = DB, shard = Shard}, _}) ->
+    SupFlags = #{
+        strategy => rest_for_one,
+        intensity => 10,
+        period => 100
+    },
+    Opts = emqx_ds_builtin_local_meta:db_config(DB),
+    Children = [
+        shard_storage_spec(DB, Shard, Opts),
+        shard_buffer_spec(DB, Shard, Opts)
+    ],
+    {ok, {SupFlags, Children}}.
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+start_link_sup(Id, Options) ->
+    supervisor:start_link(?via(Id), ?MODULE, {Id, Options}).
+
+%%================================================================================
+%% Internal functions
+%%================================================================================
+
+sup_spec(Id, Options) ->
+    #{
+        id => element(1, Id),
+        start => {?MODULE, start_link_sup, [Id, Options]},
+        type => supervisor,
+        shutdown => infinity
+    }.
+
+shard_spec(DB, Shard) ->
+    #{
+        id => Shard,
+        start => {?MODULE, start_link_sup, [#?shard_sup{db = DB, shard = Shard}, []]},
+        shutdown => infinity,
+        restart => permanent,
+        type => supervisor
+    }.
+
+shard_storage_spec(DB, Shard, Opts) ->
+    #{
+        id => {Shard, storage},
+        start => {emqx_ds_storage_layer, start_link, [{DB, Shard}, Opts]},
+        shutdown => 5_000,
+        restart => permanent,
+        type => worker
+    }.
+
+shard_buffer_spec(DB, Shard, Options) ->
+    #{
+        id => {Shard, buffer},
+        start => {emqx_ds_buffer, start_link, [emqx_ds_builtin_local, Options, DB, Shard]},
+        shutdown => 5_000,
+        restart => permanent,
+        type => worker
+    }.
+
+ensure_started(Res) ->
+    case Res of
+        {ok, _Pid} ->
+            ok;
+        {error, {already_started, _Pid}} ->
+            ok;
+        {error, Reason} ->
+            {error, Reason}
+    end.

+ 204 - 0
apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_meta.erl

@@ -0,0 +1,204 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_ds_builtin_local_meta).
+
+-behaviour(gen_server).
+
+%% API:
+-export([
+    start_link/0,
+    open_db/2,
+    drop_db/1,
+    n_shards/1,
+    shards/1,
+    db_config/1,
+    update_db_config/2,
+
+    current_timestamp/1,
+    set_current_timestamp/2,
+    ensure_monotonic_timestamp/1
+]).
+
+%% behavior callbacks:
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]).
+
+%% internal exports:
+-export([]).
+
+-export_type([]).
+
+-include_lib("stdlib/include/ms_transform.hrl").
+
+%%================================================================================
+%% Type declarations
+%%================================================================================
+
+-define(META_TAB, emqx_ds_builtin_local_metadata_tab).
+-record(?META_TAB, {
+    db :: emqx_ds:db(),
+    db_props :: emqx_ds_builtin_local:db_opts()
+}).
+
+%% We save timestamp of the last written message to a mnesia table.
+%% The saved value is restored when the node restarts. This is needed
+%% to create a timestamp that is truly monotonic even in presence of
+%% node restarts.
+-define(TS_TAB, emqx_ds_builtin_local_timestamp_tab).
+-record(?TS_TAB, {
+    id :: emqx_ds_storage_layer:shard_id(),
+    latest :: integer()
+}).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+-define(SERVER, ?MODULE).
+
+-spec start_link() -> {ok, pid()}.
+start_link() ->
+    gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
+
+-spec open_db(emqx_ds:db(), emqx_ds_builtin_local:db_opts()) ->
+    emqx_ds_builtin_local:db_opts().
+open_db(DB, CreateOpts = #{backend := builtin_local, storage := _, n_shards := _}) ->
+    transaction(
+        fun() ->
+            case mnesia:wread({?META_TAB, DB}) of
+                [] ->
+                    mnesia:write(#?META_TAB{db = DB, db_props = CreateOpts}),
+                    CreateOpts;
+                [#?META_TAB{db_props = Opts}] ->
+                    Opts
+            end
+        end
+    ).
+
+-spec drop_db(emqx_ds:db()) -> ok.
+drop_db(DB) ->
+    transaction(
+        fun() ->
+            MS = ets:fun2ms(fun(#?TS_TAB{id = ID}) when element(1, ID) =:= DB ->
+                ID
+            end),
+            Timestamps = mnesia:select(?TS_TAB, MS, write),
+            [mnesia:delete({?TS_TAB, I}) || I <- Timestamps],
+            mnesia:delete({?META_TAB, DB})
+        end
+    ).
+
+-spec update_db_config(emqx_ds:db(), emqx_ds_builtin_local:db_opts()) ->
+    emqx_ds_builtin_local:db_opts().
+update_db_config(DB, Opts) ->
+    transaction(
+        fun() ->
+            mnesia:write(#?META_TAB{db = DB, db_props = Opts}),
+            Opts
+        end
+    ).
+
+-spec n_shards(emqx_ds:db()) -> pos_integer().
+n_shards(DB) ->
+    #{n_shards := NShards} = db_config(DB),
+    NShards.
+
+-spec shards(emqx_ds:db()) -> [emqx_ds_builtin_local:shard()].
+shards(DB) ->
+    NShards = n_shards(DB),
+    [integer_to_binary(Shard) || Shard <- lists:seq(0, NShards - 1)].
+
+-spec db_config(emqx_ds:db()) -> emqx_ds_builtin_local:db_opts().
+db_config(DB) ->
+    case mnesia:dirty_read(?META_TAB, DB) of
+        [#?META_TAB{db_props = Props}] ->
+            Props;
+        [] ->
+            error({no_such_db, DB})
+    end.
+
+-spec set_current_timestamp(emqx_ds_storage_layer:shard_id(), emqx_ds:time()) -> ok.
+set_current_timestamp(ShardId, Time) ->
+    mria:dirty_write(?TS_TAB, #?TS_TAB{id = ShardId, latest = Time}).
+
+-spec current_timestamp(emqx_ds_storage_layer:shard_id()) -> emqx_ds:time() | undefined.
+current_timestamp(ShardId) ->
+    case mnesia:dirty_read(?TS_TAB, ShardId) of
+        [#?TS_TAB{latest = Latest}] ->
+            Latest;
+        [] ->
+            undefined
+    end.
+
+-spec ensure_monotonic_timestamp(emqx_ds_storage_layer:shard_id()) -> emqx_ds:time().
+ensure_monotonic_timestamp(ShardId) ->
+    mria:dirty_update_counter({?TS_TAB, ShardId}, 1).
+
+%%================================================================================
+%% behavior callbacks
+%%================================================================================
+
+-record(s, {}).
+-define(timer_update, timer_update).
+
+init([]) ->
+    process_flag(trap_exit, true),
+    ensure_tables(),
+    S = #s{},
+    {ok, S}.
+
+handle_call(_Call, _From, S) ->
+    {reply, {error, unknown_call}, S}.
+
+handle_cast(_Cast, S) ->
+    {noreply, S}.
+
+handle_info(_Info, S) ->
+    {noreply, S}.
+
+terminate(_Reason, _S) ->
+    ok.
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+%%================================================================================
+%% Internal functions
+%%================================================================================
+
+ensure_tables() ->
+    ok = mria:create_table(?META_TAB, [
+        {local_content, true},
+        {type, ordered_set},
+        {storage, disc_copies},
+        {record_name, ?META_TAB},
+        {attributes, record_info(fields, ?META_TAB)}
+    ]),
+    ok = mria:create_table(?TS_TAB, [
+        {local_content, true},
+        {type, set},
+        {storage, disc_copies},
+        {record_name, ?TS_TAB},
+        {attributes, record_info(fields, ?TS_TAB)}
+    ]).
+
+transaction(Fun) ->
+    case mria:transaction(mria:local_content_shard(), Fun) of
+        {atomic, Result} ->
+            Result;
+        {aborted, Reason} ->
+            {error, Reason}
+    end.

+ 127 - 0
apps/emqx_ds_builtin_local/src/emqx_ds_builtin_local_sup.erl

@@ -0,0 +1,127 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+%% @doc This supervisor manages the global worker processes needed for
+%% the functioning of builtin local databases, and all builtin local
+%% databases that attach to it.
+-module(emqx_ds_builtin_local_sup).
+
+-behaviour(supervisor).
+
+%% API:
+-export([start_db/2, stop_db/1]).
+
+%% behavior callbacks:
+-export([init/1]).
+
+%% internal exports:
+-export([start_top/0, start_databases_sup/0]).
+
+-export_type([]).
+
+%%================================================================================
+%% Type declarations
+%%================================================================================
+
+-define(top, ?MODULE).
+-define(databases, emqx_ds_builtin_local_db_sup).
+
+%%================================================================================
+%% API functions
+%%================================================================================
+
+-spec start_top() -> {ok, pid()}.
+start_top() ->
+    supervisor:start_link({local, ?top}, ?MODULE, ?top).
+
+-spec start_db(emqx_ds:db(), emqx_ds_builtin_local:db_opts()) ->
+    supervisor:startchild_ret().
+start_db(DB, Opts) ->
+    ChildSpec = #{
+        id => DB,
+        start => {?databases, start_db, [DB, Opts]},
+        type => supervisor,
+        shutdown => infinity
+    },
+    supervisor:start_child(?databases, ChildSpec).
+
+-spec stop_db(emqx_ds:db()) -> ok.
+stop_db(DB) ->
+    case whereis(?databases) of
+        Pid when is_pid(Pid) ->
+            _ = supervisor:terminate_child(?databases, DB),
+            _ = supervisor:delete_child(?databases, DB),
+            ok;
+        undefined ->
+            ok
+    end.
+
+%%================================================================================
+%% behavior callbacks
+%%================================================================================
+
+%% There are two layers of supervision:
+%%
+%% 1. top supervisor for the builtin backend. It contains the global
+%% worker processes (like the metadata server), and `?databases'
+%% supervisior.
+%%
+%% 2. `?databases': a `one_for_one' supervisor where each child is a
+%% `db' supervisor that contains processes that represent the DB.
+%% Chidren are attached dynamically to this one.
+init(?top) ->
+    %% Children:
+    MetadataServer = #{
+        id => metadata_server,
+        start => {emqx_ds_builtin_local_meta, start_link, []},
+        restart => permanent,
+        type => worker,
+        shutdown => 5000
+    },
+    DBsSup = #{
+        id => ?databases,
+        start => {?MODULE, start_databases_sup, []},
+        restart => permanent,
+        type => supervisor,
+        shutdown => infinity
+    },
+    %%
+    SupFlags = #{
+        strategy => one_for_all,
+        intensity => 1,
+        period => 1,
+        auto_shutdown => never
+    },
+    {ok, {SupFlags, [MetadataServer, DBsSup]}};
+init(?databases) ->
+    %% Children are added dynamically:
+    SupFlags = #{
+        strategy => one_for_one,
+        intensity => 10,
+        period => 1
+    },
+    {ok, {SupFlags, []}}.
+
+%%================================================================================
+%% Internal exports
+%%================================================================================
+
+start_databases_sup() ->
+    supervisor:start_link({local, ?databases}, ?MODULE, ?databases).
+
+%%================================================================================
+%% Internal functions
+%%================================================================================

+ 346 - 0
apps/emqx_ds_builtin_local/test/emqx_ds_builtin_local_SUITE.erl

@@ -0,0 +1,346 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+-module(emqx_ds_builtin_local_SUITE).
+
+-compile(export_all).
+-compile(nowarn_export_all).
+
+-include_lib("emqx/include/emqx.hrl").
+-include_lib("common_test/include/ct.hrl").
+-include_lib("stdlib/include/assert.hrl").
+-include_lib("emqx/include/asserts.hrl").
+-include_lib("snabbkaffe/include/snabbkaffe.hrl").
+
+-define(N_SHARDS, 1).
+
+opts(_Config) ->
+    #{
+        backend => builtin_local,
+        storage => {emqx_ds_storage_reference, #{}},
+        n_shards => ?N_SHARDS
+    }.
+
+t_drop_generation_with_never_used_iterator(Config) ->
+    %% This test checks how the iterator behaves when:
+    %%   1) it's created at generation 1 and not consumed from.
+    %%   2) generation 2 is created and 1 dropped.
+    %%   3) iteration begins.
+    %% In this case, the iterator won't see any messages and the stream will end.
+
+    DB = ?FUNCTION_NAME,
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+    [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
+
+    TopicFilter = emqx_topic:words(<<"foo/+">>),
+    StartTime = 0,
+    Msgs0 = [
+        message(<<"foo/bar">>, <<"1">>, 0),
+        message(<<"foo/baz">>, <<"2">>, 1)
+    ],
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs0)),
+
+    [{_, Stream0}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
+    {ok, Iter0} = emqx_ds:make_iterator(DB, Stream0, TopicFilter, StartTime),
+
+    ok = emqx_ds:add_generation(DB),
+    ok = emqx_ds:drop_generation(DB, GenId0),
+
+    Now = emqx_message:timestamp_now(),
+    Msgs1 = [
+        message(<<"foo/bar">>, <<"3">>, Now + 100),
+        message(<<"foo/baz">>, <<"4">>, Now + 101)
+    ],
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs1)),
+
+    ?assertError(
+        {error, unrecoverable, generation_not_found},
+        emqx_ds_test_helpers:consume_iter(DB, Iter0)
+    ),
+
+    %% New iterator for the new stream will only see the later messages.
+    [{_, Stream1}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
+    ?assertNotEqual(Stream0, Stream1),
+    {ok, Iter1} = emqx_ds:make_iterator(DB, Stream1, TopicFilter, StartTime),
+
+    {ok, Iter, Batch} = emqx_ds_test_helpers:consume_iter(DB, Iter1, #{batch_size => 1}),
+    ?assertNotEqual(end_of_stream, Iter),
+    ?assertEqual(Msgs1, Batch),
+
+    ok.
+
+t_drop_generation_with_used_once_iterator(Config) ->
+    %% This test checks how the iterator behaves when:
+    %%   1) it's created at generation 1 and consumes at least 1 message.
+    %%   2) generation 2 is created and 1 dropped.
+    %%   3) iteration continues.
+    %% In this case, the iterator should see no more messages and the stream will end.
+
+    DB = ?FUNCTION_NAME,
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+    [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
+
+    TopicFilter = emqx_topic:words(<<"foo/+">>),
+    StartTime = 0,
+    Msgs0 =
+        [Msg0 | _] = [
+            message(<<"foo/bar">>, <<"1">>, 0),
+            message(<<"foo/baz">>, <<"2">>, 1)
+        ],
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs0)),
+
+    [{_, Stream0}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
+    {ok, Iter0} = emqx_ds:make_iterator(DB, Stream0, TopicFilter, StartTime),
+    {ok, Iter1, Batch1} = emqx_ds:next(DB, Iter0, 1),
+    ?assertNotEqual(end_of_stream, Iter1),
+    ?assertEqual([Msg0], [Msg || {_Key, Msg} <- Batch1]),
+
+    ok = emqx_ds:add_generation(DB),
+    ok = emqx_ds:drop_generation(DB, GenId0),
+
+    Now = emqx_message:timestamp_now(),
+    Msgs1 = [
+        message(<<"foo/bar">>, <<"3">>, Now + 100),
+        message(<<"foo/baz">>, <<"4">>, Now + 101)
+    ],
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs1)),
+
+    ?assertError(
+        {error, unrecoverable, generation_not_found},
+        emqx_ds_test_helpers:consume_iter(DB, Iter1)
+    ).
+
+t_drop_generation_update_iterator(Config) ->
+    %% This checks the behavior of `emqx_ds:update_iterator' after the generation
+    %% underlying the iterator has been dropped.
+
+    DB = ?FUNCTION_NAME,
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+    [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
+
+    TopicFilter = emqx_topic:words(<<"foo/+">>),
+    StartTime = 0,
+    Msgs0 = [
+        message(<<"foo/bar">>, <<"1">>, 0),
+        message(<<"foo/baz">>, <<"2">>, 1)
+    ],
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs0)),
+
+    [{_, Stream0}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
+    {ok, Iter0} = emqx_ds:make_iterator(DB, Stream0, TopicFilter, StartTime),
+    {ok, Iter1, _Batch1} = emqx_ds:next(DB, Iter0, 1),
+    {ok, _Iter2, [{Key2, _Msg}]} = emqx_ds:next(DB, Iter1, 1),
+
+    ok = emqx_ds:add_generation(DB),
+    ok = emqx_ds:drop_generation(DB, GenId0),
+
+    ?assertEqual(
+        {error, unrecoverable, generation_not_found},
+        emqx_ds:update_iterator(DB, Iter1, Key2)
+    ).
+
+t_make_iterator_stale_stream(Config) ->
+    %% This checks the behavior of `emqx_ds:make_iterator' after the generation underlying
+    %% the stream has been dropped.
+
+    DB = ?FUNCTION_NAME,
+    ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+    [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
+
+    TopicFilter = emqx_topic:words(<<"foo/+">>),
+    StartTime = 0,
+    Msgs0 = [
+        message(<<"foo/bar">>, <<"1">>, 0),
+        message(<<"foo/baz">>, <<"2">>, 1)
+    ],
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs0)),
+
+    [{_, Stream0}] = emqx_ds:get_streams(DB, TopicFilter, StartTime),
+
+    ok = emqx_ds:add_generation(DB),
+    ok = emqx_ds:drop_generation(DB, GenId0),
+
+    ?assertEqual(
+        {error, unrecoverable, generation_not_found},
+        emqx_ds:make_iterator(DB, Stream0, TopicFilter, StartTime)
+    ),
+
+    ok.
+
+t_get_streams_concurrently_with_drop_generation(Config) ->
+    %% This checks that we can get all streams while a generation is dropped
+    %% mid-iteration.
+
+    DB = ?FUNCTION_NAME,
+    ?check_trace(
+        #{timetrap => 5_000},
+        begin
+            ?assertMatch(ok, emqx_ds:open_db(DB, opts(Config))),
+
+            [GenId0] = maps:keys(emqx_ds:list_generations_with_lifetimes(DB)),
+            ok = emqx_ds:add_generation(DB),
+            ok = emqx_ds:add_generation(DB),
+
+            %% All streams
+            TopicFilter = emqx_topic:words(<<"foo/+">>),
+            StartTime = 0,
+            ?assertMatch([_, _, _], emqx_ds:get_streams(DB, TopicFilter, StartTime)),
+
+            ?force_ordering(
+                #{?snk_kind := dropped_gen},
+                #{?snk_kind := get_streams_get_gen}
+            ),
+
+            spawn_link(fun() ->
+                {ok, _} = ?block_until(#{?snk_kind := get_streams_all_gens}),
+                ok = emqx_ds:drop_generation(DB, GenId0),
+                ?tp(dropped_gen, #{})
+            end),
+
+            ?assertMatch([_, _], emqx_ds:get_streams(DB, TopicFilter, StartTime)),
+
+            ok
+        end,
+        []
+    ).
+
+%% This testcase verifies the behavior of `store_batch' operation
+%% when the underlying code experiences recoverable or unrecoverable
+%% problems.
+t_store_batch_fail(Config) ->
+    ?check_trace(
+        #{timetrap => 15_000},
+        try
+            meck:new(emqx_ds_storage_layer, [passthrough, no_history]),
+            DB = ?FUNCTION_NAME,
+            ?assertMatch(ok, emqx_ds:open_db(DB, (opts(Config))#{n_shards => 2})),
+            %% Success:
+            Batch1 = [
+                message(<<"C1">>, <<"foo/bar">>, <<"1">>, 1),
+                message(<<"C1">>, <<"foo/bar">>, <<"2">>, 1)
+            ],
+            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch1, #{sync => true})),
+            %% Inject unrecoverable error:
+            meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) ->
+                {error, unrecoverable, mock}
+            end),
+            Batch2 = [
+                message(<<"C1">>, <<"foo/bar">>, <<"3">>, 1),
+                message(<<"C1">>, <<"foo/bar">>, <<"4">>, 1)
+            ],
+            ?assertMatch(
+                {error, unrecoverable, mock}, emqx_ds:store_batch(DB, Batch2, #{sync => true})
+            ),
+            %% Inject a recoveralbe error:
+            meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) ->
+                {error, recoverable, mock}
+            end),
+            Batch3 = [
+                message(<<"C1">>, <<"foo/bar">>, <<"5">>, 2),
+                message(<<"C2">>, <<"foo/bar">>, <<"6">>, 2),
+                message(<<"C1">>, <<"foo/bar">>, <<"7">>, 3),
+                message(<<"C2">>, <<"foo/bar">>, <<"8">>, 3)
+            ],
+            %% Note: due to idempotency issues the number of retries
+            %% is currently set to 0:
+            ?assertMatch(
+                {error, recoverable, mock},
+                emqx_ds:store_batch(DB, Batch3, #{sync => true})
+            ),
+            meck:unload(emqx_ds_storage_layer),
+            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})),
+            lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 1))
+        after
+            meck:unload()
+        end,
+        [
+            {"message ordering", fun(StoredMessages, _Trace) ->
+                [{_, MessagesFromStream1}, {_, MessagesFromStream2}] = StoredMessages,
+                emqx_ds_test_helpers:diff_messages(
+                    [payload],
+                    [
+                        #message{payload = <<"1">>},
+                        #message{payload = <<"2">>},
+                        #message{payload = <<"5">>},
+                        #message{payload = <<"7">>}
+                    ],
+                    MessagesFromStream1
+                ),
+                emqx_ds_test_helpers:diff_messages(
+                    [payload],
+                    [
+                        #message{payload = <<"6">>},
+                        #message{payload = <<"8">>}
+                    ],
+                    MessagesFromStream2
+                )
+            end}
+        ]
+    ).
+
+message(ClientId, Topic, Payload, PublishedAt) ->
+    Msg = message(Topic, Payload, PublishedAt),
+    Msg#message{from = ClientId}.
+
+message(Topic, Payload, PublishedAt) ->
+    #message{
+        topic = Topic,
+        payload = Payload,
+        timestamp = PublishedAt,
+        id = emqx_guid:gen()
+    }.
+
+delete(DB, It, Selector, BatchSize) ->
+    delete(DB, It, Selector, BatchSize, 0).
+
+delete(DB, It0, Selector, BatchSize, Acc) ->
+    case emqx_ds:delete_next(DB, It0, Selector, BatchSize) of
+        {ok, It, 0} ->
+            {ok, It, Acc};
+        {ok, It, NumDeleted} ->
+            delete(DB, It, BatchSize, Selector, Acc + NumDeleted);
+        {ok, end_of_stream} ->
+            {ok, end_of_stream, Acc};
+        Ret ->
+            Ret
+    end.
+
+%% CT callbacks
+
+all() ->
+    emqx_common_test_helpers:all(?MODULE).
+
+init_per_suite(Config) ->
+    emqx_common_test_helpers:clear_screen(),
+    Apps = emqx_cth_suite:start(
+        [mria, emqx_ds_builtin_local],
+        #{work_dir => ?config(priv_dir, Config)}
+    ),
+    [{apps, Apps} | Config].
+
+end_per_suite(Config) ->
+    ok = emqx_cth_suite:stop(?config(apps, Config)),
+    ok.
+
+init_per_testcase(_TC, Config) ->
+    application:ensure_all_started(emqx_ds_builtin_local),
+    Config.
+
+end_per_testcase(_TC, _Config) ->
+    snabbkaffe:stop(),
+    ok = application:stop(emqx_ds_builtin_local),
+    mria:stop(),
+    _ = mnesia:delete_schema([node()]),
+    ok.

+ 94 - 0
apps/emqx_ds_builtin_raft/BSL.txt

@@ -0,0 +1,94 @@
+Business Source License 1.1
+
+Licensor:             Hangzhou EMQ Technologies Co., Ltd.
+Licensed Work:        EMQX Enterprise Edition
+                      The Licensed Work is (c) 2024
+                      Hangzhou EMQ Technologies Co., Ltd.
+Additional Use Grant: Students and educators are granted right to copy,
+                      modify, and create derivative work for research
+                      or education.
+Change Date:          2028-06-13
+Change License:       Apache License, Version 2.0
+
+For information about alternative licensing arrangements for the Software,
+please contact Licensor: https://www.emqx.com/en/contact
+
+Notice
+
+The Business Source License (this document, or the “License”) is not an Open
+Source license. However, the Licensed Work will eventually be made available
+under an Open Source License, as stated in this License.
+
+License text copyright (c) 2017, 2024 MariaDB Corporation Ab, All Rights Reserved.
+“Business Source License” is a trademark of MariaDB Corporation Ab.
+
+-----------------------------------------------------------------------------
+
+Business Source License 1.1
+
+Terms
+
+The Licensor hereby grants you the right to copy, modify, create derivative
+works, redistribute, and make non-production use of the Licensed Work. The
+Licensor may make an Additional Use Grant, above, permitting limited
+production use.
+
+Effective on the Change Date, or the fourth anniversary of the first publicly
+available distribution of a specific version of the Licensed Work under this
+License, whichever comes first, the Licensor hereby grants you rights under
+the terms of the Change License, and the rights granted in the paragraph
+above terminate.
+
+If your use of the Licensed Work does not comply with the requirements
+currently in effect as described in this License, you must purchase a
+commercial license from the Licensor, its affiliated entities, or authorized
+resellers, or you must refrain from using the Licensed Work.
+
+All copies of the original and modified Licensed Work, and derivative works
+of the Licensed Work, are subject to this License. This License applies
+separately for each version of the Licensed Work and the Change Date may vary
+for each version of the Licensed Work released by Licensor.
+
+You must conspicuously display this License on each original or modified copy
+of the Licensed Work. If you receive the Licensed Work in original or
+modified form from a third party, the terms and conditions set forth in this
+License apply to your use of that work.
+
+Any use of the Licensed Work in violation of this License will automatically
+terminate your rights under this License for the current and all other
+versions of the Licensed Work.
+
+This License does not grant you any right in any trademark or logo of
+Licensor or its affiliates (provided that you may use a trademark or logo of
+Licensor as expressly required by this License).
+
+TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
+AN “AS IS” BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
+EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
+TITLE.
+
+MariaDB hereby grants you permission to use this License’s text to license
+your works, and to refer to it using the trademark “Business Source License”,
+as long as you comply with the Covenants of Licensor below.
+
+Covenants of Licensor
+
+In consideration of the right to use this License’s text and the “Business
+Source License” name and trademark, Licensor covenants to MariaDB, and to all
+other recipients of the licensed work to be provided by Licensor:
+
+1. To specify as the Change License the GPL Version 2.0 or any later version,
+   or a license that is compatible with GPL Version 2.0 or a later version,
+   where “compatible” means that software provided under the Change License can
+   be included in a program with software provided under GPL Version 2.0 or a
+   later version. Licensor may specify additional Change Licenses without
+   limitation.
+
+2. To either: (a) specify an additional grant of rights to use that does not
+   impose any additional restriction on the right granted in this License, as
+   the Additional Use Grant; or (b) insert the text “None”.
+
+3. To specify a Change Date.
+
+4. Not to modify this License in any other way.

+ 3 - 0
apps/emqx_ds_builtin_raft/README.md

@@ -0,0 +1,3 @@
+# `emqx_ds_builtin_raft`
+
+Replication layer for the builtin EMQX durable storage backend that uses Raft algorithm.

+ 6 - 0
apps/emqx_ds_builtin_raft/rebar.config

@@ -0,0 +1,6 @@
+%% -*- mode:erlang -*-
+
+{deps, [
+    {emqx_durable_storage, {path, "../emqx_durable_storage"}},
+    {ra, "2.7.3"}
+]}.

+ 11 - 0
apps/emqx_ds_builtin_raft/src/emqx_ds_builtin_raft.app.src

@@ -0,0 +1,11 @@
+%% -*- mode: erlang -*-
+{application, emqx_ds_builtin_raft, [
+    {description, "Raft replication layer for the durable storage"},
+    % strict semver, bump manually!
+    {vsn, "0.1.0"},
+    {modules, []},
+    {registered, []},
+    {applications, [kernel, stdlib, gproc, mria, ra, emqx_durable_storage]},
+    {mod, {emqx_ds_builtin_raft_app, []}},
+    {env, []}
+]}.

+ 11 - 0
apps/emqx_ds_builtin_raft/src/emqx_ds_builtin_raft_app.erl

@@ -0,0 +1,11 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2020-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%--------------------------------------------------------------------
+
+-module(emqx_ds_builtin_raft_app).
+
+-export([start/2]).
+
+start(_Type, _Args) ->
+    emqx_ds:register_backend(builtin_raft, emqx_ds_replication_layer),
+    emqx_ds_builtin_raft_sup:start_top().

+ 5 - 16
apps/emqx_durable_storage/src/emqx_ds_builtin_db_sup.erl

@@ -1,22 +1,10 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 
 %% @doc Supervisor that contains all the processes that belong to a
 %% given builtin DS database.
--module(emqx_ds_builtin_db_sup).
+-module(emqx_ds_builtin_raft_db_sup).
 
 -behaviour(supervisor).
 
@@ -150,7 +138,7 @@ get_shard_workers(DB) ->
 init({#?db_sup{db = DB}, DefaultOpts}) ->
     %% Spec for the top-level supervisor for the database:
     logger:notice("Starting DS DB ~p", [DB]),
-    emqx_ds_builtin_sup:clean_gvars(DB),
+    emqx_ds_builtin_raft_sup:clean_gvars(DB),
     emqx_ds_builtin_metrics:init_for_db(DB),
     Opts = emqx_ds_replication_layer_meta:open_db(DB, DefaultOpts),
     ok = start_ra_system(DB, Opts),
@@ -197,7 +185,7 @@ init({#?shard_sup{db = DB, shard = Shard}, _}) ->
     {ok, {SupFlags, Children}}.
 
 start_ra_system(DB, #{replication_options := ReplicationOpts}) ->
-    DataDir = filename:join([emqx_ds:base_dir(), DB, dsrepl]),
+    DataDir = filename:join([emqx_ds_storage_layer:base_dir(), DB, dsrepl]),
     Config = lists:foldr(fun maps:merge/2, #{}, [
         ra_system:default_config(),
         #{
@@ -279,9 +267,10 @@ shard_allocator_spec(DB) ->
     }.
 
 egress_spec(DB, Shard) ->
+    Options = #{},
     #{
         id => Shard,
-        start => {emqx_ds_replication_layer_egress, start_link, [DB, Shard]},
+        start => {emqx_ds_buffer, start_link, [emqx_ds_replication_layer, Options, DB, Shard]},
         shutdown => 5_000,
         restart => permanent,
         type => worker

+ 9 - 28
apps/emqx_durable_storage/src/emqx_ds_builtin_sup.erl

@@ -1,35 +1,23 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 
 %% @doc This supervisor manages the global worker processes needed for
 %% the functioning of builtin databases, and all builtin database
 %% attach to it.
--module(emqx_ds_builtin_sup).
+-module(emqx_ds_builtin_raft_sup).
 
 -behaviour(supervisor).
 
 %% API:
--export([start_db/2, stop_db/1]).
+-export([start_top/0, start_db/2, stop_db/1]).
 -export([set_gvar/3, get_gvar/3, clean_gvars/1]).
 
 %% behavior callbacks:
 -export([init/1]).
 
 %% internal exports:
--export([start_top/0, start_databases_sup/0]).
+-export([start_databases_sup/0]).
 
 -export_type([]).
 
@@ -39,7 +27,6 @@
 
 -define(top, ?MODULE).
 -define(databases, emqx_ds_builtin_databases_sup).
-
 -define(gvar_tab, emqx_ds_builtin_gvar).
 
 -record(gvar, {
@@ -51,13 +38,16 @@
 %% API functions
 %%================================================================================
 
+-spec start_top() -> {ok, pid()}.
+start_top() ->
+    supervisor:start_link({local, ?top}, ?MODULE, ?top).
+
 -spec start_db(emqx_ds:db(), emqx_ds_replication_layer:builtin_db_opts()) ->
     supervisor:startchild_ret().
 start_db(DB, Opts) ->
-    ensure_top(),
     ChildSpec = #{
         id => DB,
-        start => {emqx_ds_builtin_db_sup, start_db, [DB, Opts]},
+        start => {emqx_ds_builtin_raft_db_sup, start_db, [DB, Opts]},
         type => supervisor,
         shutdown => infinity
     },
@@ -109,7 +99,6 @@ clean_gvars(DB) ->
 %% Chidren are attached dynamically to this one.
 init(?top) ->
     %% Children:
-    MetricsWorker = emqx_ds_builtin_metrics:child_spec(),
     MetadataServer = #{
         id => metadata_server,
         start => {emqx_ds_replication_layer_meta, start_link, []},
@@ -132,7 +121,7 @@ init(?top) ->
         period => 1,
         auto_shutdown => never
     },
-    {ok, {SupFlags, [MetricsWorker, MetadataServer, DBsSup]}};
+    {ok, {SupFlags, [MetadataServer, DBsSup]}};
 init(?databases) ->
     %% Children are added dynamically:
     SupFlags = #{
@@ -146,17 +135,9 @@ init(?databases) ->
 %% Internal exports
 %%================================================================================
 
--spec start_top() -> {ok, pid()}.
-start_top() ->
-    supervisor:start_link({local, ?top}, ?MODULE, ?top).
-
 start_databases_sup() ->
     supervisor:start_link({local, ?databases}, ?MODULE, ?databases).
 
 %%================================================================================
 %% Internal functions
 %%================================================================================
-
-ensure_top() ->
-    {ok, _} = emqx_ds_sup:attach_backend(builtin, {?MODULE, start_top, []}),
-    ok.

+ 48 - 35
apps/emqx_durable_storage/src/emqx_ds_replication_layer.erl

@@ -1,28 +1,18 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 
 %% @doc Replication layer for DS backends that don't support
 %% replication on their own.
 -module(emqx_ds_replication_layer).
 
--behaviour(emqx_ds).
+%-behaviour(emqx_ds).
+-behaviour(emqx_ds_buffer).
 
 -export([
     list_shards/1,
     open_db/2,
+    close_db/1,
     add_generation/1,
     update_db_config/2,
     list_generations_with_lifetimes/1,
@@ -36,8 +26,12 @@
     update_iterator/3,
     next/3,
     delete_next/4,
-    shard_of_message/3,
-    current_timestamp/2
+
+    current_timestamp/2,
+
+    shard_of_message/4,
+    flush_buffer/4,
+    init_buffer/3
 ]).
 
 %% internal exports:
@@ -79,7 +73,6 @@
     delete_stream/0,
     iterator/0,
     delete_iterator/0,
-    message_id/0,
     batch/0
 ]).
 
@@ -139,8 +132,6 @@
         ?enc := emqx_ds_storage_layer:delete_iterator()
     }.
 
--type message_id() :: emqx_ds:message_id().
-
 %% TODO: this type is obsolete and is kept only for compatibility with
 %% BPAPIs. Remove it when emqx_ds_proto_v4 is gone (EMQX 5.6)
 -type batch() :: #{
@@ -176,7 +167,7 @@ list_shards(DB) ->
 
 -spec open_db(emqx_ds:db(), builtin_db_opts()) -> ok | {error, _}.
 open_db(DB, CreateOpts) ->
-    case emqx_ds_builtin_sup:start_db(DB, CreateOpts) of
+    case emqx_ds_builtin_raft_sup:start_db(DB, CreateOpts) of
         {ok, _} ->
             ok;
         {error, {already_started, _}} ->
@@ -185,6 +176,10 @@ open_db(DB, CreateOpts) ->
             {error, Err}
     end.
 
+-spec close_db(emqx_ds:db()) -> ok.
+close_db(DB) ->
+    emqx_ds_builtin_raft_sup:stop_db(DB).
+
 -spec add_generation(emqx_ds:db()) -> ok | {error, _}.
 add_generation(DB) ->
     foreach_shard(
@@ -241,7 +236,7 @@ drop_db(DB) ->
     emqx_ds:store_batch_result().
 store_batch(DB, Messages, Opts) ->
     try
-        emqx_ds_replication_layer_egress:store_batch(DB, Messages, Opts)
+        emqx_ds_buffer:store_batch(DB, Messages, Opts)
     catch
         error:{Reason, _Call} when Reason == timeout; Reason == noproc ->
             {error, recoverable, Reason}
@@ -357,17 +352,6 @@ delete_next(DB, Iter0, Selector, BatchSize) ->
             Other
     end.
 
--spec shard_of_message(emqx_ds:db(), emqx_types:message(), clientid | topic) ->
-    emqx_ds_replication_layer:shard_id().
-shard_of_message(DB, #message{from = From, topic = Topic}, SerializeBy) ->
-    N = emqx_ds_replication_shard_allocator:n_shards(DB),
-    Hash =
-        case SerializeBy of
-            clientid -> erlang:phash2(From, N);
-            topic -> erlang:phash2(Topic, N)
-        end,
-    integer_to_binary(Hash).
-
 -spec foreach_shard(emqx_ds:db(), fun((shard_id()) -> _)) -> ok.
 foreach_shard(DB, Fun) ->
     lists:foreach(Fun, list_shards(DB)).
@@ -376,12 +360,41 @@ foreach_shard(DB, Fun) ->
 %% local server
 -spec current_timestamp(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> emqx_ds:time().
 current_timestamp(DB, Shard) ->
-    emqx_ds_builtin_sup:get_gvar(DB, ?gv_timestamp(Shard), 0).
+    emqx_ds_builtin_raft_sup:get_gvar(DB, ?gv_timestamp(Shard), 0).
 
 %%================================================================================
-%% behavior callbacks
+%% emqx_ds_buffer callbacks
 %%================================================================================
 
+-record(bs, {}).
+-type egress_state() :: #bs{}.
+
+-spec init_buffer(emqx_ds:db(), shard_id(), _Options) -> {ok, egress_state()}.
+init_buffer(_DB, _Shard, _Options) ->
+    {ok, #bs{}}.
+
+-spec flush_buffer(emqx_ds:db(), shard_id(), [emqx_types:message()], egress_state()) ->
+    {egress_state(), ok | {error, recoverable | unrecoverable, _}}.
+flush_buffer(DB, Shard, Messages, State) ->
+    case ra_store_batch(DB, Shard, Messages) of
+        {timeout, ServerId} ->
+            Result = {error, recoverable, {timeout, ServerId}};
+        Result ->
+            ok
+    end,
+    {State, Result}.
+
+-spec shard_of_message(emqx_ds:db(), emqx_types:message(), clientid | topic, _Options) ->
+    emqx_ds_replication_layer:shard_id().
+shard_of_message(DB, #message{from = From, topic = Topic}, SerializeBy, _Options) ->
+    N = emqx_ds_replication_shard_allocator:n_shards(DB),
+    Hash =
+        case SerializeBy of
+            clientid -> erlang:phash2(From, N);
+            topic -> erlang:phash2(Topic, N)
+        end,
+    integer_to_binary(Hash).
+
 %%================================================================================
 %% Internal exports (RPC targets)
 %%================================================================================
@@ -402,7 +415,7 @@ current_timestamp(DB, Shard) ->
 -spec do_drop_db_v1(emqx_ds:db()) -> ok | {error, _}.
 do_drop_db_v1(DB) ->
     MyShards = emqx_ds_replication_layer_meta:my_shards(DB),
-    emqx_ds_builtin_sup:stop_db(DB),
+    emqx_ds_builtin_raft_sup:stop_db(DB),
     lists:foreach(
         fun(Shard) ->
             emqx_ds_storage_layer:drop_shard({DB, Shard})
@@ -874,4 +887,4 @@ handle_custom_event(DBShard, Latest, Event) ->
     end.
 
 set_ts({DB, Shard}, TS) ->
-    emqx_ds_builtin_sup:set_gvar(DB, ?gv_timestamp(Shard), TS).
+    emqx_ds_builtin_raft_sup:set_gvar(DB, ?gv_timestamp(Shard), TS).

+ 0 - 12
apps/emqx_durable_storage/src/emqx_ds_replication_layer.hrl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2022, 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 -ifndef(EMQX_DS_REPLICATION_LAYER_HRL).
 -define(EMQX_DS_REPLICATION_LAYER_HRL, true).

+ 1 - 13
apps/emqx_durable_storage/src/emqx_ds_replication_layer_meta.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 
 %% @doc Metadata storage for the builtin sharded database.
@@ -678,7 +666,7 @@ ensure_tables() ->
     ok = mria:wait_for_tables([?META_TAB, ?NODE_TAB, ?SHARD_TAB]).
 
 ensure_site() ->
-    Filename = filename:join(emqx_ds:base_dir(), "emqx_ds_builtin_site.eterm"),
+    Filename = filename:join(emqx_ds_storage_layer:base_dir(), "emqx_ds_builtin_site.eterm"),
     case file:consult(Filename) of
         {ok, [Site]} ->
             ok;

+ 0 - 12
apps/emqx_durable_storage/src/emqx_ds_replication_layer_shard.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 
 -module(emqx_ds_replication_layer_shard).

+ 3 - 15
apps/emqx_durable_storage/src/emqx_ds_replication_shard_allocator.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 
 -module(emqx_ds_replication_shard_allocator).
@@ -297,7 +285,7 @@ trans_drop_local(DB, Shard, {del, Site}) ->
 do_drop_local(DB, Shard) ->
     case emqx_ds_replication_layer_shard:drop_local_server(DB, Shard) of
         ok ->
-            ok = emqx_ds_builtin_db_sup:stop_shard({DB, Shard}),
+            ok = emqx_ds_builtin_raft_db_sup:stop_shard({DB, Shard}),
             ok = emqx_ds_storage_layer:drop_shard({DB, Shard}),
             logger:info(#{msg => "Local shard replica dropped"});
         {error, recoverable, Reason} ->
@@ -428,7 +416,7 @@ start_shards(DB, Shards) ->
     lists:foreach(fun(Shard) -> start_shard(DB, Shard) end, Shards).
 
 start_shard(DB, Shard) ->
-    ok = emqx_ds_builtin_db_sup:ensure_shard({DB, Shard}),
+    ok = emqx_ds_builtin_raft_db_sup:ensure_shard({DB, Shard}),
     ok = logger:info(#{msg => "Shard started", shard => Shard}),
     ok.
 
@@ -436,7 +424,7 @@ start_egresses(DB, Shards) ->
     lists:foreach(fun(Shard) -> start_egress(DB, Shard) end, Shards).
 
 start_egress(DB, Shard) ->
-    ok = emqx_ds_builtin_db_sup:ensure_egress({DB, Shard}),
+    ok = emqx_ds_builtin_raft_db_sup:ensure_egress({DB, Shard}),
     ok = logger:info(#{msg => "Egress started", shard => Shard}),
     ok.
 

+ 2 - 2
apps/emqx_durable_storage/src/emqx_ds_replication_snapshot.erl

@@ -195,7 +195,7 @@ start_snapshot_writer(WS) ->
         msg => "dsrepl_snapshot_write_started",
         shard => ShardId
     }),
-    _ = emqx_ds_builtin_db_sup:terminate_storage(ShardId),
+    _ = emqx_ds_builtin_raft_db_sup:terminate_storage(ShardId),
     {ok, SnapWriter} = emqx_ds_storage_layer:accept_snapshot(ShardId),
     {ok, WS#ws{phase = storage_snapshot, writer = SnapWriter}}.
 
@@ -223,7 +223,7 @@ complete_accept(WS = #ws{started_at = StartedAt, writer = SnapWriter}) ->
         duration_ms => erlang:monotonic_time(millisecond) - StartedAt,
         bytes_written => emqx_ds_storage_snapshot:writer_info(bytes_written, SnapWriter)
     }),
-    {ok, _} = emqx_ds_builtin_db_sup:restart_storage(ShardId),
+    {ok, _} = emqx_ds_builtin_raft_db_sup:restart_storage(ShardId),
     write_machine_snapshot(WS).
 
 write_machine_snapshot(#ws{dir = Dir, meta = Meta, state = MachineState}) ->

+ 0 - 12
apps/emqx_durable_storage/src/proto/emqx_ds_proto_v1.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 -module(emqx_ds_proto_v1).
 

+ 0 - 12
apps/emqx_durable_storage/src/proto/emqx_ds_proto_v2.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2023-2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 -module(emqx_ds_proto_v2).
 

+ 0 - 12
apps/emqx_durable_storage/src/proto/emqx_ds_proto_v3.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 -module(emqx_ds_proto_v3).
 

+ 0 - 12
apps/emqx_durable_storage/src/proto/emqx_ds_proto_v4.erl

@@ -1,17 +1,5 @@
 %%--------------------------------------------------------------------
 %% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%%     http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
 %%--------------------------------------------------------------------
 -module(emqx_ds_proto_v4).
 

+ 279 - 40
apps/emqx_durable_storage/test/emqx_ds_replication_SUITE.erl

@@ -35,7 +35,7 @@ opts() ->
 opts(Overrides) ->
     maps:merge(
         #{
-            backend => builtin,
+            backend => builtin_raft,
             %% storage => {emqx_ds_storage_reference, #{}},
             storage => {emqx_ds_storage_bitfield_lts, #{epoch_bits => 10}},
             n_shards => 16,
@@ -56,8 +56,52 @@ appspec(emqx_durable_storage) ->
         override_env => [{egress_flush_interval, 1}]
     }}.
 
+t_metadata(init, Config) ->
+    Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{
+        work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+    }),
+    [{apps, Apps} | Config];
+t_metadata('end', Config) ->
+    emqx_cth_suite:stop(?config(apps, Config)),
+    Config.
+
+t_metadata(_Config) ->
+    DB = ?FUNCTION_NAME,
+    NShards = 1,
+    Options = #{
+        backend => builtin_raft,
+        storage => {emqx_ds_storage_reference, #{}},
+        n_shards => NShards,
+        n_sites => 1,
+        replication_factor => 1,
+        replication_options => #{}
+    },
+    try
+        ?assertMatch(ok, emqx_ds:open_db(DB, Options)),
+        %% Check metadata:
+        %%    We have only one site:
+        [Site] = emqx_ds_replication_layer_meta:sites(),
+        %%    Check all shards:
+        Shards = emqx_ds_replication_layer_meta:shards(DB),
+        %%    Since there is only one site all shards should be allocated
+        %%    to this site:
+        MyShards = emqx_ds_replication_layer_meta:my_shards(DB),
+        ?assertEqual(NShards, length(Shards)),
+        lists:foreach(
+            fun(Shard) ->
+                ?assertEqual(
+                    [Site], emqx_ds_replication_layer_meta:replica_set(DB, Shard)
+                )
+            end,
+            Shards
+        ),
+        ?assertEqual(lists:sort(Shards), lists:sort(MyShards))
+    after
+        ?assertMatch(ok, emqx_ds:drop_db(DB))
+    end.
+
 t_replication_transfers_snapshots(init, Config) ->
-    Apps = [appspec(emqx_durable_storage)],
+    Apps = [appspec(emqx_durable_storage), emqx_ds_builtin_raft],
     NodeSpecs = emqx_cth_cluster:mk_nodespecs(
         [
             {t_replication_transfers_snapshots1, #{apps => Apps}},
@@ -130,7 +174,7 @@ t_replication_transfers_snapshots(Config) ->
     ).
 
 t_rebalance(init, Config) ->
-    Apps = [appspec(emqx_durable_storage)],
+    Apps = [appspec(emqx_durable_storage), emqx_ds_builtin_raft],
     Nodes = emqx_cth_cluster:start(
         [
             {t_rebalance1, #{apps => Apps}},
@@ -159,18 +203,23 @@ t_rebalance(Config) ->
     ?check_trace(
         #{timetrap => 30_000},
         begin
+            Sites = [S1, S2 | _] = [ds_repl_meta(N, this_site) || N <- Nodes],
             %% 1. Initialize DB on the first node.
             Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}),
-            ?assertEqual(ok, ?ON(N1, emqx_ds:open_db(?DB, Opts))),
-            ?assertMatch(Shards when length(Shards) == 16, shards_online(N1, ?DB)),
-
-            %% 1.1 Open DB on the rest of the nodes:
             [
                 ?assertEqual(ok, ?ON(Node, emqx_ds:open_db(?DB, Opts)))
              || Node <- Nodes
             ],
 
-            Sites = [S1, S2 | _] = [ds_repl_meta(N, this_site) || N <- Nodes],
+            %% 1.1 Kick all sites except S1 from the replica set as
+            %% the initial condition:
+            ?assertMatch(
+                {ok, [_]},
+                ?ON(N1, emqx_ds_replication_layer_meta:assign_db_sites(?DB, [S1]))
+            ),
+            ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),
+            ?retry(500, 10, ?assertMatch(Shards when length(Shards) == 16, shards_online(N1, ?DB))),
+
             ct:pal("Sites: ~p~n", [Sites]),
 
             Sequence = [
@@ -260,7 +309,7 @@ t_rebalance(Config) ->
     ).
 
 t_join_leave_errors(init, Config) ->
-    Apps = [appspec(emqx_durable_storage)],
+    Apps = [appspec(emqx_durable_storage), emqx_ds_builtin_raft],
     Nodes = emqx_cth_cluster:start(
         [
             {t_join_leave_errors1, #{apps => Apps}},
@@ -275,16 +324,15 @@ t_join_leave_errors('end', Config) ->
 t_join_leave_errors(Config) ->
     %% This testcase verifies that logical errors arising during handling of
     %% join/leave operations are reported correctly.
-
     [N1, N2] = ?config(nodes, Config),
 
     Opts = opts(#{n_shards => 16, n_sites => 1, replication_factor => 3}),
-    ?assertEqual(ok, erpc:call(N1, emqx_ds, open_db, [?DB, Opts])),
-    ?assertEqual(ok, erpc:call(N2, emqx_ds, open_db, [?DB, Opts])),
+    ?assertEqual(ok, erpc:call(N1, emqx_ds, open_db, [?FUNCTION_NAME, Opts])),
+    ?assertEqual(ok, erpc:call(N2, emqx_ds, open_db, [?FUNCTION_NAME, Opts])),
 
     [S1, S2] = [ds_repl_meta(N, this_site) || N <- [N1, N2]],
 
-    ?assertEqual([S1], ds_repl_meta(N1, db_sites, [?DB])),
+    ?assertEqual(lists:sort([S1, S2]), lists:sort(ds_repl_meta(N1, db_sites, [?FUNCTION_NAME]))),
 
     %% Attempts to join a nonexistent DB / site.
     ?assertEqual(
@@ -293,36 +341,43 @@ t_join_leave_errors(Config) ->
     ),
     ?assertEqual(
         {error, {nonexistent_sites, [<<"NO-MANS-SITE">>]}},
-        ds_repl_meta(N1, join_db_site, [?DB, <<"NO-MANS-SITE">>])
+        ds_repl_meta(N1, join_db_site, [?FUNCTION_NAME, <<"NO-MANS-SITE">>])
     ),
     %% NOTE: Leaving a non-existent site is not an error.
     ?assertEqual(
         {ok, unchanged},
-        ds_repl_meta(N1, leave_db_site, [?DB, <<"NO-MANS-SITE">>])
+        ds_repl_meta(N1, leave_db_site, [?FUNCTION_NAME, <<"NO-MANS-SITE">>])
     ),
 
     %% Should be no-op.
-    ?assertEqual({ok, unchanged}, ds_repl_meta(N1, join_db_site, [?DB, S1])),
-    ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB)),
+    ?assertEqual({ok, unchanged}, ds_repl_meta(N1, join_db_site, [?FUNCTION_NAME, S1])),
+    ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)),
 
-    %% Impossible to leave the last site.
+    %% Leave S2:
+    ?assertEqual(
+        {ok, [S1]},
+        ds_repl_meta(N1, leave_db_site, [?FUNCTION_NAME, S2])
+    ),
+    %% Impossible to leave the last site:
     ?assertEqual(
         {error, {too_few_sites, []}},
-        ds_repl_meta(N1, leave_db_site, [?DB, S1])
+        ds_repl_meta(N1, leave_db_site, [?FUNCTION_NAME, S1])
     ),
 
     %% "Move" the DB to the other node.
-    ?assertMatch({ok, _}, ds_repl_meta(N1, join_db_site, [?DB, S2])),
-    ?assertMatch({ok, _}, ds_repl_meta(N2, leave_db_site, [?DB, S1])),
-    ?assertMatch([_ | _], emqx_ds_test_helpers:transitions(N1, ?DB)),
-    ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),
+    ?assertMatch({ok, _}, ds_repl_meta(N1, join_db_site, [?FUNCTION_NAME, S2])),
+    ?assertMatch({ok, _}, ds_repl_meta(N2, leave_db_site, [?FUNCTION_NAME, S1])),
+    ?assertMatch([_ | _], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)),
+    ?retry(
+        1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME))
+    ),
 
     %% Should be no-op.
-    ?assertMatch({ok, _}, ds_repl_meta(N2, leave_db_site, [?DB, S1])),
-    ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB)).
+    ?assertMatch({ok, _}, ds_repl_meta(N2, leave_db_site, [?FUNCTION_NAME, S1])),
+    ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?FUNCTION_NAME)).
 
 t_rebalance_chaotic_converges(init, Config) ->
-    Apps = [appspec(emqx_durable_storage)],
+    Apps = [appspec(emqx_durable_storage), emqx_ds_builtin_raft],
     Nodes = emqx_cth_cluster:start(
         [
             {t_rebalance_chaotic_converges1, #{apps => Apps}},
@@ -351,23 +406,24 @@ t_rebalance_chaotic_converges(Config) ->
     ?check_trace(
         #{},
         begin
+            Sites = [S1, S2, S3] = [ds_repl_meta(N, this_site) || N <- Nodes],
+            ct:pal("Sites: ~p~n", [Sites]),
+
             %% Initialize DB on first two nodes.
             Opts = opts(#{n_shards => 16, n_sites => 2, replication_factor => 3}),
 
+            %% Open DB:
             ?assertEqual(
-                [{ok, ok}, {ok, ok}],
-                erpc:multicall([N1, N2], emqx_ds, open_db, [?DB, Opts])
+                [{ok, ok}, {ok, ok}, {ok, ok}],
+                erpc:multicall([N1, N2, N3], emqx_ds, open_db, [?DB, Opts])
             ),
 
-            %% Open DB on the last node.
-            ?assertEqual(
-                ok,
-                erpc:call(N3, emqx_ds, open_db, [?DB, Opts])
+            %% Kick N3 from the replica set as the initial condition:
+            ?assertMatch(
+                {ok, [_, _]},
+                ?ON(N1, emqx_ds_replication_layer_meta:assign_db_sites(?DB, [S1, S2]))
             ),
-
-            %% Find out which sites there are.
-            Sites = [S1, S2, S3] = [ds_repl_meta(N, this_site) || N <- Nodes],
-            ct:pal("Sites: ~p~n", [Sites]),
+            ?retry(1000, 10, ?assertEqual([], emqx_ds_test_helpers:transitions(N1, ?DB))),
 
             Sequence = [
                 {N1, join_db_site, S3},
@@ -418,7 +474,7 @@ t_rebalance_chaotic_converges(Config) ->
     ).
 
 t_rebalance_offline_restarts(init, Config) ->
-    Apps = [appspec(emqx_durable_storage)],
+    Apps = [appspec(emqx_durable_storage), emqx_ds_builtin_raft],
     Specs = emqx_cth_cluster:mk_nodespecs(
         [
             {t_rebalance_offline_restarts1, #{apps => Apps}},
@@ -435,6 +491,7 @@ t_rebalance_offline_restarts('end', Config) ->
 t_rebalance_offline_restarts(Config) ->
     %% This testcase verifies that rebalancing progresses if nodes restart or
     %% go offline and never come back.
+    ok = snabbkaffe:start_trace(),
 
     Nodes = [N1, N2, N3] = ?config(nodes, Config),
     _Specs = [NS1, NS2, _] = ?config(nodespecs, Config),
@@ -477,7 +534,7 @@ t_rebalance_offline_restarts(Config) ->
     ?assertEqual(lists:sort([S1, S2]), ds_repl_meta(N1, db_sites, [?DB])).
 
 t_drop_generation(Config) ->
-    Apps = [appspec(emqx_durable_storage)],
+    Apps = [appspec(emqx_durable_storage), emqx_ds_builtin_raft],
     [_, _, NS3] =
         NodeSpecs = emqx_cth_cluster:mk_nodespecs(
             [
@@ -554,6 +611,189 @@ t_drop_generation(Config) ->
         end
     ).
 
+t_error_mapping_replication_layer(init, Config) ->
+    Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{
+        work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+    }),
+    [{apps, Apps} | Config];
+t_error_mapping_replication_layer('end', Config) ->
+    emqx_cth_suite:stop(?config(apps, Config)),
+    Config.
+
+t_error_mapping_replication_layer(_Config) ->
+    %% This checks that the replication layer maps recoverable errors correctly.
+
+    ok = emqx_ds_test_helpers:mock_rpc(),
+    ok = snabbkaffe:start_trace(),
+
+    DB = ?FUNCTION_NAME,
+    ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})),
+    [Shard1, Shard2] = emqx_ds_replication_layer_meta:shards(DB),
+
+    TopicFilter = emqx_topic:words(<<"foo/#">>),
+    Msgs = [
+        message(<<"C1">>, <<"foo/bar">>, <<"1">>, 0),
+        message(<<"C1">>, <<"foo/baz">>, <<"2">>, 1),
+        message(<<"C2">>, <<"foo/foo">>, <<"3">>, 2),
+        message(<<"C3">>, <<"foo/xyz">>, <<"4">>, 3),
+        message(<<"C4">>, <<"foo/bar">>, <<"5">>, 4),
+        message(<<"C5">>, <<"foo/oof">>, <<"6">>, 5)
+    ],
+
+    ?assertMatch(ok, emqx_ds:store_batch(DB, Msgs)),
+
+    ?block_until(#{?snk_kind := emqx_ds_buffer_flush, shard := Shard1}),
+    ?block_until(#{?snk_kind := emqx_ds_buffer_flush, shard := Shard2}),
+
+    Streams0 = emqx_ds:get_streams(DB, TopicFilter, 0),
+    Iterators0 = lists:map(
+        fun({_Rank, S}) ->
+            {ok, Iter} = emqx_ds:make_iterator(DB, S, TopicFilter, 0),
+            Iter
+        end,
+        Streams0
+    ),
+
+    %% Disrupt the link to the second shard.
+    ok = emqx_ds_test_helpers:mock_rpc_result(
+        fun(_Node, emqx_ds_replication_layer, _Function, Args) ->
+            case Args of
+                [DB, Shard1 | _] -> passthrough;
+                [DB, Shard2 | _] -> unavailable
+            end
+        end
+    ),
+
+    %% Result of `emqx_ds:get_streams/3` will just contain partial results, not an error.
+    Streams1 = emqx_ds:get_streams(DB, TopicFilter, 0),
+    ?assert(
+        length(Streams1) > 0 andalso length(Streams1) =< length(Streams0),
+        Streams1
+    ),
+
+    %% At least one of `emqx_ds:make_iterator/4` will end in an error.
+    Results1 = lists:map(
+        fun({_Rank, S}) ->
+            case emqx_ds:make_iterator(DB, S, TopicFilter, 0) of
+                Ok = {ok, _Iter} ->
+                    Ok;
+                Error = {error, recoverable, {erpc, _}} ->
+                    Error;
+                Other ->
+                    ct:fail({unexpected_result, Other})
+            end
+        end,
+        Streams0
+    ),
+    ?assert(
+        length([error || {error, _, _} <- Results1]) > 0,
+        Results1
+    ),
+
+    %% At least one of `emqx_ds:next/3` over initial set of iterators will end in an error.
+    Results2 = lists:map(
+        fun(Iter) ->
+            case emqx_ds:next(DB, Iter, _BatchSize = 42) of
+                Ok = {ok, _Iter, [_ | _]} ->
+                    Ok;
+                Error = {error, recoverable, {badrpc, _}} ->
+                    Error;
+                Other ->
+                    ct:fail({unexpected_result, Other})
+            end
+        end,
+        Iterators0
+    ),
+    ?assert(
+        length([error || {error, _, _} <- Results2]) > 0,
+        Results2
+    ),
+    meck:unload().
+
+%% This testcase verifies the behavior of `store_batch' operation
+%% when the underlying code experiences recoverable or unrecoverable
+%% problems.
+t_store_batch_fail(init, Config) ->
+    Apps = emqx_cth_suite:start([emqx_ds_builtin_raft], #{
+        work_dir => emqx_cth_suite:work_dir(?FUNCTION_NAME, Config)
+    }),
+    [{apps, Apps} | Config];
+t_store_batch_fail('end', Config) ->
+    emqx_cth_suite:stop(?config(apps, Config)),
+    Config.
+
+t_store_batch_fail(_Config) ->
+    ?check_trace(
+        #{timetrap => 15_000},
+        try
+            meck:new(emqx_ds_storage_layer, [passthrough, no_history]),
+            DB = ?FUNCTION_NAME,
+            ?assertMatch(ok, emqx_ds:open_db(DB, (opts())#{n_shards => 2})),
+            %% Success:
+            Batch1 = [
+                message(<<"C1">>, <<"foo/bar">>, <<"1">>, 1),
+                message(<<"C1">>, <<"foo/bar">>, <<"2">>, 1)
+            ],
+            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch1, #{sync => true})),
+            %% Inject unrecoverable error:
+            meck:expect(emqx_ds_storage_layer, store_batch, fun(_DB, _Shard, _Messages) ->
+                {error, unrecoverable, mock}
+            end),
+            Batch2 = [
+                message(<<"C1">>, <<"foo/bar">>, <<"3">>, 1),
+                message(<<"C1">>, <<"foo/bar">>, <<"4">>, 1)
+            ],
+            ?assertMatch(
+                {error, unrecoverable, mock}, emqx_ds:store_batch(DB, Batch2, #{sync => true})
+            ),
+            meck:unload(emqx_ds_storage_layer),
+            %% Inject a recoveralbe error:
+            meck:new(ra, [passthrough, no_history]),
+            meck:expect(ra, process_command, fun(Servers, Shard, Command) ->
+                ?tp(ra_command, #{servers => Servers, shard => Shard, command => Command}),
+                {timeout, mock}
+            end),
+            Batch3 = [
+                message(<<"C1">>, <<"foo/bar">>, <<"5">>, 2),
+                message(<<"C2">>, <<"foo/bar">>, <<"6">>, 2),
+                message(<<"C1">>, <<"foo/bar">>, <<"7">>, 3),
+                message(<<"C2">>, <<"foo/bar">>, <<"8">>, 3)
+            ],
+            %% Note: due to idempotency issues the number of retries
+            %% is currently set to 0:
+            ?assertMatch(
+                {error, recoverable, {timeout, mock}},
+                emqx_ds:store_batch(DB, Batch3, #{sync => true})
+            ),
+            meck:unload(ra),
+            ?assertMatch(ok, emqx_ds:store_batch(DB, Batch3, #{sync => true})),
+            lists:sort(emqx_ds_test_helpers:consume_per_stream(DB, ['#'], 1))
+        after
+            meck:unload()
+        end,
+        [
+            {"message ordering", fun(StoredMessages, _Trace) ->
+                [{_, Stream1}, {_, Stream2}] = StoredMessages,
+                ?assertMatch(
+                    [
+                        #message{payload = <<"1">>},
+                        #message{payload = <<"2">>},
+                        #message{payload = <<"5">>},
+                        #message{payload = <<"7">>}
+                    ],
+                    Stream1
+                ),
+                ?assertMatch(
+                    [
+                        #message{payload = <<"6">>},
+                        #message{payload = <<"8">>}
+                    ],
+                    Stream2
+                )
+            end}
+        ]
+    ).
+
 %%
 
 shard_server_info(Node, DB, Shard, Site, Info) ->
@@ -583,7 +823,7 @@ shards(Node, DB) ->
     erpc:call(Node, emqx_ds_replication_layer_meta, shards, [DB]).
 
 shards_online(Node, DB) ->
-    erpc:call(Node, emqx_ds_builtin_db_sup, which_shards, [DB]).
+    erpc:call(Node, emqx_ds_builtin_raft_db_sup, which_shards, [DB]).
 
 n_shards_online(Node, DB) ->
     length(shards_online(Node, DB)).
@@ -635,7 +875,6 @@ all() -> emqx_common_test_helpers:all(?MODULE).
 
 init_per_testcase(TCName, Config0) ->
     Config = emqx_common_test_helpers:init_per_testcase(?MODULE, TCName, Config0),
-    ok = snabbkaffe:start_trace(),
     Config.
 
 end_per_testcase(TCName, Config) ->

+ 1 - 1
apps/emqx_ds_shared_sub/test/emqx_ds_shared_sub_SUITE.erl

@@ -26,7 +26,7 @@ init_per_suite(Config) ->
                     },
                     <<"durable_storage">> => #{
                         <<"messages">> => #{
-                            <<"backend">> => <<"builtin">>
+                            <<"backend">> => <<"builtin_raft">>
                         }
                     }
                 }

+ 34 - 2
apps/emqx_durable_storage/README.md

@@ -103,7 +103,7 @@ Consumption of messages is done in several stages:
 
 # Documentation links
 
-TBD
+https://docs.emqx.com/en/enterprise/latest/durability/durability_introduction.html
 
 # Usage
 
@@ -146,7 +146,39 @@ The following REST APIs are available for managing the builtin durable storages:
 - `/ds/storages/:ds/replicas/:site` — add or remove replica of the durable storage on the site
 
 # Other
-TBD
+
+Note: this application contains main interface module and some common utility modules used by the backends, but it doesn't contain any ready-to-use DS backends.
+The backends are instead implemented as separate OTP applications, such as `emqx_ds_backend_local` and `emqx_ds_backend_raft`.
+
+There is a helper placeholder application `emqx_ds_backends` that depends on all backend applications available in the release.
+Business logic applications must have `emqx_ds_backends` as a dependency.
+
+The dependency diagram is the following:
+
+```
+                              +------------------------+
+                              |  emqx_durable_storage  |
+                              +------------------------+
+                              /           |            \
+                             /            |             \
+                            /             |              \
+   +------------------------+  +----------------------+   +------+
+   | emqx_ds_backend_local  |  | emqx_ds_builtin_raft |   | ...  |
+   +------------------------+  +-----------+----------+   +------+
+                            \            |               /
+                             \           |              /
+                              \          |             /
+                             +-------------------------+
+                             |    emqx_ds_backends     |
+                             +-------------------------+
+                                 /              \
+                                /                \
+       ......................../.. business apps .\........................
+                              /                    \
+                         +------+                +-------+
+                         | emqx |                |  ...  |
+                         +------+                +-------+
+```
 
 # Contributing
 Please see our [contributing.md](../../CONTRIBUTING.md).

+ 6 - 6
apps/emqx_durable_storage/include/emqx_ds_metrics.hrl

@@ -19,17 +19,17 @@
 %%%% Egress metrics:
 
 %% Number of successfully flushed batches:
--define(DS_EGRESS_BATCHES, emqx_ds_egress_batches).
+-define(DS_BUFFER_BATCHES, emqx_ds_buffer_batches).
 %% Number of batch flush retries:
--define(DS_EGRESS_BATCHES_RETRY, emqx_ds_egress_batches_retry).
+-define(DS_BUFFER_BATCHES_RETRY, emqx_ds_buffer_batches_retry).
 %% Number of batches that weren't flushed due to unrecoverable errors:
--define(DS_EGRESS_BATCHES_FAILED, emqx_ds_egress_batches_failed).
+-define(DS_BUFFER_BATCHES_FAILED, emqx_ds_buffer_batches_failed).
 %% Total number of messages that were successfully committed to the storage:
--define(DS_EGRESS_MESSAGES, emqx_ds_egress_messages).
+-define(DS_BUFFER_MESSAGES, emqx_ds_buffer_messages).
 %% Total size of payloads that were successfully committed to the storage:
--define(DS_EGRESS_BYTES, emqx_ds_egress_bytes).
+-define(DS_BUFFER_BYTES, emqx_ds_buffer_bytes).
 %% Sliding average of flush time (microseconds):
--define(DS_EGRESS_FLUSH_TIME, emqx_ds_egress_flush_time).
+-define(DS_BUFFER_FLUSH_TIME, emqx_ds_buffer_flush_time).
 
 %%%% Storage layer metrics:
 -define(DS_STORE_BATCH_TIME, emqx_ds_store_batch_time).

+ 31 - 24
apps/emqx_durable_storage/src/emqx_ds.erl

@@ -16,15 +16,16 @@
 
 %% @doc Main interface module for `emqx_durable_storage' application.
 %%
-%% It takes care of forwarding calls to the underlying DBMS. Currently
-%% only the embedded `emqx_ds_replication_layer' storage is supported,
-%% so all the calls are simply passed through.
+%% It takes care of forwarding calls to the underlying DBMS.
 -module(emqx_ds).
 
 %% Management API:
 -export([
-    base_dir/0,
+    register_backend/2,
+
     open_db/2,
+    close_db/1,
+    which_dbs/0,
     update_db_config/2,
     add_generation/1,
     list_generations_with_lifetimes/1,
@@ -60,7 +61,6 @@
     iterator/0,
     delete_iterator/0,
     iterator_id/0,
-    message_id/0,
     message_key/0,
     message_store_opts/0,
     next_result/1, next_result/0,
@@ -136,7 +136,7 @@
 
 -type ds_specific_delete_stream() :: term().
 
--type make_delete_iterator_result(DeleteIterator) :: {ok, DeleteIterator} | {error, term()}.
+-type make_delete_iterator_result(DeleteIterator) :: {ok, DeleteIterator} | error(_).
 
 -type make_delete_iterator_result() :: make_delete_iterator_result(delete_iterator()).
 
@@ -173,10 +173,7 @@
         _ => _
     }.
 
--type create_db_opts() ::
-    emqx_ds_replication_layer:builtin_db_opts() | generic_db_opts().
-
--type message_id() :: emqx_ds_replication_layer:message_id().
+-type create_db_opts() :: generic_db_opts().
 
 %% An opaque term identifying a generation.  Each implementation will possibly add
 %% information to this term to match its inner structure (e.g.: by embedding the shard id,
@@ -199,6 +196,8 @@
 
 -callback open_db(db(), create_db_opts()) -> ok | {error, _}.
 
+-callback close_db(db()) -> ok.
+
 -callback add_generation(db()) -> ok | {error, _}.
 
 -callback update_db_config(db(), create_db_opts()) -> ok | {error, _}.
@@ -247,21 +246,32 @@
 %% API functions
 %%================================================================================
 
--spec base_dir() -> file:filename().
-base_dir() ->
-    application:get_env(?APP, db_data_dir, emqx:data_dir()).
+%% @doc Register DS backend.
+-spec register_backend(atom(), module()) -> ok.
+register_backend(Name, Module) ->
+    persistent_term:put({emqx_ds_backend_module, Name}, Module).
 
 %% @doc Different DBs are completely independent from each other. They
 %% could represent something like different tenants.
 -spec open_db(db(), create_db_opts()) -> ok.
-open_db(DB, Opts = #{backend := Backend}) when Backend =:= builtin orelse Backend =:= fdb ->
-    Module =
-        case Backend of
-            builtin -> emqx_ds_replication_layer;
-            fdb -> emqx_fdb_ds
-        end,
-    persistent_term:put(?persistent_term(DB), Module),
-    ?module(DB):open_db(DB, Opts).
+open_db(DB, Opts = #{backend := Backend}) ->
+    case persistent_term:get({emqx_ds_backend_module, Backend}, undefined) of
+        undefined ->
+            error({no_such_backend, Backend});
+        Module ->
+            persistent_term:put(?persistent_term(DB), Module),
+            emqx_ds_sup:register_db(DB, Backend),
+            ?module(DB):open_db(DB, Opts)
+    end.
+
+-spec close_db(db()) -> ok.
+close_db(DB) ->
+    emqx_ds_sup:unregister_db(DB),
+    ?module(DB):close_db(DB).
+
+-spec which_dbs() -> [{db(), _Backend :: atom()}].
+which_dbs() ->
+    emqx_ds_sup:which_dbs().
 
 -spec add_generation(db()) -> ok.
 add_generation(DB) ->
@@ -286,9 +296,6 @@ drop_generation(DB, GenId) ->
             {error, not_implemented}
     end.
 
-%% @doc TODO: currently if one or a few shards are down, they won't be
-
-%% deleted.
 -spec drop_db(db()) -> ok.
 drop_db(DB) ->
     case persistent_term:get(?persistent_term(DB), undefined) of

+ 63 - 37
apps/emqx_durable_storage/src/emqx_ds_replication_layer_egress.erl

@@ -14,23 +14,15 @@
 %% limitations under the License.
 %%--------------------------------------------------------------------
 
-%% @doc Egress servers are responsible for proxing the outcoming
-%% `store_batch' requests towards EMQX DS shards.
-%%
-%% They re-assemble messages from different local processes into
-%% fixed-sized batches, and introduce centralized channels between the
-%% nodes. They are also responsible for maintaining backpressure
-%% towards the local publishers.
-%%
-%% There is (currently) one egress process for each shard running on
-%% each node, but it should be possible to have a pool of egress
-%% servers, if needed.
--module(emqx_ds_replication_layer_egress).
+%% @doc Buffer servers are responsible for collecting batches from the
+%% local processes, sharding and repackaging them.
+-module(emqx_ds_buffer).
 
 -behaviour(gen_server).
 
 %% API:
--export([start_link/2, store_batch/3]).
+-export([start_link/4, store_batch/3, shard_of_message/3]).
+-export([ls/0]).
 
 %% behavior callbacks:
 -export([init/1, format_status/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]).
@@ -47,9 +39,12 @@
 %% Type declarations
 %%================================================================================
 
--define(via(DB, Shard), {via, gproc, {n, l, {?MODULE, DB, Shard}}}).
+-define(name(DB, SHARD), {n, l, {?MODULE, DB, SHARD}}).
+-define(via(DB, SHARD), {via, gproc, ?name(DB, SHARD)}).
 -define(flush, flush).
 
+-define(cbm(DB), {?MODULE, DB}).
+
 -record(enqueue_req, {
     messages :: [emqx_types:message()],
     sync :: boolean(),
@@ -58,13 +53,29 @@
     payload_bytes :: non_neg_integer()
 }).
 
+-callback init_buffer(emqx_ds:db(), _Shard, _Options) -> {ok, _State}.
+
+-callback flush_buffer(emqx_ds:db(), _Shard, [emqx_types:message()], State) ->
+    {State, ok | {error, recoverable | unrecoverable, _}}.
+
+-callback shard_of_message(emqx_ds:db(), emqx_types:message(), topic | clientid, _Options) ->
+    _Shard.
+
 %%================================================================================
 %% API functions
 %%================================================================================
 
--spec start_link(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> {ok, pid()}.
-start_link(DB, Shard) ->
-    gen_server:start_link(?via(DB, Shard), ?MODULE, [DB, Shard], []).
+-spec ls() -> [{emqx_ds:db(), _Shard}].
+ls() ->
+    MS = {{?name('$1', '$2'), '_', '_'}, [], [{{'$1', '$2'}}]},
+    gproc:select({local, names}, [MS]).
+
+-spec start_link(module(), _CallbackOptions, emqx_ds:db(), _ShardId) ->
+    {ok, pid()}.
+start_link(CallbackModule, CallbackOptions, DB, Shard) ->
+    gen_server:start_link(
+        ?via(DB, Shard), ?MODULE, [CallbackModule, CallbackOptions, DB, Shard], []
+    ).
 
 -spec store_batch(emqx_ds:db(), [emqx_types:message()], emqx_ds:message_store_opts()) ->
     emqx_ds:store_batch_result().
@@ -95,13 +106,20 @@ store_batch(DB, Messages, Opts) ->
             repackage_messages(DB, Messages, Sync)
     end.
 
+-spec shard_of_message(emqx_ds:db(), emqx_types:message(), clientid | topic) -> _Shard.
+shard_of_message(DB, Message, ShardBy) ->
+    {CBM, Options} = persistent_term:get(?cbm(DB)),
+    CBM:shard_of_message(DB, Message, ShardBy, Options).
+
 %%================================================================================
 %% behavior callbacks
 %%================================================================================
 
 -record(s, {
+    callback_module :: module(),
+    callback_state :: term(),
     db :: emqx_ds:db(),
-    shard :: emqx_ds_replication_layer:shard_id(),
+    shard :: _ShardId,
     metrics_id :: emqx_ds_builtin_metrics:shard_metrics_id(),
     n_retries = 0 :: non_neg_integer(),
     %% FIXME: Currently max_retries is always 0, because replication
@@ -115,18 +133,22 @@ store_batch(DB, Messages, Opts) ->
     pending_replies = [] :: [gen_server:from()]
 }).
 
-init([DB, Shard]) ->
+init([CBM, CBMOptions, DB, Shard]) ->
     process_flag(trap_exit, true),
     process_flag(message_queue_data, off_heap),
-    logger:update_process_metadata(#{domain => [emqx, ds, egress, DB]}),
+    logger:update_process_metadata(#{domain => [emqx, ds, buffer, DB]}),
     MetricsId = emqx_ds_builtin_metrics:shard_metric_id(DB, Shard),
     ok = emqx_ds_builtin_metrics:init_for_shard(MetricsId),
+    {ok, CallbackS} = CBM:init_buffer(DB, Shard, CBMOptions),
     S = #s{
+        callback_module = CBM,
+        callback_state = CallbackS,
         db = DB,
         shard = Shard,
         metrics_id = MetricsId,
         queue = queue:new()
     },
+    persistent_term:put(?cbm(DB), {CBM, CBMOptions}),
     {ok, S}.
 
 format_status(Status) ->
@@ -179,7 +201,8 @@ handle_info(?flush, S) ->
 handle_info(_Info, S) ->
     {noreply, S}.
 
-terminate(_Reason, _S) ->
+terminate(_Reason, #s{db = DB}) ->
+    persistent_term:erase(?cbm(DB)),
     ok.
 
 %%================================================================================
@@ -234,7 +257,9 @@ flush(S) ->
 do_flush(S0 = #s{n = 0}) ->
     S0;
 do_flush(
-    S = #s{
+    S0 = #s{
+        callback_module = CBM,
+        callback_state = CallbackS0,
         queue = Q,
         pending_replies = Replies,
         db = DB,
@@ -246,16 +271,17 @@ do_flush(
 ) ->
     Messages = queue:to_list(Q),
     T0 = erlang:monotonic_time(microsecond),
-    Result = emqx_ds_replication_layer:ra_store_batch(DB, Shard, Messages),
+    {CallbackS, Result} = CBM:flush_buffer(DB, Shard, Messages, CallbackS0),
+    S = S0#s{callback_state = CallbackS},
     T1 = erlang:monotonic_time(microsecond),
-    emqx_ds_builtin_metrics:observe_egress_flush_time(Metrics, T1 - T0),
+    emqx_ds_builtin_metrics:observe_buffer_flush_time(Metrics, T1 - T0),
     case Result of
         ok ->
-            emqx_ds_builtin_metrics:inc_egress_batches(Metrics),
-            emqx_ds_builtin_metrics:inc_egress_messages(Metrics, S#s.n),
-            emqx_ds_builtin_metrics:inc_egress_bytes(Metrics, S#s.n_bytes),
+            emqx_ds_builtin_metrics:inc_buffer_batches(Metrics),
+            emqx_ds_builtin_metrics:inc_buffer_messages(Metrics, S#s.n),
+            emqx_ds_builtin_metrics:inc_buffer_bytes(Metrics, S#s.n_bytes),
             ?tp(
-                emqx_ds_replication_layer_egress_flush,
+                emqx_ds_buffer_flush,
                 #{db => DB, shard => Shard, batch => Messages}
             ),
             lists:foreach(fun(From) -> gen_server:reply(From, ok) end, Replies),
@@ -266,7 +292,7 @@ do_flush(
                 queue = queue:new(),
                 pending_replies = []
             };
-        {timeout, ServerId} when Retries < MaxRetries ->
+        {error, recoverable, Err} when Retries < MaxRetries ->
             %% Note: this is a hot loop, so we report error messages
             %% with `debug' level to avoid wiping the logs. Instead,
             %% error the detection must rely on the metrics. Debug
@@ -274,11 +300,11 @@ do_flush(
             %% via logger domain.
             ?tp(
                 debug,
-                emqx_ds_replication_layer_egress_flush_retry,
-                #{db => DB, shard => Shard, reason => timeout, server_id => ServerId}
+                emqx_ds_buffer_flush_retry,
+                #{db => DB, shard => Shard, reason => Err}
             ),
             %% Retry sending the batch:
-            emqx_ds_builtin_metrics:inc_egress_batches_retry(Metrics),
+            emqx_ds_builtin_metrics:inc_buffer_batches_retry(Metrics),
             erlang:garbage_collect(),
             %% We block the gen_server until the next retry.
             BlockTime = ?COOLDOWN_MIN + rand:uniform(?COOLDOWN_MAX - ?COOLDOWN_MIN),
@@ -287,10 +313,10 @@ do_flush(
         Err ->
             ?tp(
                 debug,
-                emqx_ds_replication_layer_egress_flush_failed,
+                emqx_ds_buffer_flush_failed,
                 #{db => DB, shard => Shard, error => Err}
             ),
-            emqx_ds_builtin_metrics:inc_egress_batches_failed(Metrics),
+            emqx_ds_builtin_metrics:inc_buffer_batches_failed(Metrics),
             Reply =
                 case Err of
                     {error, _, _} -> Err;
@@ -311,7 +337,7 @@ do_flush(
     end.
 
 -spec shards_of_batch(emqx_ds:db(), [emqx_types:message()]) ->
-    [{emqx_ds_replication_layer:shard_id(), {NMessages, NBytes}}]
+    [{_ShardId, {NMessages, NBytes}}]
 when
     NMessages :: non_neg_integer(),
     NBytes :: non_neg_integer().
@@ -320,7 +346,7 @@ shards_of_batch(DB, Messages) ->
         lists:foldl(
             fun(Message, Acc) ->
                 %% TODO: sharding strategy must be part of the DS DB schema:
-                Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid),
+                Shard = shard_of_message(DB, Message, clientid),
                 Size = payload_size(Message),
                 maps:update_with(
                     Shard,
@@ -339,7 +365,7 @@ shards_of_batch(DB, Messages) ->
 repackage_messages(DB, Messages, Sync) ->
     Batches = lists:foldl(
         fun(Message, Acc) ->
-            Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid),
+            Shard = shard_of_message(DB, Message, clientid),
             Size = payload_size(Message),
             maps:update_with(
                 Shard,

+ 51 - 51
apps/emqx_durable_storage/src/emqx_ds_builtin_metrics.erl

@@ -22,13 +22,13 @@
 -export([prometheus_meta/0, prometheus_collect/1]).
 
 -export([
-    inc_egress_batches/1,
-    inc_egress_batches_retry/1,
-    inc_egress_batches_failed/1,
-    inc_egress_messages/2,
-    inc_egress_bytes/2,
+    inc_buffer_batches/1,
+    inc_buffer_batches_retry/1,
+    inc_buffer_batches_failed/1,
+    inc_buffer_messages/2,
+    inc_buffer_bytes/2,
 
-    observe_egress_flush_time/2,
+    observe_buffer_flush_time/2,
 
     observe_store_batch_time/2,
 
@@ -68,16 +68,16 @@
 
 -define(DB_METRICS, ?STORAGE_LAYER_METRICS ++ ?FETCH_METRICS).
 
--define(EGRESS_METRICS, [
-    {counter, ?DS_EGRESS_BATCHES},
-    {counter, ?DS_EGRESS_BATCHES_RETRY},
-    {counter, ?DS_EGRESS_BATCHES_FAILED},
-    {counter, ?DS_EGRESS_MESSAGES},
-    {counter, ?DS_EGRESS_BYTES},
-    {slide, ?DS_EGRESS_FLUSH_TIME}
+-define(BUFFER_METRICS, [
+    {counter, ?DS_BUFFER_BATCHES},
+    {counter, ?DS_BUFFER_BATCHES_RETRY},
+    {counter, ?DS_BUFFER_BATCHES_FAILED},
+    {counter, ?DS_BUFFER_MESSAGES},
+    {counter, ?DS_BUFFER_BYTES},
+    {slide, ?DS_BUFFER_FLUSH_TIME}
 ]).
 
--define(SHARD_METRICS, ?EGRESS_METRICS).
+-define(SHARD_METRICS, ?BUFFER_METRICS).
 
 -type shard_metrics_id() :: binary().
 
@@ -96,7 +96,7 @@ child_spec() ->
 init_for_db(DB) ->
     emqx_metrics_worker:create_metrics(?WORKER, DB, ?DB_METRICS, []).
 
--spec shard_metric_id(emqx_ds:db(), emqx_ds_replication_layer:shard_id()) -> shard_metrics_id().
+-spec shard_metric_id(emqx_ds:db(), binary()) -> shard_metrics_id().
 shard_metric_id(DB, ShardId) ->
     iolist_to_binary([atom_to_list(DB), $/, ShardId]).
 
@@ -106,37 +106,37 @@ init_for_shard(ShardId) ->
     emqx_metrics_worker:create_metrics(?WORKER, ShardId, ?SHARD_METRICS, []).
 
 %% @doc Increase the number of successfully flushed batches
--spec inc_egress_batches(shard_metrics_id()) -> ok.
-inc_egress_batches(Id) ->
-    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BATCHES).
+-spec inc_buffer_batches(shard_metrics_id()) -> ok.
+inc_buffer_batches(Id) ->
+    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_BUFFER_BATCHES).
 
-%% @doc Increase the number of time the egress worker had to retry
+%% @doc Increase the number of time the buffer worker had to retry
 %% flushing the batch
--spec inc_egress_batches_retry(shard_metrics_id()) -> ok.
-inc_egress_batches_retry(Id) ->
-    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BATCHES_RETRY).
+-spec inc_buffer_batches_retry(shard_metrics_id()) -> ok.
+inc_buffer_batches_retry(Id) ->
+    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_BUFFER_BATCHES_RETRY).
 
-%% @doc Increase the number of time the egress worker encountered an
+%% @doc Increase the number of time the buffer worker encountered an
 %% unrecoverable error while trying to flush the batch
--spec inc_egress_batches_failed(shard_metrics_id()) -> ok.
-inc_egress_batches_failed(Id) ->
-    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BATCHES_FAILED).
+-spec inc_buffer_batches_failed(shard_metrics_id()) -> ok.
+inc_buffer_batches_failed(Id) ->
+    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_BUFFER_BATCHES_FAILED).
 
 %% @doc Increase the number of messages successfully saved to the shard
--spec inc_egress_messages(shard_metrics_id(), non_neg_integer()) -> ok.
-inc_egress_messages(Id, NMessages) ->
-    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_MESSAGES, NMessages).
+-spec inc_buffer_messages(shard_metrics_id(), non_neg_integer()) -> ok.
+inc_buffer_messages(Id, NMessages) ->
+    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_BUFFER_MESSAGES, NMessages).
 
 %% @doc Increase the number of messages successfully saved to the shard
--spec inc_egress_bytes(shard_metrics_id(), non_neg_integer()) -> ok.
-inc_egress_bytes(Id, NMessages) ->
-    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_EGRESS_BYTES, NMessages).
+-spec inc_buffer_bytes(shard_metrics_id(), non_neg_integer()) -> ok.
+inc_buffer_bytes(Id, NMessages) ->
+    catch emqx_metrics_worker:inc(?WORKER, Id, ?DS_BUFFER_BYTES, NMessages).
 
-%% @doc Add a sample of elapsed time spent flushing the egress to the
+%% @doc Add a sample of elapsed time spent flushing the buffer to the
 %% Raft log (in microseconds)
--spec observe_egress_flush_time(shard_metrics_id(), non_neg_integer()) -> ok.
-observe_egress_flush_time(Id, FlushTime) ->
-    catch emqx_metrics_worker:observe(?WORKER, Id, ?DS_EGRESS_FLUSH_TIME, FlushTime).
+-spec observe_buffer_flush_time(shard_metrics_id(), non_neg_integer()) -> ok.
+observe_buffer_flush_time(Id, FlushTime) ->
+    catch emqx_metrics_worker:observe(?WORKER, Id, ?DS_BUFFER_FLUSH_TIME, FlushTime).
 
 -spec observe_store_batch_time(emqx_ds_storage_layer:shard_id(), non_neg_integer()) -> ok.
 observe_store_batch_time({DB, _}, StoreTime) ->
@@ -176,11 +176,14 @@ prometheus_collect(NodeOrAggr) ->
 
 prometheus_per_db(NodeOrAggr) ->
     lists:foldl(
-        fun(DB, Acc) ->
-            prometheus_per_db(NodeOrAggr, DB, Acc)
+        fun
+            ({DB, Backend}, Acc) when Backend =:= builtin_local; Backend =:= builtin_raft ->
+                prometheus_per_db(NodeOrAggr, DB, Acc);
+            ({_, _}, Acc) ->
+                Acc
         end,
         #{},
-        emqx_ds_builtin_db_sup:which_dbs()
+        emqx_ds:which_dbs()
     ).
 
 %% This function returns the data in the following format:
@@ -221,13 +224,13 @@ prometheus_per_db(NodeOrAggr, DB, Acc0) ->
 
 %% This function returns the data in the following format:
 %% ```
-%% #{emqx_ds_egress_batches =>
+%% #{emqx_ds_buffer_batches =>
 %%       [{[{db,messages},{shard,<<"1">>}],99408},
 %%        {[{db,messages},{shard,<<"0">>}],99409}],
-%%   emqx_ds_egress_batches_retry =>
+%%   emqx_ds_buffer_batches_retry =>
 %%       [{[{db,messages},{shard,<<"1">>}],0},
 %%        {[{db,messages},{shard,<<"0">>}],0}],
-%%   emqx_ds_egress_messages =>
+%%   emqx_ds_buffer_messages =>
 %%        ...
 %%  }
 %% '''
@@ -235,18 +238,15 @@ prometheus_per_db(NodeOrAggr, DB, Acc0) ->
 %% If `NodeOrAggr' = `node' then node name is appended to the list of
 %% labels.
 prometheus_per_shard(NodeOrAggr) ->
+    prometheus_buffer_metrics(NodeOrAggr).
+
+prometheus_buffer_metrics(NodeOrAggr) ->
     lists:foldl(
-        fun(DB, Acc0) ->
-            lists:foldl(
-                fun(Shard, Acc) ->
-                    prometheus_per_shard(NodeOrAggr, DB, Shard, Acc)
-                end,
-                Acc0,
-                emqx_ds_replication_layer_meta:shards(DB)
-            )
+        fun({DB, Shard}, Acc) ->
+            prometheus_per_shard(NodeOrAggr, DB, Shard, Acc)
         end,
         #{},
-        emqx_ds_builtin_db_sup:which_dbs()
+        emqx_ds_buffer:ls()
     ).
 
 prometheus_per_shard(NodeOrAggr, DB, Shard, Acc0) ->

+ 15 - 9
apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl

@@ -55,7 +55,7 @@
 -export([init/1, format_status/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2]).
 
 %% internal exports:
--export([db_dir/1]).
+-export([db_dir/1, base_dir/0]).
 
 -export_type([
     gen_id/0,
@@ -87,6 +87,8 @@
 %% Type declarations
 %%================================================================================
 
+-define(APP, emqx_durable_storage).
+
 %% # "Record" integer keys.  We use maps with integer keys to avoid persisting and sending
 %% records over the wire.
 %% tags:
@@ -104,7 +106,7 @@
     {emqx_ds_storage_reference, emqx_ds_storage_reference:options()}
     | {emqx_ds_storage_bitfield_lts, emqx_ds_storage_bitfield_lts:options()}.
 
--type shard_id() :: {emqx_ds:db(), emqx_ds_replication_layer:shard_id()}.
+-type shard_id() :: {emqx_ds:db(), binary()}.
 
 -type cf_refs() :: [{string(), rocksdb:cf_handle()}].
 
@@ -424,11 +426,11 @@ make_delete_iterator(
                         ?generation => GenId,
                         ?enc => Iter
                     }};
-                {error, _} = Err ->
-                    Err
+                {error, Err} ->
+                    {error, unrecoverable, Err}
             end;
         not_found ->
-            {error, end_of_stream}
+            {error, unrecoverable, generation_not_found}
     end.
 
 -spec update_iterator(shard_id(), iterator(), emqx_ds:message_key()) ->
@@ -447,8 +449,8 @@ update_iterator(
                         ?generation => GenId,
                         ?enc => Iter
                     }};
-                {error, _} = Err ->
-                    Err
+                {error, Err} ->
+                    {error, unrecoverable, Err}
             end;
         not_found ->
             {error, unrecoverable, generation_not_found}
@@ -889,13 +891,17 @@ rocksdb_open(Shard, Options) ->
             Error
     end.
 
+-spec base_dir() -> file:filename().
+base_dir() ->
+    application:get_env(?APP, db_data_dir, emqx:data_dir()).
+
 -spec db_dir(shard_id()) -> file:filename().
 db_dir({DB, ShardId}) ->
-    filename:join([emqx_ds:base_dir(), DB, binary_to_list(ShardId)]).
+    filename:join([base_dir(), DB, binary_to_list(ShardId)]).
 
 -spec checkpoints_dir(shard_id()) -> file:filename().
 checkpoints_dir({DB, ShardId}) ->
-    filename:join([emqx_ds:base_dir(), DB, checkpoints, binary_to_list(ShardId)]).
+    filename:join([base_dir(), DB, checkpoints, binary_to_list(ShardId)]).
 
 -spec checkpoint_dir(shard_id(), _Name :: file:name()) -> file:filename().
 checkpoint_dir(ShardId, Name) ->

+ 15 - 22
apps/emqx_durable_storage/src/emqx_ds_sup.erl

@@ -18,7 +18,8 @@
 -behaviour(supervisor).
 
 %% API:
--export([start_link/0, attach_backend/2]).
+-export([start_link/0]).
+-export([register_db/2, unregister_db/1, which_dbs/0]).
 
 %% behaviour callbacks:
 -export([init/1]).
@@ -28,6 +29,7 @@
 %%================================================================================
 
 -define(SUP, ?MODULE).
+-define(TAB, ?MODULE).
 
 %%================================================================================
 %% API functions
@@ -37,33 +39,24 @@
 start_link() ->
     supervisor:start_link({local, ?SUP}, ?MODULE, top).
 
-%% @doc Attach a child backend-specific supervisor to the top
-%% application supervisor, if not yet present
--spec attach_backend(_BackendId, {module(), atom(), list()}) ->
-    {ok, pid()} | {error, _}.
-attach_backend(Backend, Start) ->
-    Spec = #{
-        id => Backend,
-        start => Start,
-        significant => false,
-        shutdown => infinity,
-        type => supervisor
-    },
-    case supervisor:start_child(?SUP, Spec) of
-        {ok, Pid} ->
-            {ok, Pid};
-        {error, {already_started, Pid}} ->
-            {ok, Pid};
-        {error, Err} ->
-            {error, Err}
-    end.
+register_db(DB, Backend) ->
+    ets:insert(?TAB, {DB, Backend}),
+    ok.
+
+unregister_db(DB) ->
+    ets:delete(?TAB, DB),
+    ok.
+
+which_dbs() ->
+    ets:tab2list(?TAB).
 
 %%================================================================================
 %% behaviour callbacks
 %%================================================================================
 
 init(top) ->
-    Children = [],
+    _ = ets:new(?TAB, [public, set, named_table]),
+    Children = [emqx_ds_builtin_metrics:child_spec()],
     SupFlags = #{
         strategy => one_for_one,
         intensity => 10,

+ 2 - 2
apps/emqx_durable_storage/src/emqx_durable_storage.app.src

@@ -2,10 +2,10 @@
 {application, emqx_durable_storage, [
     {description, "Message persistence and subscription replays for EMQX"},
     % strict semver, bump manually!
-    {vsn, "0.2.1"},
+    {vsn, "0.3.0"},
     {modules, []},
     {registered, []},
-    {applications, [kernel, stdlib, rocksdb, gproc, mria, ra, emqx_utils]},
+    {applications, [kernel, stdlib, rocksdb, gproc, mria, emqx_utils]},
     {mod, {emqx_ds_app, []}},
     {env, []}
 ]}.

+ 11 - 15
apps/emqx_durable_storage/test/emqx_ds_storage_bitfield_lts_SUITE.erl

@@ -26,16 +26,13 @@
 -define(SHARD, shard(?FUNCTION_NAME)).
 
 -define(DEFAULT_CONFIG, #{
-    backend => builtin,
+    backend => builtin_local,
     storage => {emqx_ds_storage_bitfield_lts, #{}},
-    n_shards => 1,
-    n_sites => 1,
-    replication_factor => 1,
-    replication_options => #{}
+    n_shards => 1
 }).
 
 -define(COMPACT_CONFIG, #{
-    backend => builtin,
+    backend => builtin_local,
     storage =>
         {emqx_ds_storage_bitfield_lts, #{
             bits_per_wildcard_level => 8
@@ -138,8 +135,8 @@ t_get_streams(_Config) ->
     [FooBarBaz] = GetStream(<<"foo/bar/baz">>),
     [A] = GetStream(<<"a">>),
     %% Restart shard to make sure trie is persisted and restored:
-    ok = emqx_ds_builtin_sup:stop_db(?FUNCTION_NAME),
-    {ok, _} = emqx_ds_builtin_sup:start_db(?FUNCTION_NAME, #{}),
+    ok = emqx_ds:close_db(?FUNCTION_NAME),
+    ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG),
     %% Verify that there are no "ghost streams" for topics that don't
     %% have any messages:
     [] = GetStream(<<"bar/foo">>),
@@ -188,8 +185,7 @@ t_new_generation_inherit_trie(_Config) ->
             %% learned trie.
             ok = emqx_ds_storage_layer:add_generation(?SHARD, _Since = 1_000),
             %% Restart the shard, to verify that LTS is persisted.
-            ok = application:stop(emqx_durable_storage),
-            ok = application:start(emqx_durable_storage),
+            ok = emqx_ds:close_db(?FUNCTION_NAME),
             ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG),
             %% Store a batch of messages with the same set of topics.
             TS2 = 1_500,
@@ -241,8 +237,8 @@ t_replay(_Config) ->
     ?assert(check(?SHARD, <<"+/+/+">>, 0, Messages)),
     ?assert(check(?SHARD, <<"+/+/baz">>, 0, Messages)),
     %% Restart the DB to make sure trie is persisted and restored:
-    ok = emqx_ds_builtin_sup:stop_db(?FUNCTION_NAME),
-    {ok, _} = emqx_ds_builtin_sup:start_db(?FUNCTION_NAME, #{}),
+    ok = emqx_ds:close_db(?FUNCTION_NAME),
+    ok = emqx_ds:open_db(?FUNCTION_NAME, ?DEFAULT_CONFIG),
     %% Learned wildcard topics:
     ?assertNot(check(?SHARD, <<"wildcard/1000/suffix/foo">>, 0, [])),
     ?assert(check(?SHARD, <<"wildcard/1/suffix/foo">>, 0, Messages)),
@@ -279,7 +275,7 @@ t_atomic_store_batch(_Config) ->
             %% Must contain exactly one flush with all messages.
             ?assertMatch(
                 [#{batch := [_, _, _]}],
-                ?of_kind(emqx_ds_replication_layer_egress_flush, Trace)
+                ?of_kind(emqx_ds_buffer_flush, Trace)
             ),
             ok
         end
@@ -308,7 +304,7 @@ t_non_atomic_store_batch(_Config) ->
         end,
         fun(ExpectedMsgs, Trace) ->
             ProcessedMsgs = lists:append(
-                ?projection(batch, ?of_kind(emqx_ds_replication_layer_egress_flush, Trace))
+                ?projection(batch, ?of_kind(emqx_ds_buffer_flush, Trace))
             ),
             ?assertEqual(
                 ExpectedMsgs,
@@ -512,7 +508,7 @@ suite() -> [{timetrap, {seconds, 20}}].
 init_per_suite(Config) ->
     emqx_common_test_helpers:clear_screen(),
     Apps = emqx_cth_suite:start(
-        [emqx_durable_storage],
+        [emqx_ds_builtin_local],
         #{work_dir => emqx_cth_suite:work_dir(Config)}
     ),
     [{apps, Apps} | Config].

+ 30 - 6
apps/emqx_durable_storage/test/emqx_ds_test_helpers.erl

@@ -26,6 +26,17 @@
     emqx_ds_test_helpers:on(NODE, fun() -> BODY end)
 ).
 
+skip_if_norepl() ->
+    try emqx_release:edition() of
+        ee ->
+            false;
+        _ ->
+            {skip, no_ds_replication}
+    catch
+        error:undef ->
+            {skip, standalone_not_supported}
+    end.
+
 -spec on([node()] | node(), fun(() -> A)) -> A | [A].
 on(Node, Fun) when is_atom(Node) ->
     [Ret] = on([Node], Fun),
@@ -217,9 +228,13 @@ transitions(Node, DB) ->
 
 %% Stream comparison
 
-message_eq(Msg1, {_Key, Msg2}) ->
-    %% Timestamps can be modified by the replication layer, ignore them:
-    Msg1#message{timestamp = 0} =:= Msg2#message{timestamp = 0}.
+message_eq(Fields, {_Key, Msg1 = #message{}}, Msg2) ->
+    message_eq(Fields, Msg1, Msg2);
+message_eq(Fields, Msg1, {_Key, Msg2 = #message{}}) ->
+    message_eq(Fields, Msg1, Msg2);
+message_eq(Fields, Msg1 = #message{}, Msg2 = #message{}) ->
+    maps:with(Fields, emqx_message:to_map(Msg1)) =:=
+        maps:with(Fields, emqx_message:to_map(Msg2)).
 
 %% Consuming streams and iterators
 
@@ -242,18 +257,27 @@ verify_stream_effects(DB, TestCase, Nodes0, L) ->
 -spec verify_stream_effects(atom(), binary(), node(), emqx_types:clientid(), ds_stream()) -> ok.
 verify_stream_effects(DB, TestCase, Node, ClientId, ExpectedStream) ->
     ct:pal("Checking consistency of effects for ~p on ~p", [ClientId, Node]),
-    DiffOpts = #{context => 20, window => 1000, compare_fun => fun message_eq/2},
     ?defer_assert(
         begin
             snabbkaffe_diff:assert_lists_eq(
                 ExpectedStream,
                 ds_topic_stream(DB, ClientId, client_topic(TestCase, ClientId), Node),
-                DiffOpts
+                message_diff_options([id, qos, from, flags, headers, topic, payload, extra])
             ),
             ct:pal("Data for client ~p on ~p is consistent.", [ClientId, Node])
         end
     ).
 
+diff_messages(Fields, Expected, Got) ->
+    snabbkaffe_diff:assert_lists_eq(Expected, Got, message_diff_options(Fields)).
+
+message_diff_options(Fields) ->
+    #{
+        context => 20,
+        window => 1000,
+        compare_fun => fun(M1, M2) -> message_eq(Fields, M1, M2) end
+    }.
+
 %% Create a stream from the topic (wildcards are NOT supported for a
 %% good reason: order of messages is implementation-dependent!).
 %%
@@ -297,7 +321,7 @@ nodes_of_clientid(DB, ClientId, Nodes = [N0 | _]) ->
 shard_of_clientid(DB, Node, ClientId) ->
     ?ON(
         Node,
-        emqx_ds_replication_layer:shard_of_message(DB, #message{from = ClientId}, clientid)
+        emqx_ds_buffer:shard_of_message(DB, #message{from = ClientId}, clientid)
     ).
 
 %% Consume eagerly:

+ 3 - 1
apps/emqx_machine/priv/reboot_lists.eterm

@@ -42,6 +42,7 @@
             esasl,
             emqx_utils,
             emqx_durable_storage,
+            emqx_ds_backends,
             emqx_http_lib,
             emqx_resource,
             emqx_connector,
@@ -135,7 +136,8 @@
             emqx_bridge_confluent,
             emqx_ds_shared_sub,
             emqx_auth_ext,
-            emqx_cluster_link
+            emqx_cluster_link,
+            emqx_ds_builtin_raft
         ],
     %% must always be of type `load'
     ce_business_apps =>

+ 2 - 0
apps/emqx_machine/src/emqx_machine_boot.erl

@@ -188,6 +188,8 @@ runtime_deps() ->
         {emqx_connector, fun(App) -> lists:prefix("emqx_bridge_", atom_to_list(App)) end},
         %% emqx_fdb is an EE app
         {emqx_durable_storage, emqx_fdb},
+        %% emqx_ds_builtin is an EE app
+        {emqx_ds_backends, emqx_ds_builtin_raft},
         {emqx_dashboard, emqx_license}
     ].
 

+ 9 - 10
apps/emqx_management/src/emqx_mgmt_api_ds.erl

@@ -15,13 +15,9 @@
 %%--------------------------------------------------------------------
 -module(emqx_mgmt_api_ds).
 
--behaviour(minirest_api).
+-if(?EMQX_RELEASE_EDITION == ee).
 
--include_lib("emqx/include/logger.hrl").
--include_lib("typerefl/include/types.hrl").
--include_lib("hocon/include/hoconsc.hrl").
--include_lib("emqx_utils/include/emqx_utils_api.hrl").
--include_lib("emqx/include/emqx_persistent_message.hrl").
+-behaviour(minirest_api).
 
 -import(hoconsc, [mk/2, ref/1, enum/1, array/1]).
 
@@ -50,10 +46,11 @@
     fields/1
 ]).
 
-%% internal exports:
--export([]).
-
--export_type([]).
+-include_lib("emqx/include/logger.hrl").
+-include_lib("typerefl/include/types.hrl").
+-include_lib("hocon/include/hoconsc.hrl").
+-include_lib("emqx_utils/include/emqx_utils_api.hrl").
+-include_lib("emqx/include/emqx_persistent_message.hrl").
 
 %%================================================================================
 %% Type declarations
@@ -494,3 +491,5 @@ meta_result_to_binary({error, {member_of_replica_sets, DBNames}}) ->
 meta_result_to_binary({error, Err}) ->
     IOList = io_lib:format("Error: ~p", [Err]),
     {error, iolist_to_binary(IOList)}.
+
+-endif.

+ 5 - 0
apps/emqx_management/src/emqx_mgmt_cli.erl

@@ -848,6 +848,7 @@ ds(CMD) ->
             emqx_ctl:usage([{"ds", "Durable storage is disabled"}])
     end.
 
+-if(?EMQX_RELEASE_EDITION == ee).
 do_ds(["info"]) ->
     emqx_ds_replication_layer_meta:print_status();
 do_ds(["set_replicas", DBStr | SitesStr]) ->
@@ -907,6 +908,10 @@ do_ds(_) ->
         {"ds leave <storage> <site>", "Remove site from the replica set of the storage"},
         {"ds forget <site>", "Forcefully remove a site from the list of known sites"}
     ]).
+-else.
+do_ds(_CMD) ->
+    emqx_ctl:usage([{"ds", "DS CLI is not available in this edition of EMQX"}]).
+-endif.
 
 %%--------------------------------------------------------------------
 %% Dump ETS

+ 21 - 16
apps/emqx_management/test/emqx_mgmt_SUITE.erl

@@ -56,22 +56,27 @@ init_per_group(persistence_disabled, Config) ->
         | Config
     ];
 init_per_group(persistence_enabled, Config) ->
-    Apps = emqx_cth_suite:start(
-        [
-            {emqx,
-                "durable_sessions {\n"
-                "  enable = true\n"
-                "  heartbeat_interval = 100ms\n"
-                "  renew_streams_interval = 100ms\n"
-                "}"},
-            emqx_management
-        ],
-        #{work_dir => emqx_cth_suite:work_dir(Config)}
-    ),
-    [
-        {apps, Apps}
-        | Config
-    ];
+    case emqx_ds_test_helpers:skip_if_norepl() of
+        false ->
+            Apps = emqx_cth_suite:start(
+                [
+                    {emqx,
+                        "durable_sessions {\n"
+                        "  enable = true\n"
+                        "  heartbeat_interval = 100ms\n"
+                        "  renew_streams_interval = 100ms\n"
+                        "}"},
+                    emqx_management
+                ],
+                #{work_dir => emqx_cth_suite:work_dir(Config)}
+            ),
+            [
+                {apps, Apps}
+                | Config
+            ];
+        Yes ->
+            Yes
+    end;
 init_per_group(cm_registry_enabled, Config) ->
     [{emqx_config, "broker.enable_session_registry = true"} | Config];
 init_per_group(cm_registry_disabled, Config) ->

+ 1 - 0
apps/emqx_management/test/emqx_mgmt_api_clients_SUITE.erl

@@ -52,6 +52,7 @@ persistent_session_testcases() ->
         t_persistent_sessions_subscriptions1,
         t_list_clients_v2
     ].
+
 client_msgs_testcases() ->
     [
         t_inflight_messages,

+ 15 - 10
apps/emqx_management/test/emqx_mgmt_api_ds_SUITE.erl

@@ -27,16 +27,21 @@ all() ->
     emqx_common_test_helpers:all(?MODULE).
 
 init_per_suite(Config) ->
-    Apps = emqx_cth_suite:start(
-        [
-            {emqx, "durable_sessions.enable = true"},
-            emqx_management,
-            {emqx_dashboard, "dashboard.listeners.http { enable = true, bind = 18083 }"}
-        ],
-        #{work_dir => emqx_cth_suite:work_dir(Config)}
-    ),
-    {ok, _} = emqx_common_test_http:create_default_app(),
-    [{suite_apps, Apps} | Config].
+    case emqx_ds_test_helpers:skip_if_norepl() of
+        false ->
+            Apps = emqx_cth_suite:start(
+                [
+                    {emqx, "durable_sessions.enable = true"},
+                    emqx_management,
+                    {emqx_dashboard, "dashboard.listeners.http { enable = true, bind = 18083 }"}
+                ],
+                #{work_dir => emqx_cth_suite:work_dir(Config)}
+            ),
+            {ok, _} = emqx_common_test_http:create_default_app(),
+            [{suite_apps, Apps} | Config];
+        Yes ->
+            Yes
+    end.
 
 end_per_suite(Config) ->
     ok = emqx_cth_suite:stop(?config(suite_apps, Config)).

+ 6 - 6
apps/emqx_prometheus/src/emqx_prometheus.erl

@@ -504,12 +504,12 @@ emqx_collect(K = emqx_mria_bootstrap_num_keys, D) -> gauge_metrics(?MG(K, D, [])
 emqx_collect(K = emqx_mria_message_queue_len, D) -> gauge_metrics(?MG(K, D, []));
 emqx_collect(K = emqx_mria_replayq_len, D) -> gauge_metrics(?MG(K, D, []));
 %% DS
-emqx_collect(K = ?DS_EGRESS_BATCHES, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_EGRESS_BATCHES_RETRY, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_EGRESS_BATCHES_FAILED, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_EGRESS_MESSAGES, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_EGRESS_BYTES, D) -> counter_metrics(?MG(K, D, []));
-emqx_collect(K = ?DS_EGRESS_FLUSH_TIME, D) -> gauge_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BUFFER_BATCHES, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BUFFER_BATCHES_RETRY, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BUFFER_BATCHES_FAILED, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BUFFER_MESSAGES, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BUFFER_BYTES, D) -> counter_metrics(?MG(K, D, []));
+emqx_collect(K = ?DS_BUFFER_FLUSH_TIME, D) -> gauge_metrics(?MG(K, D, []));
 emqx_collect(K = ?DS_STORE_BATCH_TIME, D) -> gauge_metrics(?MG(K, D, []));
 emqx_collect(K = ?DS_BUILTIN_NEXT_TIME, D) -> gauge_metrics(?MG(K, D, []));
 emqx_collect(K = ?DS_LTS_SEEK_COUNTER, D) -> counter_metrics(?MG(K, D, []));

+ 16 - 0
changes/ce/breaking-13248.en.md

@@ -0,0 +1,16 @@
+`builtin` durable storage backend has been replaced with the following two backends:
+
+- `builtin_local`: A durable storage backend that doesn't support replication.
+   It can't be used in a multi-node cluster.
+   This backend is available in both open source and enterprise editions.
+- `builtin_raft`: A durable storage backend that uses Raft algorithm for replication.
+   This backend is available only in the enterprise edition.
+
+The following Prometheus metrics have been renamed:
+
+- `emqx_ds_egress_batches` -> `emqx_ds_buffer_batches`
+- `emqx_ds_egress_batches_retry` -> `emqx_ds_buffer_batches_retry`
+- `emqx_ds_egress_batches_failed` -> `emqx_ds_buffer_batches_failed`
+- `emqx_ds_egress_messages` -> `emqx_ds_buffer_messages`
+- `emqx_ds_egress_bytes` -> `emqx_ds_buffer_bytes`
+- `emqx_ds_egress_flush_time` -> `emqx_ds_buffer_flush_time`

+ 4 - 1
mix.exs

@@ -205,7 +205,8 @@ defmodule EMQXUmbrella.MixProject do
       :emqx_bridge_syskeeper,
       :emqx_ds_shared_sub,
       :emqx_auth_ext,
-      :emqx_cluster_link
+      :emqx_cluster_link,
+      :emqx_ds_builtin_raft
     ])
   end
 
@@ -341,6 +342,8 @@ defmodule EMQXUmbrella.MixProject do
             :emqx_s3,
             :emqx_opentelemetry,
             :emqx_durable_storage,
+            :emqx_ds_builtin_local,
+            :emqx_ds_builtin_raft,
             :rabbit_common,
             :emqx_eviction_agent,
             :emqx_node_rebalance

+ 1 - 0
rebar.config.erl

@@ -124,6 +124,7 @@ is_community_umbrella_app("apps/emqx_node_rebalance") -> false;
 is_community_umbrella_app("apps/emqx_ds_shared_sub") -> false;
 is_community_umbrella_app("apps/emqx_auth_ext") -> false;
 is_community_umbrella_app("apps/emqx_cluster_link") -> false;
+is_community_umbrella_app("apps/emqx_ds_builtin_raft") -> false;
 is_community_umbrella_app(_) -> true.
 
 %% BUILD_WITHOUT_JQ

+ 18 - 12
rel/i18n/emqx_ds_schema.hocon

@@ -5,15 +5,21 @@ messages.desc:
   """~
   Configuration related to the durable storage of MQTT messages.~"""
 
-builtin.label: "Builtin backend"
-builtin.desc:
+builtin_raft.label: "Builtin backend with Raft replication"
+builtin_raft.desc:
   """~
-  Builtin session storage backend utilizing embedded RocksDB key-value store.~"""
+  Builtin storage backend utilizing embedded RocksDB key-value store.~"""
 
-builtin_backend.label: "Backend type"
-builtin_backend.desc:
+builtin_local.label: "Builtin backend"
+builtin_local.desc:
   """~
-  Built-in backend.~"""
+  Builtin storage backend utilizing embedded RocksDB key-value store.
+  This backend doesn't support clustering.~"""
+
+backend_type.label: "Backend type"
+backend_type.desc:
+  """~
+  Backend type.~"""
 
 builtin_data_dir.label: "Database location"
 builtin_data_dir.desc:
@@ -39,21 +45,21 @@ builtin_n_sites.desc:
   During this phase at least that many sites should come online to distribute shards between them, otherwise message storage will be unavailable until then.
   After the initialization is complete, sites may be offline, which will affect availability depending on the number of offline sites and replication factor.~"""
 
-builtin_local_write_buffer.label: "Local write buffer"
-builtin_local_write_buffer.desc:
+builtin_write_buffer.label: "Local write buffer"
+builtin_write_buffer.desc:
   """~
   Configuration related to the buffering of messages sent from the local node to the shard leader.
 
   EMQX accumulates PUBLISH messages from the local clients in a write buffer before committing them to the durable storage.
   This helps to hide network latency between EMQX nodes and improves write throughput.~"""
 
-builtin_local_write_buffer_max_items.label: "Max items"
-builtin_local_write_buffer_max_items.desc:
+builtin_write_buffer_max_items.label: "Max items"
+builtin_write_buffer_max_items.desc:
   """~
   This configuration parameter defines maximum number of messages stored in the local write buffer.~"""
 
-builtin_local_write_buffer_flush_interval.label: "Flush interval"
-builtin_local_write_buffer_flush_interval.desc:
+builtin_write_buffer_flush_interval.label: "Flush interval"
+builtin_write_buffer_flush_interval.desc:
   """~
   Maximum linger time for the buffered messages.
   Local write buffer will be flushed _at least_ as often as `flush_interval`.