|
|
@@ -40,6 +40,7 @@
|
|
|
|
|
|
-export_type([]).
|
|
|
|
|
|
+-include_lib("emqx_utils/include/emqx_message.hrl").
|
|
|
-include_lib("snabbkaffe/include/trace.hrl").
|
|
|
|
|
|
%%================================================================================
|
|
|
@@ -49,8 +50,13 @@
|
|
|
-define(via(DB, Shard), {via, gproc, {n, l, {?MODULE, DB, Shard}}}).
|
|
|
-define(flush, flush).
|
|
|
|
|
|
--record(enqueue_req, {message :: emqx_types:message(), sync :: boolean()}).
|
|
|
--record(enqueue_atomic_req, {batch :: [emqx_types:message()], sync :: boolean()}).
|
|
|
+-record(enqueue_req, {
|
|
|
+ messages :: [emqx_types:message()],
|
|
|
+ sync :: boolean(),
|
|
|
+ atomic :: boolean(),
|
|
|
+ n_messages :: non_neg_integer(),
|
|
|
+ payload_bytes :: non_neg_integer()
|
|
|
+}).
|
|
|
|
|
|
%%================================================================================
|
|
|
%% API functions
|
|
|
@@ -61,44 +67,32 @@ start_link(DB, Shard) ->
|
|
|
gen_server:start_link(?via(DB, Shard), ?MODULE, [DB, Shard], []).
|
|
|
|
|
|
-spec store_batch(emqx_ds:db(), [emqx_types:message()], emqx_ds:message_store_opts()) ->
|
|
|
- ok.
|
|
|
+ emqx_ds:store_batch_result().
|
|
|
store_batch(DB, Messages, Opts) ->
|
|
|
Sync = maps:get(sync, Opts, true),
|
|
|
- case maps:get(atomic, Opts, false) of
|
|
|
- false ->
|
|
|
- lists:foreach(
|
|
|
- fun(Message) ->
|
|
|
- Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid),
|
|
|
- gen_server:call(
|
|
|
- ?via(DB, Shard),
|
|
|
- #enqueue_req{
|
|
|
- message = Message,
|
|
|
- sync = Sync
|
|
|
- },
|
|
|
- infinity
|
|
|
- )
|
|
|
- end,
|
|
|
- Messages
|
|
|
+ Atomic = maps:get(atomic, Opts, false),
|
|
|
+ %% Usually we expect all messages in the batch to go into the
|
|
|
+ %% single shard, so this function is optimized for the happy case.
|
|
|
+ case shards_of_batch(DB, Messages) of
|
|
|
+ [{Shard, {NMsgs, NBytes}}] ->
|
|
|
+ %% Happy case:
|
|
|
+ enqueue_call_or_cast(
|
|
|
+ ?via(DB, Shard),
|
|
|
+ #enqueue_req{
|
|
|
+ messages = Messages,
|
|
|
+ sync = Sync,
|
|
|
+ atomic = Atomic,
|
|
|
+ n_messages = NMsgs,
|
|
|
+ payload_bytes = NBytes
|
|
|
+ }
|
|
|
);
|
|
|
- true ->
|
|
|
- maps:foreach(
|
|
|
- fun(Shard, Batch) ->
|
|
|
- gen_server:call(
|
|
|
- ?via(DB, Shard),
|
|
|
- #enqueue_atomic_req{
|
|
|
- batch = Batch,
|
|
|
- sync = Sync
|
|
|
- },
|
|
|
- infinity
|
|
|
- )
|
|
|
- end,
|
|
|
- maps:groups_from_list(
|
|
|
- fun(Message) ->
|
|
|
- emqx_ds_replication_layer:shard_of_message(DB, Message, clientid)
|
|
|
- end,
|
|
|
- Messages
|
|
|
- )
|
|
|
- )
|
|
|
+ [_, _ | _] when Atomic ->
|
|
|
+ %% It's impossible to commit a batch to multiple shards
|
|
|
+ %% atomically
|
|
|
+ {error, unrecoverable, atomic_commit_to_multiple_shards};
|
|
|
+ _Shards ->
|
|
|
+ %% Use a slower implementation for the unlikely case:
|
|
|
+ repackage_messages(DB, Messages, Sync)
|
|
|
end.
|
|
|
|
|
|
%%================================================================================
|
|
|
@@ -108,35 +102,65 @@ store_batch(DB, Messages, Opts) ->
|
|
|
-record(s, {
|
|
|
db :: emqx_ds:db(),
|
|
|
shard :: emqx_ds_replication_layer:shard_id(),
|
|
|
+ metrics_id :: emqx_ds_builtin_metrics:shard_metrics_id(),
|
|
|
+ n_retries = 0 :: non_neg_integer(),
|
|
|
+ %% FIXME: Currently max_retries is always 0, because replication
|
|
|
+ %% layer doesn't guarantee idempotency. Retrying would create
|
|
|
+ %% duplicate messages.
|
|
|
+ max_retries = 0 :: non_neg_integer(),
|
|
|
n = 0 :: non_neg_integer(),
|
|
|
- tref :: reference(),
|
|
|
- batch = [] :: [emqx_types:message()],
|
|
|
+ n_bytes = 0 :: non_neg_integer(),
|
|
|
+ tref :: undefined | reference(),
|
|
|
+ queue :: queue:queue(emqx_types:message()),
|
|
|
pending_replies = [] :: [gen_server:from()]
|
|
|
}).
|
|
|
|
|
|
init([DB, Shard]) ->
|
|
|
process_flag(trap_exit, true),
|
|
|
process_flag(message_queue_data, off_heap),
|
|
|
+ logger:update_process_metadata(#{domain => [emqx, ds, egress, DB]}),
|
|
|
+ MetricsId = emqx_ds_builtin_metrics:shard_metric_id(DB, Shard),
|
|
|
+ ok = emqx_ds_builtin_metrics:init_for_shard(MetricsId),
|
|
|
S = #s{
|
|
|
db = DB,
|
|
|
shard = Shard,
|
|
|
- tref = start_timer()
|
|
|
+ metrics_id = MetricsId,
|
|
|
+ queue = queue:new()
|
|
|
},
|
|
|
- {ok, S}.
|
|
|
+ {ok, start_timer(S)}.
|
|
|
|
|
|
-handle_call(#enqueue_req{message = Msg, sync = Sync}, From, S) ->
|
|
|
- do_enqueue(From, Sync, Msg, S);
|
|
|
-handle_call(#enqueue_atomic_req{batch = Batch, sync = Sync}, From, S) ->
|
|
|
- Len = length(Batch),
|
|
|
- do_enqueue(From, Sync, {atomic, Len, Batch}, S);
|
|
|
+handle_call(
|
|
|
+ #enqueue_req{
|
|
|
+ messages = Msgs,
|
|
|
+ sync = Sync,
|
|
|
+ atomic = Atomic,
|
|
|
+ n_messages = NMsgs,
|
|
|
+ payload_bytes = NBytes
|
|
|
+ },
|
|
|
+ From,
|
|
|
+ S0 = #s{pending_replies = Replies0}
|
|
|
+) ->
|
|
|
+ S = S0#s{pending_replies = [From | Replies0]},
|
|
|
+ {noreply, enqueue(Sync, Atomic, Msgs, NMsgs, NBytes, S)};
|
|
|
handle_call(_Call, _From, S) ->
|
|
|
{reply, {error, unknown_call}, S}.
|
|
|
|
|
|
+handle_cast(
|
|
|
+ #enqueue_req{
|
|
|
+ messages = Msgs,
|
|
|
+ sync = Sync,
|
|
|
+ atomic = Atomic,
|
|
|
+ n_messages = NMsgs,
|
|
|
+ payload_bytes = NBytes
|
|
|
+ },
|
|
|
+ S
|
|
|
+) ->
|
|
|
+ {noreply, enqueue(Sync, Atomic, Msgs, NMsgs, NBytes, S)};
|
|
|
handle_cast(_Cast, S) ->
|
|
|
{noreply, S}.
|
|
|
|
|
|
handle_info(?flush, S) ->
|
|
|
- {noreply, do_flush(S)};
|
|
|
+ {noreply, flush(S)};
|
|
|
handle_info(_Info, S) ->
|
|
|
{noreply, S}.
|
|
|
|
|
|
@@ -151,80 +175,215 @@ terminate(_Reason, _S) ->
|
|
|
%% Internal functions
|
|
|
%%================================================================================
|
|
|
|
|
|
+enqueue(
|
|
|
+ Sync,
|
|
|
+ Atomic,
|
|
|
+ Msgs,
|
|
|
+ BatchSize,
|
|
|
+ BatchBytes,
|
|
|
+ S0 = #s{n = NMsgs0, n_bytes = NBytes0, queue = Q0}
|
|
|
+) ->
|
|
|
+ %% At this point we don't split the batches, even when they aren't
|
|
|
+ %% atomic. It wouldn't win us anything in terms of memory, and
|
|
|
+ %% EMQX currently feeds data to DS in very small batches, so
|
|
|
+ %% granularity should be fine enough.
|
|
|
+ NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000),
|
|
|
+ NBytesMax = application:get_env(emqx_durable_storage, egress_batch_bytes, infinity),
|
|
|
+ NMsgs = NMsgs0 + BatchSize,
|
|
|
+ NBytes = NBytes0 + BatchBytes,
|
|
|
+ case (NMsgs >= NMax orelse NBytes >= NBytesMax) andalso (NMsgs0 > 0) of
|
|
|
+ true ->
|
|
|
+ %% Adding this batch would cause buffer to overflow. Flush
|
|
|
+ %% it now, and retry:
|
|
|
+ cancel_timer(S0),
|
|
|
+ S1 = flush(S0),
|
|
|
+ enqueue(Sync, Atomic, Msgs, BatchSize, BatchBytes, S1);
|
|
|
+ false ->
|
|
|
+ %% The buffer is empty, we enqueue the atomic batch in its
|
|
|
+ %% entirety:
|
|
|
+ Q1 = lists:foldl(fun queue:in/2, Q0, Msgs),
|
|
|
+ S1 = S0#s{n = NMsgs, n_bytes = NBytes, queue = Q1},
|
|
|
+ case NMsgs >= NMax orelse NBytes >= NBytes of
|
|
|
+ true ->
|
|
|
+ cancel_timer(S1),
|
|
|
+ flush(S1);
|
|
|
+ false ->
|
|
|
+ S1
|
|
|
+ end
|
|
|
+ end.
|
|
|
+
|
|
|
-define(COOLDOWN_MIN, 1000).
|
|
|
-define(COOLDOWN_MAX, 5000).
|
|
|
|
|
|
-do_flush(S = #s{batch = []}) ->
|
|
|
- S#s{tref = start_timer()};
|
|
|
+flush(S) ->
|
|
|
+ start_timer(do_flush(S)).
|
|
|
+
|
|
|
+do_flush(S0 = #s{n = 0}) ->
|
|
|
+ S0;
|
|
|
do_flush(
|
|
|
- S = #s{batch = Messages, pending_replies = Replies, db = DB, shard = Shard}
|
|
|
+ S = #s{
|
|
|
+ queue = Q,
|
|
|
+ pending_replies = Replies,
|
|
|
+ db = DB,
|
|
|
+ shard = Shard,
|
|
|
+ metrics_id = Metrics,
|
|
|
+ n_retries = Retries,
|
|
|
+ max_retries = MaxRetries
|
|
|
+ }
|
|
|
) ->
|
|
|
- case emqx_ds_replication_layer:ra_store_batch(DB, Shard, lists:reverse(Messages)) of
|
|
|
+ Messages = queue:to_list(Q),
|
|
|
+ T0 = erlang:monotonic_time(microsecond),
|
|
|
+ Result = emqx_ds_replication_layer:ra_store_batch(DB, Shard, Messages),
|
|
|
+ T1 = erlang:monotonic_time(microsecond),
|
|
|
+ emqx_ds_builtin_metrics:observe_egress_flush_time(Metrics, T1 - T0),
|
|
|
+ case Result of
|
|
|
ok ->
|
|
|
+ emqx_ds_builtin_metrics:inc_egress_batches(Metrics),
|
|
|
+ emqx_ds_builtin_metrics:inc_egress_messages(Metrics, S#s.n),
|
|
|
+ emqx_ds_builtin_metrics:inc_egress_bytes(Metrics, S#s.n_bytes),
|
|
|
?tp(
|
|
|
emqx_ds_replication_layer_egress_flush,
|
|
|
#{db => DB, shard => Shard, batch => Messages}
|
|
|
),
|
|
|
lists:foreach(fun(From) -> gen_server:reply(From, ok) end, Replies),
|
|
|
- true = erlang:garbage_collect(),
|
|
|
- ok;
|
|
|
- Error ->
|
|
|
- true = erlang:garbage_collect(),
|
|
|
+ erlang:garbage_collect(),
|
|
|
+ S#s{
|
|
|
+ n = 0,
|
|
|
+ n_bytes = 0,
|
|
|
+ queue = queue:new(),
|
|
|
+ pending_replies = []
|
|
|
+ };
|
|
|
+ {timeout, ServerId} when Retries < MaxRetries ->
|
|
|
+ %% Note: this is a hot loop, so we report error messages
|
|
|
+ %% with `debug' level to avoid wiping the logs. Instead,
|
|
|
+ %% error the detection must rely on the metrics. Debug
|
|
|
+ %% logging can be enabled for the particular egress server
|
|
|
+ %% via logger domain.
|
|
|
+ ?tp(
|
|
|
+ debug,
|
|
|
+ emqx_ds_replication_layer_egress_flush_retry,
|
|
|
+ #{db => DB, shard => Shard, reason => timeout, server_id => ServerId}
|
|
|
+ ),
|
|
|
+ %% Retry sending the batch:
|
|
|
+ emqx_ds_builtin_metrics:inc_egress_batches_retry(Metrics),
|
|
|
+ erlang:garbage_collect(),
|
|
|
+ %% We block the gen_server until the next retry.
|
|
|
+ BlockTime = ?COOLDOWN_MIN + rand:uniform(?COOLDOWN_MAX - ?COOLDOWN_MIN),
|
|
|
+ timer:sleep(BlockTime),
|
|
|
+ S#s{n_retries = Retries + 1};
|
|
|
+ Err ->
|
|
|
?tp(
|
|
|
- warning,
|
|
|
+ debug,
|
|
|
emqx_ds_replication_layer_egress_flush_failed,
|
|
|
- #{db => DB, shard => Shard, reason => Error}
|
|
|
+ #{db => DB, shard => Shard, error => Err}
|
|
|
),
|
|
|
- Cooldown = ?COOLDOWN_MIN + rand:uniform(?COOLDOWN_MAX - ?COOLDOWN_MIN),
|
|
|
- ok = timer:sleep(Cooldown),
|
|
|
- %% Since we drop the entire batch here, we at least reply callers with an
|
|
|
- %% error so they don't hang indefinitely in the `gen_server' call with
|
|
|
- %% `infinity' timeout.
|
|
|
- lists:foreach(fun(From) -> gen_server:reply(From, {error, Error}) end, Replies)
|
|
|
- end,
|
|
|
- S#s{
|
|
|
- n = 0,
|
|
|
- batch = [],
|
|
|
- pending_replies = [],
|
|
|
- tref = start_timer()
|
|
|
- }.
|
|
|
-
|
|
|
-do_enqueue(From, Sync, MsgOrBatch, S0 = #s{n = N, batch = Batch, pending_replies = Replies}) ->
|
|
|
- NMax = application:get_env(emqx_durable_storage, egress_batch_size, 1000),
|
|
|
- S1 =
|
|
|
- case MsgOrBatch of
|
|
|
- {atomic, NumMsgs, Msgs} ->
|
|
|
- S0#s{n = N + NumMsgs, batch = Msgs ++ Batch};
|
|
|
- Msg ->
|
|
|
- S0#s{n = N + 1, batch = [Msg | Batch]}
|
|
|
- end,
|
|
|
- %% TODO: later we may want to delay the reply until the message is
|
|
|
- %% replicated, but it requies changes to the PUBACK/PUBREC flow to
|
|
|
- %% allow for async replies. For now, we ack when the message is
|
|
|
- %% _buffered_ rather than stored.
|
|
|
- %%
|
|
|
- %% Otherwise, the client would freeze for at least flush interval,
|
|
|
- %% or until the buffer is filled.
|
|
|
- S2 =
|
|
|
- case Sync of
|
|
|
- true ->
|
|
|
- S1#s{pending_replies = [From | Replies]};
|
|
|
- false ->
|
|
|
- gen_server:reply(From, ok),
|
|
|
- S1
|
|
|
+ emqx_ds_builtin_metrics:inc_egress_batches_failed(Metrics),
|
|
|
+ Reply =
|
|
|
+ case Err of
|
|
|
+ {error, _, _} -> Err;
|
|
|
+ {timeout, ServerId} -> {error, recoverable, {timeout, ServerId}};
|
|
|
+ _ -> {error, unrecoverable, Err}
|
|
|
+ end,
|
|
|
+ lists:foreach(
|
|
|
+ fun(From) -> gen_server:reply(From, Reply) end, Replies
|
|
|
+ ),
|
|
|
+ erlang:garbage_collect(),
|
|
|
+ S#s{
|
|
|
+ n = 0,
|
|
|
+ n_bytes = 0,
|
|
|
+ queue = queue:new(),
|
|
|
+ pending_replies = [],
|
|
|
+ n_retries = 0
|
|
|
+ }
|
|
|
+ end.
|
|
|
+
|
|
|
+-spec shards_of_batch(emqx_ds:db(), [emqx_types:message()]) ->
|
|
|
+ [{emqx_ds_replication_layer:shard_id(), {NMessages, NBytes}}]
|
|
|
+when
|
|
|
+ NMessages :: non_neg_integer(),
|
|
|
+ NBytes :: non_neg_integer().
|
|
|
+shards_of_batch(DB, Messages) ->
|
|
|
+ maps:to_list(
|
|
|
+ lists:foldl(
|
|
|
+ fun(Message, Acc) ->
|
|
|
+ %% TODO: sharding strategy must be part of the DS DB schema:
|
|
|
+ Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid),
|
|
|
+ Size = payload_size(Message),
|
|
|
+ maps:update_with(
|
|
|
+ Shard,
|
|
|
+ fun({N, S}) ->
|
|
|
+ {N + 1, S + Size}
|
|
|
+ end,
|
|
|
+ {1, Size},
|
|
|
+ Acc
|
|
|
+ )
|
|
|
+ end,
|
|
|
+ #{},
|
|
|
+ Messages
|
|
|
+ )
|
|
|
+ ).
|
|
|
+
|
|
|
+repackage_messages(DB, Messages, Sync) ->
|
|
|
+ Batches = lists:foldl(
|
|
|
+ fun(Message, Acc) ->
|
|
|
+ Shard = emqx_ds_replication_layer:shard_of_message(DB, Message, clientid),
|
|
|
+ Size = payload_size(Message),
|
|
|
+ maps:update_with(
|
|
|
+ Shard,
|
|
|
+ fun({N, S, Msgs}) ->
|
|
|
+ {N + 1, S + Size, [Message | Msgs]}
|
|
|
+ end,
|
|
|
+ {1, Size, [Message]},
|
|
|
+ Acc
|
|
|
+ )
|
|
|
end,
|
|
|
- S =
|
|
|
- case N >= NMax of
|
|
|
- true ->
|
|
|
- _ = erlang:cancel_timer(S2#s.tref),
|
|
|
- do_flush(S2);
|
|
|
- false ->
|
|
|
- S2
|
|
|
+ #{},
|
|
|
+ Messages
|
|
|
+ ),
|
|
|
+ maps:fold(
|
|
|
+ fun(Shard, {NMsgs, ByteSize, RevMessages}, ErrAcc) ->
|
|
|
+ Err = enqueue_call_or_cast(
|
|
|
+ ?via(DB, Shard),
|
|
|
+ #enqueue_req{
|
|
|
+ messages = lists:reverse(RevMessages),
|
|
|
+ sync = Sync,
|
|
|
+ atomic = false,
|
|
|
+ n_messages = NMsgs,
|
|
|
+ payload_bytes = ByteSize
|
|
|
+ }
|
|
|
+ ),
|
|
|
+ compose_errors(ErrAcc, Err)
|
|
|
end,
|
|
|
- %% TODO: add a backpressure mechanism for the server to avoid
|
|
|
- %% building a long message queue.
|
|
|
- {noreply, S}.
|
|
|
+ ok,
|
|
|
+ Batches
|
|
|
+ ).
|
|
|
|
|
|
-start_timer() ->
|
|
|
+enqueue_call_or_cast(To, Req = #enqueue_req{sync = true}) ->
|
|
|
+ gen_server:call(To, Req, infinity);
|
|
|
+enqueue_call_or_cast(To, Req = #enqueue_req{sync = false}) ->
|
|
|
+ gen_server:cast(To, Req).
|
|
|
+
|
|
|
+compose_errors(ErrAcc, ok) ->
|
|
|
+ ErrAcc;
|
|
|
+compose_errors(ok, Err) ->
|
|
|
+ Err;
|
|
|
+compose_errors({error, recoverable, _}, {error, unrecoverable, Err}) ->
|
|
|
+ {error, unrecoverable, Err};
|
|
|
+compose_errors(ErrAcc, _Err) ->
|
|
|
+ ErrAcc.
|
|
|
+
|
|
|
+start_timer(S) ->
|
|
|
Interval = application:get_env(emqx_durable_storage, egress_flush_interval, 100),
|
|
|
- erlang:send_after(Interval, self(), ?flush).
|
|
|
+ Tref = erlang:send_after(Interval, self(), ?flush),
|
|
|
+ S#s{tref = Tref}.
|
|
|
+
|
|
|
+cancel_timer(#s{tref = undefined}) ->
|
|
|
+ ok;
|
|
|
+cancel_timer(#s{tref = TRef}) ->
|
|
|
+ _ = erlang:cancel_timer(TRef),
|
|
|
+ ok.
|
|
|
+
|
|
|
+%% @doc Return approximate size of the MQTT message (it doesn't take
|
|
|
+%% all things into account, for example headers and extras)
|
|
|
+payload_size(#message{payload = P, topic = T}) ->
|
|
|
+ size(P) + size(T).
|