فهرست منبع

chore(dsrepl): improve tracepoints usability a bit

Andrew Mayorov 1 سال پیش
والد
کامیت
b6894c18fa

+ 7 - 6
apps/emqx_ds_builtin_raft/src/emqx_ds_replication_layer.erl

@@ -143,7 +143,12 @@
 
 
 %% Core state of the replication, i.e. the state of ra machine.
 %% Core state of the replication, i.e. the state of ra machine.
 -type ra_state() :: #{
 -type ra_state() :: #{
+    %% Shard ID.
     db_shard := {emqx_ds:db(), shard_id()},
     db_shard := {emqx_ds:db(), shard_id()},
+
+    %% Unique timestamp tracking real time closely.
+    %% With microsecond granularity it should be nearly impossible for it to run
+    %% too far ahead of the real time clock.
     latest := timestamp_us()
     latest := timestamp_us()
 }.
 }.
 
 
@@ -755,11 +760,7 @@ apply(
     },
     },
     #{db_shard := DBShard = {DB, Shard}, latest := Latest0} = State0
     #{db_shard := DBShard = {DB, Shard}, latest := Latest0} = State0
 ) ->
 ) ->
-    %% NOTE
-    %% Unique timestamp tracking real time closely.
-    %% With microsecond granularity it should be nearly impossible for it to run
-    %% too far ahead than the real time clock.
-    ?tp(ds_ra_apply_batch, #{db => DB, shard => Shard, batch => MessagesIn, ts => Latest0}),
+    ?tp(ds_ra_apply_batch, #{db => DB, shard => Shard, batch => MessagesIn, latest => Latest0}),
     {Latest, Messages} = assign_timestamps(Latest0, MessagesIn),
     {Latest, Messages} = assign_timestamps(Latest0, MessagesIn),
     Result = emqx_ds_storage_layer:store_batch(DBShard, Messages, #{}),
     Result = emqx_ds_storage_layer:store_batch(DBShard, Messages, #{}),
     State = State0#{latest := Latest},
     State = State0#{latest := Latest},
@@ -839,7 +840,7 @@ apply(
 tick(TimeMs, #{db_shard := DBShard = {DB, Shard}, latest := Latest}) ->
 tick(TimeMs, #{db_shard := DBShard = {DB, Shard}, latest := Latest}) ->
     %% Leader = emqx_ds_replication_layer_shard:lookup_leader(DB, Shard),
     %% Leader = emqx_ds_replication_layer_shard:lookup_leader(DB, Shard),
     {Timestamp, _} = ensure_monotonic_timestamp(timestamp_to_timeus(TimeMs), Latest),
     {Timestamp, _} = ensure_monotonic_timestamp(timestamp_to_timeus(TimeMs), Latest),
-    ?tp(emqx_ds_replication_layer_tick, #{db => DB, shard => Shard, ts => Timestamp}),
+    ?tp(emqx_ds_replication_layer_tick, #{db => DB, shard => Shard, timestamp => Timestamp}),
     handle_custom_event(DBShard, Timestamp, tick).
     handle_custom_event(DBShard, Timestamp, tick).
 
 
 assign_timestamps(Latest, Messages) ->
 assign_timestamps(Latest, Messages) ->

+ 4 - 2
apps/emqx_ds_builtin_raft/src/emqx_ds_replication_snapshot.erl

@@ -69,6 +69,7 @@ prepare(Index, State) ->
 -spec write(_SnapshotDir :: file:filename(), ra_snapshot:meta(), _State :: ra_state()) ->
 -spec write(_SnapshotDir :: file:filename(), ra_snapshot:meta(), _State :: ra_state()) ->
     ok | {ok, _BytesWritten :: non_neg_integer()} | {error, ra_snapshot:file_err()}.
     ok | {ok, _BytesWritten :: non_neg_integer()} | {error, ra_snapshot:file_err()}.
 write(Dir, Meta, MachineState) ->
 write(Dir, Meta, MachineState) ->
+    ?tp(dsrepl_snapshot_write, #{meta => Meta, state => MachineState}),
     ra_log_snapshot:write(Dir, Meta, MachineState).
     ra_log_snapshot:write(Dir, Meta, MachineState).
 
 
 %% Reading a snapshot.
 %% Reading a snapshot.
@@ -165,6 +166,7 @@ complete_read(RS = #rs{reader = SnapReader, started_at = StartedAt}) ->
 -spec begin_accept(_SnapshotDir :: file:filename(), ra_snapshot:meta()) ->
 -spec begin_accept(_SnapshotDir :: file:filename(), ra_snapshot:meta()) ->
     {ok, ws()}.
     {ok, ws()}.
 begin_accept(Dir, Meta) ->
 begin_accept(Dir, Meta) ->
+    ?tp(dsrepl_snapshot_accept_started, #{meta => Meta}),
     WS = #ws{
     WS = #ws{
         phase = machine_state,
         phase = machine_state,
         started_at = erlang:monotonic_time(millisecond),
         started_at = erlang:monotonic_time(millisecond),
@@ -207,7 +209,7 @@ complete_accept(Chunk, WS = #ws{phase = storage_snapshot, writer = SnapWriter0})
             ?tp(dsrepl_snapshot_write_complete, #{writer => SnapWriter}),
             ?tp(dsrepl_snapshot_write_complete, #{writer => SnapWriter}),
             _ = emqx_ds_storage_snapshot:release_writer(SnapWriter),
             _ = emqx_ds_storage_snapshot:release_writer(SnapWriter),
             Result = complete_accept(WS#ws{writer = SnapWriter}),
             Result = complete_accept(WS#ws{writer = SnapWriter}),
-            ?tp(dsrepl_snapshot_accepted, #{shard => shard_id(WS)}),
+            ?tp(dsrepl_snapshot_accepted, #{shard => shard_id(WS), state => WS#ws.state}),
             Result;
             Result;
         {error, Reason} ->
         {error, Reason} ->
             ?tp(dsrepl_snapshot_write_error, #{reason => Reason, writer => SnapWriter0}),
             ?tp(dsrepl_snapshot_write_error, #{reason => Reason, writer => SnapWriter0}),
@@ -218,7 +220,7 @@ complete_accept(Chunk, WS = #ws{phase = storage_snapshot, writer = SnapWriter0})
 complete_accept(WS = #ws{started_at = StartedAt, writer = SnapWriter}) ->
 complete_accept(WS = #ws{started_at = StartedAt, writer = SnapWriter}) ->
     ShardId = shard_id(WS),
     ShardId = shard_id(WS),
     logger:info(#{
     logger:info(#{
-        msg => "dsrepl_snapshot_read_complete",
+        msg => "dsrepl_snapshot_write_complete",
         shard => ShardId,
         shard => ShardId,
         duration_ms => erlang:monotonic_time(millisecond) - StartedAt,
         duration_ms => erlang:monotonic_time(millisecond) - StartedAt,
         bytes_written => emqx_ds_storage_snapshot:writer_info(bytes_written, SnapWriter)
         bytes_written => emqx_ds_storage_snapshot:writer_info(bytes_written, SnapWriter)

+ 3 - 3
apps/emqx_durable_storage/src/emqx_ds_storage_layer.erl

@@ -312,9 +312,6 @@ store_batch(Shard, Messages, Options) ->
 prepare_batch(Shard, Messages = [{Time, _} | _], Options) ->
 prepare_batch(Shard, Messages = [{Time, _} | _], Options) ->
     %% NOTE
     %% NOTE
     %% We assume that batches do not span generations. Callers should enforce this.
     %% We assume that batches do not span generations. Callers should enforce this.
-    ?tp(emqx_ds_storage_layer_prepare_batch, #{
-        shard => Shard, messages => Messages, options => Options
-    }),
     %% FIXME: always store messages in the current generation
     %% FIXME: always store messages in the current generation
     case generation_at(Shard, Time) of
     case generation_at(Shard, Time) of
         {GenId, #{module := Mod, data := GenData}} ->
         {GenId, #{module := Mod, data := GenData}} ->
@@ -322,6 +319,9 @@ prepare_batch(Shard, Messages = [{Time, _} | _], Options) ->
             Result =
             Result =
                 case Mod:prepare_batch(Shard, GenData, Messages, Options) of
                 case Mod:prepare_batch(Shard, GenData, Messages, Options) of
                     {ok, CookedBatch} ->
                     {ok, CookedBatch} ->
+                        ?tp(emqx_ds_storage_layer_batch_cooked, #{
+                            shard => Shard, gen => GenId, batch => CookedBatch
+                        }),
                         {ok, #{?tag => ?COOKED_BATCH, ?generation => GenId, ?enc => CookedBatch}};
                         {ok, #{?tag => ?COOKED_BATCH, ?generation => GenId, ?enc => CookedBatch}};
                     Error = {error, _, _} ->
                     Error = {error, _, _} ->
                         Error
                         Error