Просмотр исходного кода

fix(resource manager): ensure metrics exist when (re)starting a resource or resetting their metrics

Fixes https://emqx.atlassian.net/browse/EMQX-13496

The reasons for the original issue that led to `emqx_metrics_worker` losing all metrics
are still unknown.  Here, we introduce some mitigations to avoid more drastic measures
such as restarting the node.
Thales Macedo Garitezi 1 год назад
Родитель
Сommit
3c4de9d4fb
2 измененных файлов с 21 добавлено и 0 удалено
  1. 10 0
      apps/emqx_resource/src/emqx_resource_manager.erl
  2. 11 0
      changes/ce/fix-14226.en.md

+ 10 - 0
apps/emqx_resource/src/emqx_resource_manager.erl

@@ -399,6 +399,7 @@ get_metrics(ResId) ->
 %% @doc Reset the metrics for the specified resource
 -spec reset_metrics(resource_id()) -> ok.
 reset_metrics(ResId) ->
+    ok = ensure_metrics(ResId),
     emqx_metrics_worker:reset_metrics(?RES_METRICS, ResId).
 
 %% @doc Returns the data for all resources
@@ -788,6 +789,7 @@ handle_remove_event(From, ClearMetrics, Data) ->
 start_resource(Data, From) ->
     %% in case the emqx_resource:call_start/2 hangs, the lookup/1 can read status from the cache
     #data{id = ResId, mod = Mod, config = Config, group = Group, type = Type} = Data,
+    ok = ensure_metrics(ResId),
     case emqx_resource:call_start(ResId, Mod, Config) of
         {ok, ResourceState} ->
             UpdatedData1 = Data#data{status = ?status_connecting, state = ResourceState},
@@ -1894,3 +1896,11 @@ abort_channel_health_check(Pid) ->
         {'EXIT', Pid, _} ->
             ok
     end.
+
+%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
+%% might be lost for a running resource, and future attempts to bump them result in
+%% errors.  As mitigation, we ensure such metrics are created here so that restarting
+%% the resource or resetting its metrics can recreate them.
+ensure_metrics(ResId) ->
+    {ok, _} = emqx_resource:ensure_metrics(ResId),
+    ok.

+ 11 - 0
changes/ce/fix-14226.en.md

@@ -0,0 +1,11 @@
+Previously, under high stress, the node could lose track of a resource's (action/source) metrics and not be able to recover until the node is reboot.  This is now mitigated by attempting to recreate such metrics when either restarting the resource or resetting its metrics.
+
+Also, warning logs about failures to bump said metrics would flood the logs for "hot-path" metrics such as `matched`.  Now, such logs are throttled to avoid bloating log files.
+
+An example of such throttled log:
+
+```
+2024-11-14T13:56:44.134289+00:00 [warning] tag: RESOURCE, clientid: clientid, msg: handle_resource_metrics_failed, peername: 172.100.239.1:33896, reason: {badkey,matched}, stacktrace: [{erlang,map_get,[matched,#{}],[{error_info,#{module => erl_erts_errors}}]},{emqx_metrics_worker,idx_metric,4,[{file,"src/emqx_metrics_worker.erl"},{line,560}]},...
+
+2024-11-14T13:57:12.490503+00:00 [warning] msg: log_events_throttled_during_last_period, period: 1 minutes, 0 seconds, dropped: #{handle_resource_metrics_failed => 2294}
+```