浏览代码

fix(resource manager): ensure metrics exist when (re)starting a resource or resetting their metrics

Fixes https://emqx.atlassian.net/browse/EMQX-13496

The reasons for the original issue that led to `emqx_metrics_worker` losing all metrics
are still unknown.  Here, we introduce some mitigations to avoid more drastic measures
such as restarting the node.
Thales Macedo Garitezi 1 年之前
父节点
当前提交
3c4de9d4fb
共有 2 个文件被更改,包括 21 次插入0 次删除
  1. 10 0
      apps/emqx_resource/src/emqx_resource_manager.erl
  2. 11 0
      changes/ce/fix-14226.en.md

+ 10 - 0
apps/emqx_resource/src/emqx_resource_manager.erl

@@ -399,6 +399,7 @@ get_metrics(ResId) ->
 %% @doc Reset the metrics for the specified resource
 %% @doc Reset the metrics for the specified resource
 -spec reset_metrics(resource_id()) -> ok.
 -spec reset_metrics(resource_id()) -> ok.
 reset_metrics(ResId) ->
 reset_metrics(ResId) ->
+    ok = ensure_metrics(ResId),
     emqx_metrics_worker:reset_metrics(?RES_METRICS, ResId).
     emqx_metrics_worker:reset_metrics(?RES_METRICS, ResId).
 
 
 %% @doc Returns the data for all resources
 %% @doc Returns the data for all resources
@@ -788,6 +789,7 @@ handle_remove_event(From, ClearMetrics, Data) ->
 start_resource(Data, From) ->
 start_resource(Data, From) ->
     %% in case the emqx_resource:call_start/2 hangs, the lookup/1 can read status from the cache
     %% in case the emqx_resource:call_start/2 hangs, the lookup/1 can read status from the cache
     #data{id = ResId, mod = Mod, config = Config, group = Group, type = Type} = Data,
     #data{id = ResId, mod = Mod, config = Config, group = Group, type = Type} = Data,
+    ok = ensure_metrics(ResId),
     case emqx_resource:call_start(ResId, Mod, Config) of
     case emqx_resource:call_start(ResId, Mod, Config) of
         {ok, ResourceState} ->
         {ok, ResourceState} ->
             UpdatedData1 = Data#data{status = ?status_connecting, state = ResourceState},
             UpdatedData1 = Data#data{status = ?status_connecting, state = ResourceState},
@@ -1894,3 +1896,11 @@ abort_channel_health_check(Pid) ->
         {'EXIT', Pid, _} ->
         {'EXIT', Pid, _} ->
             ok
             ok
     end.
     end.
+
+%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
+%% might be lost for a running resource, and future attempts to bump them result in
+%% errors.  As mitigation, we ensure such metrics are created here so that restarting
+%% the resource or resetting its metrics can recreate them.
+ensure_metrics(ResId) ->
+    {ok, _} = emqx_resource:ensure_metrics(ResId),
+    ok.

+ 11 - 0
changes/ce/fix-14226.en.md

@@ -0,0 +1,11 @@
+Previously, under high stress, the node could lose track of a resource's (action/source) metrics and not be able to recover until the node is reboot.  This is now mitigated by attempting to recreate such metrics when either restarting the resource or resetting its metrics.
+
+Also, warning logs about failures to bump said metrics would flood the logs for "hot-path" metrics such as `matched`.  Now, such logs are throttled to avoid bloating log files.
+
+An example of such throttled log:
+
+```
+2024-11-14T13:56:44.134289+00:00 [warning] tag: RESOURCE, clientid: clientid, msg: handle_resource_metrics_failed, peername: 172.100.239.1:33896, reason: {badkey,matched}, stacktrace: [{erlang,map_get,[matched,#{}],[{error_info,#{module => erl_erts_errors}}]},{emqx_metrics_worker,idx_metric,4,[{file,"src/emqx_metrics_worker.erl"},{line,560}]},...
+
+2024-11-14T13:57:12.490503+00:00 [warning] msg: log_events_throttled_during_last_period, period: 1 minutes, 0 seconds, dropped: #{handle_resource_metrics_failed => 2294}
+```