1 年之前 · 3c4de9d4fb
--- a/apps/emqx_resource/src/emqx_resource_manager.erl
+++ b/apps/emqx_resource/src/emqx_resource_manager.erl
@@ -399,6 +399,7 @@ get_metrics(ResId) ->
 
																 %% @doc Reset the metrics for the specified resource
															
 
																 -spec reset_metrics(resource_id()) -> ok.
															
 
																 reset_metrics(ResId) ->
															
 
																+    ok = ensure_metrics(ResId),
															
 
																     emqx_metrics_worker:reset_metrics(?RES_METRICS, ResId).
															
 
																 %% @doc Returns the data for all resources
															
@@ -788,6 +789,7 @@ handle_remove_event(From, ClearMetrics, Data) ->
 
																 start_resource(Data, From) ->
															
 
																     %% in case the emqx_resource:call_start/2 hangs, the lookup/1 can read status from the cache
															
 
																     #data{id = ResId, mod = Mod, config = Config, group = Group, type = Type} = Data,
															
 
																+    ok = ensure_metrics(ResId),
															
 
																     case emqx_resource:call_start(ResId, Mod, Config) of
															
 
																         {ok, ResourceState} ->
															
 
																             UpdatedData1 = Data#data{status = ?status_connecting, state = ResourceState},
															
@@ -1894,3 +1896,11 @@ abort_channel_health_check(Pid) ->
 
																         {'EXIT', Pid, _} ->
															
 
																             ok
															
 
																     end.
															
 
																+
															
 
																+%% For still unknown reasons (e.g.: `emqx_metrics_worker' process might die?), metrics
															
 
																+%% might be lost for a running resource, and future attempts to bump them result in
															
 
																+%% errors.  As mitigation, we ensure such metrics are created here so that restarting
															
 
																+%% the resource or resetting its metrics can recreate them.
															
 
																+ensure_metrics(ResId) ->
															
 
																+    {ok, _} = emqx_resource:ensure_metrics(ResId),
															
 
																+    ok.
															
--- a/changes/ce/fix-14226.en.md
+++ b/changes/ce/fix-14226.en.md
@@ -0,0 +1,11 @@
 
																+Previously, under high stress, the node could lose track of a resource's (action/source) metrics and not be able to recover until the node is reboot.  This is now mitigated by attempting to recreate such metrics when either restarting the resource or resetting its metrics.
															
 
																+
															
 
																+Also, warning logs about failures to bump said metrics would flood the logs for "hot-path" metrics such as `matched`.  Now, such logs are throttled to avoid bloating log files.
															
 
																+
															
 
																+An example of such throttled log:
															
 
																+
															
 
																+```
															
 
																+2024-11-14T13:56:44.134289+00:00 [warning] tag: RESOURCE, clientid: clientid, msg: handle_resource_metrics_failed, peername: 172.100.239.1:33896, reason: {badkey,matched}, stacktrace: [{erlang,map_get,[matched,#{}],[{error_info,#{module => erl_erts_errors}}]},{emqx_metrics_worker,idx_metric,4,[{file,"src/emqx_metrics_worker.erl"},{line,560}]},...
															
 
																+
															
 
																+2024-11-14T13:57:12.490503+00:00 [warning] msg: log_events_throttled_during_last_period, period: 1 minutes, 0 seconds, dropped: #{handle_resource_metrics_failed => 2294}
															
 
																+```