Просмотр исходного кода

fix(emqx_opentelemetry): avoid using `application:ensure_all_started/1` for better deadlock safety

Serge Tupchii 2 лет назад
Родитель
Сommit
2a3f6b749c

+ 1 - 6
apps/emqx_machine/priv/reboot_lists.eterm

@@ -25,12 +25,7 @@
             redbug,
             xmerl,
             {hocon, load},
-            telemetry,
-            {opentelemetry, load},
-            {opentelemetry_api, load},
-            {opentelemetry_experimental, load},
-            {opentelemetry_api_experimental, load},
-            {opentelemetry_exporter, load}
+            telemetry
         ],
     %% must always be of type `load'
     common_business_apps =>

+ 1 - 0
apps/emqx_machine/src/emqx_machine.erl

@@ -50,6 +50,7 @@ start() ->
     start_sysmon(),
     configure_shard_transports(),
     set_mnesia_extra_diagnostic_checks(),
+    emqx_otel_app:configure_otel_deps(),
     ekka:start(),
     ok.
 

+ 1 - 3
apps/emqx_machine/src/emqx_machine_boot.erl

@@ -69,9 +69,7 @@ stop_apps() ->
     ?SLOG(notice, #{msg => "stopping_emqx_apps"}),
     _ = emqx_alarm_handler:unload(),
     ok = emqx_conf_app:unset_config_loaded(),
-    lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())),
-    %% Mute otel deps application.
-    ok = emqx_otel_app:stop_deps().
+    lists:foreach(fun stop_one_app/1, lists:reverse(sorted_reboot_apps())).
 
 %% Those port apps are terminated after the main apps
 %% Don't need to stop when reboot.

+ 6 - 6
apps/emqx_opentelemetry/rebar.config

@@ -3,13 +3,13 @@
 {deps,
     [{emqx, {path, "../emqx"}}
     %% trace
-    , {opentelemetry_api, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_api"}}
-    , {opentelemetry, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry"}}
-    %% log metrics
-    , {opentelemetry_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_experimental"}}
-    , {opentelemetry_api_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_api_experimental"}}
+    , {opentelemetry_api, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_api"}}
+    , {opentelemetry, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry"}}
+    %% logs, metrics
+    , {opentelemetry_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_experimental"}}
+    , {opentelemetry_api_experimental, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_api_experimental"}}
     %% export
-    , {opentelemetry_exporter, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.4-emqx"}, "apps/opentelemetry_exporter"}}
+    , {opentelemetry_exporter, {git_subdir, "https://github.com/emqx/opentelemetry-erlang", {tag, "v1.4.5-emqx"}, "apps/opentelemetry_exporter"}}
     ]}.
 
 {edoc_opts, [{preprocess, true}]}.

+ 4 - 6
apps/emqx_opentelemetry/src/emqx_opentelemetry.app.src

@@ -8,14 +8,12 @@
         stdlib,
         emqx,
         %% otel metrics depend on emqx_mgmt_cache
-        emqx_management
-    ]},
-    {included_applications, [
+        emqx_management,
+        opentelemetry_exporter,
         opentelemetry,
-        opentelemetry_api,
-        opentelemetry_api_experimental,
         opentelemetry_experimental,
-        opentelemetry_exporter
+        opentelemetry_api,
+        opentelemetry_api_experimental
     ]},
     {env, []},
     {modules, []},

+ 10 - 3
apps/emqx_opentelemetry/src/emqx_otel_app.erl

@@ -19,7 +19,7 @@
 -behaviour(application).
 
 -export([start/2, stop/1]).
--export([stop_deps/0]).
+-export([configure_otel_deps/0]).
 
 start(_StartType, _StartArgs) ->
     emqx_otel_config:add_handler(),
@@ -33,5 +33,12 @@ stop(_State) ->
     _ = emqx_otel_config:remove_otel_log_handler(),
     ok.
 
-stop_deps() ->
-    emqx_otel_config:stop_all_otel_apps().
+configure_otel_deps() ->
+    %% default tracer and metrics are started only on demand
+    ok = application:set_env(
+        [
+            {opentelemetry, [{start_default_tracer, false}]},
+            {opentelemetry_experimental, [{start_default_metrics, false}]}
+        ],
+        [{persistent, true}]
+    ).

+ 0 - 36
apps/emqx_opentelemetry/src/emqx_otel_config.erl

@@ -27,7 +27,6 @@
 -export([post_config_update/5]).
 -export([update/1]).
 -export([add_otel_log_handler/0, remove_otel_log_handler/0]).
--export([stop_all_otel_apps/0]).
 -export([otel_exporter/1]).
 
 update(Config) ->
@@ -59,7 +58,6 @@ post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
     MetricsRes = ensure_otel_metrics(New, Old),
     LogsRes = ensure_otel_logs(New, Old),
     TracesRes = ensure_otel_traces(New, Old),
-    _ = maybe_stop_all_otel_apps(New),
     case {MetricsRes, LogsRes, TracesRes} of
         {ok, ok, ok} -> ok;
         Other -> {error, Other}
@@ -67,9 +65,6 @@ post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
 post_config_update(_ConfPath, _Req, _NewConf, _OldConf, _AppEnvs) ->
     ok.
 
-stop_all_otel_apps() ->
-    stop_all_otel_apps(true).
-
 add_otel_log_handler() ->
     ensure_otel_logs(emqx:get_config(?OPTL), #{}).
 
@@ -104,7 +99,6 @@ ensure_otel_logs(#{logs := LogsConf}, #{logs := LogsConf}) ->
     ok;
 ensure_otel_logs(#{logs := #{enable := true} = LogsConf}, _OldConf) ->
     ok = remove_handler_if_present(?OTEL_LOG_HANDLER_ID),
-    ok = ensure_log_apps(),
     HandlerConf = tr_handler_conf(LogsConf),
     %% NOTE: should primary logger level be updated if it's higher than otel log level?
     logger:add_handler(?OTEL_LOG_HANDLER_ID, ?OTEL_LOG_HANDLER, HandlerConf);
@@ -126,21 +120,6 @@ remove_handler_if_present(HandlerId) ->
             ok
     end.
 
-ensure_log_apps() ->
-    {ok, _} = application:ensure_all_started(opentelemetry_exporter),
-    {ok, _} = application:ensure_all_started(opentelemetry_experimental),
-    ok.
-
-maybe_stop_all_otel_apps(#{
-    metrics := #{enable := false},
-    logs := #{enable := false},
-    traces := #{enable := false}
-}) ->
-    IsShutdown = false,
-    stop_all_otel_apps(IsShutdown);
-maybe_stop_all_otel_apps(_) ->
-    ok.
-
 tr_handler_conf(Conf) ->
     #{
         level := Level,
@@ -171,18 +150,3 @@ is_ssl(<<"https://", _/binary>> = _Endpoint) ->
     true;
 is_ssl(_Endpoint) ->
     false.
-
-stop_all_otel_apps(IsShutdown) ->
-    %% if traces were enabled, it's not safe to stop opentelemetry app,
-    %% as there could be not finsihed traces that would crash if spans ETS tables are deleted
-    _ =
-        case IsShutdown of
-            true ->
-                _ = application:stop(opentelemetry);
-            false ->
-                ok
-        end,
-    _ = application:stop(opentelemetry_experimental),
-    _ = application:stop(opentelemetry_experimental_api),
-    _ = application:stop(opentelemetry_exporter),
-    ok.

+ 0 - 4
apps/emqx_opentelemetry/src/emqx_otel_metrics.erl

@@ -74,10 +74,6 @@ setup(_Conf) ->
 
 ensure_apps(Conf) ->
     #{exporter := #{interval := ExporterInterval} = Exporter} = Conf,
-    {ok, _} = application:ensure_all_started(opentelemetry_exporter),
-    {ok, _} = application:ensure_all_started(opentelemetry),
-    {ok, _} = application:ensure_all_started(opentelemetry_experimental),
-    {ok, _} = application:ensure_all_started(opentelemetry_api_experimental),
 
     _ = opentelemetry_experimental:stop_default_metrics(),
     ok = application:set_env(

+ 0 - 1
apps/emqx_opentelemetry/src/emqx_otel_trace.erl

@@ -78,7 +78,6 @@ start(Conf) ->
     ],
     set_trace_all(TraceAll),
     ok = application:set_env([{opentelemetry, OtelEnv}]),
-    _ = application:ensure_all_started(opentelemetry),
     Res = assert_started(opentelemetry:start_default_tracer_provider()),
     case Res of
         ok ->