Procházet zdrojové kódy

Merge pull request #11445 from zhongwencool/remove-os-mon-from-windows

fix: remove os_mon application in Windows release
zhongwencool před 2 roky
rodič
revize
e6f0dead9e

+ 0 - 1
apps/emqx/src/emqx.app.src

@@ -14,7 +14,6 @@
         esockd,
         cowboy,
         sasl,
-        os_mon,
         lc,
         hocon,
         emqx_durable_storage

+ 18 - 14
apps/emqx/src/emqx_os_mon.erl

@@ -38,15 +38,14 @@
 %% gen_server callbacks
 -export([
     init/1,
+    handle_continue/2,
     handle_call/3,
     handle_cast/2,
     handle_info/2,
     terminate/2,
     code_change/3
 ]).
--ifdef(TEST).
--export([is_sysmem_check_supported/0]).
--endif.
+-export([is_os_check_supported/0]).
 
 -include("emqx.hrl").
 
@@ -56,7 +55,7 @@ start_link() ->
     gen_server:start_link({local, ?OS_MON}, ?MODULE, [], []).
 
 update(OS) ->
-    erlang:send(?MODULE, {monitor_conf_update, OS}).
+    gen_server:cast(?MODULE, {monitor_conf_update, OS}).
 
 %%--------------------------------------------------------------------
 %% API
@@ -83,12 +82,17 @@ current_sysmem_percent() ->
 %%--------------------------------------------------------------------
 
 init([]) ->
+    {ok, undefined, {continue, setup}}.
+
+handle_continue(setup, undefined) ->
+    %% start os_mon temporarily
+    {ok, _} = application:ensure_all_started(os_mon),
     %% memsup is not reliable, ignore
     memsup:set_sysmem_high_watermark(1.0),
     SysHW = init_os_monitor(),
     MemRef = start_mem_check_timer(),
     CpuRef = start_cpu_check_timer(),
-    {ok, #{sysmem_high_watermark => SysHW, mem_time_ref => MemRef, cpu_time_ref => CpuRef}}.
+    {noreply, #{sysmem_high_watermark => SysHW, mem_time_ref => MemRef, cpu_time_ref => CpuRef}}.
 
 init_os_monitor() ->
     init_os_monitor(emqx:get_config([sysmon, os])).
@@ -110,6 +114,12 @@ handle_call({set_sysmem_high_watermark, New}, _From, #{sysmem_high_watermark :=
 handle_call(Req, _From, State) ->
     {reply, {error, {unexpected_call, Req}}, State}.
 
+handle_cast({monitor_conf_update, OS}, State) ->
+    cancel_outdated_timer(State),
+    SysHW = init_os_monitor(OS),
+    MemRef = start_mem_check_timer(),
+    CpuRef = start_cpu_check_timer(),
+    {noreply, #{sysmem_high_watermark => SysHW, mem_time_ref => MemRef, cpu_time_ref => CpuRef}};
 handle_cast(Msg, State) ->
     ?SLOG(error, #{msg => "unexpected_cast", cast => Msg}),
     {noreply, State}.
@@ -151,12 +161,6 @@ handle_info({timeout, _Timer, cpu_check}, State) ->
     end,
     Ref = start_cpu_check_timer(),
     {noreply, State#{cpu_time_ref => Ref}};
-handle_info({monitor_conf_update, OS}, State) ->
-    cancel_outdated_timer(State),
-    SysHW = init_os_monitor(OS),
-    MemRef = start_mem_check_timer(),
-    CpuRef = start_cpu_check_timer(),
-    {noreply, #{sysmem_high_watermark => SysHW, mem_time_ref => MemRef, cpu_time_ref => CpuRef}};
 handle_info(Info, State) ->
     ?SLOG(error, #{msg => "unexpected_info", info => Info}),
     {noreply, State}.
@@ -182,12 +186,12 @@ start_cpu_check_timer() ->
         _ -> start_timer(Interval, cpu_check)
     end.
 
-is_sysmem_check_supported() ->
+is_os_check_supported() ->
     {unix, linux} =:= os:type().
 
 start_mem_check_timer() ->
     Interval = emqx:get_config([sysmon, os, mem_check_interval]),
-    case is_integer(Interval) andalso is_sysmem_check_supported() of
+    case is_integer(Interval) andalso is_os_check_supported() of
         true ->
             start_timer(Interval, mem_check);
         false ->
@@ -205,7 +209,7 @@ update_mem_alarm_status(HWM) when HWM > 1.0 orelse HWM < 0.0 ->
         <<"Deactivated mem usage alarm due to out of range threshold">>
     );
 update_mem_alarm_status(HWM) ->
-    is_sysmem_check_supported() andalso
+    is_os_check_supported() andalso
         do_update_mem_alarm_status(HWM),
     ok.
 

+ 7 - 1
apps/emqx/src/emqx_schema.erl

@@ -1582,7 +1582,7 @@ fields("sysmon_os") ->
             sc(
                 hoconsc:union([disabled, duration()]),
                 #{
-                    default => <<"60s">>,
+                    default => default_mem_check_interval(),
                     desc => ?DESC(sysmon_os_mem_check_interval)
                 }
             )},
@@ -3657,3 +3657,9 @@ shared_subscription_strategy() ->
                 desc => ?DESC(broker_shared_subscription_strategy)
             }
         )}.
+
+default_mem_check_interval() ->
+    case emqx_os_mon:is_os_check_supported() of
+        true -> <<"60s">>;
+        false -> disabled
+    end.

+ 6 - 2
apps/emqx/src/emqx_sys_mon.erl

@@ -29,6 +29,7 @@
 %% gen_server callbacks
 -export([
     init/1,
+    handle_continue/2,
     handle_call/3,
     handle_cast/2,
     handle_info/2,
@@ -70,11 +71,14 @@ update(VM) ->
 
 init([]) ->
     emqx_logger:set_proc_metadata(#{sysmon => true}),
-    init_system_monitor(),
+    {ok, undefined, {continue, setup}}.
 
+handle_continue(setup, undefined) ->
+    init_system_monitor(),
     %% Monitor cluster partition event
     ekka:monitor(partition, fun handle_partition_event/1),
-    {ok, start_timer(#{timer => undefined, events => []})}.
+    NewState = start_timer(#{timer => undefined, events => []}),
+    {noreply, NewState, hibernate}.
 
 start_timer(State) ->
     State#{timer := emqx_utils:start_timer(timer:seconds(2), reset)}.

+ 13 - 9
apps/emqx/src/emqx_sys_sup.erl

@@ -19,21 +19,25 @@
 -behaviour(supervisor).
 
 -export([start_link/0]).
-
 -export([init/1]).
 
 start_link() ->
     supervisor:start_link({local, ?MODULE}, ?MODULE, []).
 
 init([]) ->
-    Childs = [
-        child_spec(emqx_sys),
-        child_spec(emqx_alarm),
-        child_spec(emqx_sys_mon),
-        child_spec(emqx_os_mon),
-        child_spec(emqx_vm_mon)
-    ],
-    {ok, {{one_for_one, 10, 100}, Childs}}.
+    OsMon =
+        case emqx_os_mon:is_os_check_supported() of
+            true -> [child_spec(emqx_os_mon)];
+            false -> []
+        end,
+    Children =
+        [
+            child_spec(emqx_sys),
+            child_spec(emqx_alarm),
+            child_spec(emqx_sys_mon),
+            child_spec(emqx_vm_mon)
+        ] ++ OsMon,
+    {ok, {{one_for_one, 10, 100}, Children}}.
 
 %%--------------------------------------------------------------------
 %% Internal functions

+ 19 - 10
apps/emqx/src/emqx_vm.erl

@@ -44,7 +44,7 @@
     get_otp_version/0
 ]).
 
--export([cpu_util/0]).
+-export([cpu_util/0, cpu_util/1]).
 
 -ifdef(TEST).
 -compile(export_all).
@@ -378,16 +378,25 @@ avg15() ->
 cpu_util() ->
     compat_windows(fun cpu_sup:util/0).
 
+cpu_util(Args) ->
+    compat_windows(fun cpu_sup:util/1, Args).
+
 compat_windows(Fun) ->
-    case os:type() of
-        {win32, nt} ->
-            0.0;
-        _Type ->
-            case catch Fun() of
-                Val when is_float(Val) -> floor(Val * 100) / 100;
-                Val when is_number(Val) -> Val;
-                _Error -> 0.0
-            end
+    case compat_windows(Fun, []) of
+        Val when is_float(Val) -> floor(Val * 100) / 100;
+        Val when is_number(Val) -> Val;
+        _ -> 0.0
+    end.
+
+compat_windows(Fun, Args) ->
+    try
+        case emqx_os_mon:is_os_check_supported() of
+            false -> 0.0;
+            true when Args =:= [] -> Fun();
+            true -> Fun(Args)
+        end
+    catch
+        _:_ -> 0.0
     end.
 
 load(Avg) ->

+ 33 - 15
apps/emqx/test/emqx_os_mon_SUITE.erl

@@ -39,29 +39,47 @@ init_per_testcase(t_cpu_check_alarm, Config) ->
         %% 200ms
         cpu_check_interval => 200
     }),
-    ok = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon),
-    {ok, _} = supervisor:restart_child(emqx_sys_sup, emqx_os_mon),
+    restart_os_mon(),
     Config;
 init_per_testcase(t_sys_mem_check_alarm, Config) ->
-    case emqx_os_mon:is_sysmem_check_supported() of
+    case emqx_os_mon:is_os_check_supported() of
         true ->
             SysMon = emqx_config:get([sysmon, os], #{}),
             emqx_config:put([sysmon, os], SysMon#{
                 sysmem_high_watermark => 0.51,
                 %% 200ms
                 mem_check_interval => 200
-            }),
-            ok = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon),
-            {ok, _} = supervisor:restart_child(emqx_sys_sup, emqx_os_mon),
-            Config;
+            });
         false ->
-            Config
-    end;
+            ok
+    end,
+    restart_os_mon(),
+    Config;
 init_per_testcase(_, Config) ->
-    emqx_common_test_helpers:boot_modules(all),
-    emqx_common_test_helpers:start_apps([]),
+    restart_os_mon(),
     Config.
 
+restart_os_mon() ->
+    case emqx_os_mon:is_os_check_supported() of
+        true ->
+            ok = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon),
+            {ok, _} = supervisor:restart_child(emqx_sys_sup, emqx_os_mon);
+        false ->
+            _ = supervisor:terminate_child(emqx_sys_sup, emqx_os_mon),
+            _ = supervisor:delete_child(emqx_sys_sup, emqx_os_mon),
+            %% run test on mac/windows.
+            Mod = emqx_os_mon,
+            OsMon = #{
+                id => Mod,
+                start => {Mod, start_link, []},
+                restart => permanent,
+                shutdown => 5000,
+                type => worker,
+                modules => [Mod]
+            },
+            {ok, _} = supervisor:start_child(emqx_sys_sup, OsMon)
+    end.
+
 t_api(_) ->
     ?assertEqual(0.7, emqx_os_mon:get_sysmem_high_watermark()),
     ?assertEqual(ok, emqx_os_mon:set_sysmem_high_watermark(0.8)),
@@ -81,7 +99,7 @@ t_api(_) ->
     ok.
 
 t_sys_mem_check_disable(Config) ->
-    case emqx_os_mon:is_sysmem_check_supported() of
+    case emqx_os_mon:is_os_check_supported() of
         true -> do_sys_mem_check_disable(Config);
         false -> skip
     end.
@@ -100,7 +118,7 @@ do_sys_mem_check_disable(_Config) ->
     ok.
 
 t_sys_mem_check_alarm(Config) ->
-    case emqx_os_mon:is_sysmem_check_supported() of
+    case emqx_os_mon:is_os_check_supported() of
         true -> do_sys_mem_check_alarm(Config);
         false -> skip
     end.
@@ -167,7 +185,7 @@ t_cpu_check_alarm(_) ->
         util,
         fun() -> CpuUtil end,
         fun() ->
-            timer:sleep(500),
+            timer:sleep(1000),
             Alarms = emqx_alarm:get_alarms(activated),
             ?assert(
                 emqx_vm_mon_SUITE:is_existing(high_cpu_usage, emqx_alarm:get_alarms(activated))
@@ -193,7 +211,7 @@ t_cpu_check_alarm(_) ->
             ?assert(is_binary(Msg)),
             emqx_config:put([sysmon, os, cpu_high_watermark], 1),
             emqx_config:put([sysmon, os, cpu_low_watermark], 0.96),
-            timer:sleep(500),
+            timer:sleep(800),
             ?assertNot(
                 emqx_vm_mon_SUITE:is_existing(high_cpu_usage, emqx_alarm:get_alarms(activated))
             )

+ 2 - 2
apps/emqx_machine/priv/reboot_lists.eterm

@@ -17,7 +17,8 @@
             asn1,
             syntax_tools,
             ssl,
-            os_mon,
+            %% started temporary in emqx to prevent crash vm when permanent.
+            {os_mon, load},
             inets,
             compiler,
             runtime_tools,
@@ -36,7 +37,6 @@
         [
             emqx,
             emqx_conf,
-
             esasl,
             observer_cli,
             tools,

+ 27 - 13
apps/emqx_management/src/emqx_mgmt.erl

@@ -185,25 +185,39 @@ node_info(Nodes) ->
 stopped_node_info(Node) ->
     {Node, #{node => Node, node_status => 'stopped', role => core}}.
 
+%% Hide cpu stats if os_check is not supported.
 vm_stats() ->
-    Idle = vm_stats('cpu.idle'),
     {MemUsedRatio, MemTotal} = get_sys_memory(),
-    [
-        {run_queue, vm_stats('run.queue')},
-        {cpu_idle, Idle},
-        {cpu_use, 100 - Idle},
-        {total_memory, MemTotal},
-        {used_memory, erlang:round(MemTotal * MemUsedRatio)}
-    ].
+    cpu_stats() ++
+        [
+            {run_queue, vm_stats('run.queue')},
+            {total_memory, MemTotal},
+            {used_memory, erlang:round(MemTotal * MemUsedRatio)}
+        ].
+
+cpu_stats() ->
+    case emqx_os_mon:is_os_check_supported() of
+        false ->
+            [];
+        true ->
+            Idle = vm_stats('cpu.idle'),
+            [
+                {cpu_idle, Idle},
+                {cpu_use, 100 - Idle}
+            ]
+    end.
 
 vm_stats('cpu.idle') ->
-    case cpu_sup:util([detailed]) of
-        %% Not support for Windows
-        {_, 0, 0, _} -> 0;
-        {_Num, _Use, IdleList, _} -> proplists:get_value(idle, IdleList, 0)
+    case emqx_vm:cpu_util([detailed]) of
+        {_Num, _Use, List, _} when is_list(List) -> proplists:get_value(idle, List, 0);
+        %% return {all, 0, 0, []} when cpu_sup is not started
+        _ -> 0
     end;
 vm_stats('cpu.use') ->
-    100 - vm_stats('cpu.idle');
+    case vm_stats('cpu.idle') of
+        0 -> 0;
+        Idle -> 100 - Idle
+    end;
 vm_stats('total.memory') ->
     {_, MemTotal} = get_sys_memory(),
     MemTotal;

+ 2 - 0
changes/ce/fix-11445.en.md

@@ -0,0 +1,2 @@
+Removed os_mon application monitor support on Windows platforms to prevent VM crashes.
+Functionality remains on non-Windows platforms.

+ 6 - 1
mix.exs

@@ -403,7 +403,8 @@ defmodule EMQXUmbrella.MixProject do
       quicer: enable_quicer?(),
       bcrypt: enable_bcrypt?(),
       jq: enable_jq?(),
-      observer: is_app?(:observer)
+      observer: is_app?(:observer),
+      os_mon: enable_os_mon?()
     }
     |> Enum.reject(&elem(&1, 1))
     |> Enum.map(&elem(&1, 0))
@@ -835,6 +836,10 @@ defmodule EMQXUmbrella.MixProject do
     not win32?()
   end
 
+  defp enable_os_mon?() do
+    not win32?()
+  end
+
   defp enable_jq?() do
     not Enum.any?([
       build_without_jq?(),

+ 12 - 7
rebar.config.erl

@@ -405,12 +405,13 @@ relx_apps(ReleaseType, Edition) ->
             ce -> CEBusinessApps
         end,
     BusinessApps = CommonBusinessApps ++ EditionSpecificApps,
-    ExcludedApps = excluded_apps(ReleaseType),
-    SystemApps ++
-        %% EMQX starts the DB and the business applications:
-        [{App, load} || App <- (DBApps -- ExcludedApps)] ++
-        [emqx_machine] ++
-        [{App, load} || App <- (BusinessApps -- ExcludedApps)].
+    Apps =
+        (SystemApps ++
+            %% EMQX starts the DB and the business applications:
+            [{App, load} || App <- DBApps] ++
+            [emqx_machine] ++
+            [{App, load} || App <- BusinessApps]),
+    lists:foldl(fun proplists:delete/2, Apps, excluded_apps(ReleaseType)).
 
 excluded_apps(ReleaseType) ->
     OptionalApps = [
@@ -418,7 +419,8 @@ excluded_apps(ReleaseType) ->
         {bcrypt, provide_bcrypt_release(ReleaseType)},
         {jq, is_jq_supported()},
         {observer, is_app(observer)},
-        {mnesia_rocksdb, is_rocksdb_supported()}
+        {mnesia_rocksdb, is_rocksdb_supported()},
+        {os_mon, provide_os_mon_release()}
     ],
     [App || {App, false} <- OptionalApps].
 
@@ -524,6 +526,9 @@ is_debug(VarName) ->
 provide_bcrypt_dep() ->
     not is_win32().
 
+provide_os_mon_release() ->
+    not is_win32().
+
 provide_bcrypt_release(ReleaseType) ->
     provide_bcrypt_dep() andalso ReleaseType =:= cloud.
 

+ 6 - 6
rel/i18n/emqx_schema.hocon

@@ -156,7 +156,7 @@ persistent_session_builtin_messages_table.label:
 
 sysmon_os_cpu_low_watermark.desc:
 """The threshold, as percentage of system CPU load,
- for how much system cpu can be used before the corresponding alarm is cleared."""
+ for how much system cpu can be used before the corresponding alarm is cleared. Disabled on Windows platform"""
 
 sysmon_os_cpu_low_watermark.label:
 """CPU low watermark"""
@@ -278,7 +278,7 @@ fields_ws_opts_mqtt_path.label:
 sysmon_os_procmem_high_watermark.desc:
 """The threshold, as percentage of system memory,
  for how much system memory can be allocated by one Erlang process before
- the corresponding alarm is raised."""
+ the corresponding alarm is raised. Disabled on Windows platform."""
 
 sysmon_os_procmem_high_watermark.label:
 """ProcMem high wartermark"""
@@ -389,7 +389,7 @@ fields_tcp_opts_sndbuf.label:
 """TCP send buffer"""
 
 sysmon_os_mem_check_interval.desc:
-"""The time interval for the periodic memory check."""
+"""The time interval for the periodic memory check. Disabled on Windows platform."""
 
 sysmon_os_mem_check_interval.label:
 """Mem check interval"""
@@ -742,7 +742,7 @@ common_ssl_opts_schema_keyfile.label:
 
 sysmon_os_cpu_high_watermark.desc:
 """The threshold, as percentage of system CPU load,
- for how much system cpu can be used before the corresponding alarm is raised."""
+ for how much system cpu can be used before the corresponding alarm is raised. Disabled on Windows platform"""
 
 sysmon_os_cpu_high_watermark.label:
 """CPU high watermark"""
@@ -798,7 +798,7 @@ fields_ws_opts_proxy_address_header.label:
 
 sysmon_os_sysmem_high_watermark.desc:
 """The threshold, as percentage of system memory,
- for how much system memory can be allocated before the corresponding alarm is raised."""
+ for how much system memory can be allocated before the corresponding alarm is raised. Disabled on Windows platform"""
 
 sysmon_os_sysmem_high_watermark.label:
 """SysMem high wartermark"""
@@ -1521,7 +1521,7 @@ fields_tcp_opts_send_timeout_close.label:
 """TCP send timeout close"""
 
 sysmon_os_cpu_check_interval.desc:
-"""The time interval for the periodic CPU check."""
+"""The time interval for the periodic CPU check. Disabled on Windows platform."""
 
 sysmon_os_cpu_check_interval.label:
 """The time interval for the periodic CPU check."""