Преглед изворни кода

feat: e2e tracing config schema

JimMoen пре 1 година
родитељ
комит
3445bac686

+ 9 - 9
apps/emqx_opentelemetry/src/emqx_otel_config.erl

@@ -19,7 +19,7 @@
 
 
 -include_lib("emqx/include/logger.hrl").
 -include_lib("emqx/include/logger.hrl").
 
 
--define(OPTL, [opentelemetry]).
+-define(OTEL, [opentelemetry]).
 -define(CERTS_PATH, filename:join(["opentelemetry", "exporter"])).
 -define(CERTS_PATH, filename:join(["opentelemetry", "exporter"])).
 
 
 -define(OTEL_EXPORTER, opentelemetry_exporter).
 -define(OTEL_EXPORTER, opentelemetry_exporter).
@@ -35,7 +35,7 @@
 update(Config) ->
 update(Config) ->
     case
     case
         emqx_conf:update(
         emqx_conf:update(
-            ?OPTL,
+            ?OTEL,
             Config,
             Config,
             #{rawconf_with_defaults => true, override_to => cluster}
             #{rawconf_with_defaults => true, override_to => cluster}
         )
         )
@@ -47,21 +47,21 @@ update(Config) ->
     end.
     end.
 
 
 add_handler() ->
 add_handler() ->
-    ok = emqx_config_handler:add_handler(?OPTL, ?MODULE),
+    ok = emqx_config_handler:add_handler(?OTEL, ?MODULE),
     ok.
     ok.
 
 
 remove_handler() ->
 remove_handler() ->
-    ok = emqx_config_handler:remove_handler(?OPTL),
+    ok = emqx_config_handler:remove_handler(?OTEL),
     ok.
     ok.
 
 
-pre_config_update(?OPTL, RawConf, RawConf) ->
+pre_config_update(?OTEL, RawConf, RawConf) ->
     {ok, RawConf};
     {ok, RawConf};
-pre_config_update(?OPTL, NewRawConf, _RawConf) ->
+pre_config_update(?OTEL, NewRawConf, _RawConf) ->
     {ok, convert_certs(NewRawConf)}.
     {ok, convert_certs(NewRawConf)}.
 
 
-post_config_update(?OPTL, _Req, Old, Old, _AppEnvs) ->
+post_config_update(?OTEL, _Req, Old, Old, _AppEnvs) ->
     ok;
     ok;
-post_config_update(?OPTL, _Req, New, Old, AppEnvs) ->
+post_config_update(?OTEL, _Req, New, Old, AppEnvs) ->
     application:set_env(AppEnvs),
     application:set_env(AppEnvs),
     MetricsRes = ensure_otel_metrics(New, Old),
     MetricsRes = ensure_otel_metrics(New, Old),
     LogsRes = ensure_otel_logs(New, Old),
     LogsRes = ensure_otel_logs(New, Old),
@@ -74,7 +74,7 @@ post_config_update(_ConfPath, _Req, _NewConf, _OldConf, _AppEnvs) ->
     ok.
     ok.
 
 
 add_otel_log_handler() ->
 add_otel_log_handler() ->
-    ensure_otel_logs(emqx:get_config(?OPTL), #{}).
+    ensure_otel_logs(emqx:get_config(?OTEL), #{}).
 
 
 remove_otel_log_handler() ->
 remove_otel_log_handler() ->
     remove_handler_if_present(?OTEL_LOG_HANDLER_ID).
     remove_handler_if_present(?OTEL_LOG_HANDLER_ID).

+ 115 - 7
apps/emqx_opentelemetry/src/emqx_otel_schema.erl

@@ -15,6 +15,7 @@
 %%--------------------------------------------------------------------
 %%--------------------------------------------------------------------
 -module(emqx_otel_schema).
 -module(emqx_otel_schema).
 
 
+-include("emqx_otel_trace.hrl").
 -include_lib("hocon/include/hoconsc.hrl").
 -include_lib("hocon/include/hoconsc.hrl").
 
 
 -export([
 -export([
@@ -219,7 +220,17 @@ fields("otel_exporter") ->
 fields("trace_filter") ->
 fields("trace_filter") ->
     %% More filters can be implemented in future, e.g. topic, clientid
     %% More filters can be implemented in future, e.g. topic, clientid
     [
     [
+        {trace_mode,
+            ?HOCON(
+                ?ENUM([legacy, e2e]),
+                #{
+                    default => legacy,
+                    desc => ?DESC(trace_mode),
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )},
         {trace_all,
         {trace_all,
+            %% Only takes effect when trace_mode set to `legacy`
             ?HOCON(
             ?HOCON(
                 boolean(),
                 boolean(),
                 #{
                 #{
@@ -227,16 +238,113 @@ fields("trace_filter") ->
                     desc => ?DESC(trace_all),
                     desc => ?DESC(trace_all),
                     importance => ?IMPORTANCE_MEDIUM
                     importance => ?IMPORTANCE_MEDIUM
                 }
                 }
+            )},
+        {e2e_tracing_options,
+            ?HOCON(
+                %% Only takes effect when trace_mode set to `e2e`
+                ?R_REF("e2e_tracing_options"),
+                #{
+                    desc => ?DESC(e2e_tracing_options),
+                    default => #{},
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )}
+    ];
+fields("e2e_tracing_options") ->
+    [
+        {attribute_meta,
+            ?HOCON(
+                string(),
+                #{
+                    default => emqxcl,
+                    desc => ?DESC(e2e_attribute_meta),
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )},
+        {publish_response_trace_level,
+            ?HOCON(
+                emqx_schema:qos(),
+                #{
+                    default => 0,
+                    desc => ?DESC(publish_response_trace_level),
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )},
+        {samplers,
+            ?HOCON(
+                ?R_REF("e2e_samplers"),
+                #{
+                    desc => ?DESC(e2e_samplers),
+                    default => #{},
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )}
+    ];
+fields("e2e_samplers") ->
+    [
+        {whitelist_based_sampler,
+            ?HOCON(
+                boolean(),
+                #{
+                    default => true,
+                    desc => ?DESC(whitelist_based_sampler),
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )},
+        {event_based_samplers,
+            ?HOCON(
+                ?ARRAY(?R_REF("event_based_samplers")),
+                #{
+                    default => [],
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )}
+    ];
+fields("event_based_samplers") ->
+    [
+        {name,
+            ?HOCON(
+                ?ENUM(root_span_names()),
+                #{
+                    required => ture,
+                    desc => ?DESC(event_type),
+                    importance => ?IMPORTANCE_MEDIUM
+                }
+            )},
+        {ratio,
+            ?HOCON(
+                emqx_schema:percent(),
+                #{
+                    default => <<"10%">>,
+                    desc => ?DESC(ratio),
+                    importance => ?IMPORTANCE_MEDIUM
+                }
             )}
             )}
     ].
     ].
 
 
-desc("opentelemetry") -> ?DESC(opentelemetry);
-desc("otel_exporter") -> ?DESC(otel_exporter);
-desc("otel_logs") -> ?DESC(otel_logs);
-desc("otel_metrics") -> ?DESC(otel_metrics);
-desc("otel_traces") -> ?DESC(otel_traces);
-desc("trace_filter") -> ?DESC(trace_filter);
-desc(_) -> undefined.
+desc("opentelemetry") ->
+    ?DESC(opentelemetry);
+desc("otel_exporter") ->
+    ?DESC(otel_exporter);
+desc("otel_logs") ->
+    ?DESC(otel_logs);
+desc("otel_metrics") ->
+    ?DESC(otel_metrics);
+desc("otel_traces") ->
+    ?DESC(otel_traces);
+desc("trace_filter") ->
+    ?DESC(trace_filter);
+desc(_) ->
+    undefined.
+
+root_span_names() ->
+    [
+        ?CLIENT_CONNECT_SPAN_NAME,
+        ?CLIENT_DISCONNECT_SPAN_NAME,
+        ?CLIENT_SUBSCRIBE_SPAN_NAME,
+        ?CLIENT_UNSUBSCRIBE_SPAN_NAME,
+        ?CLIENT_PUBLISH_SPAN_NAME
+    ].
 
 
 %% Compatibility with the previous schema that defined only metrics fields
 %% Compatibility with the previous schema that defined only metrics fields
 legacy_metrics_converter(OtelConf, _Opts) when is_map(OtelConf) ->
 legacy_metrics_converter(OtelConf, _Opts) when is_map(OtelConf) ->

+ 3 - 6
apps/emqx_opentelemetry/src/emqx_otel_trace.erl

@@ -95,8 +95,8 @@ start(#{traces := TracesConf, exporter := ExporterConf}) ->
     #{
     #{
         max_queue_size := MaxQueueSize,
         max_queue_size := MaxQueueSize,
         exporting_timeout := ExportingTimeout,
         exporting_timeout := ExportingTimeout,
-        scheduled_delay := ScheduledDelay,
-        filter := #{trace_all := TraceAll}
+        scheduled_delay := ScheduledDelay
+        %% filter := Filter
     } = TracesConf,
     } = TracesConf,
     OtelEnv = [
     OtelEnv = [
         {bsp_scheduled_delay_ms, ScheduledDelay},
         {bsp_scheduled_delay_ms, ScheduledDelay},
@@ -106,7 +106,7 @@ start(#{traces := TracesConf, exporter := ExporterConf}) ->
         %% TODO: any sampler's options
         %% TODO: any sampler's options
         {sampler, {emqx_otel_sampler, #{opt_key => opt_value}}}
         {sampler, {emqx_otel_sampler, #{opt_key => opt_value}}}
     ],
     ],
-    set_trace_all(TraceAll),
+    %% set_e2e_trace_sampler
     ok = application:set_env([{opentelemetry, OtelEnv}]),
     ok = application:set_env([{opentelemetry, OtelEnv}]),
     Res = assert_started(opentelemetry:start_default_tracer_provider()),
     Res = assert_started(opentelemetry:start_default_tracer_provider()),
     case Res of
     case Res of
@@ -893,6 +893,3 @@ assert_started({ok, _Pid}) -> ok;
 assert_started({ok, _Pid, _Info}) -> ok;
 assert_started({ok, _Pid, _Info}) -> ok;
 assert_started({error, {already_started, _Pid}}) -> ok;
 assert_started({error, {already_started, _Pid}}) -> ok;
 assert_started({error, Reason}) -> {error, Reason}.
 assert_started({error, Reason}) -> {error, Reason}.
-
-set_trace_all(TraceAll) ->
-    persistent_term:put({?MODULE, trace_all}, TraceAll).

+ 51 - 1
rel/i18n/emqx_otel_schema.hocon

@@ -49,8 +49,58 @@ trace_filter.label: "Trace Filter"
 
 
 trace_all.desc:
 trace_all.desc:
 """If enabled, all published messages are traced, a new trace ID is generated if it can't be extracted from the message.
 """If enabled, all published messages are traced, a new trace ID is generated if it can't be extracted from the message.
-Otherwise, only messages published with trace context are traced. Disabled by default."""
+Otherwise, only messages published with trace context are traced. <br/>
+Disabled by default.<br/>
+Note: this config only takes effect when <code>trace_mode</code> is set to <code>legacy</code>."""
 trace_all.label: "Trace All"
 trace_all.label: "Trace All"
 
 
+trace_mode.desc:
+"""Opentelemetry tracing mode.<br/>
+- `legacy`: follow the old tracing method, only trace message publishing and delivery.<br/>
+  Span Name will remain compatible with the old version.
+- `e2e`: end-to-end tracing mode. All kinds of client behaviors will be traced:<br/>
+  Connection/Disconnection/Subscription/Unsubscription/Message Publishing/Message delivery.<br/>
+  More control options and sampling functions are controlled by the `e2e_tracing_options` sub-configuration item<br/>"""
+trace_mode.label: "Trace Mode"
+
+e2e_tracing_options.desc: "End-to-end tracing options"
+e2e_tracing_options.label: "End-to-End Tracing Options"
+
+e2e_attribute_meta.desc:
+"""Simple attribute meta value added into Span's Attributes.</br/>
+Typically set a simple and easily recognizable name or use the cluster name to identify different EMQX clusters."""
+e2e_attribute_meta.label: "Attribute Meta Value"
+
+publish_response_trace_level.desc:
+"""Trace level for all message exchanges during the message publishing process.<br/>
+Note: this config only takes effect when <code>sample</code> is set to <code>false</code>.
+- `0`: Only `PUBLISH` packets are sampled for all QoS level (both QoS0, QoS1, QoS2).
+- `1`: In addition to `PUBLISH` packets for all QoS. PUBACK and PUBREC are also sampled.
+   That is, the first response packet in the QoS1 or QoS2 message interaction.
+- `2`: Both `PUBLISH` packets and all response packets `PUBACK`, `PUBREC`, `PUBREL`, `PUBCOMP` will be sampled."""
+publish_response_trace_level.label: "Publish Trace Level"
+
+e2e_samplers.desc: """End-to-end Tracing Samplers."""
+e2e_samplers.label: "Samplers"
+
+whitelist_based_sampler.desc:
+"""Whitelist-based Sampler. All Root Spans matched the whitelist will be sampled."""
+whitelist_based_samplers.label: "Whitelist Based Sampler"
+
+event_based_samplers.desc:
+"""Event-based Samplers. The Spans that not matched the whitelist will be sampled based on the event type and the ratio setting."""
+event_based_samplers.label: "Event Based Samplers"
+
+event_type.desc:
+"""Tracing event type.<br/>
+- `"client.connect"`: Trace client Connect.<br/>
+- `"client.disconnect"`: Trace client Disconnect.<br/>
+- `"client.subscribe"`: Trace client Subscribe.<br/>
+- `"client.unsubscribe"`: Trace client Unsubscribe.<br/>
+- `"client.publish"`: Trace Client publishing messages. The message-delivering to subscribers will also be sampled as a part of the message publishing process."""
+event_type.label: "Event Type"
+
+ratio.desc: "Sampling ratio for the event type."
+ratio.label: "Event based Sampling Ratio"
 
 
 }
 }