Procházet zdrojové kódy

fix(logger): fallback to hex format if invalid unicode(utf8)

zmstone před 1 rokem
rodič
revize
5cd393d06a

+ 79 - 27
apps/emqx/src/emqx_packet.erl

@@ -54,7 +54,7 @@
     format/2
     format/2
 ]).
 ]).
 
 
--export([format_truncated_payload/3]).
+-export([format_payload/2]).
 
 
 -define(TYPE_NAMES,
 -define(TYPE_NAMES,
     {'CONNECT', 'CONNACK', 'PUBLISH', 'PUBACK', 'PUBREC', 'PUBREL', 'PUBCOMP', 'SUBSCRIBE',
     {'CONNECT', 'CONNACK', 'PUBLISH', 'PUBACK', 'PUBREC', 'PUBREL', 'PUBCOMP', 'SUBSCRIBE',
@@ -506,7 +506,7 @@ format_variable(undefined, _, _) ->
 format_variable(Variable, undefined, PayloadEncode) ->
 format_variable(Variable, undefined, PayloadEncode) ->
     format_variable(Variable, PayloadEncode);
     format_variable(Variable, PayloadEncode);
 format_variable(Variable, Payload, PayloadEncode) ->
 format_variable(Variable, Payload, PayloadEncode) ->
-    [format_variable(Variable, PayloadEncode), ", ", format_payload(Payload, PayloadEncode)].
+    [format_variable(Variable, PayloadEncode), ", ", format_payload_label(Payload, PayloadEncode)].
 
 
 format_variable(
 format_variable(
     #mqtt_packet_connect{
     #mqtt_packet_connect{
@@ -537,7 +537,7 @@ format_variable(
                     ", Will(Q~p, R~p, Topic=~ts ",
                     ", Will(Q~p, R~p, Topic=~ts ",
                     [WillQoS, i(WillRetain), WillTopic]
                     [WillQoS, i(WillRetain), WillTopic]
                 ),
                 ),
-                format_payload(WillPayload, PayloadEncode),
+                format_payload_label(WillPayload, PayloadEncode),
                 ")"
                 ")"
             ];
             ];
         false ->
         false ->
@@ -617,32 +617,84 @@ format_password(undefined) -> "";
 format_password(<<>>) -> "";
 format_password(<<>>) -> "";
 format_password(_Password) -> "******".
 format_password(_Password) -> "******".
 
 
+format_payload_label(Payload, Type) ->
+    ["Payload=", format_payload(Payload, Type)].
+
 format_payload(_, hidden) ->
 format_payload(_, hidden) ->
-    "Payload=******";
-format_payload(Payload, text) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) ->
-    ["Payload=", unicode:characters_to_list(Payload)];
-format_payload(Payload, hex) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) ->
-    ["Payload(hex)=", binary:encode_hex(Payload)];
-format_payload(<<Part:?TRUNCATED_PAYLOAD_SIZE/binary, _/binary>> = Payload, Type) ->
-    [
-        "Payload=",
-        format_truncated_payload(Part, byte_size(Payload), Type)
-    ].
+    "******";
+format_payload(<<>>, _) ->
+    "";
+format_payload(Payload, Type) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) ->
+    %% under the 1KB limit
+    format_payload_limit(Type, Payload, size(Payload));
+format_payload(Payload, Type) ->
+    %% too long, truncate to 100B
+    format_payload_limit(Type, Payload, ?TRUNCATED_PAYLOAD_SIZE).
+
+format_payload_limit(Type, Payload, Limit) when size(Payload) > Limit ->
+    {Part, TruncatedBytes} = truncate_payload(Type, Limit, Payload),
+    case TruncatedBytes > 0 of
+        true ->
+            [do_format_payload(Type, Part), "...(", integer_to_list(TruncatedBytes), " bytes)"];
+        false ->
+            do_format_payload(Type, Payload)
+    end;
+format_payload_limit(Type, Payload, _Limit) ->
+    do_format_payload(Type, Payload).
 
 
-format_truncated_payload(Bin, Size, Type) ->
-    Bin2 =
-        case Type of
-            text -> Bin;
-            hex -> binary:encode_hex(Bin)
-        end,
-    unicode:characters_to_list(
-        [
-            Bin2,
-            "... The ",
-            integer_to_list(Size - ?TRUNCATED_PAYLOAD_SIZE),
-            " bytes of this log are truncated"
-        ]
-    ).
+do_format_payload(text, Bytes) ->
+    try
+        [_ | _] = unicode:characters_to_list(Bytes)
+    catch
+        _:_ ->
+            do_format_payload(hex, Bytes)
+    end;
+do_format_payload(hex, Bytes) ->
+    ["hex:", binary:encode_hex(Bytes)].
+
+truncate_payload(hex, Limit, Payload) ->
+    <<Part:Limit/binary, Rest/binary>> = Payload,
+    {Part, size(Rest)};
+truncate_payload(text, Limit, Payload) ->
+    truncate_utf8(Limit, Payload).
+
+truncate_utf8(Limit, Payload) ->
+    CompleteLen = find_complete_utf8_len(Limit, Payload),
+    <<Part:CompleteLen/binary, Rest/binary>> = Payload,
+    {Part, size(Rest)}.
+
+find_complete_utf8_len(StartLen, Payload) ->
+    %% check ahead 3 bytes, to find the next 1st byte utf8 encoded character
+    CheckAhead = min(size(Payload) - StartLen, 3),
+    find_complete_utf8_len(StartLen, 0, CheckAhead, Payload).
+
+find_complete_utf8_len(Len, Shift, MaxShift, _Payload) when Shift > MaxShift ->
+    %% hopeless case, failed to find a utf8 character boundary
+    Len;
+find_complete_utf8_len(Len, Shift, MaxShift, Payload) ->
+    <<_:(Len + Shift)/binary, NextByte, _/binary>> = Payload,
+    case is_first_utf8(NextByte) of
+        true ->
+            Len + Shift;
+        false ->
+            find_complete_utf8_len(Len, Shift + 1, MaxShift, Payload)
+    end.
+
+-compile({inline, is_first_utf8/1}).
+is_first_utf8(Byte) when Byte band 128 =:= 0 ->
+    %% Start of a 1-byte character (0xxxxxxx).
+    true;
+is_first_utf8(Byte) when Byte band 224 =:= 192 ->
+    %% Start of a 2-byte character (110xxxxx).
+    true;
+is_first_utf8(Byte) when Byte band 240 =:= 224 ->
+    %% Start of a 3-byte character (1110xxxx).
+    true;
+is_first_utf8(Byte) when Byte band 248 =:= 240 ->
+    %% Start of a 4-byte character (11110xxx).
+    true;
+is_first_utf8(_) ->
+    false.
 
 
 i(true) -> 1;
 i(true) -> 1;
 i(false) -> 0;
 i(false) -> 0;

+ 2 - 9
apps/emqx/src/emqx_trace/emqx_trace_formatter.erl

@@ -98,16 +98,9 @@ format_packet(Packet, Encode) ->
 
 
 format_payload(undefined, _) ->
 format_payload(undefined, _) ->
     "";
     "";
-format_payload(_, hidden) ->
-    "******";
-format_payload(Payload, text) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) ->
-    unicode:characters_to_list(Payload);
-format_payload(Payload, hex) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) -> binary:encode_hex(Payload);
-format_payload(<<Part:?TRUNCATED_PAYLOAD_SIZE/binary, _/binary>> = Payload, Type) ->
-    emqx_packet:format_truncated_payload(Part, byte_size(Payload), Type);
+format_payload(Payload, Type) when is_binary(Payload) ->
+    emqx_packet:format_payload(Payload, Type);
 format_payload(Payload, _) ->
 format_payload(Payload, _) ->
-    %% We don't want to crash if there is a field named payload with some other
-    %% type of value
     Payload.
     Payload.
 
 
 to_iolist(Atom) when is_atom(Atom) -> atom_to_list(Atom);
 to_iolist(Atom) when is_atom(Atom) -> atom_to_list(Atom);

+ 4 - 7
apps/emqx/src/emqx_trace/emqx_trace_json_formatter.erl

@@ -185,13 +185,10 @@ format_packet(Packet, Encode) -> emqx_packet:format(Packet, Encode).
 
 
 format_payload(undefined, _) ->
 format_payload(undefined, _) ->
     "";
     "";
-format_payload(_, hidden) ->
-    "******";
-format_payload(Payload, text) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) ->
-    unicode:characters_to_list(Payload);
-format_payload(Payload, hex) when ?MAX_PAYLOAD_FORMAT_LIMIT(Payload) -> binary:encode_hex(Payload);
-format_payload(<<Part:?TRUNCATED_PAYLOAD_SIZE/binary, _/binary>> = Payload, Type) ->
-    emqx_packet:format_truncated_payload(Part, byte_size(Payload), Type).
+format_payload(Payload, Type) when is_binary(Payload) ->
+    emqx_packet:format_payload(Payload, Type);
+format_payload(Payload, _) ->
+    Payload.
 
 
 format_map_set_to_list(Map) ->
 format_map_set_to_list(Map) ->
     Items = [
     Items = [

+ 94 - 0
apps/emqx/test/emqx_packet_tests.erl

@@ -0,0 +1,94 @@
+%%--------------------------------------------------------------------
+%% Copyright (c) 2024 EMQ Technologies Co., Ltd. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%--------------------------------------------------------------------
+
+-module(emqx_packet_tests).
+
+-include_lib("eunit/include/eunit.hrl").
+-include_lib("emqx/include/emqx_mqtt.hrl").
+
+format_payload_test_() ->
+    Hidden = fun(Payload) -> emqx_packet:format_payload(Payload, hidden) end,
+    Hex = fun(Payload) -> bin(emqx_packet:format_payload(Payload, hex)) end,
+    [
+        {"hidden", fun() -> ?assertEqual("******", Hidden(<<>>)) end},
+        {"hex empty", fun() -> ?assertEqual(<<"">>, Hex(<<"">>)) end},
+        {"hex short", fun() -> ?assertEqual(<<"hex:303030">>, Hex(<<"000">>)) end},
+        {"hex at limit", fun() ->
+            Payload = bin(lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE, 0)),
+            Expected = bin(
+                [
+                    "hex:",
+                    binary:encode_hex(bin(lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE, 0)))
+                ]
+            ),
+            ?assertEqual(Expected, Hex(Payload))
+        end},
+        {"hex long", fun() ->
+            Payload = bin(lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE + 2, 0)),
+            Prefix = binary:encode_hex(bin(lists:duplicate(?TRUNCATED_PAYLOAD_SIZE, 0))),
+            Lost = size(Payload) - ?TRUNCATED_PAYLOAD_SIZE,
+            Expected = bin(["hex:", Prefix, "...(", integer_to_list(Lost), " bytes)"]),
+            ?assertEqual(Expected, Hex(Payload))
+        end}
+    ].
+
+format_payload_utf8_test_() ->
+    Fmt = fun(P) -> bin(emqx_packet:format_payload(P, text)) end,
+    [
+        {"empty", fun() -> ?assertEqual(<<"">>, Fmt(<<>>)) end},
+        {"short ascii", fun() -> ?assertEqual(<<"abc">>, Fmt(<<"abc">>)) end},
+        {"short unicode", fun() -> ?assertEqual(<<"日志"/utf8>>, Fmt(<<"日志"/utf8>>)) end},
+        {"unicode at limit", fun() ->
+            Payload = bin(lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE div 2, <<"¢"/utf8>>)),
+            Expected = bin(["", Payload]),
+            ?assertEqual(Expected, Fmt(Payload))
+        end}
+    ].
+
+format_payload_utf8_cutoff_test_() ->
+    Fmt = fun(P) -> bin(emqx_packet:format_payload(P, text)) end,
+    Check = fun(MultiBytesChar) ->
+        Prefix = [lists:duplicate(?TRUNCATED_PAYLOAD_SIZE - 1, $a), MultiBytesChar],
+        Payload = bin([Prefix, MultiBytesChar, lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE, $b)]),
+        Lost = size(Payload) - iolist_size(Prefix),
+        Expected = bin([Prefix, "...(", integer_to_list(Lost), " bytes)"]),
+        ?assertEqual(Expected, Fmt(Payload))
+    end,
+    [
+        {"utf8 1B", fun() -> Check(<<"x"/utf8>>) end},
+        {"utf8 2B", fun() -> Check(<<"¢"/utf8>>) end},
+        {"utf8 3B", fun() -> Check(<<"€"/utf8>>) end},
+        {"utf8 4B", fun() -> Check(<<"𐍈"/utf8>>) end}
+    ].
+
+invalid_utf8_fallback_test() ->
+    %% trucate after the first byte of a utf8 encoded unicode character
+    <<FirstByte:8, Last3Bytes/binary>> = <<"𐍈"/utf8>>,
+    Prefix = iolist_to_binary([lists:duplicate(?TRUNCATED_PAYLOAD_SIZE - 1, $a), FirstByte]),
+    %% invalidate utf8 byte sequence, so it should fallback to hex
+    InvalidUtf8 = 255,
+    Payload = iolist_to_binary([
+        Prefix, Last3Bytes, InvalidUtf8, lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE, $b)
+    ]),
+    Lost = size(Payload) - iolist_size(Prefix),
+    Expected = iolist_to_binary([
+        "hex:", binary:encode_hex(Prefix), "...(", integer_to_list(Lost), " bytes)"
+    ]),
+    ?assertEqual(Expected, bin(emqx_packet:format_payload(Payload, text))),
+    ok.
+
+bin(X) ->
+    unicode:characters_to_binary(X).