Просмотр исходного кода

refactor: use utf8 bit-sytanx to detect codepoint boundary

zmstone 1 год назад
Родитель
Сommit
a7d888e192
2 измененных файлов с 10 добавлено и 33 удалено
  1. 9 32
      apps/emqx/src/emqx_packet.erl
  2. 1 1
      apps/emqx/test/emqx_packet_tests.erl

+ 9 - 32
apps/emqx/src/emqx_packet.erl

@@ -659,42 +659,19 @@ truncate_payload(text, Limit, Payload) ->
     truncate_utf8(Limit, Payload).
     truncate_utf8(Limit, Payload).
 
 
 truncate_utf8(Limit, Payload) ->
 truncate_utf8(Limit, Payload) ->
-    CompleteLen = find_complete_utf8_len(Limit, Payload),
+    CompleteLen = max(Limit, find_complete_utf8_len(Limit, Payload)),
     <<Part:CompleteLen/binary, Rest/binary>> = Payload,
     <<Part:CompleteLen/binary, Rest/binary>> = Payload,
     {Part, size(Rest)}.
     {Part, size(Rest)}.
 
 
-find_complete_utf8_len(StartLen, Payload) ->
-    %% check ahead 3 bytes, to find the next 1st byte utf8 encoded character
-    CheckAhead = min(size(Payload) - StartLen, 3),
-    find_complete_utf8_len(StartLen, 0, CheckAhead, Payload).
-
-find_complete_utf8_len(Len, Shift, MaxShift, _Payload) when Shift > MaxShift ->
-    %% hopeless case, failed to find a utf8 character boundary
-    Len;
-find_complete_utf8_len(Len, Shift, MaxShift, Payload) ->
-    <<_:(Len + Shift)/binary, NextByte, _/binary>> = Payload,
-    case is_first_utf8(NextByte) of
-        true ->
-            Len + Shift;
-        false ->
-            find_complete_utf8_len(Len, Shift + 1, MaxShift, Payload)
-    end.
+find_complete_utf8_len(Limit, Payload) ->
+    TailLen = trim_utf8(Limit, Payload),
+    size(Payload) - TailLen.
 
 
--compile({inline, is_first_utf8/1}).
-is_first_utf8(Byte) when Byte band 128 =:= 0 ->
-    %% Start of a 1-byte character (0xxxxxxx).
-    true;
-is_first_utf8(Byte) when Byte band 224 =:= 192 ->
-    %% Start of a 2-byte character (110xxxxx).
-    true;
-is_first_utf8(Byte) when Byte band 240 =:= 224 ->
-    %% Start of a 3-byte character (1110xxxx).
-    true;
-is_first_utf8(Byte) when Byte band 248 =:= 240 ->
-    %% Start of a 4-byte character (11110xxx).
-    true;
-is_first_utf8(_) ->
-    false.
+trim_utf8(Count, <<_/utf8, Rest/binary>> = All) when Count > 0 ->
+    trim_utf8(Count - (size(All) - size(Rest)), Rest);
+trim_utf8(_Count, Rest) ->
+    %% either limit =< 0, or there is no valid utf8 char as prefix
+    size(Rest).
 
 
 i(true) -> 1;
 i(true) -> 1;
 i(false) -> 0;
 i(false) -> 0;

+ 1 - 1
apps/emqx/test/emqx_packet_tests.erl

@@ -81,7 +81,7 @@ invalid_utf8_fallback_test() ->
     %% invalidate utf8 byte sequence, so it should fallback to hex
     %% invalidate utf8 byte sequence, so it should fallback to hex
     InvalidUtf8 = 255,
     InvalidUtf8 = 255,
     Payload = iolist_to_binary([
     Payload = iolist_to_binary([
-        Prefix, Last3Bytes, InvalidUtf8, lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE, $b)
+        Prefix, InvalidUtf8, lists:duplicate(?MAX_PAYLOAD_FORMAT_SIZE, $b)
     ]),
     ]),
     Lost = size(Payload) - iolist_size(Prefix),
     Lost = size(Payload) - iolist_size(Prefix),
     Expected = iolist_to_binary([
     Expected = iolist_to_binary([