Просмотр исходного кода

feat(rule engine SQL): add an `unescape` function

The added `unescape` function unescapes escape sequences, transforming
them back to their represented characters. The following escape
sequences are supported:

- Standard C escape sequences:
  - `\n` for newline (LF)
  - `\t` for horizontal tab (HT)
  - `\r` for carriage return (CR)
  - `\b` for backspace (BS)
  - `\f` for formfeed (FF)
  - `\v` for vertical tab (VT)
  - `\'` for single quote (')
  - `\"` for double quote (")
  - `\\` for backslash (\)
  - `\?` for question mark (?)
  - `\a` for alert (bell, BEL)

- Hexadecimal escape codes:
  - `\xH...` where `H...` is one or more hexadecimal digits (0-9, A-F,
    a-f), allowing for the encoding of arbitrary utf32 characters.

If an escape sequence is not recognized, or if the hexadecimal escape
does not form a valid Unicode character, the function generates an
exception.

Fixes:
https://github.com/emqx/emqx/issues/12460
https://emqx.atlassian.net/browse/EMQX-11847
Kjell Winblad 2 лет назад
Родитель
Сommit
5a6f96212d

+ 134 - 1
apps/emqx_rule_engine/src/emqx_rule_funcs.erl

@@ -161,7 +161,8 @@
     join_to_string/2,
     join_to_sql_values_string/1,
     jq/2,
-    jq/3
+    jq/3,
+    unescape/1
 ]).
 
 %% Map Funcs
@@ -937,6 +938,138 @@ jq(FilterProgram, JSONBin) ->
         ])
     ).
 
+unescape(Bin) when is_binary(Bin) ->
+    UnicodeList = unicode:characters_to_list(Bin, utf8),
+    UnescapedUnicodeList = unescape_string(UnicodeList),
+    UnescapedUTF8Bin = unicode:characters_to_binary(UnescapedUnicodeList, utf32, utf8),
+    case UnescapedUTF8Bin of
+        Out when is_binary(Out) ->
+            Out;
+        Error ->
+            throw({invalid_unicode_character, Error})
+    end.
+
+unescape_string(Input) -> unescape_string(Input, []).
+
+unescape_string([], Acc) ->
+    lists:reverse(Acc);
+unescape_string([$\\, $\\ | Rest], Acc) ->
+    unescape_string(Rest, [$\\ | Acc]);
+unescape_string([$\\, $n | Rest], Acc) ->
+    unescape_string(Rest, [$\n | Acc]);
+unescape_string([$\\, $t | Rest], Acc) ->
+    unescape_string(Rest, [$\t | Acc]);
+unescape_string([$\\, $r | Rest], Acc) ->
+    unescape_string(Rest, [$\r | Acc]);
+unescape_string([$\\, $b | Rest], Acc) ->
+    unescape_string(Rest, [$\b | Acc]);
+unescape_string([$\\, $f | Rest], Acc) ->
+    unescape_string(Rest, [$\f | Acc]);
+unescape_string([$\\, $v | Rest], Acc) ->
+    unescape_string(Rest, [$\v | Acc]);
+unescape_string([$\\, $' | Rest], Acc) ->
+    unescape_string(Rest, [$\' | Acc]);
+unescape_string([$\\, $" | Rest], Acc) ->
+    unescape_string(Rest, [$\" | Acc]);
+unescape_string([$\\, $? | Rest], Acc) ->
+    unescape_string(Rest, [$\? | Acc]);
+unescape_string([$\\, $a | Rest], Acc) ->
+    unescape_string(Rest, [$\a | Acc]);
+%% Start of HEX escape code
+unescape_string([$\\, $x | [$0 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$1 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$2 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$3 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$4 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$5 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$6 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$7 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$8 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$9 | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$A | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$B | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$C | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$D | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$E | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$F | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$a | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$b | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$c | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$d | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$e | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+unescape_string([$\\, $x | [$f | _] = HexStringStart], Acc) ->
+    unescape_handle_hex_string(HexStringStart, Acc);
+%% We treat all other escape sequences as not valid input to leave room for
+%% extending the function to support more escape codes
+unescape_string([$\\, X | _Rest], _Acc) ->
+    erlang:throw({unrecognized_escape_sequence, list_to_binary([$\\, X])});
+unescape_string([First | Rest], Acc) ->
+    unescape_string(Rest, [First | Acc]).
+
+unescape_handle_hex_string(HexStringStart, Acc) ->
+    {RemainingString, Num} = parse_hex_string(HexStringStart),
+    unescape_string(RemainingString, [Num | Acc]).
+
+parse_hex_string(SeqStartingWithHexDigit) ->
+    parse_hex_string(SeqStartingWithHexDigit, []).
+
+parse_hex_string([], Acc) ->
+    ReversedAcc = lists:reverse(Acc),
+    {[], list_to_integer(ReversedAcc, 16)};
+parse_hex_string([First | Rest] = String, Acc) ->
+    case is_hex_digit(First) of
+        true ->
+            parse_hex_string(Rest, [First | Acc]);
+        false ->
+            ReversedAcc = lists:reverse(Acc),
+            {String, list_to_integer(ReversedAcc, 16)}
+    end.
+
+is_hex_digit($0) -> true;
+is_hex_digit($1) -> true;
+is_hex_digit($2) -> true;
+is_hex_digit($3) -> true;
+is_hex_digit($4) -> true;
+is_hex_digit($5) -> true;
+is_hex_digit($6) -> true;
+is_hex_digit($7) -> true;
+is_hex_digit($8) -> true;
+is_hex_digit($9) -> true;
+is_hex_digit($A) -> true;
+is_hex_digit($B) -> true;
+is_hex_digit($C) -> true;
+is_hex_digit($D) -> true;
+is_hex_digit($E) -> true;
+is_hex_digit($F) -> true;
+is_hex_digit($a) -> true;
+is_hex_digit($b) -> true;
+is_hex_digit($c) -> true;
+is_hex_digit($d) -> true;
+is_hex_digit($e) -> true;
+is_hex_digit($f) -> true;
+is_hex_digit(_) -> false.
+
 %%------------------------------------------------------------------------------
 %% Array Funcs
 %%------------------------------------------------------------------------------

+ 54 - 0
apps/emqx_rule_engine/test/emqx_rule_funcs_SUITE.erl

@@ -736,6 +736,60 @@ t_regex_replace(_) ->
     ?assertEqual(<<"aebed">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"e">>])),
     ?assertEqual(<<"a[cc]b[c]d">>, apply_func(regex_replace, [<<"accbcd">>, <<"c+">>, <<"[&]">>])).
 
+t_unescape(_) ->
+    ?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\n">>)),
+    ?assertEqual(<<"\t">>, emqx_rule_funcs:unescape(<<"\\t">>)),
+    ?assertEqual(<<"\r">>, emqx_rule_funcs:unescape(<<"\\r">>)),
+    ?assertEqual(<<"\b">>, emqx_rule_funcs:unescape(<<"\\b">>)),
+    ?assertEqual(<<"\f">>, emqx_rule_funcs:unescape(<<"\\f">>)),
+    ?assertEqual(<<"\v">>, emqx_rule_funcs:unescape(<<"\\v">>)),
+    ?assertEqual(<<"'">>, emqx_rule_funcs:unescape(<<"\\'">>)),
+    ?assertEqual(<<"\"">>, emqx_rule_funcs:unescape(<<"\\\"">>)),
+    ?assertEqual(<<"?">>, emqx_rule_funcs:unescape(<<"\\?">>)),
+    ?assertEqual(<<"\a">>, emqx_rule_funcs:unescape(<<"\\a">>)),
+    % Test escaping backslash itself
+    ?assertEqual(<<"\\">>, emqx_rule_funcs:unescape(<<"\\\\">>)),
+    % Test a string without any escape sequences
+    ?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, World!">>)),
+    % Test a string with escape sequences
+    ?assertEqual(<<"Hello,\t World\n!">>, emqx_rule_funcs:unescape(<<"Hello,\\t World\\n!">>)),
+    % Test unrecognized escape sequence (should throw an error)
+    ?assertException(
+        throw, {unrecognized_escape_sequence, <<$\\, $L>>}, emqx_rule_funcs:unescape(<<"\\L">>)
+    ),
+    % Test hexadecimal escape sequences
+
+    % Newline
+    ?assertEqual(<<"\n">>, emqx_rule_funcs:unescape(<<"\\x0A">>)),
+    % Newline
+    ?assertEqual(<<"hej\n">>, emqx_rule_funcs:unescape(<<"hej\\x0A">>)),
+    % Newline
+    ?assertEqual(<<"\nhej">>, emqx_rule_funcs:unescape(<<"\\x0Ahej">>)),
+    % Newline
+    ?assertEqual(<<"hej\nhej">>, emqx_rule_funcs:unescape(<<"hej\\x0Ahej">>)),
+    % "ABC"
+    ?assertEqual(<<"ABC">>, emqx_rule_funcs:unescape(<<"\\x41\\x42\\x43">>)),
+    % "\xFF" = 255 in decimal
+    ?assertEqual(<<"\xFF"/utf8>>, emqx_rule_funcs:unescape(<<"\\xFF">>)),
+    % "W" = \x57
+    ?assertEqual(<<"Hello, World!">>, emqx_rule_funcs:unescape(<<"Hello, \\x57orld!">>)).
+
+t_unescape_hex(_) ->
+    ?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x41">>)),
+    ?assertEqual(<<"Hello"/utf8>>, emqx_rule_funcs:unescape(<<"\\x48\\x65\\x6c\\x6c\\x6f">>)),
+    ?assertEqual(<<"A"/utf8>>, emqx_rule_funcs:unescape(<<"\\x0041">>)),
+    ?assertEqual(<<"€"/utf8>>, emqx_rule_funcs:unescape(<<"\\x20AC">>)),
+    ?assertEqual(<<"❤"/utf8>>, emqx_rule_funcs:unescape(<<"\\x2764">>)),
+    ?assertException(
+        throw, {unrecognized_escape_sequence, <<"\\x">>}, emqx_rule_funcs:unescape(<<"\\xG1">>)
+    ),
+    ?assertException(
+        throw, {invalid_unicode_character, _}, emqx_rule_funcs:unescape(<<"\\x11000000">>)
+    ),
+    ?assertEqual(
+        <<"Hello, 世界"/utf8>>, emqx_rule_funcs:unescape(<<"Hello, \\x00004E16\\x0000754C">>)
+    ).
+
 jq_1_elm_res(JSONString) ->
     Bin = list_to_binary(JSONString),
     [apply_func(json_decode, [Bin])].