Skip to content

Commit

Permalink
Merge pull request #284 from zmstone/0213-multiline-string-indentation
Browse files Browse the repository at this point in the history
0213 multiline string indentation
  • Loading branch information
zmstone authored Feb 13, 2024
2 parents 8442790 + 0d0c736 commit 98cd69a
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 34 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ HOCON spec for reference: https://lightbend.github.io/config/
* `key={a: 1}\n{b: 2}`
* `key={a=1, b=2}`
- `url()/file()/classpath()` includes are not supported
- Quotes next to triple-quotes needs to be escaped, otherwise they are discarded.
Meaning `"""a""""` is parsed as `a` but not `a"`, to crrectly express `a"`, it must be one of below:
* Escape the last `"`: `"""a\""""`;
* Or add `~` around the string value: `"""~a"~"""` (see below).
- Multiline strings allow indentation (spaces, not tabs).
If `~\n` (or `~\r\n`) are the only characters following the opening triple-quote, then it's a multiline string with indentation:
* The first line `~\n` is ignored;
* The indentation spaces of the following lines are trimed;
* Indentation is allowed but not required for empty lines;
* Indentation level is determined by the least number of leading spaces among the non-empty lines;
* Backslashes are treated as escape characters, i.e. should be escaped with another backslash;
* There is no need to escape quotes in multiline strings, but it's allowed;
* The closing triple-quote can be either `"""` or `~"""` (`~` allows the string to end with `"` without escaping).

## Schema

Expand Down
2 changes: 1 addition & 1 deletion etc/unescape.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ sql_laitin1_with_escape_1 = "SELECT * FROM \"t/1\""
sql_laitin1_with_escape_2 = "SELECT * FROM \\\"t/1\\\""
sql_unicode_with_escape_1 = "SELECT * FROM \"t/1\" WHERE clientid = \"-测试专用-\""
sql_unicode_with_escape_2 = "SELECT * FROM \\\"t/1\\\" WHERE clientid = \"-测试专用-\""
sql_unicode_with_escape_3 = "SELECT * FROM \\\"t/1\\\" WHERE clientid = \"-测试\\\n\r\t专用-\""
sql_unicode_with_escape_3 = "SELECT * FROM \\\"t/1\\\" WHERE clientid = \"-测试\\\r\n\t专用-\""
z = 1
z1 = "1"
103 changes: 92 additions & 11 deletions src/hocon_pp.erl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
-include("hocon_private.hrl").

-define(INDENT, " ").
-define(TRIPLE_QUOTE, <<"\"\"\"">>).

%% @doc Pretty print HOCON value.
%% Options are:
Expand Down Expand Up @@ -99,10 +100,10 @@ gen(Bin, Opts) when is_binary(Bin) ->
gen(S, Opts) when is_list(S) ->
case io_lib:printable_latin1_list(S) of
true ->
maybe_quote_latin1_str(S);
gen_str(S, latin1);
false ->
case io_lib:printable_unicode_list(S) of
true -> <<"\"", (format_escape_sequences(S))/binary, "\"">>;
true -> gen_str(S, unicode);
false -> gen_list(S, Opts)
end
end;
Expand All @@ -124,26 +125,100 @@ gen(Value, Opts) ->
options => Opts
}).

gen_str(S, Codec) ->
case is_triple_quote_str(S) of
true ->
gen_triple_quote_str(S);
false ->
gen_single_quote_str(S, Codec)
end.

%% If a string requires escaping, it is a triple quote string
%% with one exception: if the string itself contains triple-quote
is_triple_quote_str(Chars) ->
case has_triple_quotes(Chars) of
true ->
false;
false ->
lists:any(fun(C) -> esc(C) =/= C end, Chars)
end.

%% Return 'true' if there are three consecutive quotes in a string.
has_triple_quotes(Chars) ->
nomatch =/= string:find(Chars, "\"\"\"").

%% If a string has '\n' in it, it's a multiline.
%% If it has leading or trailing quotes,
%% it's a multiline -- so that there is no need to escape the quotes.
is_multiline([]) ->
false;
is_multiline(Chars) ->
lists:member($\n, Chars) orelse is_leading_quote(Chars) orelse is_trailling_quote(Chars).

is_leading_quote([$" | _]) -> true;
is_leading_quote(_) -> false.

is_trailling_quote(Chars) ->
is_leading_quote(lists:reverse(Chars)).

gen_single_quote_str(S, latin1) ->
maybe_quote_latin1_str(S);
gen_single_quote_str(S, unicode) ->
<<"\"", (format_escape_sequences(S))/binary, "\"">>.

gen_triple_quote_str(Str) ->
[
?TRIPLE_QUOTE,
maybe_indent(esc_backslashes(Str)),
?TRIPLE_QUOTE
].

maybe_indent(Chars) ->
case is_multiline(Chars) of
true ->
["~", indent_multiline_str(Chars), "~"];
false ->
Chars
end.
indent_multiline_str(Chars) ->
Lines = hocon_scanner:split_lines(Chars),
indent_str_value_lines(Lines).
%% mark each line for indentation with 'indent'
%% except for empty lines in the middle of the string
indent_str_value_lines([[]]) ->
%% last line being empty
[?NL];
indent_str_value_lines([LastLine]) ->
%% last line is not empty
[{indent, bin(LastLine)}];
indent_str_value_lines([[] | Lines]) ->
%% do not indent empty line
[<<"\n">> | indent_str_value_lines(Lines)];
indent_str_value_lines([Line | Lines]) ->
[{indent, bin(Line)} | indent_str_value_lines(Lines)].
gen_list(L, Opts) ->
case is_oneliner(L) of
true ->
%% one line
["[", infix([gen(I, Opts) || I <- L], ", "), "]"];
false ->
do_gen_list(L, Opts)
gen_multiline_list(L, Opts)
end.
do_gen_list([_ | _] = L, Opts) ->
gen_multiline_list([_ | _] = L, Opts) ->
[
["[", ?NL],
do_gen_list_loop(L, Opts#{no_obj_nl => true}),
["["],
gen_multiline_list_loop(L, Opts#{no_obj_nl => true}),
["]", ?NL]
].
do_gen_list_loop([I], Opts) ->
gen_multiline_list_loop([I], Opts) ->
[{indent, gen(I, Opts)}];
do_gen_list_loop([H | T], Opts) ->
[{indent, [gen(H, Opts), ","]} | do_gen_list_loop(T, Opts)].
gen_multiline_list_loop([H | T], Opts) ->
[{indent, [gen(H, Opts), ","]} | gen_multiline_list_loop(T, Opts)].
is_oneliner(L) when is_list(L) ->
lists:all(fun(X) -> is_number(X) orelse is_binary(X) orelse is_atom(X) end, L);
Expand All @@ -153,7 +228,7 @@ is_oneliner(M) when is_map(M) ->
gen_map(M, Opts) ->
case is_oneliner(M) of
true -> ["{", infix(gen_map_fields(M, Opts, ""), ", "), "}"];
false -> [["{", ?NL], {indent, gen_map_fields(M, Opts, ?NL)}, "}"]
false -> ["{", {indent, gen_map_fields(M, Opts, ?NL)}, [?NL, "}"]]
end.
gen_map_fields(M, Opts, NL) ->
Expand Down Expand Up @@ -224,7 +299,7 @@ fmt(L) when is_list(L) ->
bin(lists:map(fun fmt/1, L));
fmt({indent, Block}) ->
FormattedBlock = fmt(Block),
bin([[?INDENT, Line, ?NL] || Line <- split(FormattedBlock)]).
bin([[?NL, ?INDENT, Line] || Line <- split(FormattedBlock)]).

split(Bin) ->
[Line || Line <- binary:split(Bin, ?NL, [global]), Line =/= <<>>].
Expand Down Expand Up @@ -256,3 +331,9 @@ esc($\") -> "\\\"";
% \
esc($\\) -> "\\\\";
esc(Char) -> Char.

esc_backslashes(Str) ->
lists:map(fun esc_backslash/1, Str).

esc_backslash($\\) -> "\\\\";
esc_backslash(Char) -> Char.
66 changes: 65 additions & 1 deletion src/hocon_scanner.xrl
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Rules.
{Integer} : {token, {integer, TokenLine, list_to_integer(TokenChars)}}.
{Float} : {token, {float, TokenLine, to_float(TokenChars)}}.
{String} : {token, {string, TokenLine, unquote(TokenChars, force_escape)}}.
{MultilineString} : {token, {string, TokenLine, unquote(TokenChars, allow_unescaped)}}.
{MultilineString} : {token, {string, TokenLine, unindent(unquote(TokenChars, allow_unescaped))}}.
{Bytesize} : {token, {string, TokenLine, TokenChars}}.
{Percent} : {token, {string, TokenLine, TokenChars}}.
{Duration} : {token, {string, TokenLine, TokenChars}}.
Expand All @@ -92,6 +92,8 @@ Rules.
Erlang code.
-export([split_lines/1]).
maybe_include("include", TokenLine) -> {include, TokenLine};
maybe_include(TokenChars, TokenLine) -> {unqstr, TokenLine, TokenChars}.
Expand All @@ -111,6 +113,68 @@ strip_surrounded_quotes([$" | Rem]) ->
strip_surrounded_quotes(Str) ->
Str.

unindent([$~, $\r, $\n | Chars]) ->
do_unindent(Chars);
unindent([$~, $\n | Chars]) ->
do_unindent(Chars);
unindent(Chars) ->
Chars.

do_unindent(Chars) ->
Lines = split_lines(Chars),
Indent = min_indent(Lines),
NewLines = lists:map(fun(Line) -> trim_indents(Line, Indent) end, Lines),
lists:flatten(lists:join($\n, NewLines)).

split_lines(Chars) ->
split_lines(Chars, "", []).

%% Split multiline strings like
%% """~
%% line1
%% line2
%% ~"""
%% into ["line1\n", "line2\n"]
split_lines([], LastLineR, Lines) ->
%% if the last line ends with '-' drop it
LastLine = case LastLineR of
[$~ | Rest] ->
lists:reverse(Rest);
_ ->
lists:reverse(LastLineR)
end,
lists:reverse([LastLine | Lines]);
split_lines([$\n | Chars], Line, Lines) ->
split_lines(Chars, [], [lists:reverse(Line) | Lines]);
split_lines([Char | Chars], Line, Lines) ->
split_lines(Chars, [Char | Line], Lines).

min_indent(Lines) ->
Indents0 = lists:map(fun indent_level/1, Lines),
case lists:filter(fun erlang:is_integer/1, Indents0) of
[] ->
0;
Indents ->
lists:min(Indents)
end.

indent_level("") ->
ignore;
indent_level(Line) ->
indent_level(Line, 0).

indent_level([$\s | Chars], Count) ->
indent_level(Chars, Count + 1);
indent_level(_, Count) ->
Count.

trim_indents([], _Indent) ->
[];
trim_indents(Chars, 0) ->
Chars;
trim_indents([$\s | Chars], Indent) when Indent > 0 ->
trim_indents(Chars, Indent - 1).

% the first clause is commented out on purpose
% meaning below two escape sequence (in a hocon file)
% key="\\""
Expand Down
47 changes: 28 additions & 19 deletions test/hocon_pp_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,6 @@ do(File) ->
?assertEqual(Conf, Conf3),
file:delete(TmpFile).

pp_escape_to_file_test() ->
File = "etc/unescape.conf",
{ok, Conf} = hocon:load(File),
PP = hocon_pp:do(Conf, #{}),
TmpFile = File ++ ".pp",
file:write_file(TmpFile, [PP]),
?assertEqual(file:read_file(File), file:read_file(TmpFile)),
file:delete(TmpFile),
ok.

pp_quote_test() ->
Fun = fun(Map, ExpectBin) ->
Bin = iolist_to_binary(hocon_pp:do(Map, #{})),
Expand All @@ -100,29 +90,48 @@ pp_quote_test() ->
Fun(#{<<"$d_dfdk2f">> => <<"12">>}, <<"\"$d_dfdk2f\" = \"12\"\n">>),

%% backslash
Fun(#{<<"test_backslash">> => <<"\\emqx">>}, <<"test_backslash = \"\\\\emqx\"\n">>),
Fun(#{<<"test_backslash">> => <<"emqx\\emqx">>}, <<"test_backslash = \"emqx\\\\emqx\"\n">>),
Fun(#{<<"test_backslash">> => <<"emqx\\">>}, <<"test_backslash = \"emqx\\\\\"\n">>),
Fun(#{<<"a">> => <<"\\emqx">>}, <<"a = \"\"\"\\\\emqx\"\"\"\n">>),
Fun(#{<<"b">> => <<"emqx\\emqx">>}, <<"b = \"\"\"emqx\\\\emqx\"\"\"\n">>),
Fun(#{<<"c">> => <<"emqx\\">>}, <<"c = \"\"\"emqx\\\\\"\"\"\n">>),

%% quote
Fun(#{<<"test_quote">> => <<"\"emqx">>}, <<"test_quote = \"\\\"emqx\"\n">>),
Fun(#{<<"test_quote">> => <<"emqx\"emqx">>}, <<"test_quote = \"emqx\\\"emqx\"\n">>),
Fun(#{<<"test_quote">> => <<"emqx\"">>}, <<"test_quote = \"emqx\\\"\"\n">>),
Fun(#{<<"A">> => <<"\"emqx">>}, <<"A = \"\"\"~\n \"emqx~\"\"\"\n">>),
Fun(#{<<"B">> => <<"emqx\"emqx">>}, <<"B = \"\"\"emqx\"emqx\"\"\"\n">>),
Fun(#{<<"C">> => <<"emqx\"">>}, <<"C = \"\"\"~\n emqx\"~\"\"\"\n">>),
Fun(#{<<"D">> => <<"emqx\"\"\"">>}, <<"D = \"emqx\\\"\\\"\\\"\"\n">>),

%% '${}[]:=,+#`^?!@*& ' should quote
lists:foreach(
fun(Char) ->
Header = list_to_binary([Char | "emqx"]),
Tail = list_to_binary("emqx" ++ [Char]),
Middle = <<Tail/binary, "emqx">>,
Fun(#{<<"test_key">> => Header}, <<"test_key = \"", Header/binary, "\"\n">>),
Fun(#{<<"test_key">> => Tail}, <<"test_key = \"", Tail/binary, "\"\n">>),
Fun(#{<<"test_key">> => Middle}, <<"test_key = \"", Middle/binary, "\"\n">>)
Fun(#{<<"D">> => Header}, <<"D = \"", Header/binary, "\"\n">>),
Fun(#{<<"E">> => Tail}, <<"E = \"", Tail/binary, "\"\n">>),
Fun(#{<<"F">> => Middle}, <<"F = \"", Middle/binary, "\"\n">>)
end,
"'${}[]:=,+#`^?!@*& "
),
ok.

multi_line_str_indent_test() ->
Struct = #{<<"a">> => #{<<"b">> => #{<<"c">> => "line1\n\nline2\n\nline3\n"}}},
Expected = <<
"a {\n"
" b {\n"
" c = \"\"\"~\n"
" line1\n"
"\n"
" line2\n"
"\n"
" line3\n"
" ~\"\"\"\n"
" }\n"
"}\n"
>>,
?assertEqual(Expected, iolist_to_binary(hocon_pp:do(Struct, #{}))),
ok.

load_file_pp_test() ->
TmpF = "/tmp/load_file_pp_test",
F = fun(Raw, Format) ->
Expand Down
29 changes: 27 additions & 2 deletions test/hocon_tests.erl
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,33 @@ escape_test_() ->
)
].

multiline_string_test_() ->
[].
triple_quote_string_test_() ->
Parse = fun(Str) -> maps:get(<<"a">>, binary(<<"a = \"\"\"", Str/binary, "\"\"\"">>)) end,
[
?_assertEqual(<<"1">>, Parse(<<"1">>)),
?_assertEqual(<<"1">>, Parse(<<"~\n1~">>)),
?_assertEqual(<<"1\n">>, Parse(<<"~\n1\n~">>)),
?_assertEqual(<<"1\r\n">>, Parse(<<"~\r\n1\r\n">>)),
?_assertEqual(<<"1\n\n2">>, Parse(<<"~\n1\n\n2">>)),
?_assertEqual(<<"1\n\n2">>, Parse(<<"~\n 1\n\n 2">>)),
?_assertEqual(<<"1\n\n2">>, Parse(<<"~\n 1\n \n 2">>)),
?_assertEqual(<<" 1\n\n2">>, Parse(<<"~\n 1\n \n 2">>)),
?_assertEqual(<<" 1\n\n2\n">>, Parse(<<"~\n 1\n \n 2\n">>)),
?_assertEqual(<<" 1\n\n2\n">>, Parse(<<"~\n 1\n \n 2\n ">>)),
?_assertEqual(<<" 1\n\n2\n">>, Parse(<<"~\n 1\n \n 2\n ~">>)),
?_assertEqual(<<" 1\n\n2\n ">>, Parse(<<"~\n 1\n \n 2\n ~">>)),
?_assertEqual(<<"1\"\"\n2">>, Parse(<<"~\n 1\"\"\n 2">>)),
%% must escape quotes if it's next to """
?_assertEqual(<<"1\"">>, Parse(<<"1\\\"">>)),
%% must escape quotes if it's next to """
?_assertEqual(<<"\"1">>, Parse(<<"\\\"1">>)),
%% no need to escape quotes unless it's next to """
?_assertEqual(<<"1\"2">>, Parse(<<"1\"2">>)),
%% empty string with closing quote in the next line
?_assertEqual(<<"">>, Parse(<<"~\n">>)),
%% empty string with indented closing quote in the next line
?_assertEqual(<<"">>, Parse(<<"~\n ~">>))
].
obj_inside_array_test_() ->
[
Expand Down

0 comments on commit 98cd69a

Please sign in to comment.