Skip to content

Commit

Permalink
Merge pull request #80 from semgrep/martin/to-js-sort-rules
Browse files Browse the repository at this point in the history
Add option to sort rule definitions
  • Loading branch information
aryx authored Jun 25, 2024
2 parents c085bc5 + 67c7a48 commit d70955e
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 77 deletions.
29 changes: 25 additions & 4 deletions src/gen/bin/Ocaml_tree_sitter_main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ type simplify_conf = {
type to_js_conf = {
input_path: string option;
output_path: string option;
sort_choices: bool;
sort_rules: bool;
}

type cmd_conf =
Expand Down Expand Up @@ -61,7 +63,10 @@ let simplify (conf : simplify_conf) =
Simplify_grammar.run conf.grammar conf.output_path

let to_js (conf : to_js_conf) =
To_JS.run conf.input_path conf.output_path
To_JS.run
~sort_choices:conf.sort_choices
~sort_rules:conf.sort_rules
conf.input_path conf.output_path

let run conf =
safe_run (fun () ->
Expand Down Expand Up @@ -160,6 +165,20 @@ let to_js_cmd =
in
Arg.value (Arg.pos 1 Arg.(some string) None info) in

let sort_choices_term : bool Term.t =
let info = Arg.info ["sort-choices"]
~doc:"Sort the elements of the 'choice()' constructs. \
This may not preserve the parsing behavior perfectly."
in
Arg.value (Arg.flag info) in

let sort_rules_term : bool Term.t =
let info = Arg.info ["sort-rules"]
~doc:"Sort the rule definitions alphabetically. The first rule \
remains first because it is the grammar's entry point."
in
Arg.value (Arg.flag info) in

let doc =
"recover a tree-sitter grammar.js from grammar.json" in

Expand All @@ -172,13 +191,15 @@ let to_js_cmd =
https://github.com/returntocorp/ocaml-tree-sitter/issues.";
] in
let info = Term.info ~doc ~man "to-js" in
let config input_path output_path =
To_JS { input_path; output_path }
let config input_path output_path sort_choices sort_rules =
To_JS { input_path; output_path; sort_choices; sort_rules }
in
let cmdline_term = Term.(
const config
$ input_path_term
$ output_path_term) in
$ output_path_term
$ sort_choices_term
$ sort_rules_term) in
(cmdline_term, info)

let gen_cmd =
Expand Down
194 changes: 123 additions & 71 deletions src/gen/lib/To_JS.ml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,38 @@ let pattern s =
let rule s =
sprintf "$.%s" s

(*
Move the leading underscore to the end of the string for sorting purposes:
"_foo" -> "foo_"
*)
let move_leading_underscore str =
if str <> "" && str.[0] = '_' then
String.sub str 1 (String.length str - 1) ^ "_"
else
str

(*
Compare rule names alphabetically, using the leading '_' only as a
last-resort disambiguator to allow 'foo' and '_foo' to appear in the
same region when comparing two grammars.
*)
let compare_rule_name a b =
String.compare (move_leading_underscore a) (move_leading_underscore b)

(*
This is for sorting choice() elements
*)
let compare_rule_body (a : rule_body) (b : rule_body) =
match a, b with
| SYMBOL a, SYMBOL b -> compare_rule_name a b
| _ -> compare a b

let pp_conflict ?prefix:_ ?(is_last = true) rule_list =
let rules = List.map rule rule_list |> String.concat ", " in
[ Line (sprintf "[%s]" rules |> comma is_last) ]

let rec map (f : ?prefix:string -> ?is_last:bool -> _ -> _) xs =
let rec map
(f : ?prefix:string -> ?is_last:bool -> _ -> _) xs =
match xs with
| [] -> []
| [x] -> [f ~is_last:true x]
Expand All @@ -46,76 +73,90 @@ let pp_prec_value ~is_last x =
let s = string_of_prec_value x in
pp_value ~is_last s

let rec pp_body ?(prefix = "") ?(is_last = true) body =
let cons, args =
match body with
| SYMBOL s -> rule s, None
| STRING s -> str s, None
| PATTERN s -> pattern s, None
| BLANK -> "\"\" /* blank */", None
| REPEAT x -> "repeat", Some (pp_body x)
| REPEAT1 x -> "repeat1", Some (pp_body x)
| CHOICE [x; BLANK] -> "optional", Some (pp_body x)
| CHOICE xs -> "choice", Some (map pp_body xs |> flatten)
| SEQ xs -> "seq", Some (map pp_body xs |> flatten)
| PREC (prec_value, x) ->
"prec", Some [
Inline (pp_prec_value ~is_last:false prec_value);
Inline (pp_body ~is_last:true x);
(*
The 'sort_choices' setting is the same for all recursive calls
unlike the other options 'prefix' and 'is_last'.
*)
let pp_body ~sort_choices ?prefix ?is_last body =
let rec pp_body ?(prefix = "") ?(is_last = true) body =
let cons, args =
match body with
| SYMBOL s -> rule s, None
| STRING s -> str s, None
| PATTERN s -> pattern s, None
| BLANK -> "\"\" /* blank */", None
| REPEAT x -> "repeat", Some (pp_body x)
| REPEAT1 x -> "repeat1", Some (pp_body x)
| CHOICE [x; BLANK] -> "optional", Some (pp_body x)
| CHOICE xs ->
let xs =
if sort_choices then
List.sort compare_rule_body xs
else
xs
in
"choice", Some (map pp_body xs |> flatten)
| SEQ xs -> "seq", Some (map pp_body xs |> flatten)
| PREC (prec_value, x) ->
"prec", Some [
Inline (pp_prec_value ~is_last:false prec_value);
Inline (pp_body ~is_last:true x);
]
| PREC_DYNAMIC (n, x) ->
"prec.dynamic", Some [
Inline (pp_value ~is_last:false (string_of_int n));
Inline (pp_body ~is_last:true x)
]
| PREC_LEFT (opt_prec_value, x) ->
"prec.left", Some (pp_opt_prec opt_prec_value x)
| PREC_RIGHT (opt_prec_value, x) ->
"prec.right", Some (pp_opt_prec opt_prec_value x)
| ALIAS x -> "alias", Some (pp_alias x)
| FIELD (name, x) -> "field", Some (pp_field name x)
| IMMEDIATE_TOKEN x -> "token.immediate", Some (pp_body x)
| TOKEN x -> "token", Some (pp_body x)
in
match args with
| None ->
[ Line (sprintf "%s%s" prefix (cons |> comma is_last)) ]
| Some args ->
[
Line (sprintf "%s%s(" prefix cons);
Block args;
Line (")" |> comma is_last)
]
| PREC_DYNAMIC (n, x) ->
"prec.dynamic", Some [
Inline (pp_value ~is_last:false (string_of_int n));
Inline (pp_body ~is_last:true x)

and pp_opt_prec opt_prec_value x =
let body = pp_body ~is_last:true x in
match opt_prec_value with
| None -> body
| Some prec_value ->
[
Inline (pp_prec_value ~is_last:false prec_value);
Inline body;
]
| PREC_LEFT (opt_prec_value, x) ->
"prec.left", Some (pp_opt_prec opt_prec_value x)
| PREC_RIGHT (opt_prec_value, x) ->
"prec.right", Some (pp_opt_prec opt_prec_value x)
| ALIAS x -> "alias", Some (pp_alias x)
| FIELD (name, x) -> "field", Some (pp_field name x)
| IMMEDIATE_TOKEN x -> "token.immediate", Some (pp_body x)
| TOKEN x -> "token", Some (pp_body x)
in
match args with
| None ->
[ Line (sprintf "%s%s" prefix (cons |> comma is_last)) ]
| Some args ->
[
Line (sprintf "%s%s(" prefix cons);
Block args;
Line (")" |> comma is_last)
]

and pp_opt_prec opt_prec_value x =
let body = pp_body ~is_last:true x in
match opt_prec_value with
| None -> body
| Some prec_value ->
[
Inline (pp_prec_value ~is_last:false prec_value);
Inline body;
]

and pp_alias x =
let new_name =
match x.named with
| true -> rule x.value
| false -> str x.value
in
[
Inline (pp_body ~is_last:false x.content);
Line new_name;
]

and pp_field name x =
[
Line (str name ^ ",");
Inline (pp_body ~is_last:true x);
]
and pp_alias x =
let new_name =
match x.named with
| true -> rule x.value
| false -> str x.value
in
[
Inline (pp_body ~is_last:false x.content);
Line new_name;
]

and pp_field name x =
[
Line (str name ^ ",");
Inline (pp_body ~is_last:true x);
]
in
pp_body ?prefix ?is_last body

let pp_rule ?prefix:_ ?is_last (name, body) =
pp_body ~prefix:(sprintf "%s: $ => " name) ?is_last body
let pp_rule ~sort_choices ?prefix:_ ?is_last (name, body) =
pp_body ~sort_choices ~prefix:(sprintf "%s: $ => " name) ?is_last body

let pp_word (x : ident option) =
match x with
Expand All @@ -135,7 +176,18 @@ let pp_precedence_level ?prefix:_ ?(is_last = true) level =
in
[ Line (sprintf "[%s]" level |> comma is_last) ]

let pp_grammar (x : grammar) : Indent.t =
let pp_grammar ~sort_choices ~sort_rules (x : grammar) : Indent.t =
let rules =
if sort_rules then
match x.rules with
| entrypoint :: other_rules ->
entrypoint
:: List.sort (fun (a, _) (b, _) -> compare_rule_name a b) other_rules
| [] -> []
else
x.rules
in
let pp_body = pp_body ~sort_choices in
[
Line
"// JavaScript grammar recovered from JSON by 'ocaml-tree-sitter to-js'";
Expand Down Expand Up @@ -166,21 +218,21 @@ let pp_grammar (x : grammar) : Indent.t =
Block (map pp_body x.extras |> flatten);
Line "],";
Line "rules: {";
Block (map pp_rule x.rules |> flatten);
Block (map (pp_rule ~sort_choices) rules |> flatten);
Line "}";
];
Line "});";
]

let run input_path output_path =
let run ~sort_choices ~sort_rules input_path output_path =
let grammar =
match input_path with
| None ->
Atdgen_runtime.Util.Json.from_channel Tree_sitter_j.read_grammar stdin
| Some file ->
Atdgen_runtime.Util.Json.from_file Tree_sitter_j.read_grammar file
in
let tree = pp_grammar grammar in
let tree = pp_grammar ~sort_choices ~sort_rules grammar in
match output_path with
| None -> Indent.to_channel stdout tree
| Some file -> Indent.to_file file tree
16 changes: 14 additions & 2 deletions src/gen/lib/To_JS.mli
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,17 @@
debugging.
*)

(* Usage: run (Some "grammar.json") (Some "grammar.js") *)
val run : string option -> string option -> unit
(* Usage: run (Some "grammar.json") (Some "grammar.js")
sort_choices: sort the elements of the 'choice()' constructs.
This normalization may not completely preserve the same parsing behavior
but in general it should and it's convenient for comparing two grammars.
sort_rules: sort all the rule definitions alphabetically except for the
first one because it must stay in place to be identified by tree-sitter
as the grammar's entry point.
*)
val run :
sort_choices:bool ->
sort_rules:bool ->
string option -> string option -> unit

0 comments on commit d70955e

Please sign in to comment.