Properly support parsing regexp in extended mode

kddnewton · kddnewton · commit bedc4585ed81 · 2024-07-01T10:42:06.000-04:00
diff --git a/fuzz/regexp.c b/fuzz/regexp.c
@@ -15,7 +15,7 @@ harness(const uint8_t *input, size_t size) {
     pm_parser_t parser;
     pm_parser_init(&parser, input, size, NULL);
 
-    pm_regexp_parse(&parser, input, size, regexp_name_callback, NULL, regexp_error_callback, NULL);
+    pm_regexp_parse(&parser, input, size, false, regexp_name_callback, NULL, regexp_error_callback, NULL);
 
     pm_parser_free(&parser);
 }
diff --git a/include/prism/regexp.h b/include/prism/regexp.h
@@ -32,11 +32,12 @@ typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *
  * @param parser The parser that is currently being used.
  * @param source The source code to parse.
  * @param size The size of the source code.
+ * @param extended_mode Whether to parse the regular expression in extended mode.
  * @param name_callback The optional callback to call when a named capture group is found.
  * @param name_data The optional data to pass to the name callback.
  * @param error_callback The callback to call when a parse error is found.
  * @param error_data The data to pass to the error callback.
  */
-PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
+PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
 
 #endif
diff --git a/src/prism.c b/src/prism.c
@@ -17393,7 +17393,7 @@ parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_
         .shared = unescaped->type == PM_STRING_SHARED
     };
 
-    pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), NULL, NULL, parse_regular_expression_error, &error_data);
+    pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED), NULL, NULL, parse_regular_expression_error, &error_data);
 }
 
 /**
@@ -20164,7 +20164,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
  * match write node.
  */
 static pm_node_t *
-parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) {
+parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) {
     parse_regular_expression_named_capture_data_t callback_data = {
         .parser = parser,
         .call = call,
@@ -20179,7 +20179,7 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
         .shared = content->type == PM_STRING_SHARED
     };
 
-    pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
+    pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), extended_mode, parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
     pm_constant_id_list_free(&callback_data.names);
 
     if (callback_data.match != NULL) {
@@ -20674,14 +20674,14 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
                     pm_string_t owned;
                     pm_string_owned_init(&owned, (uint8_t *) memory, total_length);
 
-                    result = parse_regular_expression_named_captures(parser, &owned, call);
+                    result = parse_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
                     pm_string_free(&owned);
                 }
             } else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) {
                 // If we have a regular expression node, then we can just parse
                 // the named captures directly off the unescaped string.
                 const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped;
-                result = parse_regular_expression_named_captures(parser, content, call);
+                result = parse_regular_expression_named_captures(parser, content, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
             }
 
             return result;
diff --git a/src/regexp.c b/src/regexp.c
@@ -18,6 +18,12 @@ typedef struct {
     /** A pointer to the end of the source that we are parsing. */
     const uint8_t *end;
 
+    /**
+     * Whether or not the regular expression currently being parsed is in
+     * extended mode, wherein whitespace is ignored and comments are allowed.
+     */
+    bool extended_mode;
+
     /** Whether the encoding has changed from the default. */
     bool encoding_changed;
 
@@ -418,6 +424,19 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
     return false;
 }
 
+/**
+ * True if the given key is set in the options.
+ */
+static bool
+pm_regexp_options_added_p(pm_regexp_options_t *options, uint8_t key) {
+    if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
+        key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
+        return options->values[key] == PM_REGEXP_OPTION_STATE_ADDED;
+    }
+
+    return false;
+}
+
 /**
  * Groups can have quite a few different patterns for syntax. They basically
  * just wrap a set of expressions, but they can potentially have options after a
@@ -443,16 +462,16 @@ static bool
 pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
     const uint8_t *group_start = parser->cursor;
 
+    pm_regexp_options_t options;
+    pm_regexp_options_init(&options);
+
     // First, parse any options for the group.
     if (pm_regexp_char_accept(parser, '?')) {
         if (pm_regexp_char_is_eof(parser)) {
             pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
             return false;
         }
 
-        pm_regexp_options_t options;
-        pm_regexp_options_init(&options);
-
         switch (*parser->cursor) {
             case '#': { // inline comments
                 parser->cursor++;
@@ -560,6 +579,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
                     return false;
                 }
 
+                // If we are at the end of the group of options and there is no
+                // subexpression, then we are going to be setting the options
+                // for the parent group. In this case we are safe to return now.
+                if (*parser->cursor == ')') {
+                    if (pm_regexp_options_added_p(&options, 'x')) parser->extended_mode = true;
+                    parser->cursor++;
+                    return true;
+                }
+
                 // If we hit a -, then we're done parsing options.
                 if (*parser->cursor != '-') break;
 
@@ -577,6 +605,16 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
                 if (pm_regexp_char_is_eof(parser)) {
                     return false;
                 }
+
+                // If we are at the end of the group of options and there is no
+                // subexpression, then we are going to be setting the options
+                // for the parent group. In this case we are safe to return now.
+                if (*parser->cursor == ')') {
+                    if (pm_regexp_options_added_p(&options, 'x')) parser->extended_mode = true;
+                    parser->cursor++;
+                    return true;
+                }
+
                 break;
             default:
                 parser->cursor++;
@@ -585,15 +623,22 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
         }
     }
 
+    bool extended_mode = parser->extended_mode;
+    if (pm_regexp_options_added_p(&options, 'x')) {
+        parser->extended_mode = true;
+    }
+
     // Now, parse the expressions within this group.
     while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
         if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
+            parser->extended_mode = extended_mode;
             return false;
         }
         pm_regexp_char_accept(parser, '|');
     }
 
     // Finally, make sure we have a closing parenthesis.
+    parser->extended_mode = extended_mode;
     if (pm_regexp_char_expect(parser, ')')) return true;
 
     pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
@@ -641,6 +686,12 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
             parser->cursor++;
             pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
             return true;
+        case '#':
+            if (parser->extended_mode) {
+                if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
+                return true;
+            }
+        /* fallthrough */
         default: {
             size_t width;
             if (!parser->encoding_changed) {
@@ -702,12 +753,13 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
  * groups.
  */
 PRISM_EXPORTED_FUNCTION void
-pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
+pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
     pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
         .parser = parser,
         .start = source,
         .cursor = source,
         .end = source + size,
+        .extended_mode = extended_mode,
         .encoding_changed = parser->encoding_changed,
         .encoding = parser->encoding,
         .name_callback = name_callback,

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ harness(const uint8_t *input, size_t size) {`
`15`	`15`	`pm_parser_t parser;`
`16`	`16`	`pm_parser_init(&parser, input, size, NULL);`
`17`	`17`
`18`		`- pm_regexp_parse(&parser, input, size, regexp_name_callback, NULL, regexp_error_callback, NULL);`
	`18`	`+ pm_regexp_parse(&parser, input, size, false, regexp_name_callback, NULL, regexp_error_callback, NULL);`
`19`	`19`
`20`	`20`	`pm_parser_free(&parser);`
`21`	`21`	`}`