Skip to content

Commit bedc458

Browse files
committed
Properly support parsing regexp in extended mode
1 parent 6b78f53 commit bedc458

File tree

4 files changed

+64
-11
lines changed

4 files changed

+64
-11
lines changed

fuzz/regexp.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ harness(const uint8_t *input, size_t size) {
1515
pm_parser_t parser;
1616
pm_parser_init(&parser, input, size, NULL);
1717

18-
pm_regexp_parse(&parser, input, size, regexp_name_callback, NULL, regexp_error_callback, NULL);
18+
pm_regexp_parse(&parser, input, size, false, regexp_name_callback, NULL, regexp_error_callback, NULL);
1919

2020
pm_parser_free(&parser);
2121
}

include/prism/regexp.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,12 @@ typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *
3232
* @param parser The parser that is currently being used.
3333
* @param source The source code to parse.
3434
* @param size The size of the source code.
35+
* @param extended_mode Whether to parse the regular expression in extended mode.
3536
* @param name_callback The optional callback to call when a named capture group is found.
3637
* @param name_data The optional data to pass to the name callback.
3738
* @param error_callback The callback to call when a parse error is found.
3839
* @param error_data The data to pass to the error callback.
3940
*/
40-
PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
41+
PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
4142

4243
#endif

src/prism.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -17393,7 +17393,7 @@ parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_
1739317393
.shared = unescaped->type == PM_STRING_SHARED
1739417394
};
1739517395

17396-
pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), NULL, NULL, parse_regular_expression_error, &error_data);
17396+
pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED), NULL, NULL, parse_regular_expression_error, &error_data);
1739717397
}
1739817398

1739917399
/**
@@ -20164,7 +20164,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
2016420164
* match write node.
2016520165
*/
2016620166
static pm_node_t *
20167-
parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) {
20167+
parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) {
2016820168
parse_regular_expression_named_capture_data_t callback_data = {
2016920169
.parser = parser,
2017020170
.call = call,
@@ -20179,7 +20179,7 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
2017920179
.shared = content->type == PM_STRING_SHARED
2018020180
};
2018120181

20182-
pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
20182+
pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), extended_mode, parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
2018320183
pm_constant_id_list_free(&callback_data.names);
2018420184

2018520185
if (callback_data.match != NULL) {
@@ -20674,14 +20674,14 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
2067420674
pm_string_t owned;
2067520675
pm_string_owned_init(&owned, (uint8_t *) memory, total_length);
2067620676

20677-
result = parse_regular_expression_named_captures(parser, &owned, call);
20677+
result = parse_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
2067820678
pm_string_free(&owned);
2067920679
}
2068020680
} else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) {
2068120681
// If we have a regular expression node, then we can just parse
2068220682
// the named captures directly off the unescaped string.
2068320683
const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped;
20684-
result = parse_regular_expression_named_captures(parser, content, call);
20684+
result = parse_regular_expression_named_captures(parser, content, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED));
2068520685
}
2068620686

2068720687
return result;

src/regexp.c

+56-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ typedef struct {
1818
/** A pointer to the end of the source that we are parsing. */
1919
const uint8_t *end;
2020

21+
/**
22+
* Whether or not the regular expression currently being parsed is in
23+
* extended mode, wherein whitespace is ignored and comments are allowed.
24+
*/
25+
bool extended_mode;
26+
2127
/** Whether the encoding has changed from the default. */
2228
bool encoding_changed;
2329

@@ -418,6 +424,19 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
418424
return false;
419425
}
420426

427+
/**
428+
* True if the given key is set in the options.
429+
*/
430+
static bool
431+
pm_regexp_options_added_p(pm_regexp_options_t *options, uint8_t key) {
432+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
433+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
434+
return options->values[key] == PM_REGEXP_OPTION_STATE_ADDED;
435+
}
436+
437+
return false;
438+
}
439+
421440
/**
422441
* Groups can have quite a few different patterns for syntax. They basically
423442
* just wrap a set of expressions, but they can potentially have options after a
@@ -443,16 +462,16 @@ static bool
443462
pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
444463
const uint8_t *group_start = parser->cursor;
445464

465+
pm_regexp_options_t options;
466+
pm_regexp_options_init(&options);
467+
446468
// First, parse any options for the group.
447469
if (pm_regexp_char_accept(parser, '?')) {
448470
if (pm_regexp_char_is_eof(parser)) {
449471
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
450472
return false;
451473
}
452474

453-
pm_regexp_options_t options;
454-
pm_regexp_options_init(&options);
455-
456475
switch (*parser->cursor) {
457476
case '#': { // inline comments
458477
parser->cursor++;
@@ -560,6 +579,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
560579
return false;
561580
}
562581

582+
// If we are at the end of the group of options and there is no
583+
// subexpression, then we are going to be setting the options
584+
// for the parent group. In this case we are safe to return now.
585+
if (*parser->cursor == ')') {
586+
if (pm_regexp_options_added_p(&options, 'x')) parser->extended_mode = true;
587+
parser->cursor++;
588+
return true;
589+
}
590+
563591
// If we hit a -, then we're done parsing options.
564592
if (*parser->cursor != '-') break;
565593

@@ -577,6 +605,16 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
577605
if (pm_regexp_char_is_eof(parser)) {
578606
return false;
579607
}
608+
609+
// If we are at the end of the group of options and there is no
610+
// subexpression, then we are going to be setting the options
611+
// for the parent group. In this case we are safe to return now.
612+
if (*parser->cursor == ')') {
613+
if (pm_regexp_options_added_p(&options, 'x')) parser->extended_mode = true;
614+
parser->cursor++;
615+
return true;
616+
}
617+
580618
break;
581619
default:
582620
parser->cursor++;
@@ -585,15 +623,22 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
585623
}
586624
}
587625

626+
bool extended_mode = parser->extended_mode;
627+
if (pm_regexp_options_added_p(&options, 'x')) {
628+
parser->extended_mode = true;
629+
}
630+
588631
// Now, parse the expressions within this group.
589632
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
590633
if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
634+
parser->extended_mode = extended_mode;
591635
return false;
592636
}
593637
pm_regexp_char_accept(parser, '|');
594638
}
595639

596640
// Finally, make sure we have a closing parenthesis.
641+
parser->extended_mode = extended_mode;
597642
if (pm_regexp_char_expect(parser, ')')) return true;
598643

599644
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
@@ -641,6 +686,12 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
641686
parser->cursor++;
642687
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
643688
return true;
689+
case '#':
690+
if (parser->extended_mode) {
691+
if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
692+
return true;
693+
}
694+
/* fallthrough */
644695
default: {
645696
size_t width;
646697
if (!parser->encoding_changed) {
@@ -702,12 +753,13 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
702753
* groups.
703754
*/
704755
PRISM_EXPORTED_FUNCTION void
705-
pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
756+
pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
706757
pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
707758
.parser = parser,
708759
.start = source,
709760
.cursor = source,
710761
.end = source + size,
762+
.extended_mode = extended_mode,
711763
.encoding_changed = parser->encoding_changed,
712764
.encoding = parser->encoding,
713765
.name_callback = name_callback,

0 commit comments

Comments
 (0)