@@ -18,6 +18,12 @@ typedef struct {
18
18
/** A pointer to the end of the source that we are parsing. */
19
19
const uint8_t * end ;
20
20
21
+ /**
22
+ * Whether or not the regular expression currently being parsed is in
23
+ * extended mode, wherein whitespace is ignored and comments are allowed.
24
+ */
25
+ bool extended_mode ;
26
+
21
27
/** Whether the encoding has changed from the default. */
22
28
bool encoding_changed ;
23
29
@@ -418,6 +424,19 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
418
424
return false;
419
425
}
420
426
427
+ /**
428
+ * True if the given key is set in the options.
429
+ */
430
+ static bool
431
+ pm_regexp_options_added_p (pm_regexp_options_t * options , uint8_t key ) {
432
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM ) {
433
+ key = (uint8_t ) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM );
434
+ return options -> values [key ] == PM_REGEXP_OPTION_STATE_ADDED ;
435
+ }
436
+
437
+ return false;
438
+ }
439
+
421
440
/**
422
441
* Groups can have quite a few different patterns for syntax. They basically
423
442
* just wrap a set of expressions, but they can potentially have options after a
@@ -443,16 +462,16 @@ static bool
443
462
pm_regexp_parse_group (pm_regexp_parser_t * parser , uint16_t depth ) {
444
463
const uint8_t * group_start = parser -> cursor ;
445
464
465
+ pm_regexp_options_t options ;
466
+ pm_regexp_options_init (& options );
467
+
446
468
// First, parse any options for the group.
447
469
if (pm_regexp_char_accept (parser , '?' )) {
448
470
if (pm_regexp_char_is_eof (parser )) {
449
471
pm_regexp_parse_error (parser , group_start , parser -> cursor , "end pattern in group" );
450
472
return false;
451
473
}
452
474
453
- pm_regexp_options_t options ;
454
- pm_regexp_options_init (& options );
455
-
456
475
switch (* parser -> cursor ) {
457
476
case '#' : { // inline comments
458
477
parser -> cursor ++ ;
@@ -560,6 +579,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
560
579
return false;
561
580
}
562
581
582
+ // If we are at the end of the group of options and there is no
583
+ // subexpression, then we are going to be setting the options
584
+ // for the parent group. In this case we are safe to return now.
585
+ if (* parser -> cursor == ')' ) {
586
+ if (pm_regexp_options_added_p (& options , 'x' )) parser -> extended_mode = true;
587
+ parser -> cursor ++ ;
588
+ return true;
589
+ }
590
+
563
591
// If we hit a -, then we're done parsing options.
564
592
if (* parser -> cursor != '-' ) break ;
565
593
@@ -577,6 +605,16 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
577
605
if (pm_regexp_char_is_eof (parser )) {
578
606
return false;
579
607
}
608
+
609
+ // If we are at the end of the group of options and there is no
610
+ // subexpression, then we are going to be setting the options
611
+ // for the parent group. In this case we are safe to return now.
612
+ if (* parser -> cursor == ')' ) {
613
+ if (pm_regexp_options_added_p (& options , 'x' )) parser -> extended_mode = true;
614
+ parser -> cursor ++ ;
615
+ return true;
616
+ }
617
+
580
618
break ;
581
619
default :
582
620
parser -> cursor ++ ;
@@ -585,15 +623,22 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
585
623
}
586
624
}
587
625
626
+ bool extended_mode = parser -> extended_mode ;
627
+ if (pm_regexp_options_added_p (& options , 'x' )) {
628
+ parser -> extended_mode = true;
629
+ }
630
+
588
631
// Now, parse the expressions within this group.
589
632
while (!pm_regexp_char_is_eof (parser ) && * parser -> cursor != ')' ) {
590
633
if (!pm_regexp_parse_expression (parser , (uint16_t ) (depth + 1 ))) {
634
+ parser -> extended_mode = extended_mode ;
591
635
return false;
592
636
}
593
637
pm_regexp_char_accept (parser , '|' );
594
638
}
595
639
596
640
// Finally, make sure we have a closing parenthesis.
641
+ parser -> extended_mode = extended_mode ;
597
642
if (pm_regexp_char_expect (parser , ')' )) return true;
598
643
599
644
pm_regexp_parse_error (parser , group_start , parser -> cursor , "end pattern with unmatched parenthesis" );
@@ -641,6 +686,12 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
641
686
parser -> cursor ++ ;
642
687
pm_regexp_parse_error (parser , parser -> cursor - 1 , parser -> cursor , "unmatched close parenthesis" );
643
688
return true;
689
+ case '#' :
690
+ if (parser -> extended_mode ) {
691
+ if (!pm_regexp_char_find (parser , '\n' )) parser -> cursor = parser -> end ;
692
+ return true;
693
+ }
694
+ /* fallthrough */
644
695
default : {
645
696
size_t width ;
646
697
if (!parser -> encoding_changed ) {
@@ -702,12 +753,13 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
702
753
* groups.
703
754
*/
704
755
PRISM_EXPORTED_FUNCTION void
705
- pm_regexp_parse (pm_parser_t * parser , const uint8_t * source , size_t size , pm_regexp_name_callback_t name_callback , void * name_data , pm_regexp_error_callback_t error_callback , void * error_data ) {
756
+ pm_regexp_parse (pm_parser_t * parser , const uint8_t * source , size_t size , bool extended_mode , pm_regexp_name_callback_t name_callback , void * name_data , pm_regexp_error_callback_t error_callback , void * error_data ) {
706
757
pm_regexp_parse_pattern (& (pm_regexp_parser_t ) {
707
758
.parser = parser ,
708
759
.start = source ,
709
760
.cursor = source ,
710
761
.end = source + size ,
762
+ .extended_mode = extended_mode ,
711
763
.encoding_changed = parser -> encoding_changed ,
712
764
.encoding = parser -> encoding ,
713
765
.name_callback = name_callback ,
0 commit comments