diff --git a/README.md b/README.md index f92f8b1..c8166a4 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ The internals of `regexp2` always operate on `[]rune` so `Index` and `Length` da | named back reference `\k'name'` | no | yes | | named ascii character class `[[:foo:]]`| yes | no (yes in RE2 compat mode) | | conditionals `(?(expr)yes\|no)` | no | yes | +| PCRE capture group order | no | no (yes in MaintainCaptureOrder mode) | ## RE2 compatibility mode The default behavior of `regexp2` is to match the .NET regexp engine, however the `RE2` option is provided to change the parsing to increase compatibility with RE2. Using the `RE2` option when compiling a regexp will not take away any features, but will change the following behaviors: @@ -90,6 +91,20 @@ if isMatch, _ := re.MatchString(`Something to match`); isMatch { This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?). +## MaintainCaptureOrder mode +The default behavior of `regexp2` is to match the .NET regexp engine, which unlike PCRE, doesn't maintain the order of the captures and appends the named capture groups to the end of captured groups. Using the `MaintainCaptureOrder` option when compiling a regexp will keep the order of named and unnamed capture groups. + +```go +re := regexp2.MustCompile(`(?This) (is) a (?test)`, regexp2.RE2) +if match, _ := re.FindStringMatch(`This is a test`); match != nil { + // match.Groups()[1].String() == "This" + // match.Groups()[1].Name == "first" + // match.Groups()[2].String() == "is" + // match.Groups()[2].Name == "2" + // match.Groups()[3].String() == "test" + // match.Groups()[3].Name == "last" +} +``` ## Library features that I'm still working on - Regex split diff --git a/regexp.go b/regexp.go index 7c7b01d..5179468 100644 --- a/regexp.go +++ b/regexp.go @@ -121,6 +121,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 (regexp package) compatibility mode + MaintainCaptureOrder = 0x0400 // Maintain named and unnamed capture order ) func (re *Regexp) RightToLeft() bool { diff --git a/regexp_MaintainCaptureOrder_test.go b/regexp_MaintainCaptureOrder_test.go new file mode 100644 index 0000000..bde5f4c --- /dev/null +++ b/regexp_MaintainCaptureOrder_test.go @@ -0,0 +1,406 @@ +package regexp2 + +import ( + "testing" +) + +func TestMaintainCaptureOrder_Basic(t *testing.T) { + r, err := Compile("(?this).+?(testing).+?(?stuff)", MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := `this is a testing stuff` + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, m.GroupByName(`first`).String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, m.regex.GroupNameFromNumber(1); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `2`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, m.GroupByNumber(3).String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_Mode_Not_Enabled(t *testing.T) { + r, err := Compile("(?this).+?(testing).+?(?stuff)", 0) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := `this is a testing stuff` + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `1`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, m.GroupByName(`first`).String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, m.regex.GroupNameFromNumber(2); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, m.GroupByNumber(3).String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { + r, err := Compile("(?si)(?this).+?\n(testing).+?(?stuff)", MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a \ntesting stuff" + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `2`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_No_Capture_Groups(t *testing.T) { + r, err := Compile("this.+?testing.+?stuff", MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := `this is a testing stuff` + m, err := r.FindStringMatch(text) + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := 1, len(groups); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_NestedCaptures(t *testing.T) { + r, err := Compile( + `(?This)(?(.)+?(?testing)).+?(some.+?(other).+?(?stuff)) (?\k)`, MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a testing some other stuff testing" + m, err := r.FindStringMatch(text) + + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ` is a testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `second`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := groups[2].String(), groups[2].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ` `, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `a`, groups[3].Captures[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `3`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `test`, groups[4].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `some other stuff`, groups[5].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `5`, groups[5].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `other`, groups[6].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `6`, groups[6].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[7].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `last`, groups[7].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := 8, len(groups); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_RE2_And_NumBackref(t *testing.T) { + r, err := Compile( + `(?'first'This).+?(?Ptesting) (some).+?(?<4>stuff) \2`, MaintainCaptureOrder | RE2) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := "This is a testing some other stuff testing" + m, err := r.FindStringMatch(text) + + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `This`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `testing`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `test`, groups[2].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `some`, groups[3].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `3`, groups[3].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `stuff`, groups[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `4`, groups[4].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} + +func TestMaintainCaptureOrder_Balancing_Conditional_Alternation(t *testing.T) { + r, err := Compile( + `^[^<>]*(((?'Open'<)[^<>]*)+((?'Close-Open'>)[^<>]*)+)*(?(Open)(?!))$`, MaintainCaptureOrder) + // t.Logf("code dump: %v", r.code.Dump()) + if err != nil { + t.Errorf("unexpected compile err: %v", err) + } + text := ">" + m, err := r.FindStringMatch(text) + + if err != nil { + t.Errorf("unexpected match err: %v", err) + } + if m == nil { + t.Error("Nil match, expected success") + } else { + //t.Logf("Match: %v", m.dump()) + } + + groups := m.Groups() + if want, got := text, m.String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := text, groups[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ``, groups[1].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[1].Captures[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `1`, groups[1].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := ``, groups[4].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[4].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[4].Captures[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `>`, groups[4].Captures[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `4`, groups[4].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `mno`, groups[5].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `abc`, groups[5].Captures[0].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `xyz`, groups[5].Captures[1].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `mno`, groups[5].Captures[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `Close`, groups[5].Name; want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } +} diff --git a/syntax/parser.go b/syntax/parser.go index d86f332..d75ee90 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -22,6 +22,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 compat mode + MaintainCaptureOrder = 0x0400 // Maintain named and unnamed capture order ) func optionFromCode(ch rune) RegexOptions { @@ -129,8 +130,9 @@ type parser struct { captop int capsize int - caps map[int]int - capnames map[string]int + caps map[int]int + capnames map[string]int + capnamenums map[string]int capnumlist []int capnamelist []string @@ -214,6 +216,17 @@ func (p *parser) noteCaptureName(name string, pos int) { p.capnames = make(map[string]int) } + if p.useMaintainCaptureOrder() { + if p.capnamenums == nil { + p.capnamenums = make(map[string]int) + } + + if _, ok := p.capnamenums[name]; !ok { + p.capnamenums[name] = p.autocap + p.noteCaptureSlot(p.consumeAutocap(), pos) + } + } + if _, ok := p.capnames[name]; !ok { p.capnames[name] = pos p.capnamelist = append(p.capnamelist, name) @@ -221,6 +234,15 @@ func (p *parser) noteCaptureName(name string, pos int) { } func (p *parser) assignNameSlots() { + if p.useMaintainCaptureOrder() { + p.capnames = p.capnamenums + // Prepend `0` to capnamelist if it's not set (MaintainCaptureOrder was enabled inline) + if len(p.capnamelist) == 0 || p.capnamelist[0] != `0` { + p.capnamelist = append([]string{fmt.Sprint(0)}, p.capnamelist...) + } + return + } + if p.capnames != nil { for _, name := range p.capnamelist { for p.isCaptureSlot(p.autocap) { @@ -301,7 +323,11 @@ func (p *parser) consumeAutocap() int { func (p *parser) countCaptures() error { var ch rune - p.noteCaptureSlot(0, 0) + if p.useMaintainCaptureOrder() { + p.noteCaptureName(fmt.Sprint(0), 0) + } else { + p.noteCaptureSlot(0, 0) + } p.autocap = 1 @@ -350,7 +376,11 @@ func (p *parser) countCaptures() error { if err != nil { return err } - p.noteCaptureSlot(dec, pos) + if p.useMaintainCaptureOrder() { + p.noteCaptureName(fmt.Sprint(dec), pos) + } else { + p.noteCaptureSlot(dec, pos) + } } else { p.noteCaptureName(p.scanCapname(), pos) } @@ -386,7 +416,11 @@ func (p *parser) countCaptures() error { } } else { if !p.useOptionN() && !p.ignoreNextParen { - p.noteCaptureSlot(p.consumeAutocap(), pos) + if p.useMaintainCaptureOrder() { + p.noteCaptureName(fmt.Sprint(p.autocap), pos) + } else { + p.noteCaptureSlot(p.consumeAutocap(), pos) + } } } } @@ -921,6 +955,12 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { return nil, p.getErr(ErrInvalidGroupName) } + + if capnum != -1 && p.useMaintainCaptureOrder() { + // Successfully scanned a named capture group so we need to increment + // our cap number to maintain the order + p.consumeAutocap() + } } else if ch == '-' { proceed = true } else { @@ -1062,6 +1102,9 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { // actually make the node if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { + if p.useMaintainCaptureOrder() { + p.consumeAutocap() + } return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil } goto BreakRecognize @@ -1968,6 +2011,11 @@ func (p *parser) useRE2() bool { return (p.options & RE2) != 0 } +// true to use MaintainCaptureOrder parsing behavior. +func (p *parser) useMaintainCaptureOrder() bool { + return (p.options & MaintainCaptureOrder) != 0 +} + // True if options stack is empty. func (p *parser) emptyOptionsStack() bool { return len(p.optionsStack) == 0