From fb16ed8d932a94d1530e3e097e5f039cc5e1e8dc Mon Sep 17 00:00:00 2001 From: Dmitry Panov Date: Tue, 22 Apr 2025 23:46:49 +0100 Subject: [PATCH 1/2] Improved compatibility for named capture groups for ECMA mode --- regexp_test.go | 83 +++++++++++++++++++++ syntax/charclass.go | 9 +++ syntax/parser.go | 174 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 244 insertions(+), 22 deletions(-) diff --git a/regexp_test.go b/regexp_test.go index ed47ee8..b316839 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -853,6 +853,89 @@ func TestECMANamedGroup(t *testing.T) { } } +func TestECMAGroupNameUnicode(t *testing.T) { + t.Run("unicode-escape", func(t *testing.T) { + re := MustCompile(`(?<\u03C0>a)`, ECMAScript) + names := re.GetGroupNames() + if len(names) != 2 || names[1] != "π" { + t.Fatalf("Group names: %v", names) + } + }) + + t.Run("extended-unicode-escape", func(t *testing.T) { + re := MustCompile(`(?<\u{03C0}>a)`, ECMAScript|Unicode) + names := re.GetGroupNames() + if len(names) != 2 || names[1] != "π" { + t.Fatalf("Group names: %v", names) + } + m, err := re.FindStringMatch("bab") + if err != nil { + t.Fatal(err) + } + if m == nil { + t.Fatal("Expected match") + } + if g := m.GroupByName("π"); g != nil { + if s := g.Capture.String(); s != "a" { + t.Fatalf("Group capture != a ('%s')", s) + } + } else { + t.Fatal("No group capture by name") + } + if g := m.GroupByNumber(1); g != nil { + if s := g.Capture.String(); s != "a" { + t.Fatalf("Group capture != a ('%s')", s) + } + } else { + t.Fatal("No group capture by number") + } + }) + + t.Run("invalid-escape-x", func(t *testing.T) { + _, err := Compile(`(?<\x68>>a)`, ECMAScript) + if err == nil { + t.Fatal("Expected error") + } + }) + + t.Run("invalid-escape-u", func(t *testing.T) { + _, err := Compile(`(?<\ubob>>a)`, ECMAScript) + if err == nil { + t.Fatal("Expected error") + } + }) + + t.Run("duplicate-name", func(t *testing.T) { + _, err := Compile(`(?a)(?a)`, ECMAScript) + if err == nil { + t.Fatal("Expected error") + } + }) + +} + +func TestECMANamedGroupNumberAssignment(t *testing.T) { + re := MustCompile(`(.)(?a)(?\1)(\k)`, ECMAScript) + m, err := re.FindStringMatch("baba") + if err != nil { + t.Fatal(err) + } + if m == nil { + t.Fatal("Expected match") + } + groups := m.Groups() + if len(groups) != 5 { + t.Fatalf("Groups: %v", groups) + } + if groups[0].Name != "0" || groups[0].Index != 0 || groups[0].String() != "baba" { + t.Fatalf("Groups[0]: %v", groups[0]) + } + + for _, group := range m.Groups() { + t.Log(group.Index, group.Name, group.String()) + } +} + func TestECMAInvalidEscapeCharClass(t *testing.T) { re := MustCompile(`[\x0]`, ECMAScript) if m, err := re.MatchString("x"); err != nil { diff --git a/syntax/charclass.go b/syntax/charclass.go index 6881a0e..fa2488b 100644 --- a/syntax/charclass.go +++ b/syntax/charclass.go @@ -354,6 +354,15 @@ func IsECMAWordChar(r rune) bool { //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' } +func IsECMAIdentifierStartChar(r rune) bool { + return r == '$' || r == '_' || unicode.In(r, unicode.L, unicode.Nl, unicode.Other_ID_Start) +} + +func IsECMAIdentifierChar(r rune) bool { + return IsECMAIdentifierStartChar(r) || r == '\u200C' || r == '\u200D' || + unicode.In(r, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue) +} + // SingletonChar will return the char from the first range without validation. // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input func (c CharSet) SingletonChar() rune { diff --git a/syntax/parser.go b/syntax/parser.go index 4ff0aaa..74db3d3 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -6,6 +6,7 @@ import ( "os" "sort" "strconv" + "strings" "unicode" ) @@ -87,6 +88,8 @@ const ( ErrTooManyAlternates = "too many | in (?()|)" ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" + ErrInvalidECMAGroupName = "invalid capture group name" + ErrDuplicateGroupName = "duplicate capture group name" ErrCapNumNotZero = "capture number cannot be zero" ErrUndefinedBackRef = "reference to undefined group number %v" ErrUndefinedNameRef = "reference to undefined group name %v" @@ -209,18 +212,34 @@ func (p *parser) noteCaptureSlot(i, pos int) { p.captop = i + 1 } } + if p.useOptionE() { + p.capnamelist = append(p.capnamelist, "") + } } } -func (p *parser) noteCaptureName(name string, pos int) { +func (p *parser) noteCaptureName(name string, pos int) error { if p.capnames == nil { p.capnames = make(map[string]int) } if _, ok := p.capnames[name]; !ok { - p.capnames[name] = pos p.capnamelist = append(p.capnamelist, name) + if p.useOptionE() { + slot := p.consumeAutocap() + p.caps[slot] = pos + p.capcount++ + p.captop = slot + 1 + p.capnames[name] = slot + } else { + p.capnames[name] = pos + } + } else { + if p.useOptionE() { + return p.getErr(ErrDuplicateGroupName) + } } + return nil } func (p *parser) assignNameSlots() { @@ -285,9 +304,12 @@ func (p *parser) assignNameSlots() { } else { //feature: culture? - str := strconv.Itoa(j) + var str string + if !p.useOptionE() { + str = strconv.Itoa(j) + p.capnames[str] = j + } p.capnamelist = append(p.capnamelist, str) - p.capnames[str] = j } } } @@ -347,15 +369,22 @@ func (p *parser) countCaptures() error { p.moveRight(1) ch = p.rightChar(0) - if ch != '0' && IsWordChar(ch) { - if ch >= '1' && ch <= '9' { + if ch != '0' && p.isGroupNameStartChar(ch) { + if ch >= '1' && ch <= '9' && !p.useOptionE() { dec, err := p.scanDecimal() if err != nil { return err } p.noteCaptureSlot(dec, pos) } else { - p.noteCaptureName(p.scanCapname(), pos) + cn, err := p.scanCapname() + if err != nil { + return err + } + err = p.noteCaptureName(cn, pos) + if err != nil { + return err + } } } } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') { @@ -363,7 +392,14 @@ func (p *parser) countCaptures() error { p.moveRight(2) ch = p.rightChar(0) if IsWordChar(ch) { - p.noteCaptureName(p.scanCapname(), pos) + cn, err := p.scanCapname() + if err != nil { + return err + } + err = p.noteCaptureName(cn, pos) + if err != nil { + return err + } } } else { @@ -399,7 +435,9 @@ func (p *parser) countCaptures() error { } } - p.assignNameSlots() + if !p.useOptionE() { + p.assignNameSlots() + } return nil } @@ -781,7 +819,10 @@ func (p *parser) scanDollar() (*regexNode, error) { } } } else if angled && IsWordChar(ch) { - capname := p.scanCapname() + capname, err := p.scanCapname() + if err != nil { + return nil, err + } if p.charsRight() > 0 && p.moveRightGetChar() == '}' { if p.isCaptureName(capname) { @@ -819,6 +860,13 @@ func (p *parser) scanDollar() (*regexNode, error) { return newRegexNodeCh(ntOne, p.options, '$'), nil } +func (p *parser) isGroupNameStartChar(ch rune) bool { + if p.useOptionE() { + return IsECMAIdentifierStartChar(ch) || ch == '\\' + } + return IsWordChar(ch) +} + // scanGroupOpen scans chars following a '(' (not counting the '('), and returns // a RegexNode for the type of group scanned, or nil if the group // simply changed options (?cimsx-cimsx) or was a comment (#...). @@ -897,7 +945,7 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { // grab part before - - if ch >= '0' && ch <= '9' { + if ch >= '0' && ch <= '9' && !p.useOptionE() { if capnum, err = p.scanDecimal(); err != nil { return nil, err } @@ -913,27 +961,42 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { if capnum == 0 { return nil, p.getErr(ErrCapNumNotZero) } - } else if IsWordChar(ch) { - capname := p.scanCapname() + } else if p.isGroupNameStartChar(ch) { + capname, err := p.scanCapname() + if err != nil { + return nil, err + } if p.isCaptureName(capname) { capnum = p.captureSlotFromName(capname) + if p.useOptionE() { + // We need to keep track of the slot numbers. + // This works because capture names are required to be unique, however + // a better approach would be having a map pos->slot and use that rather than p.autocap + p.consumeAutocap() + } } // check if we have bogus character after the name if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { + if p.useOptionE() { + return nil, p.getErr(ErrInvalidECMAGroupName) + } return nil, p.getErr(ErrInvalidGroupName) } } else if ch == '-' { proceed = true } else { // bad group name - starts with something other than a word character and isn't a number + if p.useOptionE() { + return nil, p.getErr(ErrInvalidECMAGroupName) + } return nil, p.getErr(ErrInvalidGroupName) } // grab part after - if any - if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { + if !p.useOptionE() && (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { p.moveRight(1) //no more chars left, no closing char, etc @@ -956,7 +1019,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { return nil, p.getErr(ErrInvalidGroupName) } } else if IsWordChar(ch) { - uncapname := p.scanCapname() + uncapname, err := p.scanCapname() + if err != nil { + return nil, err + } if !p.isCaptureName(uncapname) { return nil, p.getErr(ErrUndefinedNameRef, uncapname) @@ -1004,7 +1070,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { return nil, p.getErr(ErrMalformedReference, capnum) } else if IsWordChar(ch) { - capname := p.scanCapname() + capname, err := p.scanCapname() + if err != nil { + return nil, err + } if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil @@ -1051,7 +1120,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { if IsWordChar(ch) { capnum := -1 - capname := p.scanCapname() + capname, err := p.scanCapname() + if err != nil { + return nil, err + } if p.isCaptureName(capname) { capnum = p.captureSlotFromName(capname) @@ -1203,9 +1275,9 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { // According to ECMAScript specification, \k is only parsed as a named group reference if // there is at least one group name in the regexp. - // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7. + // See https://tc39.es/ecma262/2020/#sec-isvalidregularexpressionliteral, step 7. // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok. - if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) { + if ch == 'k' && (!p.useOptionE() || p.useOptionU() || len(p.capnames) > 0) { if p.charsRight() >= 2 { p.moveRight(1) ch = p.moveRightGetChar() @@ -1271,7 +1343,10 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { } } else if angled { - capname := p.scanCapname() + capname, err := p.scanCapname() + if err != nil { + return nil, err + } if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close { @@ -1425,7 +1500,54 @@ func (p *parser) scanBlank() error { return nil } -func (p *parser) scanCapname() string { +func (p *parser) scaneCapnameECMA() (string, error) { + startpos := p.textpos() + var sb strings.Builder + hasEscape := false + for p.charsRight() > 0 { + savedpos := p.textpos() + ch := p.moveRightGetChar() + var err error + if ch == '\\' { + if p.charsRight() > 0 && p.rightChar(0) == 'u' { + var r rune + p.moveRight(1) + if p.charsRight() > 0 && p.rightChar(0) == '{' { + // ECMAScript specification says the \u{...} syntax should only be supported in full Unicode mode + // (https://tc39.es/ecma262/#prod-RegExpUnicodeEscapeSequence), however every implementation + // I've tried happily accepts it regardless. + p.moveRight(1) + r, err = p.scanHexUntilBrace() + } else { + r, err = p.scanHex(4) + } + if err == nil { + if !hasEscape { + sb.WriteString(string(p.pattern[startpos:savedpos])) + hasEscape = true + } + ch = r + } + } + } + if err != nil { + return "", err + } + if !IsECMAIdentifierChar(ch) { + p.textto(savedpos) + break + } + if hasEscape { + sb.WriteRune(ch) + } + } + if hasEscape { + return sb.String(), nil + } + return string(p.pattern[startpos:p.textpos()]), nil +} + +func (p *parser) scanWord() string { startpos := p.textpos() for p.charsRight() > 0 { @@ -1438,6 +1560,14 @@ func (p *parser) scanCapname() string { return string(p.pattern[startpos:p.textpos()]) } +func (p *parser) scanCapname() (string, error) { + if p.useOptionE() { + return p.scaneCapnameECMA() + } + + return p.scanWord(), nil +} + // Scans contents of [] (not including []'s), and converts to a set. func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { ch := '\x00' @@ -1548,7 +1678,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { p.moveRight(1) } - nm := p.scanCapname() // snag the name + nm := p.scanWord() // snag the name if !scanOnly && p.useRE2() { // look up the name since these are valid for RE2 // add the group based on the name From 93eb3e7bfcf172c296a8ced4a723cbdbf2db4667 Mon Sep 17 00:00:00 2001 From: Dmitry Panov Date: Wed, 23 Apr 2025 21:10:41 +0100 Subject: [PATCH 2/2] Do not name group 0 in ECMAScript mode, added tests to ensure the changes do not affect non-ECMAScript mode. --- match.go | 4 +++- regexp_test.go | 30 ++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/match.go b/match.go index 759cf8c..0704ac7 100644 --- a/match.go +++ b/match.go @@ -69,7 +69,9 @@ func newMatch(regex *Regexp, capcount int, text []rune, startpos int) *Match { textstart: startpos, balancing: false, } - m.Name = "0" + if regex.options|ECMAScript == 0 { + m.Name = "0" + } m.text = text m.matches[0] = make([]int, 2) return &m diff --git a/regexp_test.go b/regexp_test.go index b316839..5732205 100644 --- a/regexp_test.go +++ b/regexp_test.go @@ -855,11 +855,16 @@ func TestECMANamedGroup(t *testing.T) { func TestECMAGroupNameUnicode(t *testing.T) { t.Run("unicode-escape", func(t *testing.T) { - re := MustCompile(`(?<\u03C0>a)`, ECMAScript) + const RE = `(?<\u03C0>a)` + re := MustCompile(RE, ECMAScript) names := re.GetGroupNames() if len(names) != 2 || names[1] != "π" { t.Fatalf("Group names: %v", names) } + _, err := Compile(RE, 0) + if err == nil { + t.Fatal("Expected error") + } }) t.Run("extended-unicode-escape", func(t *testing.T) { @@ -906,10 +911,15 @@ func TestECMAGroupNameUnicode(t *testing.T) { }) t.Run("duplicate-name", func(t *testing.T) { - _, err := Compile(`(?a)(?a)`, ECMAScript) + const RE = `(?a)(?a)` + _, err := Compile(RE, ECMAScript) if err == nil { t.Fatal("Expected error") } + _, err = Compile(RE, 0) + if err != nil { + t.Fatal(err) + } }) } @@ -927,12 +937,20 @@ func TestECMANamedGroupNumberAssignment(t *testing.T) { if len(groups) != 5 { t.Fatalf("Groups: %v", groups) } - if groups[0].Name != "0" || groups[0].Index != 0 || groups[0].String() != "baba" { + if groups[0].Name != "" || groups[0].Index != 0 || groups[0].String() != "baba" { t.Fatalf("Groups[0]: %v", groups[0]) } - - for _, group := range m.Groups() { - t.Log(group.Index, group.Name, group.String()) + if groups[1].Name != "" || groups[1].Index != 0 || groups[1].String() != "b" { + t.Fatalf("Groups[1]: %v", groups[1]) + } + if groups[2].Name != "x" || groups[2].Index != 1 || groups[2].String() != "a" { + t.Fatalf("Groups[2]: %v", groups[1]) + } + if groups[3].Name != "y" || groups[3].Index != 2 || groups[3].String() != "b" { + t.Fatalf("Groups[3]: %v", groups[3]) + } + if groups[4].Name != "" || groups[4].Index != 3 || groups[4].String() != "a" { + t.Fatalf("Groups[4]: %v", groups[4]) } }