diff --git a/match.go b/match.go
index 759cf8c..0704ac7 100644
--- a/match.go
+++ b/match.go
@@ -69,7 +69,9 @@ func newMatch(regex *Regexp, capcount int, text []rune, startpos int) *Match {
textstart: startpos,
balancing: false,
}
- m.Name = "0"
+ if regex.options|ECMAScript == 0 {
+ m.Name = "0"
+ }
m.text = text
m.matches[0] = make([]int, 2)
return &m
diff --git a/regexp_test.go b/regexp_test.go
index ed47ee8..5732205 100644
--- a/regexp_test.go
+++ b/regexp_test.go
@@ -853,6 +853,107 @@ func TestECMANamedGroup(t *testing.T) {
}
}
+func TestECMAGroupNameUnicode(t *testing.T) {
+ t.Run("unicode-escape", func(t *testing.T) {
+ const RE = `(?<\u03C0>a)`
+ re := MustCompile(RE, ECMAScript)
+ names := re.GetGroupNames()
+ if len(names) != 2 || names[1] != "π" {
+ t.Fatalf("Group names: %v", names)
+ }
+ _, err := Compile(RE, 0)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+ })
+
+ t.Run("extended-unicode-escape", func(t *testing.T) {
+ re := MustCompile(`(?<\u{03C0}>a)`, ECMAScript|Unicode)
+ names := re.GetGroupNames()
+ if len(names) != 2 || names[1] != "π" {
+ t.Fatalf("Group names: %v", names)
+ }
+ m, err := re.FindStringMatch("bab")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if m == nil {
+ t.Fatal("Expected match")
+ }
+ if g := m.GroupByName("π"); g != nil {
+ if s := g.Capture.String(); s != "a" {
+ t.Fatalf("Group capture != a ('%s')", s)
+ }
+ } else {
+ t.Fatal("No group capture by name")
+ }
+ if g := m.GroupByNumber(1); g != nil {
+ if s := g.Capture.String(); s != "a" {
+ t.Fatalf("Group capture != a ('%s')", s)
+ }
+ } else {
+ t.Fatal("No group capture by number")
+ }
+ })
+
+ t.Run("invalid-escape-x", func(t *testing.T) {
+ _, err := Compile(`(?<\x68>>a)`, ECMAScript)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+ })
+
+ t.Run("invalid-escape-u", func(t *testing.T) {
+ _, err := Compile(`(?<\ubob>>a)`, ECMAScript)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+ })
+
+ t.Run("duplicate-name", func(t *testing.T) {
+ const RE = `(?a)(?a)`
+ _, err := Compile(RE, ECMAScript)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+ _, err = Compile(RE, 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ })
+
+}
+
+func TestECMANamedGroupNumberAssignment(t *testing.T) {
+ re := MustCompile(`(.)(?a)(?\1)(\k)`, ECMAScript)
+ m, err := re.FindStringMatch("baba")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if m == nil {
+ t.Fatal("Expected match")
+ }
+ groups := m.Groups()
+ if len(groups) != 5 {
+ t.Fatalf("Groups: %v", groups)
+ }
+ if groups[0].Name != "" || groups[0].Index != 0 || groups[0].String() != "baba" {
+ t.Fatalf("Groups[0]: %v", groups[0])
+ }
+ if groups[1].Name != "" || groups[1].Index != 0 || groups[1].String() != "b" {
+ t.Fatalf("Groups[1]: %v", groups[1])
+ }
+ if groups[2].Name != "x" || groups[2].Index != 1 || groups[2].String() != "a" {
+ t.Fatalf("Groups[2]: %v", groups[1])
+ }
+ if groups[3].Name != "y" || groups[3].Index != 2 || groups[3].String() != "b" {
+ t.Fatalf("Groups[3]: %v", groups[3])
+ }
+ if groups[4].Name != "" || groups[4].Index != 3 || groups[4].String() != "a" {
+ t.Fatalf("Groups[4]: %v", groups[4])
+ }
+}
+
func TestECMAInvalidEscapeCharClass(t *testing.T) {
re := MustCompile(`[\x0]`, ECMAScript)
if m, err := re.MatchString("x"); err != nil {
diff --git a/syntax/charclass.go b/syntax/charclass.go
index 6881a0e..fa2488b 100644
--- a/syntax/charclass.go
+++ b/syntax/charclass.go
@@ -354,6 +354,15 @@ func IsECMAWordChar(r rune) bool {
//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
}
+func IsECMAIdentifierStartChar(r rune) bool {
+ return r == '$' || r == '_' || unicode.In(r, unicode.L, unicode.Nl, unicode.Other_ID_Start)
+}
+
+func IsECMAIdentifierChar(r rune) bool {
+ return IsECMAIdentifierStartChar(r) || r == '\u200C' || r == '\u200D' ||
+ unicode.In(r, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue)
+}
+
// SingletonChar will return the char from the first range without validation.
// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
func (c CharSet) SingletonChar() rune {
diff --git a/syntax/parser.go b/syntax/parser.go
index 4ff0aaa..74db3d3 100644
--- a/syntax/parser.go
+++ b/syntax/parser.go
@@ -6,6 +6,7 @@ import (
"os"
"sort"
"strconv"
+ "strings"
"unicode"
)
@@ -87,6 +88,8 @@ const (
ErrTooManyAlternates = "too many | in (?()|)"
ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
+ ErrInvalidECMAGroupName = "invalid capture group name"
+ ErrDuplicateGroupName = "duplicate capture group name"
ErrCapNumNotZero = "capture number cannot be zero"
ErrUndefinedBackRef = "reference to undefined group number %v"
ErrUndefinedNameRef = "reference to undefined group name %v"
@@ -209,18 +212,34 @@ func (p *parser) noteCaptureSlot(i, pos int) {
p.captop = i + 1
}
}
+ if p.useOptionE() {
+ p.capnamelist = append(p.capnamelist, "")
+ }
}
}
-func (p *parser) noteCaptureName(name string, pos int) {
+func (p *parser) noteCaptureName(name string, pos int) error {
if p.capnames == nil {
p.capnames = make(map[string]int)
}
if _, ok := p.capnames[name]; !ok {
- p.capnames[name] = pos
p.capnamelist = append(p.capnamelist, name)
+ if p.useOptionE() {
+ slot := p.consumeAutocap()
+ p.caps[slot] = pos
+ p.capcount++
+ p.captop = slot + 1
+ p.capnames[name] = slot
+ } else {
+ p.capnames[name] = pos
+ }
+ } else {
+ if p.useOptionE() {
+ return p.getErr(ErrDuplicateGroupName)
+ }
}
+ return nil
}
func (p *parser) assignNameSlots() {
@@ -285,9 +304,12 @@ func (p *parser) assignNameSlots() {
} else {
//feature: culture?
- str := strconv.Itoa(j)
+ var str string
+ if !p.useOptionE() {
+ str = strconv.Itoa(j)
+ p.capnames[str] = j
+ }
p.capnamelist = append(p.capnamelist, str)
- p.capnames[str] = j
}
}
}
@@ -347,15 +369,22 @@ func (p *parser) countCaptures() error {
p.moveRight(1)
ch = p.rightChar(0)
- if ch != '0' && IsWordChar(ch) {
- if ch >= '1' && ch <= '9' {
+ if ch != '0' && p.isGroupNameStartChar(ch) {
+ if ch >= '1' && ch <= '9' && !p.useOptionE() {
dec, err := p.scanDecimal()
if err != nil {
return err
}
p.noteCaptureSlot(dec, pos)
} else {
- p.noteCaptureName(p.scanCapname(), pos)
+ cn, err := p.scanCapname()
+ if err != nil {
+ return err
+ }
+ err = p.noteCaptureName(cn, pos)
+ if err != nil {
+ return err
+ }
}
}
} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
@@ -363,7 +392,14 @@ func (p *parser) countCaptures() error {
p.moveRight(2)
ch = p.rightChar(0)
if IsWordChar(ch) {
- p.noteCaptureName(p.scanCapname(), pos)
+ cn, err := p.scanCapname()
+ if err != nil {
+ return err
+ }
+ err = p.noteCaptureName(cn, pos)
+ if err != nil {
+ return err
+ }
}
} else {
@@ -399,7 +435,9 @@ func (p *parser) countCaptures() error {
}
}
- p.assignNameSlots()
+ if !p.useOptionE() {
+ p.assignNameSlots()
+ }
return nil
}
@@ -781,7 +819,10 @@ func (p *parser) scanDollar() (*regexNode, error) {
}
}
} else if angled && IsWordChar(ch) {
- capname := p.scanCapname()
+ capname, err := p.scanCapname()
+ if err != nil {
+ return nil, err
+ }
if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
if p.isCaptureName(capname) {
@@ -819,6 +860,13 @@ func (p *parser) scanDollar() (*regexNode, error) {
return newRegexNodeCh(ntOne, p.options, '$'), nil
}
+func (p *parser) isGroupNameStartChar(ch rune) bool {
+ if p.useOptionE() {
+ return IsECMAIdentifierStartChar(ch) || ch == '\\'
+ }
+ return IsWordChar(ch)
+}
+
// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
// a RegexNode for the type of group scanned, or nil if the group
// simply changed options (?cimsx-cimsx) or was a comment (#...).
@@ -897,7 +945,7 @@ func (p *parser) scanGroupOpen() (*regexNode, error) {
// grab part before -
- if ch >= '0' && ch <= '9' {
+ if ch >= '0' && ch <= '9' && !p.useOptionE() {
if capnum, err = p.scanDecimal(); err != nil {
return nil, err
}
@@ -913,27 +961,42 @@ func (p *parser) scanGroupOpen() (*regexNode, error) {
if capnum == 0 {
return nil, p.getErr(ErrCapNumNotZero)
}
- } else if IsWordChar(ch) {
- capname := p.scanCapname()
+ } else if p.isGroupNameStartChar(ch) {
+ capname, err := p.scanCapname()
+ if err != nil {
+ return nil, err
+ }
if p.isCaptureName(capname) {
capnum = p.captureSlotFromName(capname)
+ if p.useOptionE() {
+ // We need to keep track of the slot numbers.
+ // This works because capture names are required to be unique, however
+ // a better approach would be having a map pos->slot and use that rather than p.autocap
+ p.consumeAutocap()
+ }
}
// check if we have bogus character after the name
if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+ if p.useOptionE() {
+ return nil, p.getErr(ErrInvalidECMAGroupName)
+ }
return nil, p.getErr(ErrInvalidGroupName)
}
} else if ch == '-' {
proceed = true
} else {
// bad group name - starts with something other than a word character and isn't a number
+ if p.useOptionE() {
+ return nil, p.getErr(ErrInvalidECMAGroupName)
+ }
return nil, p.getErr(ErrInvalidGroupName)
}
// grab part after - if any
- if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
+ if !p.useOptionE() && (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
p.moveRight(1)
//no more chars left, no closing char, etc
@@ -956,7 +1019,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) {
return nil, p.getErr(ErrInvalidGroupName)
}
} else if IsWordChar(ch) {
- uncapname := p.scanCapname()
+ uncapname, err := p.scanCapname()
+ if err != nil {
+ return nil, err
+ }
if !p.isCaptureName(uncapname) {
return nil, p.getErr(ErrUndefinedNameRef, uncapname)
@@ -1004,7 +1070,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) {
return nil, p.getErr(ErrMalformedReference, capnum)
} else if IsWordChar(ch) {
- capname := p.scanCapname()
+ capname, err := p.scanCapname()
+ if err != nil {
+ return nil, err
+ }
if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
@@ -1051,7 +1120,10 @@ func (p *parser) scanGroupOpen() (*regexNode, error) {
if IsWordChar(ch) {
capnum := -1
- capname := p.scanCapname()
+ capname, err := p.scanCapname()
+ if err != nil {
+ return nil, err
+ }
if p.isCaptureName(capname) {
capnum = p.captureSlotFromName(capname)
@@ -1203,9 +1275,9 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
// According to ECMAScript specification, \k is only parsed as a named group reference if
// there is at least one group name in the regexp.
- // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
+ // See https://tc39.es/ecma262/2020/#sec-isvalidregularexpressionliteral, step 7.
// Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
- if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
+ if ch == 'k' && (!p.useOptionE() || p.useOptionU() || len(p.capnames) > 0) {
if p.charsRight() >= 2 {
p.moveRight(1)
ch = p.moveRightGetChar()
@@ -1271,7 +1343,10 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
}
} else if angled {
- capname := p.scanCapname()
+ capname, err := p.scanCapname()
+ if err != nil {
+ return nil, err
+ }
if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
@@ -1425,7 +1500,54 @@ func (p *parser) scanBlank() error {
return nil
}
-func (p *parser) scanCapname() string {
+func (p *parser) scaneCapnameECMA() (string, error) {
+ startpos := p.textpos()
+ var sb strings.Builder
+ hasEscape := false
+ for p.charsRight() > 0 {
+ savedpos := p.textpos()
+ ch := p.moveRightGetChar()
+ var err error
+ if ch == '\\' {
+ if p.charsRight() > 0 && p.rightChar(0) == 'u' {
+ var r rune
+ p.moveRight(1)
+ if p.charsRight() > 0 && p.rightChar(0) == '{' {
+ // ECMAScript specification says the \u{...} syntax should only be supported in full Unicode mode
+ // (https://tc39.es/ecma262/#prod-RegExpUnicodeEscapeSequence), however every implementation
+ // I've tried happily accepts it regardless.
+ p.moveRight(1)
+ r, err = p.scanHexUntilBrace()
+ } else {
+ r, err = p.scanHex(4)
+ }
+ if err == nil {
+ if !hasEscape {
+ sb.WriteString(string(p.pattern[startpos:savedpos]))
+ hasEscape = true
+ }
+ ch = r
+ }
+ }
+ }
+ if err != nil {
+ return "", err
+ }
+ if !IsECMAIdentifierChar(ch) {
+ p.textto(savedpos)
+ break
+ }
+ if hasEscape {
+ sb.WriteRune(ch)
+ }
+ }
+ if hasEscape {
+ return sb.String(), nil
+ }
+ return string(p.pattern[startpos:p.textpos()]), nil
+}
+
+func (p *parser) scanWord() string {
startpos := p.textpos()
for p.charsRight() > 0 {
@@ -1438,6 +1560,14 @@ func (p *parser) scanCapname() string {
return string(p.pattern[startpos:p.textpos()])
}
+func (p *parser) scanCapname() (string, error) {
+ if p.useOptionE() {
+ return p.scaneCapnameECMA()
+ }
+
+ return p.scanWord(), nil
+}
+
// Scans contents of [] (not including []'s), and converts to a set.
func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
ch := '\x00'
@@ -1548,7 +1678,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
p.moveRight(1)
}
- nm := p.scanCapname() // snag the name
+ nm := p.scanWord() // snag the name
if !scanOnly && p.useRE2() {
// look up the name since these are valid for RE2
// add the group based on the name