Skip to content

Commit d509f92

Browse files
ferstlfstapelberg
authored andcommitted
encoding/prototext: Support URL chars in type URLs in text-format.
Change the text-format parser to allow for more characters and formats in the type URL prefixes of expanded Any protos. This follows a recent change to the text-format spec [1,2] and the Go implementation is now almost exactly implementing this spec (with some exceptions to ensure backwards compatibility). Note that we are also making the whitespace handling between [ and ] more lenient compared to the current implementation. This is also in line with the new spec. Refs: - [1] https://protobuf.dev/reference/protobuf/textformat-spec/#characters - [2] https://protobuf.dev/reference/protobuf/textformat-spec/#field-names - go/protobuf-improved-type-url-support Change-Id: I55055becec0248c1d161e776f1937eaaa4af2066 Reviewed-on: https://go-review.googlesource.com/c/protobuf/+/726780 Reviewed-by: Michael Stapelberg <stapelberg@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Damien Neil <dneil@google.com>
1 parent b85b189 commit d509f92

3 files changed

Lines changed: 169 additions & 48 deletions

File tree

encoding/prototext/decode_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,6 +1554,13 @@ value: "some bytes"
15541554
wantMessage: &anypb.Any{
15551555
TypeUrl: "foo.com/pb2.Nested",
15561556
},
1557+
}, {
1558+
desc: "Any expanded with URL chars in type URL prefix",
1559+
inputMessage: &anypb.Any{},
1560+
inputText: `[foo.com/bar//=*+./pb2.Nested]: {}`,
1561+
wantMessage: &anypb.Any{
1562+
TypeUrl: "foo.com/bar//=*+./pb2.Nested",
1563+
},
15571564
}, {
15581565
desc: "Any expanded with missing required",
15591566
inputMessage: &anypb.Any{},

internal/encoding/text/decode.go

Lines changed: 79 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -424,67 +424,94 @@ func (d *Decoder) parseFieldName() (tok Token, err error) {
424424
return Token{}, d.newSyntaxError("invalid field name: %s", errId(d.in))
425425
}
426426

427-
// parseTypeName parses Any type URL or extension field name. The name is
428-
// enclosed in [ and ] characters. The C++ parser does not handle many legal URL
429-
// strings. This implementation is more liberal and allows for the pattern
430-
// ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed
431-
// in between [ ], '.', '/' and the sub names.
427+
// parseTypeName parses an Any type URL or an extension field name. The name is
428+
// enclosed in [ and ] characters. We allow almost arbitrary type URL prefixes,
429+
// closely following the text-format spec [1,2]. We implement "ExtensionName |
430+
// AnyName" as follows (with some exceptions for backwards compatibility):
431+
//
432+
// char = [-_a-zA-Z0-9]
433+
// url_char = char | [.~!$&'()*+,;=] | "%", hex, hex
434+
//
435+
// Ident = char, { char }
436+
// TypeName = Ident, { ".", Ident } ;
437+
// UrlPrefix = url_char, { url_char | "/" } ;
438+
// ExtensionName = "[", TypeName, "]" ;
439+
// AnyName = "[", UrlPrefix, "/", TypeName, "]" ;
440+
//
441+
// Additionally, we allow arbitrary whitespace and comments between [ and ].
442+
//
443+
// [1] https://protobuf.dev/reference/protobuf/textformat-spec/#characters
444+
// [2] https://protobuf.dev/reference/protobuf/textformat-spec/#field-names
432445
func (d *Decoder) parseTypeName() (Token, error) {
433-
startPos := len(d.orig) - len(d.in)
434446
// Use alias s to advance first in order to use d.in for error handling.
435-
// Caller already checks for [ as first character.
447+
// Caller already checks for [ as first character (d.in[0] == '[').
436448
s := consume(d.in[1:], 0)
437449
if len(s) == 0 {
438450
return Token{}, ErrUnexpectedEOF
439451
}
440452

453+
// Collect everything between [ and ] in name.
441454
var name []byte
442-
for len(s) > 0 && isTypeNameChar(s[0]) {
443-
name = append(name, s[0])
444-
s = s[1:]
445-
}
446-
s = consume(s, 0)
447-
448455
var closed bool
449456
for len(s) > 0 && !closed {
450457
switch {
451458
case s[0] == ']':
452459
s = s[1:]
453460
closed = true
454461

455-
case s[0] == '/', s[0] == '.':
456-
if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') {
457-
return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
458-
d.orig[startPos:len(d.orig)-len(s)+1])
459-
}
462+
case s[0] == '/' || isTypeNameChar(s[0]) || isUrlExtraChar(s[0]):
460463
name = append(name, s[0])
461-
s = s[1:]
462-
s = consume(s, 0)
463-
for len(s) > 0 && isTypeNameChar(s[0]) {
464-
name = append(name, s[0])
465-
s = s[1:]
464+
s = consume(s[1:], 0)
465+
466+
// URL percent-encoded chars
467+
case s[0] == '%':
468+
if len(s) < 3 || !isHexChar(s[1]) || !isHexChar(s[2]) {
469+
return Token{}, d.parseTypeNameError(s, 3)
466470
}
467-
s = consume(s, 0)
471+
name = append(name, s[0], s[1], s[2])
472+
s = consume(s[3:], 0)
468473

469474
default:
470-
return Token{}, d.newSyntaxError(
471-
"invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1])
475+
return Token{}, d.parseTypeNameError(s, 1)
472476
}
473477
}
474478

475479
if !closed {
476480
return Token{}, ErrUnexpectedEOF
477481
}
478482

479-
// First character cannot be '.'. Last character cannot be '.' or '/'.
480-
size := len(name)
481-
if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' {
482-
return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s",
483-
d.orig[startPos:len(d.orig)-len(s)])
483+
// Split collected name on last '/' into urlPrefix and typeName (if '/' is
484+
// present).
485+
typeName := name
486+
if i := bytes.LastIndexByte(name, '/'); i != -1 {
487+
urlPrefix := name[:i]
488+
typeName = name[i+1:]
489+
490+
// urlPrefix may be empty (for backwards compatibility).
491+
// If non-empty, it must not start with '/'.
492+
if len(urlPrefix) > 0 && urlPrefix[0] == '/' {
493+
return Token{}, d.parseTypeNameError(s, 0)
494+
}
484495
}
485496

497+
// typeName must not be empty (note: "" splits to [""]) and all identifier
498+
// parts must not be empty.
499+
for _, ident := range bytes.Split(typeName, []byte{'.'}) {
500+
if len(ident) == 0 {
501+
return Token{}, d.parseTypeNameError(s, 0)
502+
}
503+
}
504+
505+
// typeName must not contain any percent-encoded or special URL chars.
506+
for _, b := range typeName {
507+
if b == '%' || (b != '.' && isUrlExtraChar(b)) {
508+
return Token{}, d.parseTypeNameError(s, 0)
509+
}
510+
}
511+
512+
startPos := len(d.orig) - len(d.in)
513+
endPos := len(d.orig) - len(s)
486514
d.in = s
487-
endPos := len(d.orig) - len(d.in)
488515
d.consume(0)
489516

490517
return Token{
@@ -496,16 +523,32 @@ func (d *Decoder) parseTypeName() (Token, error) {
496523
}, nil
497524
}
498525

526+
func (d *Decoder) parseTypeNameError(s []byte, numUnconsumedChars int) error {
527+
return d.newSyntaxError(
528+
"invalid type URL/extension field name: %s",
529+
d.in[:len(d.in)-len(s)+min(numUnconsumedChars, len(s))],
530+
)
531+
}
532+
533+
func isHexChar(b byte) bool {
534+
return ('0' <= b && b <= '9') ||
535+
('a' <= b && b <= 'f') ||
536+
('A' <= b && b <= 'F')
537+
}
538+
499539
func isTypeNameChar(b byte) bool {
500-
return (b == '-' || b == '_' ||
540+
return b == '-' || b == '_' ||
501541
('0' <= b && b <= '9') ||
502542
('a' <= b && b <= 'z') ||
503-
('A' <= b && b <= 'Z'))
543+
('A' <= b && b <= 'Z')
504544
}
505545

506-
func isWhiteSpace(b byte) bool {
546+
// isUrlExtraChar complements isTypeNameChar with extra characters that we allow
547+
// in URLs but not in type names. Note that '/' is not included so that it can
548+
// be treated specially.
549+
func isUrlExtraChar(b byte) bool {
507550
switch b {
508-
case ' ', '\n', '\r', '\t':
551+
case '.', '~', '!', '$', '&', '(', ')', '*', '+', ',', ';', '=':
509552
return true
510553
default:
511554
return false

internal/encoding/text/decode_test.go

Lines changed: 83 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,14 @@ func TestDecoder(t *testing.T) {
408408
{K: text.Name, T: NT{K: text.TypeName, S: "type"}, RS: "[type]"},
409409
},
410410
},
411+
{
412+
in: "[",
413+
want: []R{{E: text.ErrUnexpectedEOF.Error()}},
414+
},
415+
{
416+
in: "[]",
417+
want: []R{{E: "invalid type URL/extension field name: []"}},
418+
},
411419
{
412420
// V1 allows this syntax. C++ does not, however, C++ also fails if
413421
// field is Any and does not contain '/'.
@@ -416,6 +424,10 @@ func TestDecoder(t *testing.T) {
416424
{K: text.Name, T: NT{K: text.TypeName, S: "/type"}},
417425
},
418426
},
427+
{
428+
in: "[/domain.com/type]",
429+
want: []R{{E: "invalid type URL/extension field name: [/domain.com/type]"}},
430+
},
419431
{
420432
in: "[.type]",
421433
want: []R{{E: "invalid type URL/extension field name: [.type]"}},
@@ -451,6 +463,25 @@ func TestDecoder(t *testing.T) {
451463
},
452464
},
453465
},
466+
{
467+
// V3 allows more valid URL chars
468+
in: "[url.special.chars/.~!$&()*+,;=/type]",
469+
want: []R{
470+
{
471+
K: text.Name,
472+
T: NT{K: text.TypeName, S: "url.special.chars/.~!$&()*+,;=/type"},
473+
},
474+
},
475+
},
476+
{
477+
// V3 allows multiple slashes and empty path segments in URL prefix.
478+
in: `message_field{[val/id//type]`,
479+
want: []R{
480+
{K: text.Name},
481+
{K: text.MessageOpen},
482+
{K: text.Name, T: NT{K: text.TypeName, S: "val/id//type"}},
483+
},
484+
},
454485
{
455486
// V2 no longer allows a quoted string for the Any type URL.
456487
in: `["domain.com/pkg.type"]`,
@@ -466,14 +497,18 @@ func TestDecoder(t *testing.T) {
466497
want: []R{{E: "invalid type URL/extension field name: [pkg.Foo.extension_field:"}},
467498
},
468499
{
469-
// V2 no longer allows whitespace within identifier "word".
470-
in: "[proto.packa ge.field]",
471-
want: []R{{E: "invalid type URL/extension field name: [proto.packa g"}},
500+
// V3 allows whitespace anywhere between [ and ].
501+
in: "[ do\tmain.\ncom/pro\rto.packa ge.field ]",
502+
want: []R{
503+
{K: text.Name, T: NT{K: text.TypeName, S: "domain.com/proto.package.field"}},
504+
},
472505
},
473506
{
474-
// V2 no longer allows comments within identifier "word".
475-
in: "[proto.packa # comment\n ge.field]",
476-
want: []R{{E: "invalid type URL/extension field name: [proto.packa # comment\n g"}},
507+
// V3 allows comments anywhere between [ and ].
508+
in: "[# comment\n domain# comment\n .com/proto.packa # comment\n ge.field# comment\n]",
509+
want: []R{
510+
{K: text.Name, T: NT{K: text.TypeName, S: "domain.com/proto.package.field"}},
511+
},
477512
},
478513
{
479514
in: "[proto.package.]",
@@ -492,12 +527,48 @@ func TestDecoder(t *testing.T) {
492527
},
493528
},
494529
{
495-
in: `message_field{[invalid//type]`,
496-
want: []R{
497-
{K: text.Name},
498-
{K: text.MessageOpen},
499-
{E: `invalid type URL/extension field name: [invalid//`},
500-
},
530+
// V3 allows URL percent-encoded chars
531+
in: "[percent.encoded/%0a%1B%2c%3D%4e%F5%A6%b7%C8%f9/type]",
532+
want: []R{{
533+
K: text.Name,
534+
T: NT{K: text.TypeName, S: "percent.encoded/%0a%1B%2c%3D%4e%F5%A6%b7%C8%f9/type"},
535+
}},
536+
},
537+
{
538+
in: `[percent.encode.incomplete%/type]`,
539+
want: []R{{E: `invalid type URL/extension field name: [percent.encode.incomplete%`}},
540+
},
541+
{
542+
in: `[percent.encode.nohex%Z/type]`,
543+
want: []R{{E: `invalid type URL/extension field name: [percent.encode.nohex%Z`}},
544+
},
545+
{
546+
in: `[percent.encode.nohex%aZ/type]`,
547+
want: []R{{E: `invalid type URL/extension field name: [percent.encode.nohex%aZ`}},
548+
},
549+
{
550+
in: `[percent.encode.eof%`,
551+
want: []R{{E: `invalid type URL/extension field name: [percent.encode.eof%`}},
552+
},
553+
{
554+
in: `[percent.encode.eof%a`,
555+
want: []R{{E: `invalid type URL/extension field name: [percent.encode.eof%a`}},
556+
},
557+
{
558+
in: `[invalid.type.char/type!]`,
559+
want: []R{{E: `invalid type URL/extension field name: [invalid.type.char/type!`}},
560+
},
561+
{
562+
in: `[invalid.type.char/type=]`,
563+
want: []R{{E: `invalid type URL/extension field name: [invalid.type.char/type=`}},
564+
},
565+
{
566+
in: `[invalid.type.char/type+]`,
567+
want: []R{{E: `invalid type URL/extension field name: [invalid.type.char/type+`}},
568+
},
569+
{
570+
in: `[invalid.type.char/type%2b]`,
571+
want: []R{{E: `invalid type URL/extension field name: [invalid.type.char/type%`}},
501572
},
502573
{
503574
in: `message_field{[proto.package.]`,

0 commit comments

Comments
 (0)