Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions experimental/parser/lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ func Lex(ctx token.Context, errs *report.Report) {
} else {
text = l.SeekEOF()
}

l.Push(len("//")+len(text), token.Comment)

case r == '/' && l.Peek() == '*':
l.cursor++ // Skip the *.

Expand All @@ -116,7 +118,9 @@ func Lex(ctx token.Context, errs *report.Report) {
l.Error(ErrUnmatched{Span: l.SpanFrom(l.cursor - 2)})
text = l.SeekEOF()
}

l.Push(len("/*")+len(text), token.Comment)

case r == '*' && l.Peek() == '/':
l.cursor++ // Skip the /.

Expand Down
199 changes: 195 additions & 4 deletions experimental/parser/lex_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
package parser

import (
"fmt"
"slices"
"strings"
"unicode/utf8"

Expand All @@ -31,11 +33,12 @@ type lexer struct {
cursor, count int

braces []token.ID
}

func (l *lexer) Push(length int, kind token.Kind) token.Token {
l.count++
return l.Stream.Push(length, kind)
prev token.ID // The last non-skippable token.

firstCommentSincePrev token.ID
firstCommentOnSameLine bool
parStart, parEnd token.ID
}

func (l *lexer) Cursor() int {
Expand Down Expand Up @@ -107,6 +110,194 @@ func (l *lexer) SpanFrom(start int) report.Span {
return l.Span(start, l.cursor)
}

func (l *lexer) Push(length int, kind token.Kind) token.Token {
l.count++
prev := l.prev.In(l.Context)
tok := l.Stream.Push(length, kind)
// NOTE: tok will have the Stream rather than l.Context as its context,
// which will cause issues when we call NewCursor below.
tok = tok.ID().In(l.Context)

// NOTE: For the purposes of attributing comments, we need to know what line
// certain offsets are at. Although we could track this as we advance cursor,
// we instead use other methods to determine if two tokens are on the same
// line. This is for a couple of reasons.
//
// 1. Getting a line number from the line index is O(log n), but we can
// instead use strings.Index and friends in some places without going
// quadratic.
//
// 2. Having to examine every character directly locks us out of using e.g.
// strings.Index for certain operations, which is much more efficient
// than the naive for loop.

switch {
case tok.Kind() == token.Comment:
isLineComment := strings.HasPrefix(tok.Text(), "//")

if l.firstCommentSincePrev.Nil() {
l.firstCommentSincePrev = tok.ID()

if !prev.Nil() && l.newLinesBetween(prev, tok, 1) == 0 {
// The first comment is always in a paragraph by itself if there
// is no newline between it and the comment start.
l.firstCommentOnSameLine = true
break
}
}

if !isLineComment {
// Block comments cannot be made into paragraphs, so we must
// interrupt the current paragraph.
l.fuseParagraph()
break
}

// Start building up a line comment paragraph if there isn't one
// currently.
if l.parStart.Nil() {
l.parStart = tok.ID()
}
l.parEnd = tok.ID()

case tok.Kind() == token.Space:
// Note that line comments contain their newlines, except for a line
// comment at the end of the file. Thus, seeing a single new line
// means that if we are interrupting a line comment paragraph, and thus
// we must fuse the current paragraph.
if strings.Contains(tok.Text(), "\n") {
l.fuseParagraph()
}

default:
l.fuseParagraph()
//nolint:dupword // False positive due to comments describing an algorithm.
if !l.firstCommentSincePrev.Nil() {
fmt.Println(l.firstCommentSincePrev.In(l.Context), tok)
comments := token.NewCursor(l.firstCommentSincePrev.In(l.Context), tok)
var first, second, penultimate, last token.Token
for { // Don't use l.Done() here, that tosses comment tokens.
next := comments.PopSkippable()
if next.Nil() {
break
} else if next.Kind() == token.Comment {
switch {
case first.Nil():
first = next
case second.Nil():
second = next
}
penultimate = last
last = next
}
}
fmt.Println(first, second, penultimate, last)

// Determine if we need to donate first to the previous comment.
var donate bool
switch {
case prev.Nil():
donate = false
case l.firstCommentOnSameLine:
donate = true
case l.newLinesBetween(prev, first, 2) < 2:
// Now we need to check the remaining three criteria for
// donate. These are:
//
// 1. Is there more than one comment.
// 2. Is the token one of the closers ), ], or } (but not
// >).
// 3. The line of the current token minus the end line of
// the first comment is greater than one.
switch {
case !second.Nil():
donate = true
case slices.Contains([]string{")", "]", "}"}, tok.Text()):
donate = true
case l.newLinesBetween(first, tok, 2) > 1:
donate = true
}
}

if donate {
prev.Comments().SetTrailing(first)
first = second
}

// The leading comment must have precisely one newline between
// it and the new token.
if !first.Nil() && !last.Nil() && l.newLinesBetween(last, tok, 2) == 1 {
tok.Comments().SetLeading(last)
last = penultimate
}

// Check if we have any detached comments left. This is the case
// when first and last are both non-nil and <=. If we donated the
// only comment, second will have been nil, so first is now nil.
//
// If we attached the only remaining comment after donating a
// comment, we would have had the following value evolution for
// first, second, penultimate and last:
//
// before donate: a, b, a, b
// after donate: b, b, a, b
// after attach: b, b, a, a
//
// Thus, when we check b < a, we find that we have nothing left to
// attach.
if !first.Nil() && !last.Nil() && first.ID() <= last.ID() {
tok.Comments().SetDetachedRange(first, last)
}

l.firstCommentSincePrev = 0
l.firstCommentOnSameLine = false
}

l.prev = tok.ID()
}
return tok
}

func (l *lexer) fuseParagraph() {
if !l.parStart.Nil() && l.parEnd != l.parStart {
token.Fuse(
l.parStart.In(l.Context),
l.parEnd.In(l.Context),
)
}
l.parStart = 0
}

// newLinesBetween counts the number of \n characters between the end of a
// and the start of b, up to max.
//
// The final rune of a is included in this count, since comments may end in a
// \n rune.
//
//nolint:revive,predeclared // Complains about redefining max.
func (l *lexer) newLinesBetween(a, b token.Token, max int) int {
end := a.Span().End
if end != 0 {
// Account for the final rune of a.
end--
}

start := b.Span().Start
between := l.Text()[end:start]

var total int
for total < max {
var found bool
_, between, found = strings.Cut(between, "\n")
if !found {
break
}

total++
}
return total
}

// mustProgress returns a progress checker for this lexer.
func (l *lexer) mustProgress() mustProgress {
return mustProgress{l, -1}
Expand Down
17 changes: 17 additions & 0 deletions experimental/parser/lex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/bufbuild/protocompile/experimental/report"
"github.com/bufbuild/protocompile/experimental/token"
"github.com/bufbuild/protocompile/internal/golden"
"github.com/bufbuild/protocompile/internal/iters"
)

func TestRender(t *testing.T) {
Expand Down Expand Up @@ -87,6 +88,22 @@ func TestRender(t *testing.T) {
}
}

comments := tok.Comments()
iters.Enumerate(comments.Detached())(func(i int, t token.Token) bool {
if i == 0 {
fmt.Fprintf(&tsv, "\t\tdetached:%v", t.ID())
} else {
fmt.Fprintf(&tsv, ",%v", t.ID())
}
return true
})
if leading := comments.Leading(); !leading.Nil() {
fmt.Fprintf(&tsv, "\t\tleading:%v", leading.ID())
}
if trailing := comments.Trailing(); !trailing.Nil() {
fmt.Fprintf(&tsv, "\t\ttrailing:%v", trailing.ID())
}

tsv.WriteByte('\n')
return true
})
Expand Down
35 changes: 35 additions & 0 deletions experimental/parser/testdata/lexer/comments/attribution.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// This, as expected, is a leading comment for Foo.
message Foo {
// This is the TRAILING comment for Foo. (It is NOT
// a detached comment for baz.)

// leading comment for baz
string baz = 1;
// trailing comment for baz
}
// This is NOT a trailing comment. It's also not considered
// a detached comment for Bar. It is discarded.

// This IS a detached comment for Bar.

// A leading comment for Bar.
message Bar {
}

string name = 1; // trailing comment for name
// leading comment for id
uint64 id = 2;

previousToken // this comment
// won't get merged into a
// group with these two lines
/* block comments */ /* are always their own groups */ // line comments
// can usually get joined into
// groups with adjacent lines

// empty lines separate groups
// indentation does not impact grouping
/* a single block
* comment can span lines
*/
currentToken
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# kind offsets linecol text
0 Comment 000:052 001:001 "// This, as expected, is a leading comment for Foo.\n"
1 Ident 052:059 002:001 "message" leading:Token(0)
2 Space 059:060 002:008 " "
3 Ident 060:063 002:009 "Foo"
4 Space 063:064 002:012 " "
5 Punct 064:233 002:013 "{" close:Token(23) trailing:Token(7)
6 Space 065:068 002:014 "\n "
7 Comment 068:154 003:003 "// This is the TRAILING comment for Foo. (It is NOT\n" close:Token(9)
8 Space 120:122 004:001 " "
9 Comment 068:154 003:003 "// a detached comment for baz.)\n" open:Token(7)
10 Space 154:157 005:001 "\n "
11 Comment 157:184 006:003 "// leading comment for baz\n"
12 Space 184:186 007:001 " "
13 Ident 186:192 007:003 "string" leading:Token(11)
14 Space 192:193 007:009 " "
15 Ident 193:196 007:010 "baz"
16 Space 196:197 007:013 " "
17 Punct 197:198 007:014 "="
18 Space 198:199 007:015 " "
19 Number 199:200 007:016 "1" int:1
20 Punct 200:201 007:017 ";" trailing:Token(22)
21 Space 201:204 007:018 "\n "
22 Comment 204:232 008:003 "// trailing comment for baz\n"
23 Punct 064:233 002:013 "}" open:Token(5) trailing:Token(25)
24 Space 233:234 009:002 "\n"
25 Comment 234:342 010:001 "// This is NOT a trailing comment. It's also not considered\n" close:Token(26)
26 Comment 234:342 010:001 "// a detached comment for Bar. It is discarded.\n" open:Token(25)
27 Space 342:343 012:001 "\n"
28 Comment 343:382 013:001 "// This IS a detached comment for Bar.\n"
29 Space 382:383 014:001 "\n"
30 Comment 383:413 015:001 "// A leading comment for Bar.\n"
31 Ident 413:420 016:001 "message" detached:Token(28) leading:Token(30)
32 Space 420:421 016:008 " "
33 Ident 421:424 016:009 "Bar"
34 Space 424:425 016:012 " "
35 Punct 425:428 016:013 "{" close:Token(37)
36 Space 426:427 016:014 "\n"
37 Punct 425:428 016:013 "}" open:Token(35)
38 Space 428:430 017:002 "\n\n"
39 Ident 430:436 019:001 "string"
40 Space 436:437 019:007 " "
41 Ident 437:441 019:008 "name"
42 Space 441:442 019:012 " "
43 Punct 442:443 019:013 "="
44 Space 443:444 019:014 " "
45 Number 444:445 019:015 "1" int:1
46 Punct 445:446 019:016 ";" trailing:Token(48)
47 Space 446:447 019:017 " "
48 Comment 447:476 019:018 "// trailing comment for name\n"
49 Comment 476:502 020:001 "// leading comment for id\n"
50 Ident 502:508 021:001 "uint64" leading:Token(49)
51 Space 508:509 021:007 " "
52 Ident 509:511 021:008 "id"
53 Space 511:512 021:010 " "
54 Punct 512:513 021:011 "="
55 Space 513:514 021:012 " "
56 Number 514:515 021:013 "2" int:2
57 Punct 515:516 021:014 ";"
58 Space 516:518 021:015 "\n\n"
59 Ident 518:531 023:001 "previousToken" trailing:Token(61)
60 Space 531:532 023:014 " "
61 Comment 532:548 023:015 "// this comment\n"
62 Comment 548:605 024:001 "// won't get merged into a\n" close:Token(63)
63 Comment 548:605 024:001 "// group with these two lines\n" open:Token(62)
64 Comment 605:625 026:001 "/* block comments */"
65 Space 625:626 026:021 " "
66 Comment 626:659 026:022 "/* are always their own groups */"
67 Space 659:660 026:055 " "
68 Comment 660:738 026:056 "// line comments\n" close:Token(70)
69 Comment 677:708 027:001 "// can usually get joined into\n"
70 Comment 660:738 026:056 "// groups with adjacent lines\n" open:Token(68)
71 Space 738:742 029:001 "\n "
72 Comment 742:813 030:004 "// empty lines separate groups\n" close:Token(73)
73 Comment 742:813 030:004 "// indentation does not impact grouping\n" open:Token(72)
74 Comment 813:860 032:001 "/* a single block\n * comment can span lines\n */"
75 Space 860:861 034:004 "\n"
76 Ident 861:873 035:001 "currentToken" detached:Token(62),Token(64),Token(66),Token(68),Token(72) leading:Token(74)
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# kind offsets linecol text
0 Comment 000:039 001:001 "/*\n Nesting\n /* is not allowed */"
1 Space 039:040 003:025 "\n"
2 Unrecognized 040:042 004:001 "*/"
2 Unrecognized 040:042 004:001 "*/" leading:Token(0)
Loading
Loading