Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit b2e550c

Browse files
feat(search): Enable improved symbol parsing for large repos (when using Rockskip) (#63988)
During an investigation, we saw that Rockskip was not using scip-ctags for symbol parsing when applicable. This means that 1. Rockskip is getting less than optimal symbols for certain languages (like Go) 2. Rockskip is getting no symbols for languages not in universal ctags (Magik) This PR attempts to solve this problem but updating Rockskip to re-use the ctags parser pool logic from symbol service. ### Key Changes - Update parser pool to be re-usable - Push common logic for parser type detection into the parser pool module - Update rockskip service config to take a parser pool - Update and add unit/integration tests ## Questions - What performance impact will using this pooled parser have compared to its previous behavior of spawning a new ctags process each time? ## Test plan - [x] Add unit tests - [x] Update integration tests - [x] Manually test rockskip - [x] Manually test symbolservice (in case of regression) --------- Co-authored-by: Keegan Carruthers-Smith <[email protected]>
1 parent 60d450b commit b2e550c

File tree

18 files changed

+333
-146
lines changed

18 files changed

+333
-146
lines changed

cmd/symbols/internal/api/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ go_test(
5656
"//cmd/symbols/internal/gitserver",
5757
"//cmd/symbols/internal/parser",
5858
"//internal/api",
59+
"//internal/conf",
5960
"//internal/ctags_config",
6061
"//internal/database/dbmocks",
6162
"//internal/diskcache",

cmd/symbols/internal/api/handler_test.go

Lines changed: 74 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package api
33
import (
44
"context"
55
"net/http/httptest"
6+
"sort"
67
"testing"
78
"time"
89

@@ -17,6 +18,7 @@ import (
1718
"github.com/sourcegraph/sourcegraph/cmd/symbols/internal/fetcher"
1819
"github.com/sourcegraph/sourcegraph/cmd/symbols/internal/gitserver"
1920
"github.com/sourcegraph/sourcegraph/cmd/symbols/internal/parser"
21+
"github.com/sourcegraph/sourcegraph/internal/conf"
2022
"github.com/sourcegraph/sourcegraph/internal/ctags_config"
2123
"github.com/sourcegraph/sourcegraph/internal/database/dbmocks"
2224
"github.com/sourcegraph/sourcegraph/internal/diskcache"
@@ -38,42 +40,68 @@ func TestHandler(t *testing.T) {
3840
tmpDir := t.TempDir()
3941

4042
cache := diskcache.NewStore(tmpDir, "symbols", diskcache.WithBackgroundTimeout(20*time.Minute))
41-
43+
// This ensures the ctags config is initialized properly
44+
conf.MockAndNotifyWatchers(&conf.Unified{})
4245
parserFactory := func(source ctags_config.ParserType) (ctags.Parser, error) {
43-
pathToEntries := map[string][]*ctags.Entry{
44-
"a.js": {
45-
{
46-
Name: "x",
47-
Path: "a.js",
48-
Language: "JavaScript",
49-
Line: 1, // ctags line numbers are 1-based
46+
var pathToEntries map[string][]*ctags.Entry
47+
if source == ctags_config.UniversalCtags {
48+
pathToEntries = map[string][]*ctags.Entry{
49+
"a.pl": {
50+
{
51+
Name: "x",
52+
Path: "a.pl",
53+
Language: "Perl",
54+
Line: 1, // ctags line numbers are 1-based
55+
},
56+
{
57+
Name: "y",
58+
Path: "a.pl",
59+
Language: "Perl",
60+
Line: 2,
61+
},
5062
},
51-
{
52-
Name: "y",
53-
Path: "a.js",
54-
Language: "JavaScript",
55-
Line: 2,
63+
".zshrc": {
64+
{
65+
Name: "z",
66+
Path: ".zshrc",
67+
Language: "Zsh",
68+
Line: 1,
69+
},
5670
},
57-
},
58-
".zshrc": {
59-
{
60-
Name: "z",
61-
Path: ".zshrc",
62-
Language: "Zsh",
63-
Line: 1,
71+
}
72+
} else if source == ctags_config.ScipCtags {
73+
pathToEntries = map[string][]*ctags.Entry{
74+
"b.magik": {
75+
{
76+
Name: "v",
77+
Path: "b.magik",
78+
Language: "Magik",
79+
Line: 1, // ctags line numbers are 1-based
80+
},
81+
{
82+
Name: "w",
83+
Path: "b.magik",
84+
Language: "Magik",
85+
Line: 2,
86+
},
6487
},
65-
},
88+
}
89+
} else {
90+
t.Errorf("Invalid ctags type %d", source)
6691
}
92+
6793
return newMockParser(pathToEntries), nil
6894
}
69-
parserPool, err := parser.NewParserPool(parserFactory, 15, parser.DefaultParserTypes)
95+
96+
parserPool, err := parser.NewParserPool(observation.TestContextTB(t), "test", parserFactory, 15, parser.DefaultParserTypes)
7097
if err != nil {
7198
t.Fatal(err)
7299
}
73100

74101
files := map[string]string{
75-
"a.js": "var x = 1\nvar y = 2",
76-
".zshrc": "z=42",
102+
"a.pl": "$x = 1\n$y = 2",
103+
".zshrc": "z=42",
104+
"b.magik": "v << 1\nw<<2",
77105
}
78106
gitserverClient := NewMockGitserverClient()
79107
gitserverClient.FetchTarFunc.SetDefaultHook(gitserver.CreateTestFetchTarFunc(files))
@@ -94,16 +122,18 @@ func TestHandler(t *testing.T) {
94122
GRPCConnectionCache: connectionCache,
95123
}
96124

97-
x := result.Symbol{Name: "x", Path: "a.js", Language: "JavaScript", Line: 0, Character: 4}
98-
y := result.Symbol{Name: "y", Path: "a.js", Language: "JavaScript", Line: 1, Character: 4}
125+
x := result.Symbol{Name: "x", Path: "a.pl", Language: "Perl", Line: 0, Character: 1}
126+
y := result.Symbol{Name: "y", Path: "a.pl", Language: "Perl", Line: 1, Character: 1}
99127
z := result.Symbol{Name: "z", Path: ".zshrc", Language: "Zsh", Line: 0, Character: 0}
128+
v := result.Symbol{Name: "v", Path: "b.magik", Language: "Magik", Line: 0, Character: 0}
129+
w := result.Symbol{Name: "w", Path: "b.magik", Language: "Magik", Line: 1, Character: 0}
100130

101131
testCases := map[string]struct {
102132
args search.SymbolsParameters
103133
expected result.Symbols
104134
}{
105135
"simple": {
106-
args: search.SymbolsParameters{IncludePatterns: []string{"^a.js$"}, First: 10},
136+
args: search.SymbolsParameters{IncludePatterns: []string{"^a.pl$"}, First: 10},
107137
expected: []result.Symbol{x, y},
108138
},
109139
"onematch": {
@@ -127,38 +157,48 @@ func TestHandler(t *testing.T) {
127157
expected: nil,
128158
},
129159
"caseinsensitiveexactpathmatch": {
130-
args: search.SymbolsParameters{IncludePatterns: []string{"^A.js$"}, First: 10},
160+
args: search.SymbolsParameters{IncludePatterns: []string{"^A.pl$"}, First: 10},
131161
expected: []result.Symbol{x, y},
132162
},
133163
"casesensitiveexactpathmatch": {
134-
args: search.SymbolsParameters{IncludePatterns: []string{"^a.js$"}, IsCaseSensitive: true, First: 10},
164+
args: search.SymbolsParameters{IncludePatterns: []string{"^a.pl$"}, IsCaseSensitive: true, First: 10},
135165
expected: []result.Symbol{x, y},
136166
},
137167
"casesensitivenoexactpathmatch": {
138-
args: search.SymbolsParameters{IncludePatterns: []string{"^A.js$"}, IsCaseSensitive: true, First: 10},
168+
args: search.SymbolsParameters{IncludePatterns: []string{"^A.pl$"}, IsCaseSensitive: true, First: 10},
139169
expected: nil,
140170
},
141171
"exclude": {
142-
args: search.SymbolsParameters{ExcludePattern: "a.js", IsCaseSensitive: true, First: 10},
143-
expected: []result.Symbol{z},
172+
args: search.SymbolsParameters{ExcludePattern: "a.pl", IsCaseSensitive: true, First: 10},
173+
expected: []result.Symbol{z, v, w},
144174
},
145175
"include lang filters": {
146-
args: search.SymbolsParameters{Query: ".*", IncludeLangs: []string{"Javascript"}, IsCaseSensitive: true, First: 10},
176+
args: search.SymbolsParameters{Query: ".*", IncludeLangs: []string{"Perl"}, IsCaseSensitive: true, First: 10},
147177
expected: []result.Symbol{x, y},
148178
},
149179
"include lang filters with ctags conversion": {
150180
args: search.SymbolsParameters{Query: ".*", IncludeLangs: []string{"Shell"}, IsCaseSensitive: true, First: 10},
151181
expected: []result.Symbol{z},
152182
},
153183
"exclude lang filters": {
154-
args: search.SymbolsParameters{Query: ".*", ExcludeLangs: []string{"Javascript"}, IsCaseSensitive: true, First: 10},
184+
args: search.SymbolsParameters{Query: ".*", ExcludeLangs: []string{"Perl", "Magik"}, IsCaseSensitive: true, First: 10},
155185
expected: []result.Symbol{z},
156186
},
187+
"scip-ctags only language": {
188+
args: search.SymbolsParameters{Query: ".*", IncludeLangs: []string{"Magik"}, IsCaseSensitive: true, First: 10},
189+
expected: []result.Symbol{v, w},
190+
},
157191
}
158192

159193
for label, testCase := range testCases {
160194
t.Run(label, func(t *testing.T) {
161195
resultSymbols, limitHit, err := client.Search(context.Background(), testCase.args)
196+
197+
// Sort to ensure consistent comparisons
198+
sort.Slice(resultSymbols, func(i, j int) bool {
199+
return resultSymbols[i].Path < resultSymbols[j].Path
200+
})
201+
162202
if err != nil {
163203
t.Fatalf("unexpected error performing search: %s", err)
164204
}

cmd/symbols/internal/parser/observability.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@ import (
1313

1414
type operations struct {
1515
parsing prometheus.Gauge
16-
parseQueueSize prometheus.Gauge
17-
parseQueueTimeouts prometheus.Counter
1816
parseFailed prometheus.Counter
1917
parseCanceled prometheus.Counter
2018
parse *observation.Operation
@@ -29,20 +27,6 @@ func newOperations(observationCtx *observation.Context) *operations {
2927
})
3028
observationCtx.Registerer.MustRegister(parsing)
3129

32-
parseQueueSize := prometheus.NewGauge(prometheus.GaugeOpts{
33-
Namespace: "src",
34-
Name: "codeintel_symbols_parse_queue_size",
35-
Help: "The number of parse jobs enqueued.",
36-
})
37-
observationCtx.Registerer.MustRegister(parseQueueSize)
38-
39-
parseQueueTimeouts := prometheus.NewCounter(prometheus.CounterOpts{
40-
Namespace: "src",
41-
Name: "codeintel_symbols_parse_queue_timeouts_total",
42-
Help: "The total number of parse jobs that timed out while enqueued.",
43-
})
44-
observationCtx.Registerer.MustRegister(parseQueueTimeouts)
45-
4630
parseFailed := prometheus.NewCounter(prometheus.CounterOpts{
4731
Namespace: "src",
4832
Name: "codeintel_symbols_parse_failed_total",
@@ -87,8 +71,6 @@ func newOperations(observationCtx *observation.Context) *operations {
8771

8872
return &operations{
8973
parsing: parsing,
90-
parseQueueSize: parseQueueSize,
91-
parseQueueTimeouts: parseQueueTimeouts,
9274
parseFailed: parseFailed,
9375
parseCanceled: parseCanceled,
9476
parse: observationCtx.Operation(op("Parse")),

cmd/symbols/internal/parser/parser.go

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"github.com/sourcegraph/sourcegraph/internal/observation"
2020
"github.com/sourcegraph/sourcegraph/internal/search"
2121
"github.com/sourcegraph/sourcegraph/internal/search/result"
22-
"github.com/sourcegraph/sourcegraph/lib/codeintel/languages"
2322
"github.com/sourcegraph/sourcegraph/lib/errors"
2423
)
2524

@@ -33,7 +32,7 @@ type SymbolOrError struct {
3332
}
3433

3534
type parser struct {
36-
parserPool *parserPool
35+
parserPool *ParserPool
3736
repositoryFetcher fetcher.RepositoryFetcher
3837
requestBufferSize int
3938
numParserProcesses int
@@ -42,7 +41,7 @@ type parser struct {
4241

4342
func NewParser(
4443
observationCtx *observation.Context,
45-
parserPool *parserPool,
44+
parserPool *ParserPool,
4645
repositoryFetcher fetcher.RepositoryFetcher,
4746
requestBufferSize int,
4847
numParserProcesses int,
@@ -144,20 +143,20 @@ func (p *parser) handleParseRequest(
144143
}})
145144
defer endObservation(1, observation.Args{})
146145

147-
language, found := languages.GetMostLikelyLanguage(parseRequest.Path, string(parseRequest.Data))
148-
if !found {
149-
return nil
146+
parser, parserType, err := p.parserPool.GetParser(ctx, parseRequest.Path, parseRequest.Data)
147+
148+
// If the language has a parser but we cannot retrieve it, we get an error
149+
if err != nil {
150+
return err
150151
}
151152

152-
source := GetParserType(language)
153-
if ctags_config.ParserIsNoop(source) {
153+
// If we cannot determine type of ctags it means we don't support symbols for
154+
// this file type so we bail out early. This is not considered an error since
155+
// many file types may not be supported
156+
if ctags_config.ParserIsNoop(parserType) {
154157
return nil
155158
}
156159

157-
parser, err := p.parserFromPool(ctx, source)
158-
if err != nil {
159-
return err
160-
}
161160
trace.AddEvent("parser", attribute.String("event", "acquired parser from pool"))
162161

163162
defer func() {
@@ -168,7 +167,7 @@ func (p *parser) handleParseRequest(
168167
}
169168

170169
if err == nil {
171-
p.parserPool.Done(parser, source)
170+
p.parserPool.Done(parser, parserType)
172171
} else {
173172
// If we are canceled we still kill the parser just in case, but
174173
// we do not record as failure nor logspam since this is a more
@@ -182,7 +181,7 @@ func (p *parser) handleParseRequest(
182181
// Close parser and return nil to pool, indicating that the next
183182
// receiver should create a new parser
184183
parser.Close()
185-
p.parserPool.Done(nil, source)
184+
p.parserPool.Done(nil, parserType)
186185
}
187186
}()
188187

@@ -240,27 +239,6 @@ func (p *parser) handleParseRequest(
240239
return nil
241240
}
242241

243-
func (p *parser) parserFromPool(ctx context.Context, source ctags_config.ParserType) (ctags.Parser, error) {
244-
if ctags_config.ParserIsNoop(source) {
245-
return nil, errors.New("Should not pass Noop ParserType to this function")
246-
}
247-
248-
p.operations.parseQueueSize.Inc()
249-
defer p.operations.parseQueueSize.Dec()
250-
251-
parser, err := p.parserPool.Get(ctx, source)
252-
if err != nil {
253-
if err == context.DeadlineExceeded {
254-
p.operations.parseQueueTimeouts.Inc()
255-
}
256-
if err != ctx.Err() {
257-
err = errors.Wrap(err, "failed to create parser")
258-
}
259-
}
260-
261-
return parser, err
262-
}
263-
264242
func shouldPersistEntry(e *ctags.Entry) bool {
265243
if e.Name == "" {
266244
return false

0 commit comments

Comments
 (0)