Skip to content

Commit 9a3bcbc

Browse files
mkustermannCommit Bot
authored andcommitted
[CFE] Improve [StringCanonicalizer] implementation
The current [StringCanonicalizer] implementation has some issues: * It hangs on to large [Uint8List]/[String] objects in it's cache => This requires users (such as analyzer) to clear the cache frequently * Has api that works on dynamic input (which is assumed to be String or List<int> / Uint8List) => Call sites have typing information we loose when doing the call * It uses dynamic [] calls to compare input bytes with cached bytes / input strings with cached strings => Dynamic calls come with overhead => Will cause substring generation for every character comparison => Will compare bytes with strings (which doesn't make sense) To address these issues we * Use the canonicalized [String] to compare against instead of the (much larger) source strings, thereby no longer hanging on to large strings in the canonicalizer cache (it's still an issue with [Uint8List]s though) * Make seperate API for canonicalization of strings, sub-strings or sub-utf8-bytes and use it from the token implementation. * For canonicalization of strings use String.== (instead of char-by-char comparison) * For canonicalization of sub-strings use String.charCodeAt instead of [] (which creates substrings) * Seperate out cache node entries into two classes and reduce memory consumption of the nodes that represent strings by 16 bytes (it does an additional `is` check on lookups in the cache, but that is better than paying for dynamic calls on the payload - which causes the compiler to do implicit checks) => This CL reduces RAM consumption and makes CFE scan/scan_bytes benchmarks a little faster. TEST=ci Change-Id: I157c298d26d25ac5da82c32eedfa270a590156f0 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/255121 Commit-Queue: Martin Kustermann <[email protected]> Reviewed-by: Jens Johansen <[email protected]>
1 parent d353323 commit 9a3bcbc

File tree

3 files changed

+118
-40
lines changed

3 files changed

+118
-40
lines changed

pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart

Lines changed: 105 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,30 @@ library _fe_analyzer_shared.scanner.string_canonicalizer;
66

77
import 'dart:convert';
88

9-
class Node {
10-
dynamic /* String | List<int> */ data;
11-
int start;
12-
int end;
13-
String payload;
9+
abstract class Node {
10+
final String payload;
1411
Node? next;
15-
Node(this.data, this.start, this.end, this.payload, this.next);
12+
13+
Node(this.payload, this.next);
14+
15+
int get hash;
16+
}
17+
18+
class StringNode extends Node {
19+
StringNode(super.payload, super.next);
20+
21+
int get hash => StringCanonicalizer.hashString(payload, 0, payload.length);
22+
}
23+
24+
class Utf8Node extends Node {
25+
final List<int> data;
26+
final int start;
27+
final int end;
28+
29+
Utf8Node(this.data, this.start, this.end, String payload, Node? next)
30+
: super(payload, next);
31+
32+
int get hash => StringCanonicalizer.hashBytes(data, start, end);
1633
}
1734

1835
/// A hash table for triples:
@@ -41,7 +58,7 @@ class StringCanonicalizer {
4158
if (asciiOnly) {
4259
s = new String.fromCharCodes(data, start, end);
4360
} else {
44-
s = new Utf8Decoder(allowMalformed: true).convert(data, start, end);
61+
s = const Utf8Decoder(allowMalformed: true).convert(data, start, end);
4562
}
4663
return s;
4764
}
@@ -69,9 +86,7 @@ class StringCanonicalizer {
6986
Node? t = _nodes[i];
7087
while (t != null) {
7188
Node? n = t.next;
72-
int newIndex = t.data is String
73-
? hashString(t.data, t.start, t.end) & (newSize - 1)
74-
: hashBytes(t.data, t.start, t.end) & (newSize - 1);
89+
int newIndex = t.hash & (newSize - 1);
7590
Node? s = newNodes[newIndex];
7691
t.next = s;
7792
newNodes[newIndex] = t;
@@ -83,36 +98,98 @@ class StringCanonicalizer {
8398
}
8499

85100
String canonicalize(data, int start, int end, bool asciiOnly) {
101+
if (data is String) {
102+
if (start == 0 && (end == data.length - 1)) {
103+
return canonicalizeString(data);
104+
}
105+
return canonicalizeSubString(data, start, end);
106+
}
107+
return canonicalizeBytes(data as List<int>, start, end, asciiOnly);
108+
}
109+
110+
String canonicalizeBytes(List<int> data, int start, int end, bool asciiOnly) {
86111
if (_count > _size) rehash();
87-
int index = data is String
88-
? hashString(data, start, end)
89-
: hashBytes(data, start, end);
90-
index = index & (_size - 1);
112+
final int index = hashBytes(data, start, end) & (_size - 1);
91113
Node? s = _nodes[index];
92114
Node? t = s;
93115
int len = end - start;
94116
while (t != null) {
95-
if (t.end - t.start == len) {
96-
int i = start, j = t.start;
97-
while (i < end && data[i] == t.data[j]) {
98-
i++;
99-
j++;
117+
if (t is Utf8Node) {
118+
final List<int> tData = t.data;
119+
if (t.end - t.start == len) {
120+
int i = start, j = t.start;
121+
while (i < end && data[i] == tData[j]) {
122+
i++;
123+
j++;
124+
}
125+
if (i == end) {
126+
return t.payload;
127+
}
100128
}
101-
if (i == end) {
102-
return t.payload;
129+
}
130+
t = t.next;
131+
}
132+
String payload = decode(data, start, end, asciiOnly);
133+
_nodes[index] = new Utf8Node(data, start, end, payload, s);
134+
_count++;
135+
return payload;
136+
}
137+
138+
String canonicalizeSubString(String data, int start, int end) {
139+
if (_count > _size) rehash();
140+
final int index = hashString(data, start, end) & (_size - 1);
141+
Node? s = _nodes[index];
142+
Node? t = s;
143+
int len = end - start;
144+
while (t != null) {
145+
if (t is StringNode) {
146+
final String tData = t.payload;
147+
if (identical(data, tData)) return tData;
148+
if (tData.length == len) {
149+
int i = start, j = 0;
150+
while (i < end && data.codeUnitAt(i) == tData.codeUnitAt(j)) {
151+
i++;
152+
j++;
153+
}
154+
if (i == end) {
155+
return tData;
156+
}
103157
}
104158
}
105159
t = t.next;
106160
}
107-
String payload;
108-
if (data is String) {
109-
payload = data.substring(start, end);
110-
} else {
111-
payload = decode(data, start, end, asciiOnly);
161+
return insertStringNode(index, s, data.substring(start, end));
162+
}
163+
164+
String canonicalizeString(String data) {
165+
if (_count > _size) rehash();
166+
final int index = hashString(data, 0, data.length) & (_size - 1);
167+
Node? s = _nodes[index];
168+
Node? t = s;
169+
while (t != null) {
170+
if (t is StringNode) {
171+
final String tData = t.payload;
172+
if (identical(data, tData)) return tData;
173+
if (data == tData) return tData;
174+
}
175+
t = t.next;
112176
}
113-
_nodes[index] = new Node(data, start, end, payload, s);
177+
return insertStringNode(index, s, data);
178+
}
179+
180+
String insertStringNode(int index, Node? next, String value) {
181+
final StringNode newNode = new StringNode(value, next);
182+
_nodes[index] = newNode;
114183
_count++;
115-
return payload;
184+
return value;
185+
}
186+
187+
String insertUtf8Node(int index, Node? next, List<int> buffer, int start,
188+
int end, String value) {
189+
final Utf8Node newNode = new Utf8Node(buffer, start, end, value, next);
190+
_nodes[index] = newNode;
191+
_count++;
192+
return value;
116193
}
117194

118195
clear() {

pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ class StringTokenImpl extends SimpleToken implements StringToken {
4141
*/
4242
StringTokenImpl.fromString(TokenType type, String value, int charOffset,
4343
{bool canonicalize: false, CommentToken? precedingComments})
44-
: valueOrLazySubstring = canonicalizedString(
45-
value, /* start = */ 0, value.length, canonicalize),
44+
: valueOrLazySubstring =
45+
canonicalize ? canonicalizedString(value) : value,
4646
super(type, charOffset, precedingComments);
4747

4848
/**
@@ -56,7 +56,7 @@ class StringTokenImpl extends SimpleToken implements StringToken {
5656
int length = end - start;
5757
if (length <= LAZY_THRESHOLD) {
5858
valueOrLazySubstring =
59-
canonicalizedString(data, start, end, canonicalize);
59+
canonicalizedSubString(data, start, end, canonicalize);
6060
} else {
6161
valueOrLazySubstring =
6262
new _LazySubstring(data, start, length, canonicalize);
@@ -89,7 +89,7 @@ class StringTokenImpl extends SimpleToken implements StringToken {
8989
int start = valueOrLazySubstring.start;
9090
int end = start + (valueOrLazySubstring as _LazySubstring).length;
9191
if (data is String) {
92-
valueOrLazySubstring = canonicalizedString(
92+
valueOrLazySubstring = canonicalizedSubString(
9393
data, start, end, valueOrLazySubstring.boolValue);
9494
} else {
9595
valueOrLazySubstring =
@@ -107,14 +107,18 @@ class StringTokenImpl extends SimpleToken implements StringToken {
107107

108108
static final StringCanonicalizer canonicalizer = new StringCanonicalizer();
109109

110-
static String canonicalizedString(
110+
static String canonicalizedString(String s) {
111+
return canonicalizer.canonicalizeString(s);
112+
}
113+
114+
static String canonicalizedSubString(
111115
String s, int start, int end, bool canonicalize) {
112-
if (!canonicalize) return s;
113-
return canonicalizer.canonicalize(s, start, end, /* asciiOnly = */ false);
116+
if (!canonicalize) return s.substring(start, end);
117+
return canonicalizer.canonicalizeSubString(s, start, end);
114118
}
115119

116120
static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {
117-
return canonicalizer.canonicalize(data, start, end, asciiOnly);
121+
return canonicalizer.canonicalizeBytes(data, start, end, asciiOnly);
118122
}
119123

120124
@override

pkg/front_end/test/static_types/cfe_allowed.json

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,5 @@
2020
},
2121
"pkg/front_end/lib/src/fasta/dill/dill_member_builder.dart": {
2222
"Dynamic access of 'fileUri'.": 1
23-
},
24-
"pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart": {
25-
"Dynamic invocation of '[]'.": 2
2623
}
27-
}
24+
}

0 commit comments

Comments
 (0)