[CFE] Improve [StringCanonicalizer] implementation

mkustermann · Commit Bot · commit 9a3bcbcdb0b3 · 2022-08-15T17:06:58.000Z
The current [StringCanonicalizer] implementation has some issues: * It hangs on to large [Uint8List]/[String] objects in it's cache => This requires users (such as analyzer) to clear the cache frequently * Has api that works on dynamic input (which is assumed to be String or List<int> / Uint8List) => Call sites have typing information we loose when doing the call * It uses dynamic [] calls to compare input bytes with cached bytes / input strings with cached strings => Dynamic calls come with overhead => Will cause substring generation for every character comparison => Will compare bytes with strings (which doesn't make sense) To address these issues we * Use the canonicalized [String] to compare against instead of the (much larger) source strings, thereby no longer hanging on to large strings in the canonicalizer cache (it's still an issue with [Uint8List]s though) * Make seperate API for canonicalization of strings, sub-strings or sub-utf8-bytes and use it from the token implementation. * For canonicalization of strings use String.== (instead of char-by-char comparison) * For canonicalization of sub-strings use String.charCodeAt instead of [] (which creates substrings) * Seperate out cache node entries into two classes and reduce memory consumption of the nodes that represent strings by 16 bytes (it does an additional `is` check on lookups in the cache, but that is better than paying for dynamic calls on the payload - which causes the compiler to do implicit checks) => This CL reduces RAM consumption and makes CFE scan/scan_bytes benchmarks a little faster. TEST=ci Change-Id: I157c298d26d25ac5da82c32eedfa270a590156f0 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/255121 Commit-Queue: Martin Kustermann <kustermann@google.com> Reviewed-by: Jens Johansen <jensj@google.com>
diff --git a/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart b/pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart
@@ -6,13 +6,30 @@ library _fe_analyzer_shared.scanner.string_canonicalizer;
 
 import 'dart:convert';
 
-class Node {
-  dynamic /* String | List<int> */ data;
-  int start;
-  int end;
-  String payload;
+abstract class Node {
+  final String payload;
   Node? next;
-  Node(this.data, this.start, this.end, this.payload, this.next);
+
+  Node(this.payload, this.next);
+
+  int get hash;
+}
+
+class StringNode extends Node {
+  StringNode(super.payload, super.next);
+
+  int get hash => StringCanonicalizer.hashString(payload, 0, payload.length);
+}
+
+class Utf8Node extends Node {
+  final List<int> data;
+  final int start;
+  final int end;
+
+  Utf8Node(this.data, this.start, this.end, String payload, Node? next)
+      : super(payload, next);
+
+  int get hash => StringCanonicalizer.hashBytes(data, start, end);
 }
 
 /// A hash table for triples:
@@ -41,7 +58,7 @@ class StringCanonicalizer {
     if (asciiOnly) {
       s = new String.fromCharCodes(data, start, end);
     } else {
-      s = new Utf8Decoder(allowMalformed: true).convert(data, start, end);
+      s = const Utf8Decoder(allowMalformed: true).convert(data, start, end);
     }
     return s;
   }
@@ -69,9 +86,7 @@ class StringCanonicalizer {
       Node? t = _nodes[i];
       while (t != null) {
         Node? n = t.next;
-        int newIndex = t.data is String
-            ? hashString(t.data, t.start, t.end) & (newSize - 1)
-            : hashBytes(t.data, t.start, t.end) & (newSize - 1);
+        int newIndex = t.hash & (newSize - 1);
         Node? s = newNodes[newIndex];
         t.next = s;
         newNodes[newIndex] = t;
@@ -83,36 +98,98 @@ class StringCanonicalizer {
   }
 
   String canonicalize(data, int start, int end, bool asciiOnly) {
+    if (data is String) {
+      if (start == 0 && (end == data.length - 1)) {
+        return canonicalizeString(data);
+      }
+      return canonicalizeSubString(data, start, end);
+    }
+    return canonicalizeBytes(data as List<int>, start, end, asciiOnly);
+  }
+
+  String canonicalizeBytes(List<int> data, int start, int end, bool asciiOnly) {
     if (_count > _size) rehash();
-    int index = data is String
-        ? hashString(data, start, end)
-        : hashBytes(data, start, end);
-    index = index & (_size - 1);
+    final int index = hashBytes(data, start, end) & (_size - 1);
     Node? s = _nodes[index];
     Node? t = s;
     int len = end - start;
     while (t != null) {
-      if (t.end - t.start == len) {
-        int i = start, j = t.start;
-        while (i < end && data[i] == t.data[j]) {
-          i++;
-          j++;
+      if (t is Utf8Node) {
+        final List<int> tData = t.data;
+        if (t.end - t.start == len) {
+          int i = start, j = t.start;
+          while (i < end && data[i] == tData[j]) {
+            i++;
+            j++;
+          }
+          if (i == end) {
+            return t.payload;
+          }
         }
-        if (i == end) {
-          return t.payload;
+      }
+      t = t.next;
+    }
+    String payload = decode(data, start, end, asciiOnly);
+    _nodes[index] = new Utf8Node(data, start, end, payload, s);
+    _count++;
+    return payload;
+  }
+
+  String canonicalizeSubString(String data, int start, int end) {
+    if (_count > _size) rehash();
+    final int index = hashString(data, start, end) & (_size - 1);
+    Node? s = _nodes[index];
+    Node? t = s;
+    int len = end - start;
+    while (t != null) {
+      if (t is StringNode) {
+        final String tData = t.payload;
+        if (identical(data, tData)) return tData;
+        if (tData.length == len) {
+          int i = start, j = 0;
+          while (i < end && data.codeUnitAt(i) == tData.codeUnitAt(j)) {
+            i++;
+            j++;
+          }
+          if (i == end) {
+            return tData;
+          }
         }
       }
       t = t.next;
     }
-    String payload;
-    if (data is String) {
-      payload = data.substring(start, end);
-    } else {
-      payload = decode(data, start, end, asciiOnly);
+    return insertStringNode(index, s, data.substring(start, end));
+  }
+
+  String canonicalizeString(String data) {
+    if (_count > _size) rehash();
+    final int index = hashString(data, 0, data.length) & (_size - 1);
+    Node? s = _nodes[index];
+    Node? t = s;
+    while (t != null) {
+      if (t is StringNode) {
+        final String tData = t.payload;
+        if (identical(data, tData)) return tData;
+        if (data == tData) return tData;
+      }
+      t = t.next;
     }
-    _nodes[index] = new Node(data, start, end, payload, s);
+    return insertStringNode(index, s, data);
+  }
+
+  String insertStringNode(int index, Node? next, String value) {
+    final StringNode newNode = new StringNode(value, next);
+    _nodes[index] = newNode;
     _count++;
-    return payload;
+    return value;
+  }
+
+  String insertUtf8Node(int index, Node? next, List<int> buffer, int start,
+      int end, String value) {
+    final Utf8Node newNode = new Utf8Node(buffer, start, end, value, next);
+    _nodes[index] = newNode;
+    _count++;
+    return value;
   }
 
   clear() {
diff --git a/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart b/pkg/_fe_analyzer_shared/lib/src/scanner/token_impl.dart
@@ -41,8 +41,8 @@ class StringTokenImpl extends SimpleToken implements StringToken {
    */
   StringTokenImpl.fromString(TokenType type, String value, int charOffset,
       {bool canonicalize: false, CommentToken? precedingComments})
-      : valueOrLazySubstring = canonicalizedString(
-            value, /* start = */ 0, value.length, canonicalize),
+      : valueOrLazySubstring =
+            canonicalize ? canonicalizedString(value) : value,
         super(type, charOffset, precedingComments);
 
   /**
@@ -56,7 +56,7 @@ class StringTokenImpl extends SimpleToken implements StringToken {
     int length = end - start;
     if (length <= LAZY_THRESHOLD) {
       valueOrLazySubstring =
-          canonicalizedString(data, start, end, canonicalize);
+          canonicalizedSubString(data, start, end, canonicalize);
     } else {
       valueOrLazySubstring =
           new _LazySubstring(data, start, length, canonicalize);
@@ -89,7 +89,7 @@ class StringTokenImpl extends SimpleToken implements StringToken {
       int start = valueOrLazySubstring.start;
       int end = start + (valueOrLazySubstring as _LazySubstring).length;
       if (data is String) {
-        valueOrLazySubstring = canonicalizedString(
+        valueOrLazySubstring = canonicalizedSubString(
             data, start, end, valueOrLazySubstring.boolValue);
       } else {
         valueOrLazySubstring =
@@ -107,14 +107,18 @@ class StringTokenImpl extends SimpleToken implements StringToken {
 
   static final StringCanonicalizer canonicalizer = new StringCanonicalizer();
 
-  static String canonicalizedString(
+  static String canonicalizedString(String s) {
+    return canonicalizer.canonicalizeString(s);
+  }
+
+  static String canonicalizedSubString(
       String s, int start, int end, bool canonicalize) {
-    if (!canonicalize) return s;
-    return canonicalizer.canonicalize(s, start, end, /* asciiOnly = */ false);
+    if (!canonicalize) return s.substring(start, end);
+    return canonicalizer.canonicalizeSubString(s, start, end);
   }
 
   static String decodeUtf8(List<int> data, int start, int end, bool asciiOnly) {
-    return canonicalizer.canonicalize(data, start, end, asciiOnly);
+    return canonicalizer.canonicalizeBytes(data, start, end, asciiOnly);
   }
 
   @override
diff --git a/pkg/front_end/test/static_types/cfe_allowed.json b/pkg/front_end/test/static_types/cfe_allowed.json
@@ -20,8 +20,5 @@
   },
   "pkg/front_end/lib/src/fasta/dill/dill_member_builder.dart": {
     "Dynamic access of 'fileUri'.": 1
-  },
-  "pkg/_fe_analyzer_shared/lib/src/scanner/string_canonicalizer.dart": {
-    "Dynamic invocation of '[]'.": 2
   }
-}
+}