Skip to content

Commit ab717da

Browse files
committed
fix: Improve performance of the UTF-8 string comparison logic.
The semantics of this logic were originally fixed by #1967, but this fix caused a material performance degradation, which was then improved by #2021. The performance was, however, still suboptimal, and this PR further improves the speed back to close to its original speed and, serendipitously, simplifies the algorithm too. This commit effectively ports the following two PRs from the firebase-android-sdk repository: - firebase/firebase-android-sdk#7098 - firebase/firebase-android-sdk#7109
1 parent 54ce2c7 commit ab717da

File tree

2 files changed

+51
-36
lines changed
  • google-cloud-firestore/src

2 files changed

+51
-36
lines changed

google-cloud-firestore/src/main/java/com/google/cloud/firestore/Order.java

Lines changed: 36 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
package com.google.cloud.firestore;
1818

19+
import static java.lang.Character.isSurrogate;
20+
1921
import com.google.firestore.v1.MapValue;
2022
import com.google.firestore.v1.Value;
2123
import com.google.firestore.v1.Value.ValueTypeCase;
@@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) {
136138

137139
/** Compare strings in UTF-8 encoded byte order */
138140
public static int compareUtf8Strings(String left, String right) {
139-
int i = 0;
140-
while (i < left.length() && i < right.length()) {
141-
int leftCodePoint = left.codePointAt(i);
142-
int rightCodePoint = right.codePointAt(i);
143-
144-
if (leftCodePoint != rightCodePoint) {
145-
if (leftCodePoint < 128 && rightCodePoint < 128) {
146-
// ASCII comparison
147-
return Integer.compare(leftCodePoint, rightCodePoint);
148-
} else {
149-
// UTF-8 encode the character at index i for byte comparison.
150-
ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i));
151-
ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i));
152-
int comp = compareByteStrings(leftBytes, rightBytes);
153-
if (comp != 0) {
154-
return comp;
155-
} else {
156-
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are
157-
// identical. This can happen with malformed input (invalid surrogate pairs), where
158-
// Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate
159-
// inputs get converted to "?" by protocol buffer while round tripping, so we almost
160-
// never receive invalid strings from backend.
161-
// Fallback to code point comparison for graceful handling.
162-
return Integer.compare(leftCodePoint, rightCodePoint);
163-
}
164-
}
141+
// noinspection StringEquality
142+
if (left == right) {
143+
return 0;
144+
}
145+
146+
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
147+
// if found, use that character to determine the relative ordering of the two strings as a
148+
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
149+
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
150+
// and UTF-16 happen to represent Unicode code points.
151+
//
152+
// After finding the first pair of differing characters, there are two cases:
153+
//
154+
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
155+
// both are surrogates from a surrogate pair (that collectively represent code points greater
156+
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
157+
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
158+
// sufficient.
159+
//
160+
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
161+
// containing string is always ordered after the non-surrogate. This is because surrogates are
162+
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
163+
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
164+
// less than or equal to 0xFFFF.
165+
final int length = Math.min(left.length(), right.length());
166+
for (int i = 0; i < length; i++) {
167+
final char leftChar = left.charAt(i);
168+
final char rightChar = right.charAt(i);
169+
if (leftChar != rightChar) {
170+
return (isSurrogate(leftChar) == isSurrogate(rightChar))
171+
? Character.compare(leftChar, rightChar)
172+
: isSurrogate(leftChar) ? 1 : -1;
165173
}
166-
// Increment by 2 for surrogate pairs, 1 otherwise.
167-
i += Character.charCount(leftCodePoint);
168174
}
169175

170-
// Compare lengths if all characters are equal
176+
// Use the lengths of the strings to determine the overall comparison result since either the
177+
// strings were equal or one is a prefix of the other.
171178
return Integer.compare(left.length(), right.length());
172179
}
173180

174-
private static String getUtf8SafeBytes(String str, int index) {
175-
int firstCodePoint = str.codePointAt(index);
176-
return str.substring(index, index + Character.charCount(firstCodePoint));
177-
}
178-
179181
private int compareBlobs(Value left, Value right) {
180182
ByteString leftBytes = left.getBytesValue();
181183
ByteString rightBytes = right.getBytesValue();

google-cloud-firestore/src/test/java/com/google/cloud/firestore/OrderTest.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
218218
ByteString b2 = ByteString.copyFromUtf8(s2);
219219
int expected = compareByteStrings(b1, b2);
220220

221-
if (actual == expected) {
221+
if (Integer.signum(actual) == Integer.signum(expected)) {
222222
passCount++;
223223
} else {
224224
errors.add(
@@ -227,9 +227,12 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
227227
+ "\", s2=\""
228228
+ s2
229229
+ "\") returned "
230+
+ signName(actual)
231+
+ " ("
230232
+ actual
233+
+ ")"
231234
+ ", but expected "
232-
+ expected
235+
+ signName(expected)
233236
+ " (i="
234237
+ i
235238
+ ", s1.length="
@@ -252,6 +255,16 @@ public void compareUtf8StringsShouldReturnCorrectValue() {
252255
}
253256
}
254257

258+
private static String signName(int value) {
259+
if (value < 0) {
260+
return "a negative value";
261+
} else if (value > 0) {
262+
return "a positive value";
263+
} else {
264+
return "0";
265+
}
266+
}
267+
255268
private static class StringPairGenerator {
256269

257270
private final StringGenerator stringGenerator;

0 commit comments

Comments
 (0)