|
16 | 16 |
|
17 | 17 | package com.google.cloud.firestore;
|
18 | 18 |
|
| 19 | +import static java.lang.Character.isSurrogate; |
| 20 | + |
19 | 21 | import com.google.firestore.v1.MapValue;
|
20 | 22 | import com.google.firestore.v1.Value;
|
21 | 23 | import com.google.firestore.v1.Value.ValueTypeCase;
|
@@ -136,46 +138,46 @@ public int compare(@Nonnull Value left, @Nonnull Value right) {
|
136 | 138 |
|
137 | 139 | /** Compare strings in UTF-8 encoded byte order */
|
138 | 140 | public static int compareUtf8Strings(String left, String right) {
|
139 |
| - int i = 0; |
140 |
| - while (i < left.length() && i < right.length()) { |
141 |
| - int leftCodePoint = left.codePointAt(i); |
142 |
| - int rightCodePoint = right.codePointAt(i); |
143 |
| - |
144 |
| - if (leftCodePoint != rightCodePoint) { |
145 |
| - if (leftCodePoint < 128 && rightCodePoint < 128) { |
146 |
| - // ASCII comparison |
147 |
| - return Integer.compare(leftCodePoint, rightCodePoint); |
148 |
| - } else { |
149 |
| - // UTF-8 encode the character at index i for byte comparison. |
150 |
| - ByteString leftBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(left, i)); |
151 |
| - ByteString rightBytes = ByteString.copyFromUtf8(getUtf8SafeBytes(right, i)); |
152 |
| - int comp = compareByteStrings(leftBytes, rightBytes); |
153 |
| - if (comp != 0) { |
154 |
| - return comp; |
155 |
| - } else { |
156 |
| - // EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte representations are |
157 |
| - // identical. This can happen with malformed input (invalid surrogate pairs), where |
158 |
| - // Java's encoding leads to unexpected byte sequences. Meanwhile, any invalid surrogate |
159 |
| - // inputs get converted to "?" by protocol buffer while round tripping, so we almost |
160 |
| - // never receive invalid strings from backend. |
161 |
| - // Fallback to code point comparison for graceful handling. |
162 |
| - return Integer.compare(leftCodePoint, rightCodePoint); |
163 |
| - } |
164 |
| - } |
| 141 | + // noinspection StringEquality |
| 142 | + if (left == right) { |
| 143 | + return 0; |
| 144 | + } |
| 145 | + |
| 146 | + // Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and, |
| 147 | + // if found, use that character to determine the relative ordering of the two strings as a |
| 148 | + // whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by |
| 149 | + // comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8 |
| 150 | + // and UTF-16 happen to represent Unicode code points. |
| 151 | + // |
| 152 | + // After finding the first pair of differing characters, there are two cases: |
| 153 | + // |
| 154 | + // Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or |
| 155 | + // both are surrogates from a surrogate pair (that collectively represent code points greater |
| 156 | + // than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the |
| 157 | + // lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is |
| 158 | + // sufficient. |
| 159 | + // |
| 160 | + // Case 2: One character is a surrogate and the other is not. In this case the surrogate- |
| 161 | + // containing string is always ordered after the non-surrogate. This is because surrogates are |
| 162 | + // used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations |
| 163 | + // and are lexicographically greater than the 1, 2, or 3-byte representations of code points |
| 164 | + // less than or equal to 0xFFFF. |
| 165 | + final int length = Math.min(left.length(), right.length()); |
| 166 | + for (int i = 0; i < length; i++) { |
| 167 | + final char leftChar = left.charAt(i); |
| 168 | + final char rightChar = right.charAt(i); |
| 169 | + if (leftChar != rightChar) { |
| 170 | + return (isSurrogate(leftChar) == isSurrogate(rightChar)) |
| 171 | + ? Character.compare(leftChar, rightChar) |
| 172 | + : isSurrogate(leftChar) ? 1 : -1; |
165 | 173 | }
|
166 |
| - // Increment by 2 for surrogate pairs, 1 otherwise. |
167 |
| - i += Character.charCount(leftCodePoint); |
168 | 174 | }
|
169 | 175 |
|
170 |
| - // Compare lengths if all characters are equal |
| 176 | + // Use the lengths of the strings to determine the overall comparison result since either the |
| 177 | + // strings were equal or one is a prefix of the other. |
171 | 178 | return Integer.compare(left.length(), right.length());
|
172 | 179 | }
|
173 | 180 |
|
174 |
| - private static String getUtf8SafeBytes(String str, int index) { |
175 |
| - int firstCodePoint = str.codePointAt(index); |
176 |
| - return str.substring(index, index + Character.charCount(firstCodePoint)); |
177 |
| - } |
178 |
| - |
179 | 181 | private int compareBlobs(Value left, Value right) {
|
180 | 182 | ByteString leftBytes = left.getBytesValue();
|
181 | 183 | ByteString rightBytes = right.getBytesValue();
|
|
0 commit comments