Skip to content

Commit 105720d

Browse files
iterate over Unicode White_Space directly, rather than testing each of 1.1M code points
#Cleanup PiperOrigin-RevId: 778538165
1 parent 5564b82 commit 105720d

File tree

1 file changed

+27
-36
lines changed

1 file changed

+27
-36
lines changed

tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc

Lines changed: 27 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,62 +14,53 @@
1414

1515
#include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h"
1616

17-
#include <iterator>
1817
#include <string>
1918

20-
#include "icu4c/source/common/unicode/appendable.h"
21-
#include "icu4c/source/common/unicode/bytestream.h"
22-
#include "icu4c/source/common/unicode/edits.h"
23-
#include "icu4c/source/common/unicode/normalizer2.h"
24-
#include "icu4c/source/common/unicode/schriter.h"
25-
#include "icu4c/source/common/unicode/stringoptions.h"
26-
#include "icu4c/source/common/unicode/stringpiece.h"
2719
#include "icu4c/source/common/unicode/uchar.h"
28-
#include "icu4c/source/common/unicode/ucnv.h"
29-
#include "icu4c/source/common/unicode/ucnv_err.h"
3020
#include "icu4c/source/common/unicode/umachine.h"
3121
#include "icu4c/source/common/unicode/uniset.h"
32-
#include "icu4c/source/common/unicode/unistr.h"
3322
#include "icu4c/source/common/unicode/uset.h"
34-
#include "icu4c/source/common/unicode/utf.h"
3523
#include "icu4c/source/common/unicode/utf8.h"
3624
#include "icu4c/source/common/unicode/utypes.h"
3725

3826
namespace tensorflow {
3927
namespace text {
4028

29+
namespace {
30+
31+
const icu::UnicodeSet& WhiteSpaceSet() {
32+
// Will not fail because the data is hardcoded in the ICU library.
33+
UErrorCode error_code = U_ZERO_ERROR;
34+
const USet* c_set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &error_code);
35+
// assert(U_SUCCESS(error_code));
36+
const icu::UnicodeSet* set = icu::UnicodeSet::fromUSet(c_set);
37+
return *set;
38+
}
39+
40+
} // namespace
41+
4142
std::string BuildWhitespaceString() {
42-
icu::UnicodeString unicode_string;
43-
icu::UnicodeStringAppendable appendable_unicode_string(unicode_string);
44-
// The maximum codepoint in Unicode is 0x0010FFFF.
45-
for (UChar32 cp = 0; cp <= 0x0010FFFF; ++cp) {
46-
if (U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp)) {
47-
appendable_unicode_string.appendCodePoint(cp);
48-
}
49-
}
5043
std::string str;
51-
unicode_string.toUTF8String(str);
44+
char buf[U8_MAX_LENGTH];
45+
for (auto cp : WhiteSpaceSet().codePoints()) {
46+
int len = 0;
47+
U8_APPEND_UNSAFE(buf, len, cp);
48+
str.append(buf, len);
49+
}
5250
return str;
5351
}
5452

5553
std::string BuildWhitespaceTokenizerConfig() {
56-
// The maximum codepoint in Unicode is 0x0010FFFF.
57-
UChar32 max_unicode_char = 0x0010FFFF;
54+
const icu::UnicodeSet& set = WhiteSpaceSet();
55+
int range_count = set.getRangeCount();
56+
UChar32 largest_whitespace = set.getRangeEnd(range_count - 1);
5857
// The string will hold our bit array
59-
std::string bitset((max_unicode_char >> 3) + 1, 0);
60-
auto bitdata = bitset.begin();
61-
UChar32 largest_whitespace = 0;
62-
int shift = 0;
63-
for (UChar32 cp = 0; cp <= max_unicode_char; ++cp, ++shift) {
64-
if (shift == 8) {
65-
++bitdata;
66-
shift = 0;
67-
}
68-
bool is_whitespace = U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp);
69-
largest_whitespace = is_whitespace ? cp : largest_whitespace;
70-
*bitdata |= is_whitespace << shift;
58+
std::string bitset((largest_whitespace >> 3) + 1, 0);
59+
for (auto cp : set.codePoints()) {
60+
int index = cp >> 3;
61+
bitset[index] |= 1 << (cp & 7);
7162
}
72-
return bitset.substr(0, (largest_whitespace >> 3) + 1);
63+
return bitset;
7364
}
7465

7566
} // namespace text

0 commit comments

Comments
 (0)