|
14 | 14 |
|
15 | 15 | #include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h" |
16 | 16 |
|
17 | | -#include <iterator> |
18 | 17 | #include <string> |
19 | 18 |
|
20 | | -#include "icu4c/source/common/unicode/appendable.h" |
21 | | -#include "icu4c/source/common/unicode/bytestream.h" |
22 | | -#include "icu4c/source/common/unicode/edits.h" |
23 | | -#include "icu4c/source/common/unicode/normalizer2.h" |
24 | | -#include "icu4c/source/common/unicode/schriter.h" |
25 | | -#include "icu4c/source/common/unicode/stringoptions.h" |
26 | | -#include "icu4c/source/common/unicode/stringpiece.h" |
27 | 19 | #include "icu4c/source/common/unicode/uchar.h" |
28 | | -#include "icu4c/source/common/unicode/ucnv.h" |
29 | | -#include "icu4c/source/common/unicode/ucnv_err.h" |
30 | 20 | #include "icu4c/source/common/unicode/umachine.h" |
31 | 21 | #include "icu4c/source/common/unicode/uniset.h" |
32 | | -#include "icu4c/source/common/unicode/unistr.h" |
33 | 22 | #include "icu4c/source/common/unicode/uset.h" |
34 | | -#include "icu4c/source/common/unicode/utf.h" |
35 | 23 | #include "icu4c/source/common/unicode/utf8.h" |
36 | 24 | #include "icu4c/source/common/unicode/utypes.h" |
37 | 25 |
|
38 | 26 | namespace tensorflow { |
39 | 27 | namespace text { |
40 | 28 |
|
| 29 | +namespace { |
| 30 | + |
| 31 | +const icu::UnicodeSet& WhiteSpaceSet() { |
| 32 | + // Will not fail because the data is hardcoded in the ICU library. |
| 33 | + UErrorCode error_code = U_ZERO_ERROR; |
| 34 | + const USet* c_set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &error_code); |
| 35 | + // assert(U_SUCCESS(error_code)); |
| 36 | + const icu::UnicodeSet* set = icu::UnicodeSet::fromUSet(c_set); |
| 37 | + return *set; |
| 38 | +} |
| 39 | + |
| 40 | +} // namespace |
| 41 | + |
41 | 42 | std::string BuildWhitespaceString() { |
42 | | - icu::UnicodeString unicode_string; |
43 | | - icu::UnicodeStringAppendable appendable_unicode_string(unicode_string); |
44 | | - // The maximum codepoint in Unicode is 0x0010FFFF. |
45 | | - for (UChar32 cp = 0; cp <= 0x0010FFFF; ++cp) { |
46 | | - if (U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp)) { |
47 | | - appendable_unicode_string.appendCodePoint(cp); |
48 | | - } |
49 | | - } |
50 | 43 | std::string str; |
51 | | - unicode_string.toUTF8String(str); |
| 44 | + char buf[U8_MAX_LENGTH]; |
| 45 | + for (auto cp : WhiteSpaceSet().codePoints()) { |
| 46 | + int len = 0; |
| 47 | + U8_APPEND_UNSAFE(buf, len, cp); |
| 48 | + str.append(buf, len); |
| 49 | + } |
52 | 50 | return str; |
53 | 51 | } |
54 | 52 |
|
55 | 53 | std::string BuildWhitespaceTokenizerConfig() { |
56 | | - // The maximum codepoint in Unicode is 0x0010FFFF. |
57 | | - UChar32 max_unicode_char = 0x0010FFFF; |
| 54 | + const icu::UnicodeSet& set = WhiteSpaceSet(); |
| 55 | + int range_count = set.getRangeCount(); |
| 56 | + UChar32 largest_whitespace = set.getRangeEnd(range_count - 1); |
58 | 57 | // The string will hold our bit array |
59 | | - std::string bitset((max_unicode_char >> 3) + 1, 0); |
60 | | - auto bitdata = bitset.begin(); |
61 | | - UChar32 largest_whitespace = 0; |
62 | | - int shift = 0; |
63 | | - for (UChar32 cp = 0; cp <= max_unicode_char; ++cp, ++shift) { |
64 | | - if (shift == 8) { |
65 | | - ++bitdata; |
66 | | - shift = 0; |
67 | | - } |
68 | | - bool is_whitespace = U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp); |
69 | | - largest_whitespace = is_whitespace ? cp : largest_whitespace; |
70 | | - *bitdata |= is_whitespace << shift; |
| 58 | + std::string bitset((largest_whitespace >> 3) + 1, 0); |
| 59 | + for (auto cp : set.codePoints()) { |
| 60 | + int index = cp >> 3; |
| 61 | + bitset[index] |= 1 << (cp & 7); |
71 | 62 | } |
72 | | - return bitset.substr(0, (largest_whitespace >> 3) + 1); |
| 63 | + return bitset; |
73 | 64 | } |
74 | 65 |
|
75 | 66 | } // namespace text |
|
0 commit comments