iterate over Unicode White_Space directly, rather than testing each of 1.1M code points

tf-text-github-robot · tf-text-github-robot · commit 105720db46a4 · 2025-07-02T09:38:36.000-07:00
#Cleanup

PiperOrigin-RevId: 778538165
diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.cc
@@ -14,62 +14,53 @@
 
 #include "tensorflow_text/core/kernels/whitespace_tokenizer_config_builder.h"
 
-#include <iterator>
 #include <string>
 
-#include "icu4c/source/common/unicode/appendable.h"
-#include "icu4c/source/common/unicode/bytestream.h"
-#include "icu4c/source/common/unicode/edits.h"
-#include "icu4c/source/common/unicode/normalizer2.h"
-#include "icu4c/source/common/unicode/schriter.h"
-#include "icu4c/source/common/unicode/stringoptions.h"
-#include "icu4c/source/common/unicode/stringpiece.h"
 #include "icu4c/source/common/unicode/uchar.h"
-#include "icu4c/source/common/unicode/ucnv.h"
-#include "icu4c/source/common/unicode/ucnv_err.h"
 #include "icu4c/source/common/unicode/umachine.h"
 #include "icu4c/source/common/unicode/uniset.h"
-#include "icu4c/source/common/unicode/unistr.h"
 #include "icu4c/source/common/unicode/uset.h"
-#include "icu4c/source/common/unicode/utf.h"
 #include "icu4c/source/common/unicode/utf8.h"
 #include "icu4c/source/common/unicode/utypes.h"
 
 namespace tensorflow {
 namespace text {
 
+namespace {
+
+const icu::UnicodeSet& WhiteSpaceSet() {
+  // Will not fail because the data is hardcoded in the ICU library.
+  UErrorCode error_code = U_ZERO_ERROR;
+  const USet* c_set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &error_code);
+  // assert(U_SUCCESS(error_code));
+  const icu::UnicodeSet* set = icu::UnicodeSet::fromUSet(c_set);
+  return *set;
+}
+
+}  // namespace
+
 std::string BuildWhitespaceString() {
-  icu::UnicodeString unicode_string;
-  icu::UnicodeStringAppendable appendable_unicode_string(unicode_string);
-  // The maximum codepoint in Unicode is 0x0010FFFF.
-  for (UChar32 cp = 0; cp <= 0x0010FFFF; ++cp) {
-    if (U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp)) {
-      appendable_unicode_string.appendCodePoint(cp);
-    }
-  }
   std::string str;
-  unicode_string.toUTF8String(str);
+  char buf[U8_MAX_LENGTH];
+  for (auto cp : WhiteSpaceSet().codePoints()) {
+    int len = 0;
+    U8_APPEND_UNSAFE(buf, len, cp);
+    str.append(buf, len);
+  }
   return str;
 }
 
 std::string BuildWhitespaceTokenizerConfig() {
-  // The maximum codepoint in Unicode is 0x0010FFFF.
-  UChar32 max_unicode_char = 0x0010FFFF;
+  const icu::UnicodeSet& set = WhiteSpaceSet();
+  int range_count = set.getRangeCount();
+  UChar32 largest_whitespace = set.getRangeEnd(range_count - 1);
   // The string will hold our bit array
-  std::string bitset((max_unicode_char >> 3) + 1, 0);
-  auto bitdata = bitset.begin();
-  UChar32 largest_whitespace = 0;
-  int shift = 0;
-  for (UChar32 cp = 0; cp <= max_unicode_char; ++cp, ++shift) {
-    if (shift == 8) {
-      ++bitdata;
-      shift = 0;
-    }
-    bool is_whitespace = U_IS_UNICODE_CHAR(cp) && u_isUWhiteSpace(cp);
-    largest_whitespace = is_whitespace ? cp : largest_whitespace;
-    *bitdata |= is_whitespace << shift;
+  std::string bitset((largest_whitespace >> 3) + 1, 0);
+  for (auto cp : set.codePoints()) {
+    int index = cp >> 3;
+    bitset[index] |= 1 << (cp & 7);
   }
-  return bitset.substr(0, (largest_whitespace >> 3) + 1);
+  return bitset;
 }
 
 }  // namespace text