From e0eaa1ed7453952d5a7909edc6f5f42ccb8b098b Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Sat, 7 Mar 2020 18:44:31 +0300
Subject: [PATCH 1/2] Update tokenizer: unhardcode alphanumeric char set

---
 official/nlp/transformer/utils/tokenizer.py | 49 ++++++++++++++-------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/official/nlp/transformer/utils/tokenizer.py b/official/nlp/transformer/utils/tokenizer.py
index 20302266acc..b40970a0180 100644
--- a/official/nlp/transformer/utils/tokenizer.py
+++ b/official/nlp/transformer/utils/tokenizer.py
@@ -45,12 +45,15 @@
 
 _UNDEFINED_UNICODE = u"\u3013"
 
-# Set contains all letter and number characters.
-_ALPHANUMERIC_CHAR_SET = set(
+def alphanumeric_char_set():
+  return set(
     six.unichr(i) for i in xrange(sys.maxunicode)
     if (unicodedata.category(six.unichr(i)).startswith("L") or
         unicodedata.category(six.unichr(i)).startswith("N")))
 
+# Set contains all letter and number characters.
+_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
+
 # min_count is the minimum number of times a subtoken must appear in the data
 # before before it is added to the vocabulary. The value is found using binary
 # search to obtain the target vocabulary size.
@@ -61,11 +64,15 @@
 class Subtokenizer(object):
   """Encodes and decodes strings to/from integer IDs."""
 
-  def __init__(self, vocab_file, reserved_tokens=None):
+  def __init__(self, vocab_file, reserved_tokens=None,
+               master_char_set=None):
     """Initializes class, creating a vocab file if data_files is provided."""
     tf.compat.v1.logging.info("Initializing Subtokenizer from file %s." %
                               vocab_file)
 
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
+
     if reserved_tokens is None:
       reserved_tokens = RESERVED_TOKENS
 
@@ -80,11 +87,13 @@ def __init__(self, vocab_file, reserved_tokens=None):
     # Create cache to speed up subtokenization
     self._cache_size = 2 ** 20
     self._cache = [(None, None)] * self._cache_size
+    self._master_char_set = master_char_set
 
   @staticmethod
   def init_from_files(
       vocab_file, files, target_vocab_size, threshold, min_count=None,
-      file_byte_limit=1e6, reserved_tokens=None, correct_strip=True):
+      file_byte_limit=1e6, reserved_tokens=None, correct_strip=True,
+      master_char_set=None):
     """Create subtoken vocabulary based on files, and save vocab to file.
 
     Args:
@@ -105,6 +114,8 @@ def init_from_files(
     Returns:
       Subtokenizer object
     """
+    if master_char_set is None:
+      master_char_set = _ALPHANUMERIC_CHAR_SET
     if reserved_tokens is None:
       reserved_tokens = RESERVED_TOKENS
 
@@ -112,7 +123,7 @@ def init_from_files(
       tf.compat.v1.logging.info("Vocab file already exists (%s)" % vocab_file)
     else:
       tf.compat.v1.logging.info("Begin steps to create subtoken vocabulary...")
-      token_counts = _count_tokens(files, file_byte_limit, correct_strip)
+      token_counts = _count_tokens(files, file_byte_limit, correct_strip, master_char_set)
       alphabet = _generate_alphabet_dict(token_counts)
       subtoken_list = _generate_subtokens_with_target_vocab_size(
           token_counts, alphabet, target_vocab_size, threshold, min_count,
@@ -120,12 +131,12 @@ def init_from_files(
       tf.compat.v1.logging.info("Generated vocabulary with %d subtokens." %
                                 len(subtoken_list))
       _save_vocab_file(vocab_file, subtoken_list)
-    return Subtokenizer(vocab_file)
+    return Subtokenizer(vocab_file, master_char_set=master_char_set)
 
   def encode(self, raw_string, add_eos=False):
     """Encodes a string into a list of int subtoken ids."""
     ret = []
-    tokens = _split_string_to_tokens(native_to_unicode(raw_string))
+    tokens = _split_string_to_tokens(native_to_unicode(raw_string),self._master_char_set)
     for token in tokens:
       ret.extend(self._token_to_subtoken_ids(token))
     if add_eos:
@@ -161,7 +172,8 @@ def decode(self, subtokens):
         "Subtokens argument passed into decode() must be a list of integers.")
 
     return _unicode_to_native(
-        _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens)))
+        _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens),
+                               self._master_char_set))
 
   def _subtoken_ids_to_tokens(self, subtokens):
     """Convert list of int subtoken ids to a list of string tokens."""
@@ -218,16 +230,16 @@ def _unicode_to_native(s):
     return s
 
 
-def _split_string_to_tokens(text):
+def _split_string_to_tokens(text, master_char_set):
   """Splits text to a list of string tokens."""
   if not text:
     return []
   ret = []
   token_start = 0
   # Classify each character in the input string
-  is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
+  is_master = [c in master_char_set for c in text]
   for pos in xrange(1, len(text)):
-    if is_alnum[pos] != is_alnum[pos - 1]:
+    if is_master[pos] != is_master[pos - 1]:
       token = text[token_start:pos]
       if token != u" " or token_start == 0:
         ret.append(token)
@@ -237,12 +249,12 @@ def _split_string_to_tokens(text):
   return ret
 
 
-def _join_tokens_to_string(tokens):
+def _join_tokens_to_string(tokens, master_char_set):
   """Join a list of string tokens into a single string."""
-  token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+  token_is_master = [t[0] in master_char_set for t in tokens]
   ret = []
   for i, token in enumerate(tokens):
-    if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+    if i > 0 and token_is_master[i - 1] and token_is_master[i]:
       ret.append(u" ")
     ret.append(token)
   return "".join(ret)
@@ -324,7 +336,8 @@ def match(m):
   return _UNESCAPE_REGEX.sub(match, token)
 
 
-def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
+def _count_tokens(files, file_byte_limit=1e6, correct_strip=True,
+                  master_char_set=None):
   """Return token counts of words in the files.
 
   Samples file_byte_limit bytes from each file, and counts the words that appear
@@ -342,6 +355,9 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
     Dictionary mapping tokens to the number of times they appear in the sampled
     lines from the files.
   """
+  if master_char_set is None:
+    master_char_set = _ALPHANUMERIC_CHAR_SET
+
   token_counts = collections.defaultdict(int)
 
   for filepath in files:
@@ -362,7 +378,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
           counter = 0
 
           # Add words to token counts
-          for token in _split_string_to_tokens(native_to_unicode(line)):
+          for token in _split_string_to_tokens(native_to_unicode(line),
+                                               master_char_set):
             token_counts[token] += 1
   return token_counts
 

From 30579e0f53264f3accf92697f1d243848f45cc88 Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Sat, 7 Mar 2020 18:50:12 +0300
Subject: [PATCH 2/2] Update tokenizer: do the safety check before inserting
 EOL

---
 official/nlp/transformer/utils/tokenizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/official/nlp/transformer/utils/tokenizer.py b/official/nlp/transformer/utils/tokenizer.py
index b40970a0180..a2b88047d27 100644
--- a/official/nlp/transformer/utils/tokenizer.py
+++ b/official/nlp/transformer/utils/tokenizer.py
@@ -140,6 +140,8 @@ def encode(self, raw_string, add_eos=False):
     for token in tokens:
       ret.extend(self._token_to_subtoken_ids(token))
     if add_eos:
+      assert EOS in self.subtoken_list, \
+          "Can't append 'EOS' because it is not in list of known subtokens."
       ret.append(EOS_ID)
     return ret