Skip to content

Commit

Permalink
Remove variation selector after space
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Aug 2, 2019
1 parent af15d0a commit 6e8082b
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions somajo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False

self.spaces = re.compile(r"\s+")
self.controls = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
self.stranded_variation_selector = re.compile(r" \uFE0F")
# soft hyphen (00AD), zero-width space (200B), zero-width
# non-joiner (200C), zero-width joiner (200D), Arabic letter
# mark (061C), left-to-right mark (200E), right-to-left mark
Expand Down Expand Up @@ -556,6 +557,9 @@ def _tokenize(self, paragraph):
# get rid of control characters
paragraph = self.controls.sub("", paragraph)

# get rid of isolated variation selectors
paragraph = self.stranded_variation_selector.sub("", paragraph)

# normalize whitespace
paragraph = self.spaces.sub(" ", paragraph)

Expand Down

0 comments on commit 6e8082b

Please sign in to comment.