From 6e8082bde505b7dc2576a77d85a5ecaebd616672 Mon Sep 17 00:00:00 2001 From: Thomas Proisl Date: Fri, 2 Aug 2019 15:31:24 +0200 Subject: [PATCH] Remove variation selector after space --- somajo/tokenizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/somajo/tokenizer.py b/somajo/tokenizer.py index fa6fc7e..1ef54aa 100644 --- a/somajo/tokenizer.py +++ b/somajo/tokenizer.py @@ -38,6 +38,7 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False self.spaces = re.compile(r"\s+") self.controls = re.compile(r"[\u0000-\u001F\u007F-\u009F]") + self.stranded_variation_selector = re.compile(r" \uFE0F") # soft hyphen (00AD), zero-width space (200B), zero-width # non-joiner (200C), zero-width joiner (200D), Arabic letter # mark (061C), left-to-right mark (200E), right-to-left mark @@ -556,6 +557,9 @@ def _tokenize(self, paragraph): # get rid of control characters paragraph = self.controls.sub("", paragraph) + # get rid of isolated variation selectors + paragraph = self.stranded_variation_selector.sub("", paragraph) + # normalize whitespace paragraph = self.spaces.sub(" ", paragraph)