Replace punctuations in text with white-space before tokenizing

soheilrt · soheilrt · commit d9e55f8937bf · 2021-12-11T00:04:49.000+03:30
diff --git a/tokenizers/word_tokenizer.py b/tokenizers/word_tokenizer.py
@@ -1,9 +1,15 @@
 from typing import List
+
 from nltk import word_tokenize
 
 from models.tokenizer import Tokenizer
 
 
 class WordTokenizer(Tokenizer):
+    __punctuations: str = '!"#$%&\'()*+,./:;<=>?@[\\]^_-`{|}~'
+
     def tokenize(self, text) -> List[str]:
-        return word_tokenize(text)
+        return word_tokenize(self.__translate_punctuations_to_space(text))
+
+    def __translate_punctuations_to_space(self, text):
+        return text.translate(str.maketrans(self.__punctuations, ' ' * len(self.__punctuations)))