We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 34ca2c4 commit d9e55f8Copy full SHA for d9e55f8
tokenizers/word_tokenizer.py
@@ -1,9 +1,15 @@
1
from typing import List
2
+
3
from nltk import word_tokenize
4
5
from models.tokenizer import Tokenizer
6
7
8
class WordTokenizer(Tokenizer):
9
+ __punctuations: str = '!"#$%&\'()*+,./:;<=>?@[\\]^_-`{|}~'
10
11
def tokenize(self, text) -> List[str]:
- return word_tokenize(text)
12
+ return word_tokenize(self.__translate_punctuations_to_space(text))
13
14
+ def __translate_punctuations_to_space(self, text):
15
+ return text.translate(str.maketrans(self.__punctuations, ' ' * len(self.__punctuations)))
0 commit comments