Skip to content

Commit d9e55f8

Browse files
committed
Replace punctuations in text with white-space before tokenizing
1 parent 34ca2c4 commit d9e55f8

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

tokenizers/word_tokenizer.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
from typing import List
2+
23
from nltk import word_tokenize
34

45
from models.tokenizer import Tokenizer
56

67

78
class WordTokenizer(Tokenizer):
9+
__punctuations: str = '!"#$%&\'()*+,./:;<=>?@[\\]^_-`{|}~'
10+
811
def tokenize(self, text) -> List[str]:
9-
return word_tokenize(text)
12+
return word_tokenize(self.__translate_punctuations_to_space(text))
13+
14+
def __translate_punctuations_to_space(self, text):
15+
return text.translate(str.maketrans(self.__punctuations, ' ' * len(self.__punctuations)))

0 commit comments

Comments
 (0)