-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtext_processing.py
72 lines (56 loc) · 1.86 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Set of functions that process texts.
They are meant as parameters of GraphBuilder objects
"""
import string
import enchant
import nltk
def clean_stopwords(tokens):
"""
Removes stopwords from a list of words
"""
stopwords = nltk.corpus.stopwords.words('english')
not_stop_words = []
for token in tokens:
if not token in stopwords:
not_stop_words.append(token)
return not_stop_words
def remove_punctuation(tokens):
"""
Removes punctuation from a list of tokens
It uses string.punctuation list of punctuation
"""
REMOVE_PUNCTUATION_MAPPING = dict.fromkeys(map(ord, string.punctuation))
not_punct_words = []
for token in tokens:
# Careful: not enough for unicode punctuation, ie. spanish or other languages
# punctuation symbols. Not important in this context though.
not_punct_token = token.translate(REMOVE_PUNCTUATION_MAPPING)
if not not_punct_token == '':
not_punct_words.append(not_punct_token)
return not_punct_words
def remove_dictionary_words(tokens):
"""
Removes words present in the dictionary
"""
d = enchant.Dict("en_US")
new_tokens = []
for token in tokens:
if not d.check(token):
new_tokens.append(token)
return new_tokens
def clean_punctuation_and_stopwords(tokens):
tokens = remove_punctuation(tokens)
tokens = clean_stopwords(tokens)
return tokens
def only_non_dictionary_words(tokens, remove = ['nt', 've', 'fox', 'huff', 'cnn']):
tokens = remove_punctuation(tokens)
tokens = clean_stopwords(tokens)
tokens = remove_dictionary_words(tokens)
# To do: some particles like nt remain...
def _passes(word):
if sum(p in word for p in remove) > 0:
return False
return word.isalpha()
# return True
return [t for t in tokens if _passes(t)]