forked from dmitrySorokin/cluster_docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
52 lines (40 loc) · 1.23 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from tika import parser
import re
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
_LEM = WordNetLemmatizer()
def normalize(text):
words = list(filter(lambda x: len(x) > 0, text.split(' ')))
tagged_tokens = pos_tag(words)#[0][1][0].lower()
normalized_words = []
for (word, tag) in tagged_tokens:
stem_tag = tag[0].lower()
if stem_tag not in ['a', 's', 'r', 'n', 'v']:
stem_tag = 'n'
word = _LEM.lemmatize(word.lower(), stem_tag)
normalized_words.append(word)
# TODO merge not, merge collocations?
return ' '.join(normalized_words)
def cleanup(text):
text = text.replace('\n', ' ')
text = re.sub(r'([^a-zA-Z ])', '', text)
return text
def parse(path):
try:
content = parser.from_file(path)['content']
except:
print('can not parse', path)
content = None
if content is None:
content = ''
print('can not read', path)
return content
def process(file_id, file_path, label_ids):
print('process', file_path)
text = parse(file_path)
print('parsed')
text = cleanup(text)
print('cleaned')
text = normalize(text)
print('normalized')
return file_id, file_path, label_ids, text