forked from SMAT-Lab/APIMatchmaker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPresolvedCSVFilter.py
135 lines (119 loc) · 4.77 KB
/
PresolvedCSVFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re
import string
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import threadpool
from Helper.common import *
class PresolvedCSVFilter:
def __init__(self, input_path, description_path, new_description_path, save_path, MD_min, MI_min, max_job):
self.input_path = input_path
self.save_path = save_path
self.description_path = description_path
self.new_description_path = new_description_path
self.MD_min = MD_min
self.MI_min = MI_min
self.max_jobs = max_job
self.english_stopwords = stopwords.words("english")
def processone(self, file):
filename = os.path.split(file)[-1][:-4]
file_new = os.path.join(save_path, filename + ".csv")
if os.path.exists(file_new):
return
with open(file, "r") as fr:
reader = csv.reader(fr)
headings = next(reader)
fw = open(file_new, "w")
writer = csv.writer(fw)
writer.writerow(headings)
for line in reader:
line_new = []
string = line[1].strip('\"[] ')
pattern = r'(<.*?>)'
mi = re.findall(pattern, string)
new_mi, count = self.count_MIs(mi) # count = len(mi) without dup
if 15 <= count <= 30:
line_new.append(line[0])
line_new.append(new_mi)
line_new.append(reader.line_num)
writer.writerow(line_new)
fw.close()
if row_count(file_new) < 6:
os.remove(file_new)
return
desc_file = os.path.join(self.description_path, filename + ".txt")
if not os.path.exists(desc_file):
os.remove(file_new)
return
if not self.check_lang(desc_file):
os.remove(file_new)
return
new_desc = self.normalize(desc_file)
new_desc_path = os.path.join(self.new_description_path, filename + ".txt")
with open(new_desc_path, "w") as fw:
fw.write(" ".join(new_desc))
def check_lang(self, file):
count = 0
with open(file, "r") as fr:
content = fr.read()
for char in content:
if ord(char) > 128:
count += 1
# Non English char / all < 0.1, keep; otherwise, remove
if 1.0 * count / len(content) < 0.1:
return True
else:
return False
# lemmatizer
def lemmatizer(self, tokens):
wordnet_lemmatizer = WordNetLemmatizer()
return [wordnet_lemmatizer.lemmatize(item) for item in tokens]
# stemming
def stem_tokens(self, tokens):
stemmer = PorterStemmer()
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def judge_pure_english(self, keyword):
return all(ord(c) < 128 for c in keyword)
def normalize(self, file):
with open(file, "r") as fr:
text = fr.read()
words_cut = word_tokenize(text)
words_lower = [i.lower() for i in words_cut if len(i) > 3]
words_clear = []
for i in words_lower:
if not self.judge_pure_english(i):
continue
if i not in self.english_stopwords and i not in string.punctuation:
i1 = re.sub('[^a-zA-Z]', '', i)
words_clear.append(i1)
return self.stem_tokens(words_clear)
def count_MIs(self, lst):
# remove dup? yes
new_lst = []
count = 0
for item in lst:
if item not in new_lst:
new_lst.append(item)
count += 1
return new_lst, count
def start(self):
csvs = getFileList(self.input_path, ".csv")
self.all = len(csvs)
print("[+] Total csvs ", self.all)
print("[+] Saving results to " + self.save_path)
args = [(apk) for apk in csvs]
pool = threadpool.ThreadPool(self.max_jobs)
requests = threadpool.makeRequests(self.processone, args)
[pool.putRequest(req) for req in requests]
pool.wait()
if __name__ == '__main__':
input_path = "/home/username/APIRecommendation/Presolved/"
description_path = "/home/username/APIRecommendation/Description_fromGP/"
# new_description_path = "/home/username/APIRecommendation/Description_presolved/"
new_description_path = "/home/username/APIRecommendation/Description_lemmatized/"
save_path = "/home/username/APIRecommendation/Presolved_lemmatized/"
check_and_mk_dir(save_path)
check_and_mk_dir(new_description_path)
PresolvedCSVFilter(input_path, description_path, new_description_path, save_path, 6, 15, 15).start()