-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathunkify_topical.py
More file actions
38 lines (32 loc) · 1006 Bytes
/
unkify_topical.py
File metadata and controls
38 lines (32 loc) · 1006 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pickle
import sys
import argparse
parser = argparse.ArgumentParser(description='Remove topical words')
parser.add_argument('--remove_file', type=str, required=True,
help='File containing words to be removed from each class')
parser.add_argument('--train_file', type=str, required=True,
help='Train file')
parser.add_argument('--write_file', type=str, required=True,
help='File name to write the unkified text')
args = parser.parse_args()
removewords = pickle.load(open(args.remove_file, "rb"))
f = open(args.train_file)
f2 = open(args.write_file, "w")
# print (removewords)
for l in f:
p = l.strip().split("\t")
# print (p[1])
rwords = removewords[p[1].lower()]
# print (rwords)
# input("ok")
newtext = ""
for w in p[0].split():
if w in rwords:
newtext += "UNK "
else:
newtext += w + " "
p[0] = newtext
f2.write("\t".join(p)+"\n")
print ("Done", len(removewords))
f2.close()
f.close()