-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcooccurrence.py
142 lines (115 loc) · 5.2 KB
/
cooccurrence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from bs4 import BeautifulSoup
import nltk
import os
import cPickle
from collections import Counter, defaultdict
import itertools
import numpy as np
import json
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.SnowballStemmer("english")
class Cooccurrence():
def __init__(self, savepath, stem=False):
self.dictionary = set(nltk.corpus.words.words())
self.savepath = savepath
self.stem=stem
self.wordcounts = Counter()
self.pairs = Counter()
self.n_sentences = 0
self.targets = []
def load(self):
with open(self.savepath, 'rb') as f:
tmp_dict = cPickle.load(f)
self.__dict__.update(tmp_dict)
def save(self):
with open(self.savepath, 'wb') as f:
cPickle.dump(self.__dict__, f, 2)
def read2010train(self):
self.targets = []
for folder in ["SemEval-2010/training_data/nouns/", "SemEval-2010/training_data/verbs/"]:
for file in os.listdir(folder):
self.read2010file(folder+file)
if self.stem:
self.targets.append(self.stemmer.stem(file.split(".")[0]))
else:
self.targets.append(file.split(".")[0])
print(file)
self.save()
print(str(self.n_sentences)+" processed sentences. Finished.")
def read2010file(self, path):
with open(path) as fp:
soup = BeautifulSoup(fp, "html5lib")
for sentence_tag in soup.body.contents[0].contents:
self.process_sentence(sentence_tag.text)
def process_sentence(self, sent):
# should we encode to ascii? get errors using str(sent) ...
# LEMMATIZE??
# STOPWORDS?
if self.stem:
word_bag = [stemmer.stem(w) for w in nltk.word_tokenize(sent.lower()) if w in self.dictionary]
else:
word_bag = [w for w in nltk.word_tokenize(sent.lower()) if w in self.dictionary]
# remove duplicates. sort b/c key-pairs in all_words must have consistent order
word_bag = list(set(word_bag))
word_bag.sort()
self.wordcounts.update(word_bag)
self.pairs.update(itertools.combinations(word_bag,2))
self.n_sentences += 1
def make_pmi(self, topn=20000, thresh=np.log(2), expand_thresh=None):
self.topn = topn
self.thresh = thresh
if expand_thresh is None:
self.expand_thresh = self.thresh
good_words, _ = zip(*self.wordcounts.most_common(self.topn))
self.good_words = set(good_words)
self.all_nodes = set()
self.targets2edges = defaultdict(list)
self.pmi_mat = {}
lgn = np.log(self.n_sentences)
for key, val in self.pairs.iteritems():
if key[0] in self.good_words and key[1] in self.good_words:
pmi = np.log(val) - np.log(self.wordcounts[key[0]]) - np.log(self.wordcounts[key[1]]) + lgn
if pmi > self.thresh:
self.all_nodes.update(key) #adds both
if key[0] in self.targets or key[1] in self.targets and pmi > self.expand_thresh:
edgelabel = key[0]+"_"+key[1]
self.all_nodes.add(edgelabel)
if key[0] in self.targets:
self.targets2edges[key[0]].append(key[1])
if key[1] in self.targets:
self.targets2edges[key[1]].append(key[0])
self.pmi_mat[(key[0],edgelabel)] = pmi
self.pmi_mat[(edgelabel,key[1])] = pmi
# could restore else here:
self.pmi_mat[key] = pmi
self.node2ix = {node: ix for ix, node in enumerate(self.all_nodes)}
self.save()
def write_adj_list(self, graph_prefix):
with open(graph_prefix+"_adjlist", "w") as out:
for key, val in self.pmi_mat.iteritems():
out.write("{0} {1} {2}\n".format(self.node2ix[key[0]], self.node2ix[key[1]], val))
with open(graph_prefix+"_labels", "w") as out:
json.dump(self.node2ix, out)
with open(graph_prefix+"_params", 'wb') as out:
cPickle.dump({"good_words":self.good_words,
"targets": self.targets,
"targets2edges": self.targets2edges,
"n_sentences": self.n_sentences,
"topn": self.topn,
"thresh": self.thresh,
"expand_thresh": self.expand_thresh,
"savepath": self.savepath}, out, 2)
if __name__ == "__main__":
"""
This script reads the corpus, assembles the PMI matrix, and writes information
about the graph to disk. We make a three different graphs with different cutoff
parameters
"""
co = Cooccurrence("dumps/monday_unstemmed.pkl")
co.read2010train()
co.make_pmi(topn=20000)
co.write_adj_list("dumps/monday_unstemmed")
co.make_pmi(topn=20000, thresh=np.log(4), expand_thresh=np.log(6))
co.write_adj_list('dumps/tuesday_unstemmed_BOTHhighthresh')
co.make_pmi(topn=40000, thresh=np.log(6), expand_thresh=np.log(9))
co.write_adj_list("dumps/tuesday_unstemmed_highN")