-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautomatic_topic_to_domain_map.py
168 lines (130 loc) · 6.71 KB
/
automatic_topic_to_domain_map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import pickle
from pandas import *
import json
import argparse
from collections import defaultdict
import numpy as np
def word_match(topword, keyword):
#Exact match regardless of word length
topword = topword.lower()
keyword = keyword.lower()
if topword == keyword:
return True
'''
#If both words have len <= 3, require exact match. Otherwise, just require one word being a substring of the other
if len(topword) <= 3 and len(keyword) <= 3:
if topword == keyword:
return True
elif len(topword) <= 3 and len(keyword) > 3:
if topword in keyword:
return True
elif len(topword) > 3 and len(keyword) <= 3:
if keyword in topword:
return True
else:
if (keyword in topword) or (topword in keyword):
return True
return False
'''
'''
#Unit tests for the word_match function
assert word_match('Ai', 'ai')
assert not word_match('octopus', 'october')
assert word_match('octopus', 'oct')
assert word_match('oct', 'october')
assert word_match('love', 'lovely')
'''
def find_first_matched_dm(topic, topwords, topword_index, dm_keywords_dict, automatic_topic_to_domain_map, topword_index_list):
for topword in topwords:
topword_index += 1
for dm, keywords in dm_keywords_dict.items():
for keyword in keywords:
if word_match(topword, keyword):
automatic_topic_to_domain_map[topic] = dm
topword_index_list.append(topword_index)
'''
print('topic:', topic)
print('dm:', dm)
print('topword:', topword)
print('keyword:', keyword)
print('topword_index:', topword_index)
print('before automatic_topic_to_domain_map:', automatic_topic_to_domain_map)
print('after automatic_topic_to_domain_map:', automatic_topic_to_domain_map)
#See at which topword_index is there a match for the two domains of highest recall (dm_family) and precision (dm2_asylum)
if dm == 'dm_family' or dm == 'dm2_asylum':
print('dm:', dm)
print('topic:', topic)
print('topword:', topword)
print('keyword:', keyword)
print('topword_index:', topword_index)
'''
return automatic_topic_to_domain_map, topword_index_list
#still have to return sth (the same dict with no modification) if cant find a match among the 1000 top words
return automatic_topic_to_domain_map, topword_index_list
'''
for topic, topwords in words_per_topic_dict.items():
#automatic_topic_to_domain_map[topic] = []
topword_index = 0
find_first_matched_dm(topic, topwords, topword_index, dm_keywords_dict, automatic_topic_to_domain_map)
print('automatic_topic_to_domain_map:', automatic_topic_to_domain_map)
with open('automatic_topic_to_domain_map_num_topics=' + str(num_topics) + '.json', 'w') as f:
json.dump(automatic_topic_to_domain_map, f)
with open('automatic_topic_to_domain_map_num_topics=' + str(num_topics) + '.json', 'r') as f:
automatic_topic_to_domain_map = json.load(f)
with open('automatic_topic_to_domain_map_num_topics=' + str(num_topics) + '.txt', "w") as f:
n = f.write(str(automatic_topic_to_domain_map))
'''
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Make an automatic topic to domain map")
parser.add_argument('--num_topics', type=int, default=50)
flags = parser.parse_args()
num_topics = flags.num_topics
data_compact = read_csv("domain_keywords_both_senates.csv")
dm_keywords_dict = {}
#automatic_topic_to_domain_map = {}
automatic_topic_to_domain_map = defaultdict()
for index, row in data_compact.iterrows():
dm = row['Docket (domain)']
keywords = row['Keyword(s)'].split(', ')
keyword_list = [keyword.lower() for keyword in keywords]
dm_keywords_dict[dm] = keyword_list
print('dm:', dm)
print('keywords:', keywords)
print('dm_keywords_dict:', dm_keywords_dict)
with open('WardNJU_words_per_topic_num_topics=' + str(num_topics) + '.json', 'rb') as f:
words_per_topic_dict = pickle.load(f)
topword_index_list = []
for topic, topwords in words_per_topic_dict.items():
#automatic_topic_to_domain_map[topic] = []
topword_index = 0
#Only consider the top 10 top words for the topic-to-domain map
topwords = topwords[:10]
automatic_topic_to_domain_map, topword_index_list = find_first_matched_dm(topic, topwords, topword_index, dm_keywords_dict, automatic_topic_to_domain_map, topword_index_list)
#print('automatic_topic_to_domain_map:', automatic_topic_to_domain_map)
print('topword_index_list:', sorted(topword_index_list))
print('len: ', len(topword_index_list))
topword_index_lower_quartile = np.percentile(topword_index_list, 25, method='closest_observation')
print('topword_index_lower_quartile:', topword_index_lower_quartile)
topword_index_med = np.percentile(topword_index_list, 50, method='closest_observation')
print('topword_index_med:', topword_index_med)
topword_index_upper_quartile = np.percentile(topword_index_list, 75, method='closest_observation')
print('topword_index_upper_quartile:', topword_index_upper_quartile)
with open('automatic_topic_to_domain_map_num_topics=' + str(num_topics) + '.json', 'w') as f:
json.dump(automatic_topic_to_domain_map, f)
with open('automatic_topic_to_domain_map_num_topics=' + str(num_topics) + '.json', 'r') as f:
automatic_topic_to_domain_map = json.load(f)
with open('automatic_topic_to_domain_map_num_topics=' + str(num_topics) + '.txt', "w") as f:
n = f.write(str(automatic_topic_to_domain_map))
#Keep track of indices of top_word that matches a keyword for every topic
#num_topics = 10
#topword_index_list: [1, 1, 1, 2, 2, 2, 3, 5]
#len: 8
#num_topics = 50
#topword_index_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 10]
#len: 32
#num_topics = 100
#topword_index_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 10]
#len: 59
#num_topics = 200
#topword_index_list: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10]
#len: 92