forked from Yiangos01/ADE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreducerFeature.py
212 lines (192 loc) · 6.49 KB
/
reducerFeature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python
from operator import itemgetter
import sys
import math
import re
current_topic = None
current_count = 0
total_tweets=0
total_sent=0
aver_depth=aver_ratio=aver_hastags=aver_length=aver_exla=aver_quest=aver_link=aver_topicRep=aver_sentiment=aver_neg=aver_pos=aver_neu=aver_compound=0
word = None
Dictionary_users = {}
Dictionary_retweeted = {}
Dictionary_hashtags = {}
Dictionary_words = {}
Dictionary_lang = {}
users_total=0
users_diver=1
retweeted_total=0
retweeted_diver=1
hashtags_total=0
hashtags_diver=1
lang_total=0
lang_diver=1
words_total=0
words_diver=1
text=[]
exlude=['\"','\'','[',']',' ','@',':','_','-','1','2','3','4','5','6','7','8','9','0','!','.','(',')']
common_dictionary=['a','for','what','like','me','you','we','do','have','had','did','who','how','good','fine','morning','night','now','too'
,'i','you','if','of','it','the','to','on','this','with','is','off','not','its','be','best','every','no','but','by','our','when','up','out','so'
,'my','more','from','is','are','in','that','does','where','could','us','just','can','thank','thanks']
# input comes from STDIN
for line in sys.stdin:
# remove leading and trailing whitespace
line = line.strip()
# parse the input we got from mapper.py
topic,cat,lang,text,user,retweeted,reDepth,reRatio,hashtags,length,exla,quest,link,topicRep,neg,pos,neu,compound = line.split('\t')
text = re.sub(r"https\S+", "", text)
text = re.sub(r"http\S+", "", text)
total_tweets+=1
topic = topic.replace("\'", "")
#print text
# convert count (currently a string) to int
try:
neg=float(neg)
pos=float(pos)
neu=float(neu)
compound=float(compound)
reDepth=float(reDepth)
reRatio=float(reRatio)
hashtags=float(hashtags)
length=float(length)
exla=float(exla)
quest=float(quest)
link=float(link)
topicRep=float(topicRep)
retweeted=float(retweeted)
except ValueError:
# ignore/discard this line
continue
# this IF-switch only works because Hadoop sorts map output
# by key (here: word) before it is passed to the reducer
if current_topic == topic:
#metrices
aver_neg+=neg
aver_pos+=pos
aver_neu+=neu
aver_compound+=compound
aver_depth+=reDepth
aver_ratio+=reRatio
aver_hastags+=hashtags
aver_length+=length
aver_exla+=exla
aver_quest+=quest
aver_link+=link
aver_topicRep+=topicRep
#User's diversity dictionary for a topic
if user in Dictionary_users:
Dictionary_users[user]+=1
users_total+=1
else :
Dictionary_users[user]=1
users_total+=1
#User's retweeted diversity dictionary for a topic
if user in Dictionary_retweeted and retweeted==1:
Dictionary_retweeted[user]+=1
retweeted_total+=1
elif retweeted==1 :
Dictionary_retweeted[user]=1
retweeted_total+=1
#hashtag's/word's diversity dictionary for a topic
words=''.join(ch for ch in text if ch not in exlude)
words= words.split(',')
for word in words:
word=list(word)
try:
if word[0]=='u':
word[0]=''
except IndexError:
continue
word="".join(word).strip()
word=list(word)
try:
if word[0]=='#':
word="".join(word).strip()
word=word.lower()
if word in Dictionary_hashtags:
Dictionary_hashtags[word]+=1
hashtags_total+=1
else :
Dictionary_hashtags[word]=1
hashtags_total+=1
except IndexError:
continue
if type(word) is list :
word="".join(word).strip()
if word not in common_dictionary:
if word in Dictionary_words:
Dictionary_words[word]+=1
words_total+=1
else :
Dictionary_words[word]=1
words_total+=1
#Lang s retweeted diversity dictionary for a topic
if lang in Dictionary_lang :
Dictionary_lang[lang]+=1
lang_total+=1
else:
Dictionary_lang[lang]=1
lang_total+=1
else:
if current_topic:
# write result to STDOUT
if users_total==1:
current_topic=topic
continue
#diversities
#calculate User's diversity index
for key in Dictionary_users:
users_diver+=-(float(Dictionary_users[key])/users_total)*math.log(float(Dictionary_users[key])/users_total,2)
#calculate retweeted user's diversity index
for key in Dictionary_retweeted:
retweeted_diver+=-(float(Dictionary_retweeted[key])/retweeted_total)*math.log(float(Dictionary_retweeted[key])/retweeted_total,2)
#calculate hashtag's diversity index
for key in Dictionary_hashtags:
hashtags_diver+=-(float(Dictionary_hashtags[key])/hashtags_total)*math.log(float(Dictionary_hashtags[key])/hashtags_total,2)
#calculate words's diversity index
for key in Dictionary_words:
words_diver+=-(float(Dictionary_words[key])/words_total)*math.log(float(Dictionary_words[key])/words_total,2)
#calculate lang's diversity index
for key in Dictionary_lang:
lang_diver+=-(float(Dictionary_lang[key])/lang_total)*math.log(float(Dictionary_lang[key])/lang_total,2)
try :
print '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (topic,cat,aver_depth/total_tweets,aver_ratio/total_tweets,aver_hastags/total_tweets,aver_length/total_tweets,aver_exla/total_tweets,aver_quest/total_tweets,aver_link/total_tweets,aver_topicRep/total_tweets,users_diver,retweeted_diver,hashtags_diver,words_diver,lang_diver,aver_neg/total_tweets,aver_neu/total_tweets,aver_pos/total_tweets,aver_compound/total_tweets)
except ZeroDivisionError or TypeError:
a=1
#initiate values for new topic
aver_neg=neg
aver_pos=pos
aver_neu=neu
aver_compound=compound
aver_depth=reDepth
aver_ratio=reRatio
aver_hastags=hashtags
aver_length=length
aver_exla+=exla
aver_quest=quest
aver_link=link
aver_topicRep=topicRep
total_tweets=0
users_total=0
users_diver=1
retweeted_total=0
retweeted_diver=1
hashtags_total=0
hashtags_diver=1
words_total=0
words_diver=1
lang_total=0
lang_diver=1
Dictionary_lang.clear()
Dictionary_words.clear()
Dictionary_hashtags.clear()
Dictionary_users.clear()
Dictionary_retweeted.clear()
current_topic = topic
#last word!
if current_topic.lower() == topic.lower():
try :
print '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (topic,cat,aver_depth/total_tweets,aver_ratio/total_tweets,aver_hastags/total_tweets,aver_length/total_tweets,aver_exla/total_tweets,aver_quest/total_tweets,aver_link/total_tweets,aver_topicRep/total_tweets,users_diver,retweeted_diver,hashtags_diver,words_diver,lang_diver,aver_neg/total_tweets,aver_neu/total_tweets,aver_pos/total_tweets,aver_compound/total_tweets)
except ZeroDivisionError or TypeError:
user="a"