Skip to content

Commit 07ed83e

Browse files
committed
style(black): format twitter acquisition
1 parent b9ceb40 commit 07ed83e

File tree

3 files changed

+329
-233
lines changed

3 files changed

+329
-233
lines changed

src/acquisition/twtr/healthtweets.py

Lines changed: 215 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
'''
1+
"""
22
===============
33
=== Purpose ===
44
===============
@@ -20,7 +20,7 @@
2020
* Fetching daily values instead of weekly values
2121
2015-03-??
2222
* Original version
23-
'''
23+
"""
2424

2525
# standard library
2626
import argparse
@@ -36,132 +36,220 @@
3636

3737
class HealthTweets:
3838

39-
# mapping from state abbreviations to location codes used by healthtweets.org
40-
STATE_CODES = {'AL': 3024, 'AK': 3025, 'AZ': 3026, 'AR': 3027, 'CA': 440, 'CO': 3029, 'CT': 3030, 'DE': 3031, 'DC': 3032, 'FL': 3033, 'GA': 3034, 'HI': 3035, 'ID': 3036, 'IL': 3037, 'IN': 3038, 'IA': 3039, 'KS': 3040, 'KY': 3041, 'LA': 2183, 'ME': 3043, 'MD': 3044, 'MA': 450, 'MI': 3046, 'MN': 3047, 'MS': 3048, 'MO': 3049, 'MT': 3050, 'NE': 3051, 'NV': 3052, 'NH': 3053, 'NJ': 478, 'NM': 2225, 'NY': 631, 'NC': 3057, 'ND': 3058, 'OH': 3059, 'OK': 3060, 'OR': 281, 'PA': 3062, 'RI': 3063, 'SC': 3064, 'SD': 3065, 'TN': 3066, 'TX': 3067, 'UT': 2272, 'VT': 3069, 'VA': 3070, 'WA': 3071, 'WV': 3072, 'WI': 3073, 'WY': 3074}
41-
42-
def __init__(self, username, password, debug=False):
43-
self.debug = debug
44-
self.session = requests.Session()
45-
# spoof a web browser
46-
self.session.headers.update({
47-
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
48-
})
49-
# get the login token
50-
response = self._go('http://www.healthtweets.org/accounts/login')
51-
token = self._get_token(response.text)
52-
if self.debug:
53-
print('token=%s'%(token))
54-
data = {
55-
'csrfmiddlewaretoken': token,
56-
'username': username,
57-
'password': password,
58-
'next': '/',
39+
# mapping from state abbreviations to location codes used by healthtweets.org
40+
STATE_CODES = {
41+
"AL": 3024,
42+
"AK": 3025,
43+
"AZ": 3026,
44+
"AR": 3027,
45+
"CA": 440,
46+
"CO": 3029,
47+
"CT": 3030,
48+
"DE": 3031,
49+
"DC": 3032,
50+
"FL": 3033,
51+
"GA": 3034,
52+
"HI": 3035,
53+
"ID": 3036,
54+
"IL": 3037,
55+
"IN": 3038,
56+
"IA": 3039,
57+
"KS": 3040,
58+
"KY": 3041,
59+
"LA": 2183,
60+
"ME": 3043,
61+
"MD": 3044,
62+
"MA": 450,
63+
"MI": 3046,
64+
"MN": 3047,
65+
"MS": 3048,
66+
"MO": 3049,
67+
"MT": 3050,
68+
"NE": 3051,
69+
"NV": 3052,
70+
"NH": 3053,
71+
"NJ": 478,
72+
"NM": 2225,
73+
"NY": 631,
74+
"NC": 3057,
75+
"ND": 3058,
76+
"OH": 3059,
77+
"OK": 3060,
78+
"OR": 281,
79+
"PA": 3062,
80+
"RI": 3063,
81+
"SC": 3064,
82+
"SD": 3065,
83+
"TN": 3066,
84+
"TX": 3067,
85+
"UT": 2272,
86+
"VT": 3069,
87+
"VA": 3070,
88+
"WA": 3071,
89+
"WV": 3072,
90+
"WI": 3073,
91+
"WY": 3074,
5992
}
60-
# login to the site
61-
response = self._go('http://www.healthtweets.org/accounts/login', data=data)
62-
if response.status_code != 200 or 'Your username and password' in response.text:
63-
raise Exception('login failed')
64-
65-
def get_values(self, state, date1, date2):
66-
'''
67-
state: two-letter state abbreviation (see STATE_CODES)
68-
date1: the first date in the range, inclusive (format: YYYY-MM-DD)
69-
date2: the last date in the range, inclusive (format: YYYY-MM-DD)
70-
returns a dictionary (by date) of number of flu tweets (num) and total tweets (total)
71-
'''
72-
# get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets)
73-
raw_values = self._get_values(state, date1, date2, False)
74-
normalized_values = self._get_values(state, date1, date2, True)
75-
values = {}
76-
# save the raw number and calculate the total
77-
for date in raw_values.keys():
78-
if normalized_values[date] == 0:
79-
continue
80-
values[date] = {
81-
'num': round(raw_values[date]),
82-
'total': round(100 * raw_values[date] / normalized_values[date]),
83-
}
84-
print(date, raw_values[date], normalized_values[date])
85-
return values
86-
87-
def _get_values(self, state, date1, date2, normalized):
88-
if state not in HealthTweets.STATE_CODES:
89-
raise Exception('invalid state')
90-
state_code = HealthTweets.STATE_CODES[state]
91-
d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d')
92-
s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y')
93-
count_type = 'normalized' if normalized else 'raw'
94-
url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code)
95-
response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code))
96-
#print(state, date1, date2, normalized)
97-
#print(url)
98-
#print(response.status_code)
99-
if response.status_code != 200:
100-
raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)')
101-
lines = [line.strip() for line in response.text.split('\n')]
102-
data_line = [line for line in lines if line[:16] == 'var chartData = ']
103-
if len(data_line) != 1:
104-
raise Exception('lookup failed')
105-
values = json.loads(data_line[0][16:-1])
106-
return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values])
107-
108-
def check_state(self, state):
109-
'''
110-
Sanity checks state code mapping.
111-
state: two-letter state abbreviation (see STATE_CODES)
112-
returns the full state name associated with the state abbreviation
113-
'''
114-
if state not in HealthTweets.STATE_CODES:
115-
raise Exception('invalid state')
116-
state_code = HealthTweets.STATE_CODES[state]
117-
response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d'%(state_code))
118-
lines = [line.strip() for line in response.text.split('\n')]
119-
data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza (']
120-
if len(data_line) == 0:
121-
raise Exception('check failed')
122-
name = data_line[0][29:]
123-
name = name.split('(')[0]
124-
return name.strip()
125-
126-
def _get_token(self, html):
127-
page = PageParser.parse(html)
128-
hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)])
129-
return hidden['attrs']['value']
130-
131-
def _go(self, url, method=None, referer=None, data=None):
132-
if self.debug:
133-
print('%s'%(url))
134-
if method is None:
135-
if data is None:
136-
method = self.session.get
137-
else:
138-
method = self.session.post
139-
response = method(url, headers={'referer': referer}, data=data)
140-
html = response.text
141-
if self.debug:
142-
for item in response.history:
143-
print(' [%d to %s]'%(item.status_code, item.headers['Location']))
144-
print(' %d (%d bytes)'%(response.status_code, len(html)))
145-
return response
93+
94+
def __init__(self, username, password, debug=False):
95+
self.debug = debug
96+
self.session = requests.Session()
97+
# spoof a web browser
98+
self.session.headers.update(
99+
{
100+
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
101+
}
102+
)
103+
# get the login token
104+
response = self._go("https://www.healthtweets.org/accounts/login")
105+
token = self._get_token(response.text)
106+
if self.debug:
107+
print("token=%s" % (token))
108+
data = {
109+
"csrfmiddlewaretoken": token,
110+
"username": username,
111+
"password": password,
112+
"next": "/",
113+
}
114+
# login to the site
115+
response = self._go("https://www.healthtweets.org/accounts/login", data=data)
116+
if response.status_code != 200 or "Your username and password" in response.text:
117+
raise Exception("login failed")
118+
119+
def get_values(self, state, date1, date2):
120+
"""
121+
state: two-letter state abbreviation (see STATE_CODES)
122+
date1: the first date in the range, inclusive (format: YYYY-MM-DD)
123+
date2: the last date in the range, inclusive (format: YYYY-MM-DD)
124+
returns a dictionary (by date) of number of flu tweets (num) and total tweets (total)
125+
"""
126+
# get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets)
127+
raw_values = self._get_values(state, date1, date2, False)
128+
normalized_values = self._get_values(state, date1, date2, True)
129+
values = {}
130+
# save the raw number and calculate the total
131+
for date in raw_values.keys():
132+
if normalized_values[date] == 0:
133+
continue
134+
values[date] = {
135+
"num": round(raw_values[date]),
136+
"total": round(100 * raw_values[date] / normalized_values[date]),
137+
}
138+
print(date, raw_values[date], normalized_values[date])
139+
return values
140+
141+
def _get_values(self, state, date1, date2, normalized):
142+
if state not in HealthTweets.STATE_CODES:
143+
raise Exception("invalid state")
144+
state_code = HealthTweets.STATE_CODES[state]
145+
d1, d2 = datetime.strptime(date1, "%Y-%m-%d"), datetime.strptime(date2, "%Y-%m-%d")
146+
s1, s2 = d1.strftime("%m%%2F%d%%2F%Y"), d2.strftime("%m%%2F%d%%2F%Y")
147+
count_type = "normalized" if normalized else "raw"
148+
url = (
149+
"https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d"
150+
% (count_type, (d2 - d1).days, s1, s2, state_code)
151+
)
152+
response = self._go(
153+
"https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d"
154+
% (count_type, (d2 - d1).days, s1, s2, state_code)
155+
)
156+
# print(state, date1, date2, normalized)
157+
# print(url)
158+
# print(response.status_code)
159+
if response.status_code != 200:
160+
raise Exception(
161+
"plot status is " + str(response.status_code) + " (when was data last updated?)"
162+
)
163+
lines = [line.strip() for line in response.text.split("\n")]
164+
data_line = [line for line in lines if line[:16] == "var chartData = "]
165+
if len(data_line) != 1:
166+
raise Exception("lookup failed")
167+
values = json.loads(data_line[0][16:-1])
168+
return {
169+
datetime.strptime(v[0], "%m/%d/%Y").strftime("%Y-%m-%d"): float(v[1]) for v in values
170+
}
171+
172+
def check_state(self, state):
173+
"""
174+
Sanity checks state code mapping.
175+
state: two-letter state abbreviation (see STATE_CODES)
176+
returns the full state name associated with the state abbreviation
177+
"""
178+
if state not in HealthTweets.STATE_CODES:
179+
raise Exception("invalid state")
180+
state_code = HealthTweets.STATE_CODES[state]
181+
response = self._go(
182+
"https://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d" % (state_code)
183+
)
184+
lines = [line.strip() for line in response.text.split("\n")]
185+
data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza (']
186+
if len(data_line) == 0:
187+
raise Exception("check failed")
188+
name = data_line[0][29:]
189+
name = name.split("(")[0]
190+
return name.strip()
191+
192+
def _get_token(self, html):
193+
page = PageParser.parse(html)
194+
hidden = PageParser.filter_all(
195+
page, [("html",), ("body",), ("div",), ("div",), ("div",), ("form",), ("input",)]
196+
)
197+
return hidden["attrs"]["value"]
198+
199+
def _go(self, url, method=None, referer=None, data=None):
200+
if self.debug:
201+
print("%s" % (url))
202+
if method is None:
203+
if data is None:
204+
method = self.session.get
205+
else:
206+
method = self.session.post
207+
response = method(url, headers={"referer": referer}, data=data)
208+
html = response.text
209+
if self.debug:
210+
for item in response.history:
211+
print(" [%d to %s]" % (item.status_code, item.headers["Location"]))
212+
print(" %d (%d bytes)" % (response.status_code, len(html)))
213+
return response
146214

147215

148216
def main():
149-
# args and usage
150-
parser = argparse.ArgumentParser()
151-
parser.add_argument('username', action='store', type=str, help='healthtweets.org username')
152-
parser.add_argument('password', action='store', type=str, help='healthtweets.org password')
153-
parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)')
154-
parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)')
155-
parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)')
156-
parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode')
157-
args = parser.parse_args()
158-
159-
ht = HealthTweets(args.username, args.password, debug=args.debug)
160-
values = ht.get_values(args.state, args.date1, args.date2)
161-
print('Daily counts in %s from %s to %s:'%(ht.check_state(args.state), args.date1, args.date2))
162-
for date in sorted(list(values.keys())):
163-
print('%s: num=%-4d total=%-5d (%.3f%%)'%(date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total']))
164-
165-
166-
if __name__ == '__main__':
167-
main()
217+
# args and usage
218+
parser = argparse.ArgumentParser()
219+
parser.add_argument("username", action="store", type=str, help="healthtweets.org username")
220+
parser.add_argument("password", action="store", type=str, help="healthtweets.org password")
221+
parser.add_argument(
222+
"state",
223+
action="store",
224+
type=str,
225+
choices=list(HealthTweets.STATE_CODES.keys()),
226+
help="U.S. state (ex: TX)",
227+
)
228+
parser.add_argument(
229+
"date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)"
230+
)
231+
parser.add_argument(
232+
"date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)"
233+
)
234+
parser.add_argument(
235+
"-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode"
236+
)
237+
args = parser.parse_args()
238+
239+
ht = HealthTweets(args.username, args.password, debug=args.debug)
240+
values = ht.get_values(args.state, args.date1, args.date2)
241+
print(f"Daily counts in {ht.check_state(args.state)} from {args.date1} to {args.date2}:")
242+
for date in sorted(list(values.keys())):
243+
print(
244+
"%s: num=%-4d total=%-5d (%.3f%%)"
245+
% (
246+
date,
247+
values[date]["num"],
248+
values[date]["total"],
249+
100 * values[date]["num"] / values[date]["total"],
250+
)
251+
)
252+
253+
254+
if __name__ == "__main__":
255+
main()

0 commit comments

Comments
 (0)