|
1 |
| -''' |
| 1 | +""" |
2 | 2 | ===============
|
3 | 3 | === Purpose ===
|
4 | 4 | ===============
|
|
20 | 20 | * Fetching daily values instead of weekly values
|
21 | 21 | 2015-03-??
|
22 | 22 | * Original version
|
23 |
| -''' |
| 23 | +""" |
24 | 24 |
|
25 | 25 | # standard library
|
26 | 26 | import argparse
|
|
36 | 36 |
|
37 | 37 | class HealthTweets:
|
38 | 38 |
|
39 |
| - # mapping from state abbreviations to location codes used by healthtweets.org |
40 |
| - STATE_CODES = {'AL': 3024, 'AK': 3025, 'AZ': 3026, 'AR': 3027, 'CA': 440, 'CO': 3029, 'CT': 3030, 'DE': 3031, 'DC': 3032, 'FL': 3033, 'GA': 3034, 'HI': 3035, 'ID': 3036, 'IL': 3037, 'IN': 3038, 'IA': 3039, 'KS': 3040, 'KY': 3041, 'LA': 2183, 'ME': 3043, 'MD': 3044, 'MA': 450, 'MI': 3046, 'MN': 3047, 'MS': 3048, 'MO': 3049, 'MT': 3050, 'NE': 3051, 'NV': 3052, 'NH': 3053, 'NJ': 478, 'NM': 2225, 'NY': 631, 'NC': 3057, 'ND': 3058, 'OH': 3059, 'OK': 3060, 'OR': 281, 'PA': 3062, 'RI': 3063, 'SC': 3064, 'SD': 3065, 'TN': 3066, 'TX': 3067, 'UT': 2272, 'VT': 3069, 'VA': 3070, 'WA': 3071, 'WV': 3072, 'WI': 3073, 'WY': 3074} |
41 |
| - |
42 |
| - def __init__(self, username, password, debug=False): |
43 |
| - self.debug = debug |
44 |
| - self.session = requests.Session() |
45 |
| - # spoof a web browser |
46 |
| - self.session.headers.update({ |
47 |
| - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', |
48 |
| - }) |
49 |
| - # get the login token |
50 |
| - response = self._go('http://www.healthtweets.org/accounts/login') |
51 |
| - token = self._get_token(response.text) |
52 |
| - if self.debug: |
53 |
| - print('token=%s'%(token)) |
54 |
| - data = { |
55 |
| - 'csrfmiddlewaretoken': token, |
56 |
| - 'username': username, |
57 |
| - 'password': password, |
58 |
| - 'next': '/', |
| 39 | + # mapping from state abbreviations to location codes used by healthtweets.org |
| 40 | + STATE_CODES = { |
| 41 | + "AL": 3024, |
| 42 | + "AK": 3025, |
| 43 | + "AZ": 3026, |
| 44 | + "AR": 3027, |
| 45 | + "CA": 440, |
| 46 | + "CO": 3029, |
| 47 | + "CT": 3030, |
| 48 | + "DE": 3031, |
| 49 | + "DC": 3032, |
| 50 | + "FL": 3033, |
| 51 | + "GA": 3034, |
| 52 | + "HI": 3035, |
| 53 | + "ID": 3036, |
| 54 | + "IL": 3037, |
| 55 | + "IN": 3038, |
| 56 | + "IA": 3039, |
| 57 | + "KS": 3040, |
| 58 | + "KY": 3041, |
| 59 | + "LA": 2183, |
| 60 | + "ME": 3043, |
| 61 | + "MD": 3044, |
| 62 | + "MA": 450, |
| 63 | + "MI": 3046, |
| 64 | + "MN": 3047, |
| 65 | + "MS": 3048, |
| 66 | + "MO": 3049, |
| 67 | + "MT": 3050, |
| 68 | + "NE": 3051, |
| 69 | + "NV": 3052, |
| 70 | + "NH": 3053, |
| 71 | + "NJ": 478, |
| 72 | + "NM": 2225, |
| 73 | + "NY": 631, |
| 74 | + "NC": 3057, |
| 75 | + "ND": 3058, |
| 76 | + "OH": 3059, |
| 77 | + "OK": 3060, |
| 78 | + "OR": 281, |
| 79 | + "PA": 3062, |
| 80 | + "RI": 3063, |
| 81 | + "SC": 3064, |
| 82 | + "SD": 3065, |
| 83 | + "TN": 3066, |
| 84 | + "TX": 3067, |
| 85 | + "UT": 2272, |
| 86 | + "VT": 3069, |
| 87 | + "VA": 3070, |
| 88 | + "WA": 3071, |
| 89 | + "WV": 3072, |
| 90 | + "WI": 3073, |
| 91 | + "WY": 3074, |
59 | 92 | }
|
60 |
| - # login to the site |
61 |
| - response = self._go('http://www.healthtweets.org/accounts/login', data=data) |
62 |
| - if response.status_code != 200 or 'Your username and password' in response.text: |
63 |
| - raise Exception('login failed') |
64 |
| - |
65 |
| - def get_values(self, state, date1, date2): |
66 |
| - ''' |
67 |
| - state: two-letter state abbreviation (see STATE_CODES) |
68 |
| - date1: the first date in the range, inclusive (format: YYYY-MM-DD) |
69 |
| - date2: the last date in the range, inclusive (format: YYYY-MM-DD) |
70 |
| - returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) |
71 |
| - ''' |
72 |
| - # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) |
73 |
| - raw_values = self._get_values(state, date1, date2, False) |
74 |
| - normalized_values = self._get_values(state, date1, date2, True) |
75 |
| - values = {} |
76 |
| - # save the raw number and calculate the total |
77 |
| - for date in raw_values.keys(): |
78 |
| - if normalized_values[date] == 0: |
79 |
| - continue |
80 |
| - values[date] = { |
81 |
| - 'num': round(raw_values[date]), |
82 |
| - 'total': round(100 * raw_values[date] / normalized_values[date]), |
83 |
| - } |
84 |
| - print(date, raw_values[date], normalized_values[date]) |
85 |
| - return values |
86 |
| - |
87 |
| - def _get_values(self, state, date1, date2, normalized): |
88 |
| - if state not in HealthTweets.STATE_CODES: |
89 |
| - raise Exception('invalid state') |
90 |
| - state_code = HealthTweets.STATE_CODES[state] |
91 |
| - d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d') |
92 |
| - s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y') |
93 |
| - count_type = 'normalized' if normalized else 'raw' |
94 |
| - url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code) |
95 |
| - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code)) |
96 |
| - #print(state, date1, date2, normalized) |
97 |
| - #print(url) |
98 |
| - #print(response.status_code) |
99 |
| - if response.status_code != 200: |
100 |
| - raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)') |
101 |
| - lines = [line.strip() for line in response.text.split('\n')] |
102 |
| - data_line = [line for line in lines if line[:16] == 'var chartData = '] |
103 |
| - if len(data_line) != 1: |
104 |
| - raise Exception('lookup failed') |
105 |
| - values = json.loads(data_line[0][16:-1]) |
106 |
| - return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values]) |
107 |
| - |
108 |
| - def check_state(self, state): |
109 |
| - ''' |
110 |
| - Sanity checks state code mapping. |
111 |
| - state: two-letter state abbreviation (see STATE_CODES) |
112 |
| - returns the full state name associated with the state abbreviation |
113 |
| - ''' |
114 |
| - if state not in HealthTweets.STATE_CODES: |
115 |
| - raise Exception('invalid state') |
116 |
| - state_code = HealthTweets.STATE_CODES[state] |
117 |
| - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d'%(state_code)) |
118 |
| - lines = [line.strip() for line in response.text.split('\n')] |
119 |
| - data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] |
120 |
| - if len(data_line) == 0: |
121 |
| - raise Exception('check failed') |
122 |
| - name = data_line[0][29:] |
123 |
| - name = name.split('(')[0] |
124 |
| - return name.strip() |
125 |
| - |
126 |
| - def _get_token(self, html): |
127 |
| - page = PageParser.parse(html) |
128 |
| - hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)]) |
129 |
| - return hidden['attrs']['value'] |
130 |
| - |
131 |
| - def _go(self, url, method=None, referer=None, data=None): |
132 |
| - if self.debug: |
133 |
| - print('%s'%(url)) |
134 |
| - if method is None: |
135 |
| - if data is None: |
136 |
| - method = self.session.get |
137 |
| - else: |
138 |
| - method = self.session.post |
139 |
| - response = method(url, headers={'referer': referer}, data=data) |
140 |
| - html = response.text |
141 |
| - if self.debug: |
142 |
| - for item in response.history: |
143 |
| - print(' [%d to %s]'%(item.status_code, item.headers['Location'])) |
144 |
| - print(' %d (%d bytes)'%(response.status_code, len(html))) |
145 |
| - return response |
| 93 | + |
| 94 | + def __init__(self, username, password, debug=False): |
| 95 | + self.debug = debug |
| 96 | + self.session = requests.Session() |
| 97 | + # spoof a web browser |
| 98 | + self.session.headers.update( |
| 99 | + { |
| 100 | + "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", |
| 101 | + } |
| 102 | + ) |
| 103 | + # get the login token |
| 104 | + response = self._go("https://www.healthtweets.org/accounts/login") |
| 105 | + token = self._get_token(response.text) |
| 106 | + if self.debug: |
| 107 | + print("token=%s" % (token)) |
| 108 | + data = { |
| 109 | + "csrfmiddlewaretoken": token, |
| 110 | + "username": username, |
| 111 | + "password": password, |
| 112 | + "next": "/", |
| 113 | + } |
| 114 | + # login to the site |
| 115 | + response = self._go("https://www.healthtweets.org/accounts/login", data=data) |
| 116 | + if response.status_code != 200 or "Your username and password" in response.text: |
| 117 | + raise Exception("login failed") |
| 118 | + |
| 119 | + def get_values(self, state, date1, date2): |
| 120 | + """ |
| 121 | + state: two-letter state abbreviation (see STATE_CODES) |
| 122 | + date1: the first date in the range, inclusive (format: YYYY-MM-DD) |
| 123 | + date2: the last date in the range, inclusive (format: YYYY-MM-DD) |
| 124 | + returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) |
| 125 | + """ |
| 126 | + # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) |
| 127 | + raw_values = self._get_values(state, date1, date2, False) |
| 128 | + normalized_values = self._get_values(state, date1, date2, True) |
| 129 | + values = {} |
| 130 | + # save the raw number and calculate the total |
| 131 | + for date in raw_values.keys(): |
| 132 | + if normalized_values[date] == 0: |
| 133 | + continue |
| 134 | + values[date] = { |
| 135 | + "num": round(raw_values[date]), |
| 136 | + "total": round(100 * raw_values[date] / normalized_values[date]), |
| 137 | + } |
| 138 | + print(date, raw_values[date], normalized_values[date]) |
| 139 | + return values |
| 140 | + |
| 141 | + def _get_values(self, state, date1, date2, normalized): |
| 142 | + if state not in HealthTweets.STATE_CODES: |
| 143 | + raise Exception("invalid state") |
| 144 | + state_code = HealthTweets.STATE_CODES[state] |
| 145 | + d1, d2 = datetime.strptime(date1, "%Y-%m-%d"), datetime.strptime(date2, "%Y-%m-%d") |
| 146 | + s1, s2 = d1.strftime("%m%%2F%d%%2F%Y"), d2.strftime("%m%%2F%d%%2F%Y") |
| 147 | + count_type = "normalized" if normalized else "raw" |
| 148 | + url = ( |
| 149 | + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" |
| 150 | + % (count_type, (d2 - d1).days, s1, s2, state_code) |
| 151 | + ) |
| 152 | + response = self._go( |
| 153 | + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d" |
| 154 | + % (count_type, (d2 - d1).days, s1, s2, state_code) |
| 155 | + ) |
| 156 | + # print(state, date1, date2, normalized) |
| 157 | + # print(url) |
| 158 | + # print(response.status_code) |
| 159 | + if response.status_code != 200: |
| 160 | + raise Exception( |
| 161 | + "plot status is " + str(response.status_code) + " (when was data last updated?)" |
| 162 | + ) |
| 163 | + lines = [line.strip() for line in response.text.split("\n")] |
| 164 | + data_line = [line for line in lines if line[:16] == "var chartData = "] |
| 165 | + if len(data_line) != 1: |
| 166 | + raise Exception("lookup failed") |
| 167 | + values = json.loads(data_line[0][16:-1]) |
| 168 | + return { |
| 169 | + datetime.strptime(v[0], "%m/%d/%Y").strftime("%Y-%m-%d"): float(v[1]) for v in values |
| 170 | + } |
| 171 | + |
| 172 | + def check_state(self, state): |
| 173 | + """ |
| 174 | + Sanity checks state code mapping. |
| 175 | + state: two-letter state abbreviation (see STATE_CODES) |
| 176 | + returns the full state name associated with the state abbreviation |
| 177 | + """ |
| 178 | + if state not in HealthTweets.STATE_CODES: |
| 179 | + raise Exception("invalid state") |
| 180 | + state_code = HealthTweets.STATE_CODES[state] |
| 181 | + response = self._go( |
| 182 | + "https://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d" % (state_code) |
| 183 | + ) |
| 184 | + lines = [line.strip() for line in response.text.split("\n")] |
| 185 | + data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] |
| 186 | + if len(data_line) == 0: |
| 187 | + raise Exception("check failed") |
| 188 | + name = data_line[0][29:] |
| 189 | + name = name.split("(")[0] |
| 190 | + return name.strip() |
| 191 | + |
| 192 | + def _get_token(self, html): |
| 193 | + page = PageParser.parse(html) |
| 194 | + hidden = PageParser.filter_all( |
| 195 | + page, [("html",), ("body",), ("div",), ("div",), ("div",), ("form",), ("input",)] |
| 196 | + ) |
| 197 | + return hidden["attrs"]["value"] |
| 198 | + |
| 199 | + def _go(self, url, method=None, referer=None, data=None): |
| 200 | + if self.debug: |
| 201 | + print("%s" % (url)) |
| 202 | + if method is None: |
| 203 | + if data is None: |
| 204 | + method = self.session.get |
| 205 | + else: |
| 206 | + method = self.session.post |
| 207 | + response = method(url, headers={"referer": referer}, data=data) |
| 208 | + html = response.text |
| 209 | + if self.debug: |
| 210 | + for item in response.history: |
| 211 | + print(" [%d to %s]" % (item.status_code, item.headers["Location"])) |
| 212 | + print(" %d (%d bytes)" % (response.status_code, len(html))) |
| 213 | + return response |
146 | 214 |
|
147 | 215 |
|
148 | 216 | def main():
|
149 |
| - # args and usage |
150 |
| - parser = argparse.ArgumentParser() |
151 |
| - parser.add_argument('username', action='store', type=str, help='healthtweets.org username') |
152 |
| - parser.add_argument('password', action='store', type=str, help='healthtweets.org password') |
153 |
| - parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)') |
154 |
| - parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)') |
155 |
| - parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)') |
156 |
| - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') |
157 |
| - args = parser.parse_args() |
158 |
| - |
159 |
| - ht = HealthTweets(args.username, args.password, debug=args.debug) |
160 |
| - values = ht.get_values(args.state, args.date1, args.date2) |
161 |
| - print('Daily counts in %s from %s to %s:'%(ht.check_state(args.state), args.date1, args.date2)) |
162 |
| - for date in sorted(list(values.keys())): |
163 |
| - print('%s: num=%-4d total=%-5d (%.3f%%)'%(date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total'])) |
164 |
| - |
165 |
| - |
166 |
| -if __name__ == '__main__': |
167 |
| - main() |
| 217 | + # args and usage |
| 218 | + parser = argparse.ArgumentParser() |
| 219 | + parser.add_argument("username", action="store", type=str, help="healthtweets.org username") |
| 220 | + parser.add_argument("password", action="store", type=str, help="healthtweets.org password") |
| 221 | + parser.add_argument( |
| 222 | + "state", |
| 223 | + action="store", |
| 224 | + type=str, |
| 225 | + choices=list(HealthTweets.STATE_CODES.keys()), |
| 226 | + help="U.S. state (ex: TX)", |
| 227 | + ) |
| 228 | + parser.add_argument( |
| 229 | + "date1", action="store", type=str, help="first date, inclusive (ex: 2015-01-01)" |
| 230 | + ) |
| 231 | + parser.add_argument( |
| 232 | + "date2", action="store", type=str, help="last date, inclusive (ex: 2015-01-01)" |
| 233 | + ) |
| 234 | + parser.add_argument( |
| 235 | + "-d", "--debug", action="store_const", const=True, default=False, help="enable debug mode" |
| 236 | + ) |
| 237 | + args = parser.parse_args() |
| 238 | + |
| 239 | + ht = HealthTweets(args.username, args.password, debug=args.debug) |
| 240 | + values = ht.get_values(args.state, args.date1, args.date2) |
| 241 | + print(f"Daily counts in {ht.check_state(args.state)} from {args.date1} to {args.date2}:") |
| 242 | + for date in sorted(list(values.keys())): |
| 243 | + print( |
| 244 | + "%s: num=%-4d total=%-5d (%.3f%%)" |
| 245 | + % ( |
| 246 | + date, |
| 247 | + values[date]["num"], |
| 248 | + values[date]["total"], |
| 249 | + 100 * values[date]["num"] / values[date]["total"], |
| 250 | + ) |
| 251 | + ) |
| 252 | + |
| 253 | + |
| 254 | +if __name__ == "__main__": |
| 255 | + main() |
0 commit comments