|
1 | 1 | """
|
2 |
| -=============== |
3 |
| -=== Purpose === |
4 |
| -=============== |
5 |
| -
|
6 |
| -Wrapper for the entire wiki data collection process: |
7 |
| - 1. Uses wiki_update.py to fetch metadata for new access logs |
8 |
| - 2. Uses wiki_download.py to download the access logs |
9 |
| - 3. Uses wiki_extract.py to store article access counts |
10 |
| -
|
| 2 | +=============== |
| 3 | +=== Purpose === |
| 4 | +=============== |
| 5 | +
|
| 6 | +Wrapper for the entire wiki data collection process: |
| 7 | + 1. Uses wiki_update.py to fetch metadata for new access logs |
| 8 | + 2. Uses wiki_download.py to download the access logs |
| 9 | + 3. Uses wiki_extract.py to store article access counts |
| 10 | +
|
11 | 11 | See also: master.php
|
12 |
| -
|
13 |
| -
|
14 |
| -======================= |
15 |
| -=== Data Dictionary === |
16 |
| -======================= |
17 |
| -
|
18 |
| -`wiki_raw` is a staging table where extracted access log data is stored for |
19 |
| -further processing. When wiki_update.py finds a new log, it saves the name and |
20 |
| -hash to this table, with a status of 0. This table is read by master.php, which |
21 |
| -then hands out "jobs" (independently and in parallel) to wiki_download.py. |
22 |
| -After wiki_download.py downloads the log and extracts the counts, it submits |
23 |
| -the data (as JSON) to master.php, which then stores the "raw" JSON counts in |
24 |
| -this table. |
25 |
| -+----------+---------------+------+-----+---------+----------------+ |
26 |
| -| Field | Type | Null | Key | Default | Extra | |
27 |
| -+----------+---------------+------+-----+---------+----------------+ |
28 |
| -| id | int(11) | NO | PRI | NULL | auto_increment | |
29 |
| -| name | varchar(64) | NO | UNI | NULL | | |
30 |
| -| hash | char(32) | NO | | NULL | | |
31 |
| -| status | int(11) | NO | MUL | 0 | | |
32 |
| -| size | int(11) | YES | | NULL | | |
33 |
| -| datetime | datetime | YES | | NULL | | |
34 |
| -| worker | varchar(256) | YES | | NULL | | |
35 |
| -| elapsed | float | YES | | NULL | | |
36 |
| -| data | varchar(2048) | YES | | NULL | | |
37 |
| -+----------+---------------+------+-----+---------+----------------+ |
38 |
| -id: unique identifier for each record |
39 |
| -name: name of the access log |
40 |
| -hash: md5 hash of the file, as reported by the dumps site (all zeroes if no |
41 |
| - hash is provided) |
42 |
| -status: the status of the job, using the following values: |
43 |
| - 0: queued for download |
44 |
| - 1: download in progress |
45 |
| - 2: queued for extraction |
46 |
| - 3: extracted to `wiki` table |
47 |
| - (any negative value indicates failure) |
48 |
| -size: the size, in bytes, of the downloaded file |
49 |
| -datetime: the timestamp of the most recent status update |
50 |
| -worker: name (user@hostname) of the machine working on the job |
51 |
| -elapsed: time, in seconds, taken to complete the job |
52 |
| -data: a JSON string containing counts for selected articles in the access log |
53 |
| -
|
54 |
| -`wiki` is the table where access counts are stored (parsed from wiki_raw). The |
55 |
| -"raw" JSON counts are parsed by wiki_extract.py and stored directly in this |
56 |
| -table. |
57 |
| -+----------+-------------+------+-----+---------+----------------+ |
58 |
| -| Field | Type | Null | Key | Default | Extra | |
59 |
| -+----------+-------------+------+-----+---------+----------------+ |
60 |
| -| id | int(11) | NO | PRI | NULL | auto_increment | |
61 |
| -| datetime | datetime | NO | MUL | NULL | | |
62 |
| -| article | varchar(64) | NO | MUL | NULL | | |
63 |
| -| count | int(11) | NO | | NULL | | |
64 |
| -+----------+-------------+------+-----+---------+----------------+ |
65 |
| -id: unique identifier for each record |
66 |
| -datetime: UTC timestamp (rounded to the nearest hour) of article access |
67 |
| -article: name of the article |
68 |
| -count: number of times the article was accessed in the hour |
69 |
| -
|
70 |
| -`wiki_meta` is a metadata table for this dataset. It contains pre-calculated |
71 |
| -date and epiweeks fields, and more importantly, the total number of English |
72 |
| -article hits (denominator) for each `datetime` in the `wiki` table. This table |
73 |
| -is populated in parallel with `wiki` by the wiki_extract.py script. |
74 |
| -+----------+----------+------+-----+---------+----------------+ |
75 |
| -| Field | Type | Null | Key | Default | Extra | |
76 |
| -+----------+----------+------+-----+---------+----------------+ |
77 |
| -| id | int(11) | NO | PRI | NULL | auto_increment | |
78 |
| -| datetime | datetime | NO | UNI | NULL | | |
79 |
| -| date | date | NO | | NULL | | |
80 |
| -| epiweek | int(11) | NO | | NULL | | |
81 |
| -| total | int(11) | NO | | NULL | | |
82 |
| -+----------+----------+------+-----+---------+----------------+ |
83 |
| -id: unique identifier for each record |
84 |
| -datetime: UTC timestamp (rounded to the nearest hour) of article access |
85 |
| -date: the date portion of `datetime` |
86 |
| -epiweek: the year and week containing `datetime` |
87 |
| -total: total number of English article hits in the hour |
88 |
| -
|
89 |
| -
|
90 |
| -================= |
91 |
| -=== Changelog === |
92 |
| -================= |
93 |
| -
|
| 12 | +
|
| 13 | +
|
| 14 | +======================= |
| 15 | +=== Data Dictionary === |
| 16 | +======================= |
| 17 | +
|
| 18 | +`wiki_raw` is a staging table where extracted access log data is stored for |
| 19 | +further processing. When wiki_update.py finds a new log, it saves the name and |
| 20 | +hash to this table, with a status of 0. This table is read by master.php, which |
| 21 | +then hands out "jobs" (independently and in parallel) to wiki_download.py. |
| 22 | +After wiki_download.py downloads the log and extracts the counts, it submits |
| 23 | +the data (as JSON) to master.php, which then stores the "raw" JSON counts in |
| 24 | +this table. |
| 25 | ++----------+---------------+------+-----+---------+----------------+ |
| 26 | +| Field | Type | Null | Key | Default | Extra | |
| 27 | ++----------+---------------+------+-----+---------+----------------+ |
| 28 | +| id | int(11) | NO | PRI | NULL | auto_increment | |
| 29 | +| name | varchar(64) | NO | UNI | NULL | | |
| 30 | +| hash | char(32) | NO | | NULL | | |
| 31 | +| status | int(11) | NO | MUL | 0 | | |
| 32 | +| size | int(11) | YES | | NULL | | |
| 33 | +| datetime | datetime | YES | | NULL | | |
| 34 | +| worker | varchar(256) | YES | | NULL | | |
| 35 | +| elapsed | float | YES | | NULL | | |
| 36 | +| data | varchar(2048) | YES | | NULL | | |
| 37 | ++----------+---------------+------+-----+---------+----------------+ |
| 38 | +id: unique identifier for each record |
| 39 | +name: name of the access log |
| 40 | +hash: md5 hash of the file, as reported by the dumps site (all zeroes if no |
| 41 | + hash is provided) |
| 42 | +status: the status of the job, using the following values: |
| 43 | + 0: queued for download |
| 44 | + 1: download in progress |
| 45 | + 2: queued for extraction |
| 46 | + 3: extracted to `wiki` table |
| 47 | + (any negative value indicates failure) |
| 48 | +size: the size, in bytes, of the downloaded file |
| 49 | +datetime: the timestamp of the most recent status update |
| 50 | +worker: name (user@hostname) of the machine working on the job |
| 51 | +elapsed: time, in seconds, taken to complete the job |
| 52 | +data: a JSON string containing counts for selected articles in the access log |
| 53 | +
|
| 54 | +`wiki` is the table where access counts are stored (parsed from wiki_raw). The |
| 55 | +"raw" JSON counts are parsed by wiki_extract.py and stored directly in this |
| 56 | +table. |
| 57 | ++----------+-------------+------+-----+---------+----------------+ |
| 58 | +| Field | Type | Null | Key | Default | Extra | |
| 59 | ++----------+-------------+------+-----+---------+----------------+ |
| 60 | +| id | int(11) | NO | PRI | NULL | auto_increment | |
| 61 | +| datetime | datetime | NO | MUL | NULL | | |
| 62 | +| article | varchar(64) | NO | MUL | NULL | | |
| 63 | +| count | int(11) | NO | | NULL | | |
| 64 | ++----------+-------------+------+-----+---------+----------------+ |
| 65 | +id: unique identifier for each record |
| 66 | +datetime: UTC timestamp (rounded to the nearest hour) of article access |
| 67 | +article: name of the article |
| 68 | +count: number of times the article was accessed in the hour |
| 69 | +
|
| 70 | +`wiki_meta` is a metadata table for this dataset. It contains pre-calculated |
| 71 | +date and epiweeks fields, and more importantly, the total number of English |
| 72 | +article hits (denominator) for each `datetime` in the `wiki` table. This table |
| 73 | +is populated in parallel with `wiki` by the wiki_extract.py script. |
| 74 | ++----------+----------+------+-----+---------+----------------+ |
| 75 | +| Field | Type | Null | Key | Default | Extra | |
| 76 | ++----------+----------+------+-----+---------+----------------+ |
| 77 | +| id | int(11) | NO | PRI | NULL | auto_increment | |
| 78 | +| datetime | datetime | NO | UNI | NULL | | |
| 79 | +| date | date | NO | | NULL | | |
| 80 | +| epiweek | int(11) | NO | | NULL | | |
| 81 | +| total | int(11) | NO | | NULL | | |
| 82 | ++----------+----------+------+-----+---------+----------------+ |
| 83 | +id: unique identifier for each record |
| 84 | +datetime: UTC timestamp (rounded to the nearest hour) of article access |
| 85 | +date: the date portion of `datetime` |
| 86 | +epiweek: the year and week containing `datetime` |
| 87 | +total: total number of English article hits in the hour |
| 88 | +
|
| 89 | +
|
| 90 | +================= |
| 91 | +=== Changelog === |
| 92 | +================= |
| 93 | +
|
94 | 94 | 2017-02-24
|
95 | 95 | * secrets and small improvements
|
96 | 96 | 2016-08-14
|
97 | 97 | * Increased job limit (6 -> 12) (pageviews files are ~2x smaller)
|
98 |
| -2015-08-26 |
| 98 | +2015-08-26 |
99 | 99 | * Reduced job limit (8 -> 6)
|
100 |
| -2015-08-14 |
| 100 | +2015-08-14 |
101 | 101 | * Reduced job limit (10 -> 8)
|
102 |
| -2015-08-11 |
| 102 | +2015-08-11 |
103 | 103 | + New table `wiki_meta`
|
104 |
| -2015-05-22 |
| 104 | +2015-05-22 |
105 | 105 | * Updated status codes for `wiki_raw` table
|
106 |
| -2015-05-21 |
| 106 | +2015-05-21 |
107 | 107 | * Original version
|
108 | 108 | """
|
109 |
| - |
| 109 | + |
110 | 110 | # first party
|
111 | 111 | from . import wiki_update
|
112 | 112 | from . import wiki_download
|
|
115 | 115 |
|
116 | 116 |
|
117 | 117 | def main():
|
118 |
| - # step 1: find new access logs (aka "jobs") |
119 |
| - print('looking for new jobs...') |
120 |
| - try: |
121 |
| - wiki_update.run() |
122 |
| - except: |
123 |
| - print('wiki_update failed') |
124 |
| - |
125 |
| - # step 2: run a few jobs |
126 |
| - print('running jobs...') |
127 |
| - try: |
128 |
| - wiki_download.run( |
129 |
| - secrets.wiki.hmac, |
130 |
| - download_limit=1024 * 1024 * 1024, |
131 |
| - job_limit=12 |
132 |
| - ) |
133 |
| - except: |
134 |
| - print('wiki_download failed') |
135 |
| - |
136 |
| - # step 3: extract counts from the staging data |
137 |
| - print('extracting counts...') |
138 |
| - try: |
139 |
| - wiki_extract.run(job_limit=100) |
140 |
| - except: |
141 |
| - print('wiki_extract failed') |
142 |
| - |
143 |
| - |
144 |
| -if __name__ == '__main__': |
145 |
| - main() |
| 118 | + # step 1: find new access logs (aka "jobs") |
| 119 | + print("looking for new jobs...") |
| 120 | + try: |
| 121 | + wiki_update.run() |
| 122 | + except: |
| 123 | + print("wiki_update failed") |
| 124 | + |
| 125 | + # step 2: run a few jobs |
| 126 | + print("running jobs...") |
| 127 | + try: |
| 128 | + wiki_download.run(secrets.wiki.hmac, download_limit=1024 * 1024 * 1024, job_limit=12) |
| 129 | + except: |
| 130 | + print("wiki_download failed") |
| 131 | + |
| 132 | + # step 3: extract counts from the staging data |
| 133 | + print("extracting counts...") |
| 134 | + try: |
| 135 | + wiki_extract.run(job_limit=100) |
| 136 | + except: |
| 137 | + print("wiki_extract failed") |
| 138 | + |
| 139 | + |
| 140 | +if __name__ == "__main__": |
| 141 | + main() |
0 commit comments