|
1 | 1 | """ |
2 | | -=============== |
3 | | -=== Purpose === |
4 | | -=============== |
5 | | -
|
6 | | -Wrapper for the entire wiki data collection process: |
7 | | - 1. Uses wiki_update.py to fetch metadata for new access logs |
8 | | - 2. Uses wiki_download.py to download the access logs |
9 | | - 3. Uses wiki_extract.py to store article access counts |
10 | | -
|
| 2 | +=============== |
| 3 | +=== Purpose === |
| 4 | +=============== |
| 5 | +
|
| 6 | +Wrapper for the entire wiki data collection process: |
| 7 | + 1. Uses wiki_update.py to fetch metadata for new access logs |
| 8 | + 2. Uses wiki_download.py to download the access logs |
| 9 | + 3. Uses wiki_extract.py to store article access counts |
| 10 | +
|
11 | 11 | See also: master.php |
12 | | -
|
13 | | -
|
14 | | -======================= |
15 | | -=== Data Dictionary === |
16 | | -======================= |
17 | | -
|
18 | | -`wiki_raw` is a staging table where extracted access log data is stored for |
19 | | -further processing. When wiki_update.py finds a new log, it saves the name and |
20 | | -hash to this table, with a status of 0. This table is read by master.php, which |
21 | | -then hands out "jobs" (independently and in parallel) to wiki_download.py. |
22 | | -After wiki_download.py downloads the log and extracts the counts, it submits |
23 | | -the data (as JSON) to master.php, which then stores the "raw" JSON counts in |
24 | | -this table. |
25 | | -+----------+---------------+------+-----+---------+----------------+ |
26 | | -| Field | Type | Null | Key | Default | Extra | |
27 | | -+----------+---------------+------+-----+---------+----------------+ |
28 | | -| id | int(11) | NO | PRI | NULL | auto_increment | |
29 | | -| name | varchar(64) | NO | UNI | NULL | | |
30 | | -| hash | char(32) | NO | | NULL | | |
31 | | -| status | int(11) | NO | MUL | 0 | | |
32 | | -| size | int(11) | YES | | NULL | | |
33 | | -| datetime | datetime | YES | | NULL | | |
34 | | -| worker | varchar(256) | YES | | NULL | | |
35 | | -| elapsed | float | YES | | NULL | | |
36 | | -| data | varchar(2048) | YES | | NULL | | |
37 | | -+----------+---------------+------+-----+---------+----------------+ |
38 | | -id: unique identifier for each record |
39 | | -name: name of the access log |
40 | | -hash: md5 hash of the file, as reported by the dumps site (all zeroes if no |
41 | | - hash is provided) |
42 | | -status: the status of the job, using the following values: |
43 | | - 0: queued for download |
44 | | - 1: download in progress |
45 | | - 2: queued for extraction |
46 | | - 3: extracted to `wiki` table |
47 | | - (any negative value indicates failure) |
48 | | -size: the size, in bytes, of the downloaded file |
49 | | -datetime: the timestamp of the most recent status update |
50 | | -worker: name (user@hostname) of the machine working on the job |
51 | | -elapsed: time, in seconds, taken to complete the job |
52 | | -data: a JSON string containing counts for selected articles in the access log |
53 | | -
|
54 | | -`wiki` is the table where access counts are stored (parsed from wiki_raw). The |
55 | | -"raw" JSON counts are parsed by wiki_extract.py and stored directly in this |
56 | | -table. |
57 | | -+----------+-------------+------+-----+---------+----------------+ |
58 | | -| Field | Type | Null | Key | Default | Extra | |
59 | | -+----------+-------------+------+-----+---------+----------------+ |
60 | | -| id | int(11) | NO | PRI | NULL | auto_increment | |
61 | | -| datetime | datetime | NO | MUL | NULL | | |
62 | | -| article | varchar(64) | NO | MUL | NULL | | |
63 | | -| count | int(11) | NO | | NULL | | |
64 | | -+----------+-------------+------+-----+---------+----------------+ |
65 | | -id: unique identifier for each record |
66 | | -datetime: UTC timestamp (rounded to the nearest hour) of article access |
67 | | -article: name of the article |
68 | | -count: number of times the article was accessed in the hour |
69 | | -
|
70 | | -`wiki_meta` is a metadata table for this dataset. It contains pre-calculated |
71 | | -date and epiweeks fields, and more importantly, the total number of English |
72 | | -article hits (denominator) for each `datetime` in the `wiki` table. This table |
73 | | -is populated in parallel with `wiki` by the wiki_extract.py script. |
74 | | -+----------+----------+------+-----+---------+----------------+ |
75 | | -| Field | Type | Null | Key | Default | Extra | |
76 | | -+----------+----------+------+-----+---------+----------------+ |
77 | | -| id | int(11) | NO | PRI | NULL | auto_increment | |
78 | | -| datetime | datetime | NO | UNI | NULL | | |
79 | | -| date | date | NO | | NULL | | |
80 | | -| epiweek | int(11) | NO | | NULL | | |
81 | | -| total | int(11) | NO | | NULL | | |
82 | | -+----------+----------+------+-----+---------+----------------+ |
83 | | -id: unique identifier for each record |
84 | | -datetime: UTC timestamp (rounded to the nearest hour) of article access |
85 | | -date: the date portion of `datetime` |
86 | | -epiweek: the year and week containing `datetime` |
87 | | -total: total number of English article hits in the hour |
88 | | -
|
89 | | -
|
90 | | -================= |
91 | | -=== Changelog === |
92 | | -================= |
93 | | -
|
| 12 | +
|
| 13 | +
|
| 14 | +======================= |
| 15 | +=== Data Dictionary === |
| 16 | +======================= |
| 17 | +
|
| 18 | +`wiki_raw` is a staging table where extracted access log data is stored for |
| 19 | +further processing. When wiki_update.py finds a new log, it saves the name and |
| 20 | +hash to this table, with a status of 0. This table is read by master.php, which |
| 21 | +then hands out "jobs" (independently and in parallel) to wiki_download.py. |
| 22 | +After wiki_download.py downloads the log and extracts the counts, it submits |
| 23 | +the data (as JSON) to master.php, which then stores the "raw" JSON counts in |
| 24 | +this table. |
| 25 | ++----------+---------------+------+-----+---------+----------------+ |
| 26 | +| Field | Type | Null | Key | Default | Extra | |
| 27 | ++----------+---------------+------+-----+---------+----------------+ |
| 28 | +| id | int(11) | NO | PRI | NULL | auto_increment | |
| 29 | +| name | varchar(64) | NO | UNI | NULL | | |
| 30 | +| hash | char(32) | NO | | NULL | | |
| 31 | +| status | int(11) | NO | MUL | 0 | | |
| 32 | +| size | int(11) | YES | | NULL | | |
| 33 | +| datetime | datetime | YES | | NULL | | |
| 34 | +| worker | varchar(256) | YES | | NULL | | |
| 35 | +| elapsed | float | YES | | NULL | | |
| 36 | +| data | varchar(2048) | YES | | NULL | | |
| 37 | ++----------+---------------+------+-----+---------+----------------+ |
| 38 | +id: unique identifier for each record |
| 39 | +name: name of the access log |
| 40 | +hash: md5 hash of the file, as reported by the dumps site (all zeroes if no |
| 41 | + hash is provided) |
| 42 | +status: the status of the job, using the following values: |
| 43 | + 0: queued for download |
| 44 | + 1: download in progress |
| 45 | + 2: queued for extraction |
| 46 | + 3: extracted to `wiki` table |
| 47 | + (any negative value indicates failure) |
| 48 | +size: the size, in bytes, of the downloaded file |
| 49 | +datetime: the timestamp of the most recent status update |
| 50 | +worker: name (user@hostname) of the machine working on the job |
| 51 | +elapsed: time, in seconds, taken to complete the job |
| 52 | +data: a JSON string containing counts for selected articles in the access log |
| 53 | +
|
| 54 | +`wiki` is the table where access counts are stored (parsed from wiki_raw). The |
| 55 | +"raw" JSON counts are parsed by wiki_extract.py and stored directly in this |
| 56 | +table. |
| 57 | ++----------+-------------+------+-----+---------+----------------+ |
| 58 | +| Field | Type | Null | Key | Default | Extra | |
| 59 | ++----------+-------------+------+-----+---------+----------------+ |
| 60 | +| id | int(11) | NO | PRI | NULL | auto_increment | |
| 61 | +| datetime | datetime | NO | MUL | NULL | | |
| 62 | +| article | varchar(64) | NO | MUL | NULL | | |
| 63 | +| count | int(11) | NO | | NULL | | |
| 64 | ++----------+-------------+------+-----+---------+----------------+ |
| 65 | +id: unique identifier for each record |
| 66 | +datetime: UTC timestamp (rounded to the nearest hour) of article access |
| 67 | +article: name of the article |
| 68 | +count: number of times the article was accessed in the hour |
| 69 | +
|
| 70 | +`wiki_meta` is a metadata table for this dataset. It contains pre-calculated |
| 71 | +date and epiweeks fields, and more importantly, the total number of English |
| 72 | +article hits (denominator) for each `datetime` in the `wiki` table. This table |
| 73 | +is populated in parallel with `wiki` by the wiki_extract.py script. |
| 74 | ++----------+----------+------+-----+---------+----------------+ |
| 75 | +| Field | Type | Null | Key | Default | Extra | |
| 76 | ++----------+----------+------+-----+---------+----------------+ |
| 77 | +| id | int(11) | NO | PRI | NULL | auto_increment | |
| 78 | +| datetime | datetime | NO | UNI | NULL | | |
| 79 | +| date | date | NO | | NULL | | |
| 80 | +| epiweek | int(11) | NO | | NULL | | |
| 81 | +| total | int(11) | NO | | NULL | | |
| 82 | ++----------+----------+------+-----+---------+----------------+ |
| 83 | +id: unique identifier for each record |
| 84 | +datetime: UTC timestamp (rounded to the nearest hour) of article access |
| 85 | +date: the date portion of `datetime` |
| 86 | +epiweek: the year and week containing `datetime` |
| 87 | +total: total number of English article hits in the hour |
| 88 | +
|
| 89 | +
|
| 90 | +================= |
| 91 | +=== Changelog === |
| 92 | +================= |
| 93 | +
|
94 | 94 | 2017-02-24 |
95 | 95 | * secrets and small improvements |
96 | 96 | 2016-08-14 |
97 | 97 | * Increased job limit (6 -> 12) (pageviews files are ~2x smaller) |
98 | | -2015-08-26 |
| 98 | +2015-08-26 |
99 | 99 | * Reduced job limit (8 -> 6) |
100 | | -2015-08-14 |
| 100 | +2015-08-14 |
101 | 101 | * Reduced job limit (10 -> 8) |
102 | | -2015-08-11 |
| 102 | +2015-08-11 |
103 | 103 | + New table `wiki_meta` |
104 | | -2015-05-22 |
| 104 | +2015-05-22 |
105 | 105 | * Updated status codes for `wiki_raw` table |
106 | | -2015-05-21 |
| 106 | +2015-05-21 |
107 | 107 | * Original version |
108 | 108 | """ |
109 | | - |
| 109 | + |
110 | 110 | # first party |
111 | 111 | from . import wiki_update |
112 | 112 | from . import wiki_download |
|
115 | 115 |
|
116 | 116 |
|
117 | 117 | def main(): |
118 | | - # step 1: find new access logs (aka "jobs") |
119 | | - print('looking for new jobs...') |
120 | | - try: |
121 | | - wiki_update.run() |
122 | | - except: |
123 | | - print('wiki_update failed') |
124 | | - |
125 | | - # step 2: run a few jobs |
126 | | - print('running jobs...') |
127 | | - try: |
128 | | - wiki_download.run( |
129 | | - secrets.wiki.hmac, |
130 | | - download_limit=1024 * 1024 * 1024, |
131 | | - job_limit=12 |
132 | | - ) |
133 | | - except: |
134 | | - print('wiki_download failed') |
135 | | - |
136 | | - # step 3: extract counts from the staging data |
137 | | - print('extracting counts...') |
138 | | - try: |
139 | | - wiki_extract.run(job_limit=100) |
140 | | - except: |
141 | | - print('wiki_extract failed') |
142 | | - |
143 | | - |
144 | | -if __name__ == '__main__': |
145 | | - main() |
| 118 | + # step 1: find new access logs (aka "jobs") |
| 119 | + print("looking for new jobs...") |
| 120 | + try: |
| 121 | + wiki_update.run() |
| 122 | + except: |
| 123 | + print("wiki_update failed") |
| 124 | + |
| 125 | + # step 2: run a few jobs |
| 126 | + print("running jobs...") |
| 127 | + try: |
| 128 | + wiki_download.run(secrets.wiki.hmac, download_limit=1024 * 1024 * 1024, job_limit=12) |
| 129 | + except: |
| 130 | + print("wiki_download failed") |
| 131 | + |
| 132 | + # step 3: extract counts from the staging data |
| 133 | + print("extracting counts...") |
| 134 | + try: |
| 135 | + wiki_extract.run(job_limit=100) |
| 136 | + except: |
| 137 | + print("wiki_extract failed") |
| 138 | + |
| 139 | + |
| 140 | +if __name__ == "__main__": |
| 141 | + main() |
0 commit comments