-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathreport_scraper.py
More file actions
94 lines (85 loc) · 2.88 KB
/
report_scraper.py
File metadata and controls
94 lines (85 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
import urllib2
import re
import os
def fetch(url):
opener = urllib2.build_opener()
opener.addheaders = [("User-agent", "report-crawler")]
response = opener.open(url)
return response.read()
def fetch_extended_report(report_url):
alt_url = report_url.replace('.asp','_index.asp')
doc = ''
try:
doc = fetch(alt_url).strip()
except urllib2.HTTPError:
# not a problem, just no alternate index for this report
pass
sec_num = 1
while True:
alt_url = report_url.replace('.asp','_sec'+str(sec_num)+'.asp')
try:
sec = fetch(alt_url).strip()
doc += '\n\n'+sec
except urllib2.HTTPError as e:
# 404, no section found
if sec_num > 2:
# reason for not breaking on first 404 is that sometimes
# _sec2 follows the standard report page
break
sec_num += 1
return doc
def store_report(report_url, save_dir):
alt_url = report_url.replace('.asp','_index.asp')
name = report_url.rpartition('/')[2]
name = name.replace('.asp','.html')
print 'Fetching',name
doc = fetch(report_url).strip()
alt = fetch_extended_report(report_url)
if alt:
doc += '\n\n'+alt
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filename = save_dir+'/'+name
with open(filename, 'w') as f:
f.write(doc)
def extract_report_urls(index):
soup = BeautifulSoup(index)
atags = soup.findAll('a')
urls = []
for a in atags:
if a.string=='HTML':
urls.append(a['href'])
return urls
def crawl_reports(url, year, output_dir):
base_url = url+year+'/'
index = fetch(base_url).strip()
report_urls = extract_report_urls(index)
report_urls = [base_url+rel for rel in report_urls]
for r_url in report_urls:
store_report(r_url, output_dir+'/'+year)
def fetch_rail():
cats = ['precedentes-earlier','etudes-studies']
cats += [str(y) for y in range(1994,2011)]
base_url = 'http://www.tsb.gc.ca/eng/rapports-reports/rail/'
data_dir = '../data/rir/reports/'
for c in cats:
crawl_reports(base_url, c, data_dir)
def fetch_marine():
cats = ['precedentes-earlier','etudes-studies']
cats += [str(y) for y in range(1994,2010)]
base_url = 'http://www.tsb.gc.ca/eng/rapports-reports/marine/'
data_dir = '../data/mir/reports/'
for c in cats:
crawl_reports(base_url, c, data_dir)
def fetch_pipeline():
cats = ['1994','1995','1996','1997','1999','2000','2001','2002','2005','2006', '2007','2009']
base_url = 'http://www.tsb.gc.ca/eng/rapports-reports/pipeline/'
data_dir = '../data/pir/reports/'
for c in cats:
crawl_reports(base_url, c, data_dir)
if __name__=='__main__':
fetch_marine()
fetch_rail()
fetch_pipeline()