TextNet/report_scraper.py at master · kvalle/TextNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
import urllib2
import re
import os

def fetch(url):
    opener = urllib2.build_opener()
    opener.addheaders = [("User-agent", "report-crawler")]
    response = opener.open(url)
    return response.read()

def fetch_extended_report(report_url):
    alt_url = report_url.replace('.asp','_index.asp')
    doc = ''
    try:
        doc = fetch(alt_url).strip()
    except urllib2.HTTPError:
        # not a problem, just no alternate index for this report
        pass
    sec_num = 1
    while True:
        alt_url = report_url.replace('.asp','_sec'+str(sec_num)+'.asp')
        try:
            sec = fetch(alt_url).strip()
            doc += '\n\n'+sec
        except urllib2.HTTPError as e:
            # 404, no section found
            if sec_num > 2:
                # reason for not breaking on first 404 is that sometimes
                # _sec2 follows the standard report page
                break
        sec_num += 1
    return doc

def store_report(report_url, save_dir):
    alt_url = report_url.replace('.asp','_index.asp')
    name = report_url.rpartition('/')[2]
    name = name.replace('.asp','.html')
    print 'Fetching',name
    doc = fetch(report_url).strip()
    alt = fetch_extended_report(report_url)
    if alt:
        doc += '\n\n'+alt
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    filename = save_dir+'/'+name
    with open(filename, 'w') as f:
        f.write(doc)

def extract_report_urls(index):
    soup = BeautifulSoup(index)
    atags = soup.findAll('a')
    urls = []
    for a in atags:
        if a.string=='HTML':
            urls.append(a['href'])
    return urls

def crawl_reports(url, year, output_dir):
    base_url = url+year+'/'
    index = fetch(base_url).strip()
    report_urls = extract_report_urls(index)
    report_urls = [base_url+rel for rel in report_urls]
    for r_url in report_urls:
        store_report(r_url, output_dir+'/'+year)

def fetch_rail():
    cats = ['precedentes-earlier','etudes-studies']
    cats += [str(y) for y in range(1994,2011)]
    base_url = 'http://www.tsb.gc.ca/eng/rapports-reports/rail/'
    data_dir = '../data/rir/reports/'
    for c in cats:
        crawl_reports(base_url, c, data_dir)

def fetch_marine():
    cats = ['precedentes-earlier','etudes-studies']
    cats += [str(y) for y in range(1994,2010)]
    base_url = 'http://www.tsb.gc.ca/eng/rapports-reports/marine/'
    data_dir = '../data/mir/reports/'
    for c in cats:
        crawl_reports(base_url, c, data_dir)

def fetch_pipeline():
    cats = ['1994','1995','1996','1997','1999','2000','2001','2002','2005','2006', '2007','2009']
    base_url = 'http://www.tsb.gc.ca/eng/rapports-reports/pipeline/'
    data_dir = '../data/pir/reports/'
    for c in cats:
        crawl_reports(base_url, c, data_dir)

if __name__=='__main__':
    fetch_marine()
    fetch_rail()
    fetch_pipeline()