-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
137 lines (110 loc) · 4.04 KB
/
scraper.py
File metadata and controls
137 lines (110 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import argparse
from bs4 import BeautifulSoup
import httpx
from pathlib import Path
import time
from typing import Callable, List, Tuple
from urllib.parse import urlparse, urlunparse
def assemble_links(url: str, contains: str, prefix: str = None) -> List[str]:
"""
Get URLs of files matching extension and filter_on criteria.
Args:
url: valid URL string
contains: string to filter results, e.g. 't_and_d' in 'cat_and_dog'.
Returns:
list of links matching content filter.
"""
if not prefix:
url_parts = urlparse(url)
prefix = urlunparse([url_parts.scheme, url_parts.netloc, "", "", "", ""])
page = httpx.get(url).text
soup = BeautifulSoup(page, "html.parser")
links = [
prefix + node.get("href") for node in soup.find_all("a") if node.get("href")
]
return [i for i in links if contains in i]
def download_file(url: str):
"""
Download file at provided url.
Args:
url: string URL indicating remote file to download.
Return:
Bytes from URL.
"""
try:
r = httpx.get(url, follow_redirects=True, timeout=10)
r.raise_for_status()
return r.content
except httpx.HTTPError as e:
print(f"Error while requesting {e.request.url!r}.\n{e}")
return None
def write_file(content: bytes, local_path: Path) -> str:
"""
Write the provided bytes to file.
Args:
content: bytes to write.
local_path: pathlib.Path object referencing local write destination.
Returns:
string indicating success/failure of write.
"""
try:
with open(local_path, "wb") as out_file:
out_file.write(content)
return f"Completed write to: {local_path}"
except IOError as e:
print(f"Cannot write to file: {e}")
return f"Failed to write to: {local_path}"
def get_link_data(url: str, contains: str, out_dir: Path) -> Tuple[int]:
"""
Get SD data for provided years. Years valid 2021 through 2023.
Args:
url: valid URL string
contains: string to filter results, e.g. 't_and_d' in 'cat_and_dog'.
out_dir: pathlib.Path object referencing local directory to save.
Returns:
tuple of integers: files successfully downloaded, and links attempted.
"""
links = assemble_links(url, contains)
fnames = [Path(i).name for i in links]
fcount = 0
for link, fname in zip(links, fnames):
dl_bytes = download_file(link)
if dl_bytes:
out_path = out_dir.joinpath(fname)
print(write_file(dl_bytes, out_path))
fcount = fcount + 1
return fcount, len(links)
def main(
downloader: Callable[[str, str, Path], Tuple[int]],
url: str,
contains: str,
out_dir: Path,
) -> None:
"""Create save directory, call downloader, and time it."""
try:
out_dir.mkdir(parents=True, exist_ok=False)
except FileExistsError:
print(f"Output directory ({out_dir.as_posix()}) already exists.")
t0 = time.perf_counter()
count, total = downloader(url, contains, out_dir)
elapsed = time.perf_counter() - t0
print(f"\n{count} downloads in {elapsed} seconds.")
print(f"\n{total - count} downloads failed.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("url", help="valid URL to retrieve links within", type=str)
parser.add_argument(
"contains", help="string to filter links containing this input", type=str
)
parser.add_argument(
"out_directory", help="directory to save downloaded SD data", type=str
)
args = parser.parse_args()
url = args.url
contains = args.contains
out_dir = Path(args.out_directory)
main(get_link_data, url, contains, out_dir)
url_prefix = "https://web.archive.org/web/20171017231511/http://math.nyu.edu/student_resources/wwiki/index.php/" # noqa: E501
url_gfd_cat = url_prefix + "Category:Geophysical_Fluid_Dynamics"
gfd_links = assemble_links(url_gfd_cat, contains="_Problem_")
adv_calc_cat = url_prefix + "Category:Advanced_Calculus"