Skip to content

Commit 1f9bd5c

Browse files
authored
testing multiprocessing for faster finds! (#63)
* testing multiprocessing for faster finds! * remove extra verbose logging of task info Signed-off-by: vsoch <vsoch@users.noreply.github.com> Co-authored-by: vsoch <vsoch@users.noreply.github.com>
1 parent 7d919bf commit 1f9bd5c

File tree

17 files changed

+191
-53
lines changed

17 files changed

+191
-53
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and **Merged pull requests**. Critical items to know are:
1212
Referenced versions in headers are tagged on Github, in parentheses are for pypi.
1313

1414
## [vxx](https://github.com/urlstechie/urlschecker-python/tree/master) (master)
15+
- multiprocessing to speed up checks (0.0.26)
1516
- bug fix for verbose option to only print file names that have failures (0.0.25)
1617
- adding option to print a summary that contains file names and urls (0.0.24)
1718
- updating container base to use debian buster and adding certifi (0.0.23)

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
This is a python module to collect urls over static files (code and documentation)
99
and then test for and report broken links. If you are interesting in using
1010
this as a GitHub action, see [urlchecker-action](https://github.com/urlstechie/urlchecker-action). There are also container
11-
bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags).
11+
bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags). As of version
12+
0.0.26, we use multiprocessing so the checks run a lot faster, and you can set `URLCHECKER_WORKERS` to change the number of workers
13+
(defaults to 9). If you don't want multiprocessing, use version 0.0.25 or earlier.
1214

1315
## Module Documentation
1416

docs/source/fileproc.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
urlchecker.core.fileproc
2-
==========================
2+
========================
33

44

55
.. automodule:: urlchecker.core.fileproc

urlchecker/__init__.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,3 @@
1-
"""
2-
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
4-
5-
This source code is licensed under the terms of the MIT license.
6-
For a copy, see <https://opensource.org/licenses/MIT>.
7-
8-
"""
9-
101
from urlchecker.version import __version__
2+
3+
assert __version__

urlchecker/client/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
"""
44
5-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
5+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
66
77
This source code is licensed under the terms of the MIT license.
88
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/client/check.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
client/github.py: entrypoint for interaction with a GitHub repostiory.
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
"""
55

66
import re
@@ -106,9 +106,9 @@ def main(args, extra):
106106
if args.verbose:
107107
print("\n\U0001F914 Uh oh... The following urls did not pass:")
108108
for file_name, result in checker.checks.items():
109-
if result.failed:
109+
if result["failed"]:
110110
print_failure(file_name + ":")
111-
for url in result.failed:
111+
for url in result["failed"]:
112112
print_failure(" " + url)
113113
else:
114114
print("\n\U0001F914 Uh oh... The following urls did not pass:")

urlchecker/core/check.py

Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.
@@ -12,6 +12,7 @@
1212
import re
1313
import sys
1414
from urlchecker.core import fileproc
15+
from urlchecker.core.worker import Workers
1516
from urlchecker.core.urlproc import UrlCheckResult
1617

1718

@@ -41,6 +42,8 @@ def __init__(
4142
"""
4243
# Initiate results object, and checks lookup (holds UrlCheck) for each file
4344
self.results = {"passed": set(), "failed": set(), "excluded": set()}
45+
46+
# Results organized by filename
4447
self.checks = {}
4548

4649
# Save run parameters
@@ -123,12 +126,18 @@ def save_results(self, file_path, sep=",", header=None, relative_paths=True):
123126
else:
124127
file_name = os.path.relpath(file_name)
125128

126-
[writer.writerow([url, "failed", file_name]) for url in result.failed]
129+
[
130+
writer.writerow([url, "failed", file_name])
131+
for url in result["failed"]
132+
]
127133
[
128134
writer.writerow([url, "excluded", file_name])
129-
for url in result.excluded
135+
for url in result["excluded"]
136+
]
137+
[
138+
writer.writerow([url, "passed", file_name])
139+
for url in result["passed"]
130140
]
131-
[writer.writerow([url, "passed", file_name]) for url in result.passed]
132141

133142
return file_path
134143

@@ -161,27 +170,56 @@ def run(
161170
exclude_urls = exclude_urls or []
162171
exclude_patterns = exclude_patterns or []
163172

164-
# loop through files files
165-
for file_name in file_paths:
166-
167-
# Instantiate a checker to extract urls
168-
checker = UrlCheckResult(
169-
file_name=file_name,
170-
exclude_patterns=exclude_patterns,
171-
exclude_urls=exclude_urls,
172-
print_all=self.print_all,
173-
)
174-
175-
# Check the urls
176-
checker.check_urls(retry_count=retry_count, timeout=timeout)
173+
# Run with multiprocessing
174+
tasks = {}
175+
funcs = {}
176+
workers = Workers()
177177

178-
# Update flattened results
179-
self.results["failed"].update(checker.failed)
180-
self.results["passed"].update(checker.passed)
181-
self.results["excluded"].update(checker.excluded)
178+
# loop through files
179+
for file_name in file_paths:
182180

183-
# Save the checker in the lookup
184-
self.checks[file_name] = checker
181+
# Export parameters and functions, use the same check task for all
182+
tasks[file_name] = {
183+
"file_name": file_name,
184+
"exclude_patterns": exclude_patterns,
185+
"exclude_urls": exclude_urls,
186+
"print_all": self.print_all,
187+
"retry_count": retry_count,
188+
"timeout": timeout,
189+
}
190+
funcs[file_name] = check_task
191+
192+
results = workers.run(funcs, tasks)
193+
for file_name, result in results.items():
194+
self.checks[file_name] = result
195+
self.results["failed"].update(result["failed"])
196+
self.results["passed"].update(result["passed"])
197+
self.results["excluded"].update(result["excluded"])
185198

186199
# A flattened dict of passed and failed
187200
return self.results
201+
202+
203+
def check_task(*args, **kwargs):
204+
"""
205+
A checking task, the default we use
206+
"""
207+
# Instantiate a checker to extract urls
208+
checker = UrlCheckResult(
209+
file_name=kwargs["file_name"],
210+
exclude_patterns=kwargs.get("exclude_patterns", []),
211+
exclude_urls=kwargs.get("exclude_urls", []),
212+
print_all=kwargs.get("print_all", True),
213+
)
214+
215+
# Check the urls
216+
checker.check_urls(
217+
retry_count=kwargs.get("retry_count", 2), timeout=kwargs.get("timeout", 5)
218+
)
219+
220+
# Update flattened results
221+
return {
222+
"failed": checker.failed,
223+
"passed": checker.passed,
224+
"excluded": checker.excluded,
225+
}

urlchecker/core/exclude.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/core/fileproc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

0 commit comments

Comments
 (0)