From f7ba3a29776a90da80e167f22aba719756dd89a0 Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Tue, 8 Jul 2025 10:32:53 +0100 Subject: [PATCH 01/12] Replace linklist["object"].id with generalized linklist["object"].pk, for cases where PK is not .id. --- linkcheck/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkcheck/utils.py b/linkcheck/utils.py index 0b4e6d7..b40dbb5 100644 --- a/linkcheck/utils.py +++ b/linkcheck/utils.py @@ -183,7 +183,7 @@ def find_all_links(linklists=None): linklists = linklist_cls().get_linklist() for linklist in linklists: - object_id = linklist["object"].id + object_id = linklist["object"].pk urls = linklist["urls"] + linklist["images"] if urls: new = update_urls(urls, content_type, object_id) From 97d6bfca1575dda73347029e8c6d6479c7bc9163 Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Tue, 8 Jul 2025 16:24:22 +0100 Subject: [PATCH 02/12] Added PROXIES setting. --- linkcheck/linkcheck_settings.py | 1 + linkcheck/models.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/linkcheck/linkcheck_settings.py b/linkcheck/linkcheck_settings.py index 566be34..bedbc2f 100644 --- a/linkcheck/linkcheck_settings.py +++ b/linkcheck/linkcheck_settings.py @@ -59,3 +59,4 @@ SITE_DOMAINS = getattr(settings, 'LINKCHECK_SITE_DOMAINS', []) DISABLE_LISTENERS = getattr(settings, 'LINKCHECK_DISABLE_LISTENERS', False) TOLERATE_BROKEN_ANCHOR = getattr(settings, 'LINKCHECK_TOLERATE_BROKEN_ANCHOR', True) +PROXIES = getattr(settings, 'LINKCHECK_PROXIES', {}) diff --git a/linkcheck/models.py b/linkcheck/models.py index 536fb50..d01f80f 100644 --- a/linkcheck/models.py +++ b/linkcheck/models.py @@ -31,6 +31,7 @@ LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT, MAX_URL_LENGTH, MEDIA_PREFIX, + PROXIES, SITE_DOMAINS, TOLERATE_BROKEN_ANCHOR, ) @@ -386,6 +387,9 @@ def check_external(self, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): "timeout": LINKCHECK_CONNECTION_ATTEMPT_TIMEOUT, "verify": True, } + if PROXIES: + request_params["proxies"] = PROXIES + try: try: # At first try a HEAD request From 2fff177e98d2f7c6cd72d0f6360de58d81381d9e Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Wed, 9 Jul 2025 16:22:12 +0100 Subject: [PATCH 03/12] Added TRUST_PROXY_SSL setting. --- linkcheck/linkcheck_settings.py | 1 + linkcheck/models.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/linkcheck/linkcheck_settings.py b/linkcheck/linkcheck_settings.py index bedbc2f..617aef4 100644 --- a/linkcheck/linkcheck_settings.py +++ b/linkcheck/linkcheck_settings.py @@ -60,3 +60,4 @@ DISABLE_LISTENERS = getattr(settings, 'LINKCHECK_DISABLE_LISTENERS', False) TOLERATE_BROKEN_ANCHOR = getattr(settings, 'LINKCHECK_TOLERATE_BROKEN_ANCHOR', True) PROXIES = getattr(settings, 'LINKCHECK_PROXIES', {}) +TRUST_PROXY_SSL = getattr(settings, 'LINKCHECK_TRUST_PROXY_SSL', False) diff --git a/linkcheck/models.py b/linkcheck/models.py index d01f80f..d7c189f 100644 --- a/linkcheck/models.py +++ b/linkcheck/models.py @@ -34,6 +34,7 @@ PROXIES, SITE_DOMAINS, TOLERATE_BROKEN_ANCHOR, + TRUST_PROXY_SSL, ) logger = logging.getLogger(__name__) @@ -388,6 +389,7 @@ def check_external(self, external_recheck_interval=EXTERNAL_RECHECK_INTERVAL): "verify": True, } if PROXIES: + request_params["verify"] = not TRUST_PROXY_SSL request_params["proxies"] = PROXIES try: From 3bb391f93aa549bc673f72ad92045ce7326fb688 Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Fri, 11 Jul 2025 14:19:54 +0100 Subject: [PATCH 04/12] Added test_external_proxy_request. --- linkcheck/tests/test_linkcheck.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 8a3f1e1..1c36704 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,7 +1,7 @@ import os from datetime import datetime, timedelta from io import StringIO -from unittest.mock import patch +from unittest.mock import Mock, patch import requests_mock import urllib3 @@ -672,6 +672,25 @@ def test_external_check_blocked_user_agent_blocked_head(self): self.assertEqual(uv.redirect_to, '') self.assertEqual(uv.type, 'external') + @patch('linkcheck.models.PROXIES', {'http': 'http://proxy.example.com:8080'}) + @patch('requests.head') + def test_external_proxy_request(self, mock_head): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.reason = 'OK' + mock_response.history = [] + mock_head.return_value = mock_response + request_url = 'http://test.com' + uv = Url(url=request_url) + uv.check_url() + self.assertEqual(uv.status, True) + self.assertEqual(uv.message, '200 OK') + self.assertEqual(uv.type, 'external') + mock_head.assert_called_once() + (call_url,), call_kwargs = mock_head.call_args + self.assertEqual(call_url, request_url) + self.assertEqual(call_kwargs.get('proxies'), {'http': 'http://proxy.example.com:8080'}) + def test_external_check_timedout(self): uv = Url(url=f"{self.live_server_url}/timeout/") uv.check_url() From 1a0071eafbfad7120cb0f0b3d5e926ebca8179ac Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Fri, 11 Jul 2025 16:38:23 +0100 Subject: [PATCH 05/12] Linting. --- linkcheck/tests/test_linkcheck.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 1c36704..c67f46f 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -672,7 +672,10 @@ def test_external_check_blocked_user_agent_blocked_head(self): self.assertEqual(uv.redirect_to, '') self.assertEqual(uv.type, 'external') - @patch('linkcheck.models.PROXIES', {'http': 'http://proxy.example.com:8080'}) + @patch( + 'linkcheck.models.PROXIES', + {'http': 'http://proxy.example.com:8080'}, + ) @patch('requests.head') def test_external_proxy_request(self, mock_head): mock_response = Mock() @@ -689,7 +692,10 @@ def test_external_proxy_request(self, mock_head): mock_head.assert_called_once() (call_url,), call_kwargs = mock_head.call_args self.assertEqual(call_url, request_url) - self.assertEqual(call_kwargs.get('proxies'), {'http': 'http://proxy.example.com:8080'}) + self.assertEqual( + call_kwargs.get('proxies'), + {'http': 'http://proxy.example.com:8080'}, + ) def test_external_check_timedout(self): uv = Url(url=f"{self.live_server_url}/timeout/") From c648c159d9ba56ca0ae3a1c478bd6c967e4b5bae Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Tue, 29 Jul 2025 13:24:14 +0100 Subject: [PATCH 06/12] Added environment option to skip compilation of translation files. --- linkcheck/build_meta.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/linkcheck/build_meta.py b/linkcheck/build_meta.py index 435f55d..a75ee56 100644 --- a/linkcheck/build_meta.py +++ b/linkcheck/build_meta.py @@ -1,3 +1,4 @@ +import os import subprocess from setuptools import build_meta as default @@ -5,17 +6,29 @@ def compile_translation_files(): - print("Compile translation files") + print("Compiling translation files...") subprocess.run(["django-admin", "compilemessages"], cwd="linkcheck") +def should_compile_translation_files(): + skip_translations = os.environ.get("LINKCHECK_SKIP_TRANSLATIONS") + if skip_translations and skip_translations.lower() in ("1", "true", "yes", "t", "y"): + return False + + return True + + def build_sdist(sdist_directory, config_settings=None): - compile_translation_files() + if should_compile_translation_files(): + compile_translation_files() + return default.build_sdist(sdist_directory, config_settings) def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): - compile_translation_files() + if should_compile_translation_files(): + compile_translation_files() + return default.build_wheel( wheel_directory, config_settings=config_settings, From 236ace0182b1a531ae00b27711c44757996d114a Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Tue, 29 Jul 2025 13:24:21 +0100 Subject: [PATCH 07/12] Typo fix. --- linkcheck/templates/linkcheck/report.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkcheck/templates/linkcheck/report.html b/linkcheck/templates/linkcheck/report.html index a65d577..1734170 100644 --- a/linkcheck/templates/linkcheck/report.html +++ b/linkcheck/templates/linkcheck/report.html @@ -171,7 +171,7 @@

{{report_type}} in '{{object.obj {% if link.url.redirect_to %} - R{% translate "Redirects to" %}: {{ link.url.redirect_to }} + {% translate "Redirects to" %}: {{ link.url.redirect_to }} {% endif %} {% endfor %} From 3dcce7d64d9d24bd7affe856455b2755720ad901 Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Thu, 31 Jul 2025 09:10:19 +0100 Subject: [PATCH 08/12] Added a note into the README regarding LINKCHECK_SKIP_TRANSLATIONS. --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 4b9163f..7e686ec 100644 --- a/README.rst +++ b/README.rst @@ -33,7 +33,9 @@ Basic usage ----------- #. Install app to somewhere on your Python path (e.g. ``pip install - django-linkcheck``). + django-linkcheck``). If you do not need multilingual support, you can skip + the compilation of the translation files with an environment variable, e.g. + (``LINKCHECK_SKIP_TRANSLATIONS=true pip install django-linkcheck``). #. Add ``'linkcheck'`` to your ``settings.INSTALLED_APPS``. From c1f78a32803f977ad50ad14d4b76d59c4f2b808d Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Tue, 5 Aug 2025 16:45:18 +0100 Subject: [PATCH 09/12] Added a check for instance._state.adding to instance_pre_save as checking not instance.pk is insufficient. It can have a PK but not be saved in the DB yet, which will call this to fail. --- linkcheck/listeners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkcheck/listeners.py b/linkcheck/listeners.py index f82784f..0642aeb 100644 --- a/linkcheck/listeners.py +++ b/linkcheck/listeners.py @@ -132,7 +132,7 @@ def delete_instance_links(sender, instance, **kwargs): def instance_pre_save(sender, instance, raw=False, **kwargs): - if not instance.pk or raw: + if instance._state.adding or not instance.pk or raw: # Ignore unsaved instances or raw imports return current_url = instance.get_absolute_url() From fb2378f65be64469d2865a9fc1f28fc9290ca4fa Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Wed, 10 Sep 2025 12:43:34 -0700 Subject: [PATCH 10/12] tests: add some basic coverage of the `check_links` function --- linkcheck/tests/test_linkcheck.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index c67f46f..fb349fc 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,5 +1,6 @@ import os from datetime import datetime, timedelta +from http import HTTPStatus from io import StringIO from unittest.mock import Mock, patch @@ -13,6 +14,7 @@ from django.test import LiveServerTestCase, TestCase from django.test.utils import override_settings from django.urls import reverse +from django.utils import timezone from requests.exceptions import ConnectionError from linkcheck.linkcheck_settings import MAX_URL_LENGTH @@ -25,6 +27,7 @@ unregister_listeners, ) from linkcheck.models import Link, Url +from linkcheck.utils import check_links from linkcheck.views import get_jquery_min_js from .sampleapp.models import Author, Book, Journal, Page @@ -1228,6 +1231,33 @@ def test_filter_callable(self): ) +class TestCheckLinks(TestCase): + + @requests_mock.Mocker() + def test_check_links(self, mocker): + good_url = 'https://example.com/good' + mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK') + Url.objects.create(url=good_url) + + bad_url = 'https://example.com/bad' + mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND') + Url.objects.create(url=bad_url) + + exception_url = 'https://example.com/exception' + mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong")) + Url.objects.create(url=exception_url) + + recently_checked_url = 'https://example.com/recent' + # Shouldn't be requested + Url.objects.create(url=recently_checked_url, last_checked=timezone.now() - timedelta(days=1)) + + self.assertEqual(check_links(), 3) + self.assertEqual(Url.objects.get(url=good_url).status, True) + self.assertEqual(Url.objects.get(url=bad_url).status, False) + self.assertEqual(Url.objects.get(url=exception_url).status, False) + self.assertEqual(Url.objects.get(url=recently_checked_url).status, None) + + def get_command_output(command, *args, **kwargs): """ Helper function for running a management command and checking its output From 7377a6913361d3ef51169705ca8e4a719790a185 Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Wed, 10 Sep 2025 14:30:30 -0700 Subject: [PATCH 11/12] tests: fix naive datetime warning Was raising a `RuntimeWarning: DateTimeField Url.last_checked received a naive datetime ... while time zone support is active.` --- linkcheck/tests/test_linkcheck.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index fb349fc..52d6577 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,5 +1,5 @@ import os -from datetime import datetime, timedelta +from datetime import timedelta from http import HTTPStatus from io import StringIO from unittest.mock import Mock, patch @@ -898,7 +898,7 @@ def test_checklinks_command(self): "1 internal URLs and 0 external URLs have been checked.\n" ) - yesterday = datetime.now() - timedelta(days=1) + yesterday = timezone.now() - timedelta(days=1) Url.objects.all().update(last_checked=yesterday) out = StringIO() call_command('checklinks', externalinterval=20, stdout=out) From d17e2921fdb2d76e8be86337626bea07a4814b48 Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Wed, 10 Sep 2025 14:51:52 -0700 Subject: [PATCH 12/12] check_links: add concurrent version of the `check_links` helper function Since we're using sqlite as the test db backend, we can't effectively test the `Url` objects being saved in the `ThreadPoolExecutor` futures, but there's enough test coverage that under the hood `Url.check_url` is doing the right thing --- linkcheck/tests/test_linkcheck.py | 37 +++++++++++++++-- linkcheck/utils.py | 66 +++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 52d6577..4d2f95e 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -27,7 +27,7 @@ unregister_listeners, ) from linkcheck.models import Link, Url -from linkcheck.utils import check_links +from linkcheck.utils import check_links, concurrent_check_links from linkcheck.views import get_jquery_min_js from .sampleapp.models import Author, Book, Journal, Page @@ -1233,8 +1233,10 @@ def test_filter_callable(self): class TestCheckLinks(TestCase): - @requests_mock.Mocker() - def test_check_links(self, mocker): + def _setup_mock_urls(self, mocker): + """ + Set up common mock URLs for link checking tests. + """ good_url = 'https://example.com/good' mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK') Url.objects.create(url=good_url) @@ -1249,7 +1251,13 @@ def test_check_links(self, mocker): recently_checked_url = 'https://example.com/recent' # Shouldn't be requested - Url.objects.create(url=recently_checked_url, last_checked=timezone.now() - timedelta(days=1)) + Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1)) + + return (good_url, bad_url, exception_url, recently_checked_url) + + @requests_mock.Mocker() + def test_check_links(self, mocker): + good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker) self.assertEqual(check_links(), 3) self.assertEqual(Url.objects.get(url=good_url).status, True) @@ -1257,6 +1265,27 @@ def test_check_links(self, mocker): self.assertEqual(Url.objects.get(url=exception_url).status, False) self.assertEqual(Url.objects.get(url=recently_checked_url).status, None) + @requests_mock.Mocker() + def test_concurrent_check_links(self, mocker): + self._setup_mock_urls(mocker) + + # Since the tests are running in sqlite, we can't insert data via our threaded code + # there's enough other test coverage that we can use `Url.save` as a proxy + with patch.object(Url, "save") as patched_save: + self.assertEqual(concurrent_check_links(), 3) + self.assertEqual(patched_save.call_count, 3) + + def test_concurrent_check_links_error_handling(self): + Url.objects.create(url='https://example.com/good') + with ( + patch("linkcheck.utils.logger.exception") as patched_logged_exception, + patch.object(Url, "check_external", side_effect=ValueError("oops")), + ): + self.assertEqual(concurrent_check_links(), 0) + self.assertEqual(patched_logged_exception.call_count, 1) + msg, *args = patched_logged_exception.call_args[0] + self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops") + def get_command_output(command, *args, **kwargs): """ diff --git a/linkcheck/utils.py b/linkcheck/utils.py index b40dbb5..ba00e90 100644 --- a/linkcheck/utils.py +++ b/linkcheck/utils.py @@ -1,4 +1,6 @@ import logging +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta from django.apps import apps @@ -120,6 +122,70 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True, return check_count +def concurrent_check_links( + external_recheck_interval=10080, + limit=-1, + check_internal=True, + check_external=True, + max_workers=20, +): + """ + Return the number of links effectively checked. + A concurrent version of `check_links` + + Args: + external_recheck_interval: Minutes before rechecking external links + limit: Maximum number of URLs to check (-1 for unlimited) + check_internal: Whether to check internal links + check_external: Whether to check external links + max_workers: Maximum number of concurrent threads + """ + + urls = Url.objects.all() + + # An optimization for when check_internal is False + if not check_internal: + recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval) + urls = urls.exclude(last_checked__gt=recheck_datetime) + + url_list = list(urls[:limit] if limit > 0 else urls) + + if not url_list: + return 0 + + # Thread-safe counter + check_count = 0 + count_lock = threading.Lock() + + def check_single_url(url_obj): + """Check a single URL and return 1 if checked, 0 if not""" + try: + status = url_obj.check_url(check_internal=check_internal, check_external=check_external) + return 1 if status is not None else 0 + except Exception as e: + logger.exception( + "%s while checking %s: %s", + type(e).__name__, + url_obj.url, + e + ) + return 0 + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_url = { + executor.submit(check_single_url, url): url + for url in url_list + } + # Process completed futures + for future in as_completed(future_to_url): + result = future.result() + with count_lock: + check_count += result + + return check_count + + def update_urls(urls, content_type, object_id): # Structure of urls param is [(field, link text, url), ... ]