diff --git a/CHANGES.rst b/CHANGES.rst index 845302312b2..12c7236c498 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -71,6 +71,11 @@ Features added * #14023: Add the new :confval:`mathjax_config_path` option to load MathJax configuration from a file. Patch by Randolf Scholz and Adam Turner. +* #14046: linkcheck: Add the :confval:`linkcheck_case_insensitive_urls` option + to allow case-insensitive URL comparison for specific URL patterns. + This is useful for links to websites that normalise URL casing (e.g. GitHub) + or case-insensitive servers. + Patch by Fazeel Usmani and James Addison. Bugs fixed ---------- diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst index ff903fa4f6c..e9f4d37c1de 100644 --- a/doc/usage/configuration.rst +++ b/doc/usage/configuration.rst @@ -3813,6 +3813,42 @@ and the number of workers to use. .. versionadded:: 7.3 +.. confval:: linkcheck_case_insensitive_urls + :type: :code-py:`Set[str] | Sequence[str]` + :default: :code-py:`()` + + A collection of regular expressions that match URLs for which the *linkcheck* + builder should perform case-insensitive comparisons. This is useful for + links to websites that are case-insensitive or normalise URL casing. + + By default, *linkcheck* requires the destination URL to match the + documented URL case-sensitively. + For example, a link to ``http://example.org/PATH`` that redirects to + ``http://example.org/path`` will be reported as ``redirected``. + + If the URL matches a pattern contained in + :confval:`!linkcheck_case_insensitive_urls`, + it would instead be reported as ``working``. + + For example, to treat all GitHub URLs as case-insensitive: + + .. code-block:: python + + linkcheck_case_insensitive_urls = [ + r'https://github\.com/.*', + ] + + Or, to treat all URLs as case-insensitive: + + .. code-block:: python + + linkcheck_case_insensitive_urls = ['.*'] + + .. note:: URI fragments (HTML anchors) are not affected by this option. + They are always checked with case-sensitive comparisons. + + .. versionadded:: 8.3 + .. confval:: linkcheck_rate_limit_timeout :type: :code-py:`int` :default: :code-py:`300` diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index d3ce638fea4..91e8c753943 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -34,7 +34,7 @@ from sphinx.util.nodes import get_node_line if TYPE_CHECKING: - from collections.abc import Callable, Iterator + from collections.abc import Callable, Iterator, Sequence from typing import Any, Literal, TypeAlias from requests import Response @@ -385,6 +385,9 @@ def __init__( self.documents_exclude: list[re.Pattern[str]] = list( map(re.compile, config.linkcheck_exclude_documents) ) + self.ignore_case: Sequence[re.Pattern[str]] = tuple( + map(re.compile, config.linkcheck_case_insensitive_urls) + ) self.auth = [ (re.compile(pattern), auth_info) for pattern, auth_info in config.linkcheck_auth @@ -629,8 +632,15 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: netloc = urlsplit(req_url).netloc self.rate_limits.pop(netloc, None) + # Check if URL should be normalised case-insensitively + ignore_case = any(pat.match(req_url) for pat in self.ignore_case) + normalised_req_url = self._normalise_url(req_url, ignore_case=ignore_case) + normalised_response_url = self._normalise_url( + response_url, ignore_case=ignore_case + ) + if ( - (response_url.rstrip('/') == req_url.rstrip('/')) + normalised_response_url == normalised_req_url or _allowed_redirect(req_url, response_url, self.allowed_redirects) ): # fmt: skip return _Status.WORKING, '', 0 @@ -676,6 +686,17 @@ def limit_rate(self, response_url: str, retry_after: str | None) -> float | None self.rate_limits[netloc] = RateLimit(delay, next_check) return next_check + @staticmethod + def _normalise_url(url: str, *, ignore_case: bool) -> str: + normalised_url = url.rstrip('/') + if not ignore_case: + return normalised_url + # URI fragments are case-sensitive + url_part, sep, fragment = normalised_url.partition('#') + if sep: + return f'{url_part.casefold()}#{fragment}' + return url_part.casefold() + def _get_request_headers( uri: str, @@ -816,6 +837,12 @@ def setup(app: Sphinx) -> ExtensionMetadata: app.add_config_value( 'linkcheck_report_timeouts_as_broken', False, '', types=frozenset({bool}) ) + app.add_config_value( + 'linkcheck_case_insensitive_urls', + (), + '', + types=frozenset({frozenset, list, set, tuple}), + ) app.add_event('linkcheck-process-uri') diff --git a/tests/roots/test-linkcheck-case-check/conf.py b/tests/roots/test-linkcheck-case-check/conf.py new file mode 100644 index 00000000000..71319b6d4a5 --- /dev/null +++ b/tests/roots/test-linkcheck-case-check/conf.py @@ -0,0 +1 @@ +# Empty config for linkcheck case sensitivity tests diff --git a/tests/roots/test-linkcheck-case-check/index.rst b/tests/roots/test-linkcheck-case-check/index.rst new file mode 100644 index 00000000000..1747d27ebcd --- /dev/null +++ b/tests/roots/test-linkcheck-case-check/index.rst @@ -0,0 +1,5 @@ +`path1 `_ + +`path2 `_ + +`PATH3 `_ diff --git a/tests/test_builders/test_build_linkcheck.py b/tests/test_builders/test_build_linkcheck.py index a09a4a42216..3755f5aa84a 100644 --- a/tests/test_builders/test_build_linkcheck.py +++ b/tests/test_builders/test_build_linkcheck.py @@ -1439,3 +1439,70 @@ def test_linkcheck_exclude_documents(app: SphinxTestApp) -> None: 'uri': 'https://www.sphinx-doc.org/this-is-another-broken-link', 'info': 'br0ken_link matched br[0-9]ken_link from linkcheck_exclude_documents', } in content + + +class CapitalisePathHandler(BaseHTTPRequestHandler): + """Test server that uppercases URL paths via redirects.""" + + protocol_version = 'HTTP/1.1' + + def do_GET(self): + if self.path.islower(): + # Redirect lowercase paths to uppercase versions + self.send_response(301, 'Moved Permanently') + self.send_header('Location', self.path.upper()) + self.send_header('Content-Length', '0') + self.end_headers() + else: + # Serve uppercase paths + content = b'ok\n\n' + self.send_response(200, 'OK') + self.send_header('Content-Length', str(len(content))) + self.end_headers() + self.wfile.write(content) + + +@pytest.mark.sphinx( + 'linkcheck', + testroot='linkcheck-case-check', + freshenv=True, +) +@pytest.mark.parametrize( + ('case_insensitive_pattern', 'expected_path1', 'expected_path2', 'expected_path3'), + [ + ([], 'redirected', 'redirected', 'working'), # default: case-sensitive + ( + [r'http://localhost:\d+/.*'], + 'working', + 'working', + 'working', + ), # all URLs case-insensitive + ( + [r'http://localhost:\d+/path1'], + 'working', + 'redirected', + 'working', + ), # only path1 case-insensitive + ], +) +def test_linkcheck_case_sensitivity( + app: SphinxTestApp, + case_insensitive_pattern: list[str], + expected_path1: str, + expected_path2: str, + expected_path3: str, +) -> None: + """Test case-sensitive and case-insensitive URL checking.""" + app.config.linkcheck_case_insensitive_urls = case_insensitive_pattern + + with serve_application(app, CapitalisePathHandler) as address: + app.build() + + content = (app.outdir / 'output.json').read_text(encoding='utf8') + rows = [json.loads(x) for x in content.splitlines()] + rowsby = {row['uri']: row for row in rows} + + # Verify expected status for each path + assert rowsby[f'http://{address}/path1']['status'] == expected_path1 + assert rowsby[f'http://{address}/path2']['status'] == expected_path2 + assert rowsby[f'http://{address}/PATH3']['status'] == expected_path3