From 10b87324466cd6197f87944588ace1909e5edbd4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 13:03:53 -0400 Subject: [PATCH 01/20] Add more redirects for sphinx->mkdocs migration --- mkdocs.yml | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index c9edf338af..c66a58940d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -159,6 +159,62 @@ plugins: 'roadmap.md': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' 'installation.md': 'user-guide/installation.md' 'release.md': 'release-notes.md' + # Legacy Sphinx URL redirects (mkdocs migration) + 'quickstart.html': 'quick-start/index.html' + 'about.html': 'index.html' + 'developers/contributing.html': 'contributing/index.html' + 'developers/index.html': 'contributing/index.html' + 'developers/roadmap.html': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' + 'genindex.html': 'index.html' + 'py-modindex.html': 'index.html' + 'search.html': 'search/index.html' + 'release-notes.html': 'release-notes/index.html' + # User guide: .html -> /index.html + 'user-guide/arrays.html': 'user-guide/arrays/index.html' + 'user-guide/attributes.html': 'user-guide/attributes/index.html' + 'user-guide/cli.html': 'user-guide/cli/index.html' + 'user-guide/config.html': 'user-guide/config/index.html' + 'user-guide/consolidated_metadata.html': 'user-guide/consolidated_metadata/index.html' + 'user-guide/data_types.html': 'user-guide/data_types/index.html' + 'user-guide/extending.html': 'user-guide/extending/index.html' + 'user-guide/experimental.html': 'user-guide/experimental/index.html' + 'user-guide/gpu.html': 'user-guide/gpu/index.html' + 'user-guide/groups.html': 'user-guide/groups/index.html' + 'user-guide/installation.html': 'user-guide/installation/index.html' + 'user-guide/performance.html': 'user-guide/performance/index.html' + 'user-guide/storage.html': 'user-guide/storage/index.html' + 'user-guide/v3_migration.html': 'user-guide/v3_migration/index.html' + # API structure changes: api/zarr/* -> api/* + 'api/zarr/index.html': 'api/index.html' + 'api/zarr/abc/index.html': 'api/abc/buffer/index.html' + 'api/zarr/abc/buffer/index.html': 'api/abc/buffer/index.html' + 'api/zarr/abc/codec/index.html': 'api/abc/codec/index.html' + 'api/zarr/abc/metadata/index.html': 'api/abc/metadata/index.html' + 'api/zarr/abc/numcodec/index.html': 'api/abc/codec/index.html' + 'api/zarr/abc/store/index.html': 'api/abc/store/index.html' + 'api/zarr/api/index.html': 'api/api_async/index.html' + 'api/zarr/api/asynchronous/index.html': 'api/api_async/index.html' + 'api/zarr/api/synchronous/index.html': 'api/api_sync/index.html' + 'api/zarr/buffer/index.html': 'api/buffer/index.html' + 'api/zarr/buffer/cpu/index.html': 'api/buffer/index.html' + 'api/zarr/buffer/gpu/index.html': 'api/buffer/index.html' + 'api/zarr/codecs/index.html': 'api/codecs/index.html' + 'api/zarr/codecs/numcodecs/index.html': 'api/codecs/index.html' + 'api/zarr/convenience/index.html': 'api/convenience/index.html' + 'api/zarr/creation/index.html': 'api/create/index.html' + 'api/zarr/dtype/index.html': 'api/dtype/index.html' + 'api/zarr/errors/index.html': 'api/errors/index.html' + 'api/zarr/metadata/index.html': 'api/group/index.html' + 'api/zarr/metadata/migrate_v3/index.html': 'user-guide/v3_migration/index.html' + 'api/zarr/registry/index.html': 'api/registry/index.html' + 'api/zarr/storage/index.html': 'api/storage/index.html' + 'api/zarr/testing/index.html': 'api/testing/index.html' + 'api/zarr/testing/buffer/index.html': 'api/testing/index.html' + 'api/zarr/testing/conftest/index.html': 'api/testing/index.html' + 'api/zarr/testing/stateful/index.html': 'api/testing/index.html' + 'api/zarr/testing/store/index.html': 'api/testing/index.html' + 'api/zarr/testing/strategies/index.html': 'api/testing/index.html' + 'api/zarr/testing/utils/index.html': 'api/testing/index.html' # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 markdown_extensions: From 9aed1428cc2184196fe2495bf0938e790d4ee02c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 13:46:38 -0400 Subject: [PATCH 02/20] Add entry point for top-level API --- docs/api/index.md | 6 +++++ mkdocs.yml | 56 ----------------------------------------------- 2 files changed, 6 insertions(+), 56 deletions(-) diff --git a/docs/api/index.md b/docs/api/index.md index 8e6be1058e..d70e8d7099 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -2,6 +2,12 @@ Complete reference documentation for the Zarr-Python API. +::: zarr + options: + members: false + show_root_heading: true + show_root_toc_entry: true + ## Core API ### Essential Classes and Functions diff --git a/mkdocs.yml b/mkdocs.yml index c66a58940d..c9edf338af 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -159,62 +159,6 @@ plugins: 'roadmap.md': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' 'installation.md': 'user-guide/installation.md' 'release.md': 'release-notes.md' - # Legacy Sphinx URL redirects (mkdocs migration) - 'quickstart.html': 'quick-start/index.html' - 'about.html': 'index.html' - 'developers/contributing.html': 'contributing/index.html' - 'developers/index.html': 'contributing/index.html' - 'developers/roadmap.html': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' - 'genindex.html': 'index.html' - 'py-modindex.html': 'index.html' - 'search.html': 'search/index.html' - 'release-notes.html': 'release-notes/index.html' - # User guide: .html -> /index.html - 'user-guide/arrays.html': 'user-guide/arrays/index.html' - 'user-guide/attributes.html': 'user-guide/attributes/index.html' - 'user-guide/cli.html': 'user-guide/cli/index.html' - 'user-guide/config.html': 'user-guide/config/index.html' - 'user-guide/consolidated_metadata.html': 'user-guide/consolidated_metadata/index.html' - 'user-guide/data_types.html': 'user-guide/data_types/index.html' - 'user-guide/extending.html': 'user-guide/extending/index.html' - 'user-guide/experimental.html': 'user-guide/experimental/index.html' - 'user-guide/gpu.html': 'user-guide/gpu/index.html' - 'user-guide/groups.html': 'user-guide/groups/index.html' - 'user-guide/installation.html': 'user-guide/installation/index.html' - 'user-guide/performance.html': 'user-guide/performance/index.html' - 'user-guide/storage.html': 'user-guide/storage/index.html' - 'user-guide/v3_migration.html': 'user-guide/v3_migration/index.html' - # API structure changes: api/zarr/* -> api/* - 'api/zarr/index.html': 'api/index.html' - 'api/zarr/abc/index.html': 'api/abc/buffer/index.html' - 'api/zarr/abc/buffer/index.html': 'api/abc/buffer/index.html' - 'api/zarr/abc/codec/index.html': 'api/abc/codec/index.html' - 'api/zarr/abc/metadata/index.html': 'api/abc/metadata/index.html' - 'api/zarr/abc/numcodec/index.html': 'api/abc/codec/index.html' - 'api/zarr/abc/store/index.html': 'api/abc/store/index.html' - 'api/zarr/api/index.html': 'api/api_async/index.html' - 'api/zarr/api/asynchronous/index.html': 'api/api_async/index.html' - 'api/zarr/api/synchronous/index.html': 'api/api_sync/index.html' - 'api/zarr/buffer/index.html': 'api/buffer/index.html' - 'api/zarr/buffer/cpu/index.html': 'api/buffer/index.html' - 'api/zarr/buffer/gpu/index.html': 'api/buffer/index.html' - 'api/zarr/codecs/index.html': 'api/codecs/index.html' - 'api/zarr/codecs/numcodecs/index.html': 'api/codecs/index.html' - 'api/zarr/convenience/index.html': 'api/convenience/index.html' - 'api/zarr/creation/index.html': 'api/create/index.html' - 'api/zarr/dtype/index.html': 'api/dtype/index.html' - 'api/zarr/errors/index.html': 'api/errors/index.html' - 'api/zarr/metadata/index.html': 'api/group/index.html' - 'api/zarr/metadata/migrate_v3/index.html': 'user-guide/v3_migration/index.html' - 'api/zarr/registry/index.html': 'api/registry/index.html' - 'api/zarr/storage/index.html': 'api/storage/index.html' - 'api/zarr/testing/index.html': 'api/testing/index.html' - 'api/zarr/testing/buffer/index.html': 'api/testing/index.html' - 'api/zarr/testing/conftest/index.html': 'api/testing/index.html' - 'api/zarr/testing/stateful/index.html': 'api/testing/index.html' - 'api/zarr/testing/store/index.html': 'api/testing/index.html' - 'api/zarr/testing/strategies/index.html': 'api/testing/index.html' - 'api/zarr/testing/utils/index.html': 'api/testing/index.html' # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 markdown_extensions: From 7605c7634bd57e7e9759c75b66f0579ae73f78dd Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 14:01:33 -0400 Subject: [PATCH 03/20] Explicit setting --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 151ecbbaa0..224b157e0a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -6,6 +6,7 @@ site_description: An implementation of chunked, compressed, N-dimensional arrays site_author: Alistair Miles site_url: !ENV [READTHEDOCS_CANONICAL_URL, 'https://zarr.readthedocs.io/'] docs_dir: docs +use_directory_urls: true nav: - "index.md" From 042a250465c2c3a060bfe8f0049532dac305d255 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:10:07 -0400 Subject: [PATCH 04/20] Add missing pages --- docs/api/index.md | 6 ------ docs/api/metadata.md | 5 +++++ docs/api/testing.md | 4 ++++ 3 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 docs/api/metadata.md diff --git a/docs/api/index.md b/docs/api/index.md index d70e8d7099..8e6be1058e 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -2,12 +2,6 @@ Complete reference documentation for the Zarr-Python API. -::: zarr - options: - members: false - show_root_heading: true - show_root_toc_entry: true - ## Core API ### Essential Classes and Functions diff --git a/docs/api/metadata.md b/docs/api/metadata.md new file mode 100644 index 0000000000..a7e64c3389 --- /dev/null +++ b/docs/api/metadata.md @@ -0,0 +1,5 @@ +--- +title: metadata +--- + +::: zarr.metadata diff --git a/docs/api/testing.md b/docs/api/testing.md index 1412950ee3..f34f0ad3d5 100644 --- a/docs/api/testing.md +++ b/docs/api/testing.md @@ -21,3 +21,7 @@ title: testing ## Utils ::: zarr.testing.utils + +## Conftest + +::: zarr.testing.conftest From fbeb8ea47e22972f32b38217601d9f766c14f90a Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:10:31 -0400 Subject: [PATCH 05/20] Add redirects --- mkdocs.yml | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index 224b157e0a..bbcedd0563 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,6 +43,7 @@ nav: - api/config.md - api/codecs.md - api/errors.md + - api/metadata.md - api/registry.md - api/storage.md - api/testing.md @@ -160,6 +161,68 @@ plugins: 'roadmap.md': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' 'installation.md': 'user-guide/installation.md' 'release.md': 'release-notes.md' + 'about.html.md': 'index.md' + 'arrays.html.md': 'user-guide/arrays.md' + 'attributes.html.md': 'user-guide/attributes.md' + 'cli.html.md': 'user-guide/cli.md' + 'config.html.md': 'user-guide/config.md' + 'consolidated_metadata.html.md': 'user-guide/consolidated_metadata.md' + 'data_types.html.md': 'user-guide/data_types.md' + 'extending.html.md': 'user-guide/extending.md' + 'gpu.html.md': 'user-guide/gpu.md' + 'groups.html.md': 'user-guide/groups.md' + 'installation.html.md': 'user-guide/installation.md' + 'performance.html.md': 'user-guide/performance.md' + 'quickstart.html.md': 'quick-start.md' + 'release-notes.html.md': 'release-notes.md' + 'storage.html.md': 'user-guide/storage.md' + 'v3_migration.html.md': 'user-guide/v3_migration.md' + 'user-guide/arrays.html.md': 'user-guide/arrays.md' + 'user-guide/attributes.html.md': 'user-guide/attributes.md' + 'user-guide/cli.html.md': 'user-guide/cli.md' + 'user-guide/config.html.md': 'user-guide/config.md' + 'user-guide/consolidated_metadata.html.md': 'user-guide/consolidated_metadata.md' + 'user-guide/data_types.html.md': 'user-guide/data_types.md' + 'user-guide/extending.html.md': 'user-guide/extending.md' + 'user-guide/gpu.html.md': 'user-guide/gpu.md' + 'user-guide/groups.html.md': 'user-guide/groups.md' + 'user-guide/installation.html.md': 'user-guide/installation.md' + 'user-guide/performance.html.md': 'user-guide/performance.md' + 'user-guide/storage.html.md': 'user-guide/storage.md' + 'user-guide/v3_migration.html.md': 'user-guide/v3_migration.md' + 'developers/contributing.html.md': 'contributing.md' + 'developers/index.html.md': 'contributing.md' + 'developers/roadmap.html.md': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' + 'api/zarr/index.html.md': 'api/index.md' + 'api/zarr/abc/index.html.md': 'api/abc/buffer.md' + 'api/zarr/abc/buffer/index.html.md': 'api/abc/buffer.md' + 'api/zarr/abc/codec/index.html.md': 'api/abc/codec.md' + 'api/zarr/abc/metadata/index.html.md': 'api/abc/metadata.md' + 'api/zarr/abc/numcodec/index.html.md': 'api/abc/codec.md' + 'api/zarr/abc/store/index.html.md': 'api/abc/store.md' + 'api/zarr/api/index.html.md': 'api/index.md' + 'api/zarr/api/asynchronous/index.html.md': 'api/api_async.md' + 'api/zarr/api/synchronous/index.html.md': 'api/api_sync.md' + 'api/zarr/buffer/index.html.md': 'api/buffer.md' + 'api/zarr/buffer/cpu/index.html.md': 'api/buffer.md' + 'api/zarr/buffer/gpu/index.html.md': 'api/buffer.md' + 'api/zarr/codecs/index.html.md': 'api/codecs.md' + 'api/zarr/codecs/numcodecs/index.html.md': 'api/codecs.md' + 'api/zarr/convenience/index.html.md': 'api/convenience.md' + 'api/zarr/creation/index.html.md': 'api/deprecated/creation.md' + 'api/zarr/dtype/index.html.md': 'api/dtype.md' + 'api/zarr/errors/index.html.md': 'api/errors.md' + 'api/zarr/metadata/index.html.md': 'api/metadata.md' + 'api/zarr/metadata/migrate_v3/index.html.md': 'api/metadata.md' + 'api/zarr/registry/index.html.md': 'api/registry.md' + 'api/zarr/storage/index.html.md': 'api/storage.md' + 'api/zarr/testing/index.html.md': 'api/testing.md' + 'api/zarr/testing/buffer/index.html.md': 'api/testing.md' + 'api/zarr/testing/conftest/index.html.md': 'api/testing.md' + 'api/zarr/testing/stateful/index.html.md': 'api/testing.md' + 'api/zarr/testing/store/index.html.md': 'api/testing.md' + 'api/zarr/testing/strategies/index.html.md': 'api/testing.md' + 'api/zarr/testing/utils/index.html.md': 'api/testing.md' # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 markdown_extensions: From ae4d47374a1c6a4b759b3305b3d4f434b7e7d380 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:17:36 -0400 Subject: [PATCH 06/20] Add script for checking links --- ci/check_links.py | 200 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 ci/check_links.py diff --git a/ci/check_links.py b/ci/check_links.py new file mode 100644 index 0000000000..364e0ceac5 --- /dev/null +++ b/ci/check_links.py @@ -0,0 +1,200 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "requests", +# "beautifulsoup4", +# "urllib3", +# ] +# /// + +""" +Validate that all subpages from zarr stable docs exist in latest docs. +This script crawls the stable documentation and checks if each page +has a corresponding valid page in the latest documentation. + +Generated using Claude +""" + +import time +from collections import deque +from urllib.parse import urljoin, urlparse + +import requests +from bs4 import BeautifulSoup + +STABLE_BASE = "https://zarr.readthedocs.io/en/stable" +LATEST_BASE = "https://zarr.readthedocs.io/en/latest" + +class DocumentationValidator: + def __init__(self, stable_base: str, latest_base: str) -> None: + self.stable_base = stable_base.rstrip('/') + self.latest_base = latest_base.rstrip('/') + self.session = requests.Session() + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Documentation Validator)' + }) + + def get_relative_path(self, url: str, base: str) -> str: + """Extract the relative path from a full URL.""" + if url.startswith(base): + path = url[len(base):] + # Remove fragment identifiers + if '#' in path: + path = path.split('#')[0] + return path + return "" + + def is_valid_doc_url(self, url: str, base: str) -> bool: + """Check if URL is part of the documentation.""" + if not url.startswith(('http://', 'https://')): + return False + parsed = urlparse(url) + base_parsed = urlparse(base) + # Must be same domain and start with base path + return (parsed.netloc == base_parsed.netloc and + url.startswith(base)) + + def fetch_page(self, url: str) -> tuple[int, str]: + """Fetch a page and return status code and content.""" + try: + response = self.session.get(url, timeout=10, allow_redirects=True) + return response.status_code, response.text + except requests.RequestException as e: + print(f" āœ— Error fetching {url}: {e}") + return 0, "" + + def extract_links(self, html: str, base_url: str) -> set[str]: + """Extract all documentation links from HTML.""" + soup = BeautifulSoup(html, 'html.parser') + links = set() + + for a_tag in soup.find_all('a', href=True): + href = a_tag['href'] + full_url = urljoin(base_url, href) + + # Remove fragment identifiers for deduplication + if '#' in full_url: + full_url = full_url.split('#')[0] + + if self.is_valid_doc_url(full_url, self.stable_base): + links.add(full_url) + + return links + + def crawl_stable_docs(self) -> set[str]: + """Crawl all pages in stable documentation.""" + print(f"šŸ” Crawling stable documentation: {self.stable_base}") + visited = set() + to_visit = deque([self.stable_base + "/"]) + + while to_visit: + url = to_visit.popleft() + + if url in visited: + continue + + visited.add(url) + print(f" Crawling: {url}") + + status_code, html = self.fetch_page(url) + + if status_code != 200 or not html: + continue + + # Extract and queue new links + links = self.extract_links(html, url) + for link in links: + if link not in visited: + to_visit.append(link) + + # Be respectful with rate limiting + time.sleep(0.1) + + print(f"āœ“ Found {len(visited)} pages in stable docs\n") + return visited + + def validate_latest_docs(self, stable_urls: set[str]) -> dict[str, list[str]]: + """Check if all stable URLs exist in latest docs.""" + print(f"šŸ” Validating pages in latest documentation: {self.latest_base}") + + results = { + 'valid': [], + 'missing': [], + 'error': [] + } + + for stable_url in sorted(stable_urls): + relative_path = self.get_relative_path(stable_url, self.stable_base) + latest_url = self.latest_base + relative_path + + print(f" Checking: {relative_path}") + status_code, _ = self.fetch_page(latest_url) + + if status_code == 200: + results['valid'].append(relative_path) + print(" āœ“ Valid (200)") + elif status_code == 404: + results['missing'].append(relative_path) + print(" āœ— Missing (404)") + else: + results['error'].append(f"{relative_path} (status: {status_code})") + print(f" ⚠ Error (status: {status_code})") + + time.sleep(0.1) + + return results + + def print_summary(self, results: dict[str, list[str]]) -> None: + """Print validation summary.""" + print("\n" + "="*70) + print("VALIDATION SUMMARY") + print("="*70) + + total = len(results['valid']) + len(results['missing']) + len(results['error']) + + print(f"\nāœ“ Valid pages: {len(results['valid'])}/{total}") + print(f"āœ— Missing pages: {len(results['missing'])}/{total}") + print(f"⚠ Error pages: {len(results['error'])}/{total}") + + if results['missing']: + print("\n" + "-"*70) + print("MISSING PAGES:") + print("-"*70) + for path in results['missing']: + print(f" • {path}") + + if results['error']: + print("\n" + "-"*70) + print("ERROR PAGES:") + print("-"*70) + for info in results['error']: + print(f" • {info}") + + print("\n" + "="*70) + + if not results['missing'] and not results['error']: + print("šŸŽ‰ All pages validated successfully!") + else: + print(f"āš ļø {len(results['missing']) + len(results['error'])} issues found") + print("="*70) + +def main() -> None: + validator = DocumentationValidator(STABLE_BASE, LATEST_BASE) + + # Step 1: Crawl stable documentation + stable_urls = validator.crawl_stable_docs() + + # Step 2: Validate against latest documentation + results = validator.validate_latest_docs(stable_urls) + + # Step 3: Print summary + validator.print_summary(results) + + # Exit with error code if there are missing pages + if results['missing'] or results['error']: + exit(1) + else: + exit(0) + +if __name__ == "__main__": + main() From fe028609c7df2de75c3f355d5df88ba3fa2d262b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:26:48 -0400 Subject: [PATCH 07/20] Lint --- ci/check_links.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/check_links.py b/ci/check_links.py index 364e0ceac5..0ac9fe4b24 100644 --- a/ci/check_links.py +++ b/ci/check_links.py @@ -58,10 +58,10 @@ def fetch_page(self, url: str) -> tuple[int, str]: """Fetch a page and return status code and content.""" try: response = self.session.get(url, timeout=10, allow_redirects=True) - return response.status_code, response.text except requests.RequestException as e: print(f" āœ— Error fetching {url}: {e}") return 0, "" + return response.status_code, response.text def extract_links(self, html: str, base_url: str) -> set[str]: """Extract all documentation links from HTML.""" From a39d130b4c6952dc11e85e8c0077a4560ee6dda8 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:30:33 -0400 Subject: [PATCH 08/20] Lint --- ci/check_links.py | 74 ++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/ci/check_links.py b/ci/check_links.py index 0ac9fe4b24..53c06ce157 100644 --- a/ci/check_links.py +++ b/ci/check_links.py @@ -25,34 +25,32 @@ STABLE_BASE = "https://zarr.readthedocs.io/en/stable" LATEST_BASE = "https://zarr.readthedocs.io/en/latest" + class DocumentationValidator: def __init__(self, stable_base: str, latest_base: str) -> None: - self.stable_base = stable_base.rstrip('/') - self.latest_base = latest_base.rstrip('/') + self.stable_base = stable_base.rstrip("/") + self.latest_base = latest_base.rstrip("/") self.session = requests.Session() - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Documentation Validator)' - }) + self.session.headers.update({"User-Agent": "Mozilla/5.0 (Documentation Validator)"}) def get_relative_path(self, url: str, base: str) -> str: """Extract the relative path from a full URL.""" if url.startswith(base): - path = url[len(base):] + path = url[len(base) :] # Remove fragment identifiers - if '#' in path: - path = path.split('#')[0] + if "#" in path: + path = path.split("#")[0] return path return "" def is_valid_doc_url(self, url: str, base: str) -> bool: """Check if URL is part of the documentation.""" - if not url.startswith(('http://', 'https://')): + if not url.startswith(("http://", "https://")): return False parsed = urlparse(url) base_parsed = urlparse(base) # Must be same domain and start with base path - return (parsed.netloc == base_parsed.netloc and - url.startswith(base)) + return parsed.netloc == base_parsed.netloc and url.startswith(base) def fetch_page(self, url: str) -> tuple[int, str]: """Fetch a page and return status code and content.""" @@ -65,16 +63,16 @@ def fetch_page(self, url: str) -> tuple[int, str]: def extract_links(self, html: str, base_url: str) -> set[str]: """Extract all documentation links from HTML.""" - soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, "html.parser") links = set() - for a_tag in soup.find_all('a', href=True): - href = a_tag['href'] + for a_tag in soup.find_all("a", href=True): + href = a_tag["href"] full_url = urljoin(base_url, href) # Remove fragment identifiers for deduplication - if '#' in full_url: - full_url = full_url.split('#')[0] + if "#" in full_url: + full_url = full_url.split("#")[0] if self.is_valid_doc_url(full_url, self.stable_base): links.add(full_url) @@ -117,11 +115,7 @@ def validate_latest_docs(self, stable_urls: set[str]) -> dict[str, list[str]]: """Check if all stable URLs exist in latest docs.""" print(f"šŸ” Validating pages in latest documentation: {self.latest_base}") - results = { - 'valid': [], - 'missing': [], - 'error': [] - } + results = {"valid": [], "missing": [], "error": []} for stable_url in sorted(stable_urls): relative_path = self.get_relative_path(stable_url, self.stable_base) @@ -131,13 +125,13 @@ def validate_latest_docs(self, stable_urls: set[str]) -> dict[str, list[str]]: status_code, _ = self.fetch_page(latest_url) if status_code == 200: - results['valid'].append(relative_path) + results["valid"].append(relative_path) print(" āœ“ Valid (200)") elif status_code == 404: - results['missing'].append(relative_path) + results["missing"].append(relative_path) print(" āœ— Missing (404)") else: - results['error'].append(f"{relative_path} (status: {status_code})") + results["error"].append(f"{relative_path} (status: {status_code})") print(f" ⚠ Error (status: {status_code})") time.sleep(0.1) @@ -146,37 +140,38 @@ def validate_latest_docs(self, stable_urls: set[str]) -> dict[str, list[str]]: def print_summary(self, results: dict[str, list[str]]) -> None: """Print validation summary.""" - print("\n" + "="*70) + print("\n" + "=" * 70) print("VALIDATION SUMMARY") - print("="*70) + print("=" * 70) - total = len(results['valid']) + len(results['missing']) + len(results['error']) + total = len(results["valid"]) + len(results["missing"]) + len(results["error"]) print(f"\nāœ“ Valid pages: {len(results['valid'])}/{total}") print(f"āœ— Missing pages: {len(results['missing'])}/{total}") print(f"⚠ Error pages: {len(results['error'])}/{total}") - if results['missing']: - print("\n" + "-"*70) + if results["missing"]: + print("\n" + "-" * 70) print("MISSING PAGES:") - print("-"*70) - for path in results['missing']: + print("-" * 70) + for path in results["missing"]: print(f" • {path}") - if results['error']: - print("\n" + "-"*70) + if results["error"]: + print("\n" + "-" * 70) print("ERROR PAGES:") - print("-"*70) - for info in results['error']: + print("-" * 70) + for info in results["error"]: print(f" • {info}") - print("\n" + "="*70) + print("\n" + "=" * 70) - if not results['missing'] and not results['error']: + if not results["missing"] and not results["error"]: print("šŸŽ‰ All pages validated successfully!") else: print(f"āš ļø {len(results['missing']) + len(results['error'])} issues found") - print("="*70) + print("=" * 70) + def main() -> None: validator = DocumentationValidator(STABLE_BASE, LATEST_BASE) @@ -191,10 +186,11 @@ def main() -> None: validator.print_summary(results) # Exit with error code if there are missing pages - if results['missing'] or results['error']: + if results["missing"] or results["error"]: exit(1) else: exit(0) + if __name__ == "__main__": main() From 47c0b818881fe0d3d7339234eae387b2dfc39054 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 24 Oct 2025 13:09:53 -0400 Subject: [PATCH 09/20] Add another script checking object inventory --- ci/check_links.py | 2 +- ci/check_object_inventory.py | 342 +++++++++++++++++++++++++++++++++++ mkdocs.yml | 4 + 3 files changed, 347 insertions(+), 1 deletion(-) create mode 100644 ci/check_object_inventory.py diff --git a/ci/check_links.py b/ci/check_links.py index 53c06ce157..729f881b79 100644 --- a/ci/check_links.py +++ b/ci/check_links.py @@ -12,7 +12,7 @@ This script crawls the stable documentation and checks if each page has a corresponding valid page in the latest documentation. -Generated using Claude +Generated by Claude. """ import time diff --git a/ci/check_object_inventory.py b/ci/check_object_inventory.py new file mode 100644 index 0000000000..7cfa3629ab --- /dev/null +++ b/ci/check_object_inventory.py @@ -0,0 +1,342 @@ +# /// script +# dependencies = [ +# "requests", +# ] +# /// +""" +Validate that all URLs from old docs (stable) resolve correctly in new docs (latest). + +This script: +1. Downloads and parses objects.inv from stable docs +2. Parses objects.inv from newly built local docs +3. Scans the built site directory for all HTML files +4. Loads redirect mappings from mkdocs.yml +5. Validates that all old URLs are accessible in the new build +6. Generates a comprehensive report + +Generated by Claude. +""" + +import re +import zlib +from pathlib import Path +from typing import Dict, Set, List, Tuple +from collections import defaultdict +from io import BytesIO + +import requests + +# Configuration +STABLE_OBJECTS_INV_URL = "https://zarr.readthedocs.io/en/stable/objects.inv" +LOCAL_OBJECTS_INV = Path("site/objects.inv") +SITE_DIR = Path("site") +MKDOCS_CONFIG = Path("mkdocs.yml") + + +def parse_objects_inv(file_obj) -> Dict[str, str]: + """ + Parse a Sphinx objects.inv file and extract object name -> URL mappings. + + Args: + file_obj: File-like object containing objects.inv data + + Returns: + Dictionary mapping object names to relative URLs + """ + # Read header (4 lines) + project_line = file_obj.readline() + version_line = file_obj.readline() + compression_line = file_obj.readline() + empty_line = file_obj.readline() + + # Decompress the rest + compressed = file_obj.read() + decompressed = zlib.decompress(compressed) + + # Parse entries + inventory = {} + lines = decompressed.decode('utf-8').split('\n') + + for line in lines: + line = line.strip() + if not line: + continue + + # Format: name domain:role priority uri dispname + # Example: Array py:class 1 api/array.html#zarr.Array - + parts = line.split(None, 4) + if len(parts) >= 4: + name = parts[0] + uri = parts[3] + + # Remove anchor if present (we care about the page, not the anchor) + if '#' in uri: + uri = uri.split('#')[0] + + # Handle special placeholder + if uri == '-': + continue + + inventory[name] = uri + + return inventory + + +def download_stable_inventory() -> Dict[str, str]: + """Download and parse the stable docs objects.inv.""" + print(f"Downloading stable inventory from {STABLE_OBJECTS_INV_URL}...") + try: + # Use requests with User-Agent header + response = requests.get( + STABLE_OBJECTS_INV_URL, + headers={'User-Agent': 'Mozilla/5.0 (compatible; zarr-docs-validator/1.0)'}, + timeout=30 + ) + response.raise_for_status() + + # Convert bytes to file-like object for parsing + file_obj = BytesIO(response.content) + return parse_objects_inv(file_obj) + except Exception as e: + print(f"Error downloading stable inventory: {e}") + return {} + + +def load_local_inventory() -> Dict[str, str]: + """Parse the locally built objects.inv.""" + print(f"Loading local inventory from {LOCAL_OBJECTS_INV}...") + if not LOCAL_OBJECTS_INV.exists(): + print(f"ERROR: Local inventory not found at {LOCAL_OBJECTS_INV}") + return {} + + with open(LOCAL_OBJECTS_INV, 'rb') as f: + return parse_objects_inv(f) + + +def scan_site_files() -> Set[str]: + """Scan the site directory for all HTML files and return relative paths.""" + print(f"Scanning {SITE_DIR} for HTML files...") + html_files = set() + + if not SITE_DIR.exists(): + print(f"ERROR: Site directory not found at {SITE_DIR}") + return html_files + + for html_file in SITE_DIR.rglob("*.html"): + # Get relative path from site directory + rel_path = html_file.relative_to(SITE_DIR) + html_files.add(str(rel_path)) + + print(f"Found {len(html_files)} HTML files") + return html_files + + +def load_redirect_maps() -> Dict[str, str]: + """Load redirect mappings from mkdocs.yml.""" + print(f"Loading redirects from {MKDOCS_CONFIG}...") + redirects = {} + + if not MKDOCS_CONFIG.exists(): + print(f"WARNING: mkdocs.yml not found at {MKDOCS_CONFIG}") + return redirects + + content = MKDOCS_CONFIG.read_text() + + # Find the redirect_maps section + in_redirect_section = False + for line in content.split('\n'): + if 'redirect_maps:' in line: + in_redirect_section = True + continue + + if in_redirect_section: + # Check if we've left the redirect section + if line and not line.startswith(' ') and not line.startswith('\t'): + break + + # Parse redirect mapping: 'old.md': 'new.md' or 'old.md': 'https://...' + match = re.search(r"'([^']+)':\s*'([^']+)'", line) + if match: + old_path = match.group(1) + new_path = match.group(2) + + # Convert .md to .html for URL comparison + if old_path.endswith('.md'): + old_path = old_path[:-3] + '.html' + + redirects[old_path] = new_path + + print(f"Found {len(redirects)} redirect mappings") + return redirects + + +def validate_urls( + stable_inv: Dict[str, str], + local_inv: Dict[str, str], + site_files: Set[str], + redirects: Dict[str, str] +) -> Tuple[List[dict], Dict[str, int]]: + """ + Validate that all stable URLs are accessible in the new build. + + Returns: + Tuple of (results list, statistics dict) + """ + print("\nValidating URLs...") + results = [] + stats = { + 'total': 0, + 'matched': 0, + 'redirected': 0, + 'missing': 0, + 'external': 0 + } + + # Get all unique URLs from stable inventory + stable_urls = set(stable_inv.values()) + + for url in sorted(stable_urls): + stats['total'] += 1 + result = { + 'url': url, + 'status': 'UNKNOWN', + 'new_url': None, + 'note': '' + } + + # Check if it's an external URL + if url.startswith('http://') or url.startswith('https://'): + result['status'] = 'EXTERNAL' + result['note'] = 'External URL, skipped' + stats['external'] += 1 + results.append(result) + continue + + # Check direct match in site files + if url in site_files: + result['status'] = 'OK' + result['new_url'] = url + stats['matched'] += 1 + results.append(result) + continue + + # Check if it's in redirects + if url in redirects: + redirect_target = redirects[url] + result['status'] = 'REDIRECT' + result['new_url'] = redirect_target + result['note'] = f'Redirects to {redirect_target}' + stats['redirected'] += 1 + results.append(result) + continue + + + # Not found + result['status'] = 'MISSING' + result['note'] = 'URL not found in new build' + stats['missing'] += 1 + results.append(result) + + return results, stats + + +def print_report(results: List[dict], stats: Dict[str, int]): + """Print a comprehensive validation report.""" + print("\n" + "=" * 80) + print("VALIDATION REPORT") + print("=" * 80) + + # Summary statistics + print("\nšŸ“Š SUMMARY STATISTICS") + print("-" * 80) + print(f"Total URLs checked: {stats['total']}") + print(f" āœ… Matched (OK): {stats['matched']}") + print(f" šŸ”€ Redirected: {stats['redirected']}") + print(f" āŒ Missing: {stats['missing']}") + print(f" 🌐 External (skipped): {stats['external']}") + + # Calculate coverage + accessible = stats['matched'] + stats['redirected'] + internal_total = stats['total'] - stats['external'] + if internal_total > 0: + coverage = (accessible / internal_total) * 100 + print(f"\nšŸ“ˆ Coverage: {coverage:.1f}% ({accessible}/{internal_total})") + + # Show missing URLs + missing = [r for r in results if r['status'] == 'MISSING'] + if missing: + print(f"\nāŒ MISSING URLS ({len(missing)})") + print("-" * 80) + for r in missing: + print(f" • {r['url']}") + + # Show redirects + redirected = [r for r in results if r['status'] == 'REDIRECT'] + if redirected: + print(f"\nšŸ”€ REDIRECTED URLS ({len(redirected)})") + print("-" * 80) + for r in redirected[:10]: # Show first 10 + print(f" • {r['url']} → {r['new_url']}") + if len(redirected) > 10: + print(f" ... and {len(redirected) - 10} more") + + # Show format changes + format_changes = [r for r in results if r['status'] == 'OK' and 'format changed' in r['note']] + if format_changes: + print(f"\nšŸ”„ URL FORMAT CHANGES ({len(format_changes)})") + print("-" * 80) + for r in format_changes[:10]: # Show first 10 + print(f" • {r['url']} → {r['new_url']}") + if len(format_changes) > 10: + print(f" ... and {len(format_changes) - 10} more") + + print("\n" + "=" * 80) + + # Final verdict + if stats['missing'] == 0: + print("āœ… SUCCESS: All URLs from stable docs are accessible in new build!") + else: + print(f"āš ļø WARNING: {stats['missing']} URLs are not accessible in new build") + + print("=" * 80 + "\n") + + +def main(): + """Main execution function.""" + print("Documentation URL Validation Tool") + print("=" * 80 + "\n") + + # Step 1: Download stable inventory + stable_inv = download_stable_inventory() + if not stable_inv: + print("ERROR: Could not load stable inventory. Aborting.") + return 1 + print(f" Loaded {len(stable_inv)} objects from stable docs") + + # Step 2: Load local inventory + local_inv = load_local_inventory() + if not local_inv: + print("WARNING: Could not load local inventory") + else: + print(f" Loaded {len(local_inv)} objects from local docs") + + # Step 3: Scan site files + site_files = scan_site_files() + if not site_files: + print("WARNING: No site files found") + + # Step 4: Load redirects + redirects = load_redirect_maps() + + # Step 5: Validate URLs + results, stats = validate_urls(stable_inv, local_inv, site_files, redirects) + + # Step 6: Print report + print_report(results, stats) + + # Return exit code based on results + return 0 if stats['missing'] == 0 else 1 + + +if __name__ == '__main__': + exit(main()) diff --git a/mkdocs.yml b/mkdocs.yml index bbcedd0563..2bfabc45d0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -156,6 +156,9 @@ plugins: 'spec/v2.md': 'https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html' 'spec/v3.md': 'https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html' 'license.md': 'https://github.com/zarr-developers/zarr-python/blob/main/LICENSE.txt' + 'genindex.html.md': 'index.md' + 'py-modindex.html.md': 'index.md' + 'search.html.md': 'index.md' 'tutorial.md': 'user-guide/installation.md' 'getting-started.md': 'quick-start.md' 'roadmap.md': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' @@ -223,6 +226,7 @@ plugins: 'api/zarr/testing/store/index.html.md': 'api/testing.md' 'api/zarr/testing/strategies/index.html.md': 'api/testing.md' 'api/zarr/testing/utils/index.html.md': 'api/testing.md' + # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 markdown_extensions: From d2d67c304eb2ce09e94e1ae5b159203dca021586 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 24 Oct 2025 13:11:23 -0400 Subject: [PATCH 10/20] Lint --- ci/check_object_inventory.py | 112 +++++++++++++++-------------------- mkdocs.yml | 2 +- 2 files changed, 50 insertions(+), 64 deletions(-) diff --git a/ci/check_object_inventory.py b/ci/check_object_inventory.py index 7cfa3629ab..a5a04000ed 100644 --- a/ci/check_object_inventory.py +++ b/ci/check_object_inventory.py @@ -19,10 +19,8 @@ import re import zlib -from pathlib import Path -from typing import Dict, Set, List, Tuple -from collections import defaultdict from io import BytesIO +from pathlib import Path import requests @@ -33,7 +31,7 @@ MKDOCS_CONFIG = Path("mkdocs.yml") -def parse_objects_inv(file_obj) -> Dict[str, str]: +def parse_objects_inv(file_obj) -> dict[str, str]: """ Parse a Sphinx objects.inv file and extract object name -> URL mappings. @@ -55,7 +53,7 @@ def parse_objects_inv(file_obj) -> Dict[str, str]: # Parse entries inventory = {} - lines = decompressed.decode('utf-8').split('\n') + lines = decompressed.decode("utf-8").split("\n") for line in lines: line = line.strip() @@ -70,11 +68,11 @@ def parse_objects_inv(file_obj) -> Dict[str, str]: uri = parts[3] # Remove anchor if present (we care about the page, not the anchor) - if '#' in uri: - uri = uri.split('#')[0] + if "#" in uri: + uri = uri.split("#")[0] # Handle special placeholder - if uri == '-': + if uri == "-": continue inventory[name] = uri @@ -82,15 +80,15 @@ def parse_objects_inv(file_obj) -> Dict[str, str]: return inventory -def download_stable_inventory() -> Dict[str, str]: +def download_stable_inventory() -> dict[str, str]: """Download and parse the stable docs objects.inv.""" print(f"Downloading stable inventory from {STABLE_OBJECTS_INV_URL}...") try: # Use requests with User-Agent header response = requests.get( STABLE_OBJECTS_INV_URL, - headers={'User-Agent': 'Mozilla/5.0 (compatible; zarr-docs-validator/1.0)'}, - timeout=30 + headers={"User-Agent": "Mozilla/5.0 (compatible; zarr-docs-validator/1.0)"}, + timeout=30, ) response.raise_for_status() @@ -102,18 +100,18 @@ def download_stable_inventory() -> Dict[str, str]: return {} -def load_local_inventory() -> Dict[str, str]: +def load_local_inventory() -> dict[str, str]: """Parse the locally built objects.inv.""" print(f"Loading local inventory from {LOCAL_OBJECTS_INV}...") if not LOCAL_OBJECTS_INV.exists(): print(f"ERROR: Local inventory not found at {LOCAL_OBJECTS_INV}") return {} - with open(LOCAL_OBJECTS_INV, 'rb') as f: + with open(LOCAL_OBJECTS_INV, "rb") as f: return parse_objects_inv(f) -def scan_site_files() -> Set[str]: +def scan_site_files() -> set[str]: """Scan the site directory for all HTML files and return relative paths.""" print(f"Scanning {SITE_DIR} for HTML files...") html_files = set() @@ -131,7 +129,7 @@ def scan_site_files() -> Set[str]: return html_files -def load_redirect_maps() -> Dict[str, str]: +def load_redirect_maps() -> dict[str, str]: """Load redirect mappings from mkdocs.yml.""" print(f"Loading redirects from {MKDOCS_CONFIG}...") redirects = {} @@ -144,14 +142,14 @@ def load_redirect_maps() -> Dict[str, str]: # Find the redirect_maps section in_redirect_section = False - for line in content.split('\n'): - if 'redirect_maps:' in line: + for line in content.split("\n"): + if "redirect_maps:" in line: in_redirect_section = True continue if in_redirect_section: # Check if we've left the redirect section - if line and not line.startswith(' ') and not line.startswith('\t'): + if line and not line.startswith(" ") and not line.startswith("\t"): break # Parse redirect mapping: 'old.md': 'new.md' or 'old.md': 'https://...' @@ -161,8 +159,8 @@ def load_redirect_maps() -> Dict[str, str]: new_path = match.group(2) # Convert .md to .html for URL comparison - if old_path.endswith('.md'): - old_path = old_path[:-3] + '.html' + if old_path.endswith(".md"): + old_path = old_path[:-3] + ".html" redirects[old_path] = new_path @@ -171,11 +169,11 @@ def load_redirect_maps() -> Dict[str, str]: def validate_urls( - stable_inv: Dict[str, str], - local_inv: Dict[str, str], - site_files: Set[str], - redirects: Dict[str, str] -) -> Tuple[List[dict], Dict[str, int]]: + stable_inv: dict[str, str], + local_inv: dict[str, str], + site_files: set[str], + redirects: dict[str, str], +) -> tuple[list[dict], dict[str, int]]: """ Validate that all stable URLs are accessible in the new build. @@ -184,63 +182,51 @@ def validate_urls( """ print("\nValidating URLs...") results = [] - stats = { - 'total': 0, - 'matched': 0, - 'redirected': 0, - 'missing': 0, - 'external': 0 - } + stats = {"total": 0, "matched": 0, "redirected": 0, "missing": 0, "external": 0} # Get all unique URLs from stable inventory stable_urls = set(stable_inv.values()) for url in sorted(stable_urls): - stats['total'] += 1 - result = { - 'url': url, - 'status': 'UNKNOWN', - 'new_url': None, - 'note': '' - } + stats["total"] += 1 + result = {"url": url, "status": "UNKNOWN", "new_url": None, "note": ""} # Check if it's an external URL - if url.startswith('http://') or url.startswith('https://'): - result['status'] = 'EXTERNAL' - result['note'] = 'External URL, skipped' - stats['external'] += 1 + if url.startswith("http://") or url.startswith("https://"): + result["status"] = "EXTERNAL" + result["note"] = "External URL, skipped" + stats["external"] += 1 results.append(result) continue # Check direct match in site files if url in site_files: - result['status'] = 'OK' - result['new_url'] = url - stats['matched'] += 1 + result["status"] = "OK" + result["new_url"] = url + stats["matched"] += 1 results.append(result) continue # Check if it's in redirects if url in redirects: redirect_target = redirects[url] - result['status'] = 'REDIRECT' - result['new_url'] = redirect_target - result['note'] = f'Redirects to {redirect_target}' - stats['redirected'] += 1 + result["status"] = "REDIRECT" + result["new_url"] = redirect_target + result["note"] = f"Redirects to {redirect_target}" + stats["redirected"] += 1 results.append(result) continue - # Not found - result['status'] = 'MISSING' - result['note'] = 'URL not found in new build' - stats['missing'] += 1 + result["status"] = "MISSING" + result["note"] = "URL not found in new build" + stats["missing"] += 1 results.append(result) return results, stats -def print_report(results: List[dict], stats: Dict[str, int]): +def print_report(results: list[dict], stats: dict[str, int]): """Print a comprehensive validation report.""" print("\n" + "=" * 80) print("VALIDATION REPORT") @@ -256,14 +242,14 @@ def print_report(results: List[dict], stats: Dict[str, int]): print(f" 🌐 External (skipped): {stats['external']}") # Calculate coverage - accessible = stats['matched'] + stats['redirected'] - internal_total = stats['total'] - stats['external'] + accessible = stats["matched"] + stats["redirected"] + internal_total = stats["total"] - stats["external"] if internal_total > 0: coverage = (accessible / internal_total) * 100 print(f"\nšŸ“ˆ Coverage: {coverage:.1f}% ({accessible}/{internal_total})") # Show missing URLs - missing = [r for r in results if r['status'] == 'MISSING'] + missing = [r for r in results if r["status"] == "MISSING"] if missing: print(f"\nāŒ MISSING URLS ({len(missing)})") print("-" * 80) @@ -271,7 +257,7 @@ def print_report(results: List[dict], stats: Dict[str, int]): print(f" • {r['url']}") # Show redirects - redirected = [r for r in results if r['status'] == 'REDIRECT'] + redirected = [r for r in results if r["status"] == "REDIRECT"] if redirected: print(f"\nšŸ”€ REDIRECTED URLS ({len(redirected)})") print("-" * 80) @@ -281,7 +267,7 @@ def print_report(results: List[dict], stats: Dict[str, int]): print(f" ... and {len(redirected) - 10} more") # Show format changes - format_changes = [r for r in results if r['status'] == 'OK' and 'format changed' in r['note']] + format_changes = [r for r in results if r["status"] == "OK" and "format changed" in r["note"]] if format_changes: print(f"\nšŸ”„ URL FORMAT CHANGES ({len(format_changes)})") print("-" * 80) @@ -293,7 +279,7 @@ def print_report(results: List[dict], stats: Dict[str, int]): print("\n" + "=" * 80) # Final verdict - if stats['missing'] == 0: + if stats["missing"] == 0: print("āœ… SUCCESS: All URLs from stable docs are accessible in new build!") else: print(f"āš ļø WARNING: {stats['missing']} URLs are not accessible in new build") @@ -335,8 +321,8 @@ def main(): print_report(results, stats) # Return exit code based on results - return 0 if stats['missing'] == 0 else 1 + return 0 if stats["missing"] == 0 else 1 -if __name__ == '__main__': +if __name__ == "__main__": exit(main()) diff --git a/mkdocs.yml b/mkdocs.yml index 2bfabc45d0..350833f3a6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -226,7 +226,7 @@ plugins: 'api/zarr/testing/store/index.html.md': 'api/testing.md' 'api/zarr/testing/strategies/index.html.md': 'api/testing.md' 'api/zarr/testing/utils/index.html.md': 'api/testing.md' - + # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 markdown_extensions: From 552ecb6ab30f332dd9c445117e335efa76e9c2d0 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 14:07:37 -0400 Subject: [PATCH 11/20] Try adding section label --- docs/user-guide/arrays.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 25a1347fe3..20a959864d 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -540,7 +540,7 @@ bar[:] = np.arange(100) print(root.tree()) ``` -## Sharding +## Sharding {#user-guide-sharding} Using small chunk shapes in very large arrays can lead to a very large number of chunks. This can become a performance issue for file systems and object storage. From 172439e335e2d86fc75209064cd32d7c8b1aeb77 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:15:40 -0400 Subject: [PATCH 12/20] Simplify script --- ci/check_object_inventory.py | 283 +++++++---------------------------- 1 file changed, 50 insertions(+), 233 deletions(-) diff --git a/ci/check_object_inventory.py b/ci/check_object_inventory.py index a5a04000ed..10c558cbf6 100644 --- a/ci/check_object_inventory.py +++ b/ci/check_object_inventory.py @@ -1,58 +1,59 @@ # /// script # dependencies = [ # "requests", +# "pandas", # ] # /// """ -Validate that all URLs from old docs (stable) resolve correctly in new docs (latest). +Compare objects.inv from stable docs with local docs. This script: 1. Downloads and parses objects.inv from stable docs 2. Parses objects.inv from newly built local docs -3. Scans the built site directory for all HTML files -4. Loads redirect mappings from mkdocs.yml -5. Validates that all old URLs are accessible in the new build -6. Generates a comprehensive report +3. Compares the two inventories to identify differences +4. Generates a comprehensive report Generated by Claude. + +One can also manually inspect differences using: +`uv run --with sphinx python -m sphinx.ext.intersphinx "https://zarr.readthedocs.io/en/stable/objects.inv"` +`uv run --with sphinx python -m sphinx.ext.intersphinx "site/objects.inv"` """ -import re import zlib from io import BytesIO from pathlib import Path +import pandas as pd import requests # Configuration STABLE_OBJECTS_INV_URL = "https://zarr.readthedocs.io/en/stable/objects.inv" LOCAL_OBJECTS_INV = Path("site/objects.inv") -SITE_DIR = Path("site") -MKDOCS_CONFIG = Path("mkdocs.yml") -def parse_objects_inv(file_obj) -> dict[str, str]: +def parse_objects_inv(file_obj: bytes) -> pd.DataFrame: """ - Parse a Sphinx objects.inv file and extract object name -> URL mappings. + Parse a Sphinx objects.inv file and extract object information. Args: file_obj: File-like object containing objects.inv data Returns: - Dictionary mapping object names to relative URLs + DataFrame with columns: name, type, url """ # Read header (4 lines) - project_line = file_obj.readline() - version_line = file_obj.readline() - compression_line = file_obj.readline() - empty_line = file_obj.readline() + file_obj.readline() # project line + file_obj.readline() # version line + file_obj.readline() # compression line + file_obj.readline() # empty line # Decompress the rest compressed = file_obj.read() decompressed = zlib.decompress(compressed) # Parse entries - inventory = {} + data = [] lines = decompressed.decode("utf-8").split("\n") for line in lines: @@ -65,24 +66,26 @@ def parse_objects_inv(file_obj) -> dict[str, str]: parts = line.split(None, 4) if len(parts) >= 4: name = parts[0] + obj_type = parts[1] # e.g., "py:class", "std:doc" uri = parts[3] - # Remove anchor if present (we care about the page, not the anchor) - if "#" in uri: - uri = uri.split("#")[0] - # Handle special placeholder if uri == "-": continue - inventory[name] = uri + # Store full URI (with anchor) and page-only URI + uri_full = uri + uri_page = uri.split("#")[0] if "#" in uri else uri - return inventory + data.append( + {"name": name, "type": obj_type, "url_full": uri_full, "url_page": uri_page} + ) + return pd.DataFrame(data) -def download_stable_inventory() -> dict[str, str]: + +def download_stable_inventory() -> pd.DataFrame: """Download and parse the stable docs objects.inv.""" - print(f"Downloading stable inventory from {STABLE_OBJECTS_INV_URL}...") try: # Use requests with User-Agent header response = requests.get( @@ -93,236 +96,50 @@ def download_stable_inventory() -> dict[str, str]: response.raise_for_status() # Convert bytes to file-like object for parsing - file_obj = BytesIO(response.content) + file_obj: bytes = BytesIO(response.content) return parse_objects_inv(file_obj) except Exception as e: print(f"Error downloading stable inventory: {e}") - return {} + return pd.DataFrame() -def load_local_inventory() -> dict[str, str]: +def load_local_inventory() -> pd.DataFrame: """Parse the locally built objects.inv.""" - print(f"Loading local inventory from {LOCAL_OBJECTS_INV}...") if not LOCAL_OBJECTS_INV.exists(): - print(f"ERROR: Local inventory not found at {LOCAL_OBJECTS_INV}") - return {} + return pd.DataFrame() with open(LOCAL_OBJECTS_INV, "rb") as f: return parse_objects_inv(f) -def scan_site_files() -> set[str]: - """Scan the site directory for all HTML files and return relative paths.""" - print(f"Scanning {SITE_DIR} for HTML files...") - html_files = set() - - if not SITE_DIR.exists(): - print(f"ERROR: Site directory not found at {SITE_DIR}") - return html_files - - for html_file in SITE_DIR.rglob("*.html"): - # Get relative path from site directory - rel_path = html_file.relative_to(SITE_DIR) - html_files.add(str(rel_path)) - - print(f"Found {len(html_files)} HTML files") - return html_files - - -def load_redirect_maps() -> dict[str, str]: - """Load redirect mappings from mkdocs.yml.""" - print(f"Loading redirects from {MKDOCS_CONFIG}...") - redirects = {} - - if not MKDOCS_CONFIG.exists(): - print(f"WARNING: mkdocs.yml not found at {MKDOCS_CONFIG}") - return redirects - - content = MKDOCS_CONFIG.read_text() - - # Find the redirect_maps section - in_redirect_section = False - for line in content.split("\n"): - if "redirect_maps:" in line: - in_redirect_section = True - continue - - if in_redirect_section: - # Check if we've left the redirect section - if line and not line.startswith(" ") and not line.startswith("\t"): - break - - # Parse redirect mapping: 'old.md': 'new.md' or 'old.md': 'https://...' - match = re.search(r"'([^']+)':\s*'([^']+)'", line) - if match: - old_path = match.group(1) - new_path = match.group(2) - - # Convert .md to .html for URL comparison - if old_path.endswith(".md"): - old_path = old_path[:-3] + ".html" - - redirects[old_path] = new_path - - print(f"Found {len(redirects)} redirect mappings") - return redirects - - -def validate_urls( - stable_inv: dict[str, str], - local_inv: dict[str, str], - site_files: set[str], - redirects: dict[str, str], -) -> tuple[list[dict], dict[str, int]]: +def compare_inventories( + stable_inv: pd.DataFrame, + local_inv: pd.DataFrame, +) -> pd.DataFrame: """ - Validate that all stable URLs are accessible in the new build. + Compare stable and local inventories to find differences. Returns: - Tuple of (results list, statistics dict) + DataFrame with comparison results """ - print("\nValidating URLs...") - results = [] - stats = {"total": 0, "matched": 0, "redirected": 0, "missing": 0, "external": 0} - - # Get all unique URLs from stable inventory - stable_urls = set(stable_inv.values()) - - for url in sorted(stable_urls): - stats["total"] += 1 - result = {"url": url, "status": "UNKNOWN", "new_url": None, "note": ""} - - # Check if it's an external URL - if url.startswith("http://") or url.startswith("https://"): - result["status"] = "EXTERNAL" - result["note"] = "External URL, skipped" - stats["external"] += 1 - results.append(result) - continue - # Check direct match in site files - if url in site_files: - result["status"] = "OK" - result["new_url"] = url - stats["matched"] += 1 - results.append(result) - continue + # Prepare stable inventory for comparison + stable_urls = stable_inv[["name", "type", "url_page"]].copy() + stable_urls = stable_urls.rename(columns={"url_page": "stable_url"}) - # Check if it's in redirects - if url in redirects: - redirect_target = redirects[url] - result["status"] = "REDIRECT" - result["new_url"] = redirect_target - result["note"] = f"Redirects to {redirect_target}" - stats["redirected"] += 1 - results.append(result) - continue + # Prepare local inventory for comparison + local_urls = local_inv[["name", "type", "url_page"]].copy() + local_urls = local_urls.rename(columns={"url_page": "local_url"}) - # Not found - result["status"] = "MISSING" - result["note"] = "URL not found in new build" - stats["missing"] += 1 - results.append(result) - - return results, stats - - -def print_report(results: list[dict], stats: dict[str, int]): - """Print a comprehensive validation report.""" - print("\n" + "=" * 80) - print("VALIDATION REPORT") - print("=" * 80) - - # Summary statistics - print("\nšŸ“Š SUMMARY STATISTICS") - print("-" * 80) - print(f"Total URLs checked: {stats['total']}") - print(f" āœ… Matched (OK): {stats['matched']}") - print(f" šŸ”€ Redirected: {stats['redirected']}") - print(f" āŒ Missing: {stats['missing']}") - print(f" 🌐 External (skipped): {stats['external']}") - - # Calculate coverage - accessible = stats["matched"] + stats["redirected"] - internal_total = stats["total"] - stats["external"] - if internal_total > 0: - coverage = (accessible / internal_total) * 100 - print(f"\nšŸ“ˆ Coverage: {coverage:.1f}% ({accessible}/{internal_total})") - - # Show missing URLs - missing = [r for r in results if r["status"] == "MISSING"] - if missing: - print(f"\nāŒ MISSING URLS ({len(missing)})") - print("-" * 80) - for r in missing: - print(f" • {r['url']}") - - # Show redirects - redirected = [r for r in results if r["status"] == "REDIRECT"] - if redirected: - print(f"\nšŸ”€ REDIRECTED URLS ({len(redirected)})") - print("-" * 80) - for r in redirected[:10]: # Show first 10 - print(f" • {r['url']} → {r['new_url']}") - if len(redirected) > 10: - print(f" ... and {len(redirected) - 10} more") - - # Show format changes - format_changes = [r for r in results if r["status"] == "OK" and "format changed" in r["note"]] - if format_changes: - print(f"\nšŸ”„ URL FORMAT CHANGES ({len(format_changes)})") - print("-" * 80) - for r in format_changes[:10]: # Show first 10 - print(f" • {r['url']} → {r['new_url']}") - if len(format_changes) > 10: - print(f" ... and {len(format_changes) - 10} more") - - print("\n" + "=" * 80) - - # Final verdict - if stats["missing"] == 0: - print("āœ… SUCCESS: All URLs from stable docs are accessible in new build!") - else: - print(f"āš ļø WARNING: {stats['missing']} URLs are not accessible in new build") - - print("=" * 80 + "\n") - - -def main(): - """Main execution function.""" - print("Documentation URL Validation Tool") - print("=" * 80 + "\n") - - # Step 1: Download stable inventory - stable_inv = download_stable_inventory() - if not stable_inv: - print("ERROR: Could not load stable inventory. Aborting.") - return 1 - print(f" Loaded {len(stable_inv)} objects from stable docs") + # Full outer merge to find all entries + return stable_urls.merge(local_urls, on=["name", "type"], how="outer", indicator=True) - # Step 2: Load local inventory - local_inv = load_local_inventory() - if not local_inv: - print("WARNING: Could not load local inventory") - else: - print(f" Loaded {len(local_inv)} objects from local docs") - - # Step 3: Scan site files - site_files = scan_site_files() - if not site_files: - print("WARNING: No site files found") - - # Step 4: Load redirects - redirects = load_redirect_maps() - - # Step 5: Validate URLs - results, stats = validate_urls(stable_inv, local_inv, site_files, redirects) - # Step 6: Print report - print_report(results, stats) - - # Return exit code based on results - return 0 if stats["missing"] == 0 else 1 +def main() -> None: + stable_inv = download_stable_inventory() + local_inv = load_local_inventory() + compare_inventories(stable_inv, local_inv) if __name__ == "__main__": - exit(main()) + main() From 002f2f62abc85bbaf075a6dda85f2b9abe7976a4 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:07:13 -0400 Subject: [PATCH 13/20] Show if no docstring for consistency --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 350833f3a6..647f211240 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -132,6 +132,7 @@ plugins: show_source: true show_symbol_type_toc: true signature_crossrefs: true + show_if_no_docstring: true extensions: - griffe_inherited_docstrings From f170fbf312d93391bca03e87ec1a6c114e172f55 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:07:27 -0400 Subject: [PATCH 14/20] Add numcodecs to API docs --- docs/api/codecs.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/api/codecs.md b/docs/api/codecs.md index 5cf66b304e..151efeac32 100644 --- a/docs/api/codecs.md +++ b/docs/api/codecs.md @@ -2,4 +2,6 @@ title: codecs --- -::: zarr.codecs \ No newline at end of file +::: zarr.codecs + +::: zarr.codecs.numcodecs From e635a64fadb607bb07a49f3264ab9e3f1c9277ea Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:08:00 -0400 Subject: [PATCH 15/20] Add migrate_v3 namespace --- docs/api/metadata.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api/metadata.md b/docs/api/metadata.md index a7e64c3389..12eb909086 100644 --- a/docs/api/metadata.md +++ b/docs/api/metadata.md @@ -3,3 +3,4 @@ title: metadata --- ::: zarr.metadata +::: zarr.metadata.migrate_v3 From 8324c0c7f103a9ca98d71118239979781a0a5cd2 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:41:35 -0400 Subject: [PATCH 16/20] Add module entry points --- docs/api/abc/buffer.md | 7 +++++++ docs/api/api_sync.md | 6 ++++++ docs/api/index.md | 6 ++++++ docs/api/testing.md | 6 ++++++ 4 files changed, 25 insertions(+) diff --git a/docs/api/abc/buffer.md b/docs/api/abc/buffer.md index ac814d20b6..d1ace2c899 100644 --- a/docs/api/abc/buffer.md +++ b/docs/api/abc/buffer.md @@ -2,4 +2,11 @@ title: buffer --- +::: zarr.abc + options: + show_root_heading: true + show_root_toc_entry: true + members: false + + ::: zarr.abc.buffer diff --git a/docs/api/api_sync.md b/docs/api/api_sync.md index 83ff118db5..63a4aec537 100644 --- a/docs/api/api_sync.md +++ b/docs/api/api_sync.md @@ -2,4 +2,10 @@ title: synchronous --- +::: zarr.api + options: + show_root_heading: true + show_root_toc_entry: true + members: false + ::: zarr.api.synchronous \ No newline at end of file diff --git a/docs/api/index.md b/docs/api/index.md index 8e6be1058e..6160230ac0 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -2,6 +2,12 @@ Complete reference documentation for the Zarr-Python API. +::: zarr + options: + show_root_heading: true + show_root_toc_entry: true + members: false + ## Core API ### Essential Classes and Functions diff --git a/docs/api/testing.md b/docs/api/testing.md index f34f0ad3d5..eef48614b5 100644 --- a/docs/api/testing.md +++ b/docs/api/testing.md @@ -2,6 +2,12 @@ title: testing --- +::: zarr.testing + options: + show_root_heading: true + show_root_toc_entry: true + members: false + ## Buffer ::: zarr.testing.buffer From dcd24ebcd583fca45cc2e0cf9535a8aa473ceedc Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:51:51 -0400 Subject: [PATCH 17/20] Ignore sphinx specific source files from link checking --- ci/check_links.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/ci/check_links.py b/ci/check_links.py index 729f881b79..d57956a632 100644 --- a/ci/check_links.py +++ b/ci/check_links.py @@ -43,6 +43,10 @@ def get_relative_path(self, url: str, base: str) -> str: return path return "" + def should_ignore_path(self, path: str) -> bool: + """Check if path should be ignored (contains _sources or _modules).""" + return "/_sources" in path or "/_modules" in path + def is_valid_doc_url(self, url: str, base: str) -> bool: """Check if URL is part of the documentation.""" if not url.startswith(("http://", "https://")): @@ -50,7 +54,12 @@ def is_valid_doc_url(self, url: str, base: str) -> bool: parsed = urlparse(url) base_parsed = urlparse(base) # Must be same domain and start with base path - return parsed.netloc == base_parsed.netloc and url.startswith(base) + if not (parsed.netloc == base_parsed.netloc and url.startswith(base)): + return False + + # Ignore paths containing _sources or _modules + relative_path = self.get_relative_path(url, base) + return not self.should_ignore_path(relative_path) def fetch_page(self, url: str) -> tuple[int, str]: """Fetch a page and return status code and content.""" @@ -119,6 +128,11 @@ def validate_latest_docs(self, stable_urls: set[str]) -> dict[str, list[str]]: for stable_url in sorted(stable_urls): relative_path = self.get_relative_path(stable_url, self.stable_base) + + # Skip ignored paths + if self.should_ignore_path(relative_path): + continue + latest_url = self.latest_base + relative_path print(f" Checking: {relative_path}") From 77144283bcd7bcb86199f38297f8f51c8c9ecdd5 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 18:53:19 -0400 Subject: [PATCH 18/20] Add numcodec abc --- docs/api/abc/codec.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api/abc/codec.md b/docs/api/abc/codec.md index d4eaecabe9..7d808fbb54 100644 --- a/docs/api/abc/codec.md +++ b/docs/api/abc/codec.md @@ -3,3 +3,4 @@ title: codec --- ::: zarr.abc.codec +::: zarr.abc.numcodec From ed4320daafca3e62aeb8ccc6051d16dde3d80d87 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 27 Oct 2025 19:17:56 -0400 Subject: [PATCH 19/20] Remove unused label --- docs/user-guide/arrays.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 20a959864d..25a1347fe3 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -540,7 +540,7 @@ bar[:] = np.arange(100) print(root.tree()) ``` -## Sharding {#user-guide-sharding} +## Sharding Using small chunk shapes in very large arrays can lead to a very large number of chunks. This can become a performance issue for file systems and object storage. From 61e845a3746a87d335ac78ce13b64166057d0369 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 28 Oct 2025 08:42:25 -0400 Subject: [PATCH 20/20] Remove scripts --- ci/check_links.py | 210 ----------------------------------- ci/check_object_inventory.py | 145 ------------------------ 2 files changed, 355 deletions(-) delete mode 100644 ci/check_links.py delete mode 100644 ci/check_object_inventory.py diff --git a/ci/check_links.py b/ci/check_links.py deleted file mode 100644 index d57956a632..0000000000 --- a/ci/check_links.py +++ /dev/null @@ -1,210 +0,0 @@ -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "requests", -# "beautifulsoup4", -# "urllib3", -# ] -# /// - -""" -Validate that all subpages from zarr stable docs exist in latest docs. -This script crawls the stable documentation and checks if each page -has a corresponding valid page in the latest documentation. - -Generated by Claude. -""" - -import time -from collections import deque -from urllib.parse import urljoin, urlparse - -import requests -from bs4 import BeautifulSoup - -STABLE_BASE = "https://zarr.readthedocs.io/en/stable" -LATEST_BASE = "https://zarr.readthedocs.io/en/latest" - - -class DocumentationValidator: - def __init__(self, stable_base: str, latest_base: str) -> None: - self.stable_base = stable_base.rstrip("/") - self.latest_base = latest_base.rstrip("/") - self.session = requests.Session() - self.session.headers.update({"User-Agent": "Mozilla/5.0 (Documentation Validator)"}) - - def get_relative_path(self, url: str, base: str) -> str: - """Extract the relative path from a full URL.""" - if url.startswith(base): - path = url[len(base) :] - # Remove fragment identifiers - if "#" in path: - path = path.split("#")[0] - return path - return "" - - def should_ignore_path(self, path: str) -> bool: - """Check if path should be ignored (contains _sources or _modules).""" - return "/_sources" in path or "/_modules" in path - - def is_valid_doc_url(self, url: str, base: str) -> bool: - """Check if URL is part of the documentation.""" - if not url.startswith(("http://", "https://")): - return False - parsed = urlparse(url) - base_parsed = urlparse(base) - # Must be same domain and start with base path - if not (parsed.netloc == base_parsed.netloc and url.startswith(base)): - return False - - # Ignore paths containing _sources or _modules - relative_path = self.get_relative_path(url, base) - return not self.should_ignore_path(relative_path) - - def fetch_page(self, url: str) -> tuple[int, str]: - """Fetch a page and return status code and content.""" - try: - response = self.session.get(url, timeout=10, allow_redirects=True) - except requests.RequestException as e: - print(f" āœ— Error fetching {url}: {e}") - return 0, "" - return response.status_code, response.text - - def extract_links(self, html: str, base_url: str) -> set[str]: - """Extract all documentation links from HTML.""" - soup = BeautifulSoup(html, "html.parser") - links = set() - - for a_tag in soup.find_all("a", href=True): - href = a_tag["href"] - full_url = urljoin(base_url, href) - - # Remove fragment identifiers for deduplication - if "#" in full_url: - full_url = full_url.split("#")[0] - - if self.is_valid_doc_url(full_url, self.stable_base): - links.add(full_url) - - return links - - def crawl_stable_docs(self) -> set[str]: - """Crawl all pages in stable documentation.""" - print(f"šŸ” Crawling stable documentation: {self.stable_base}") - visited = set() - to_visit = deque([self.stable_base + "/"]) - - while to_visit: - url = to_visit.popleft() - - if url in visited: - continue - - visited.add(url) - print(f" Crawling: {url}") - - status_code, html = self.fetch_page(url) - - if status_code != 200 or not html: - continue - - # Extract and queue new links - links = self.extract_links(html, url) - for link in links: - if link not in visited: - to_visit.append(link) - - # Be respectful with rate limiting - time.sleep(0.1) - - print(f"āœ“ Found {len(visited)} pages in stable docs\n") - return visited - - def validate_latest_docs(self, stable_urls: set[str]) -> dict[str, list[str]]: - """Check if all stable URLs exist in latest docs.""" - print(f"šŸ” Validating pages in latest documentation: {self.latest_base}") - - results = {"valid": [], "missing": [], "error": []} - - for stable_url in sorted(stable_urls): - relative_path = self.get_relative_path(stable_url, self.stable_base) - - # Skip ignored paths - if self.should_ignore_path(relative_path): - continue - - latest_url = self.latest_base + relative_path - - print(f" Checking: {relative_path}") - status_code, _ = self.fetch_page(latest_url) - - if status_code == 200: - results["valid"].append(relative_path) - print(" āœ“ Valid (200)") - elif status_code == 404: - results["missing"].append(relative_path) - print(" āœ— Missing (404)") - else: - results["error"].append(f"{relative_path} (status: {status_code})") - print(f" ⚠ Error (status: {status_code})") - - time.sleep(0.1) - - return results - - def print_summary(self, results: dict[str, list[str]]) -> None: - """Print validation summary.""" - print("\n" + "=" * 70) - print("VALIDATION SUMMARY") - print("=" * 70) - - total = len(results["valid"]) + len(results["missing"]) + len(results["error"]) - - print(f"\nāœ“ Valid pages: {len(results['valid'])}/{total}") - print(f"āœ— Missing pages: {len(results['missing'])}/{total}") - print(f"⚠ Error pages: {len(results['error'])}/{total}") - - if results["missing"]: - print("\n" + "-" * 70) - print("MISSING PAGES:") - print("-" * 70) - for path in results["missing"]: - print(f" • {path}") - - if results["error"]: - print("\n" + "-" * 70) - print("ERROR PAGES:") - print("-" * 70) - for info in results["error"]: - print(f" • {info}") - - print("\n" + "=" * 70) - - if not results["missing"] and not results["error"]: - print("šŸŽ‰ All pages validated successfully!") - else: - print(f"āš ļø {len(results['missing']) + len(results['error'])} issues found") - print("=" * 70) - - -def main() -> None: - validator = DocumentationValidator(STABLE_BASE, LATEST_BASE) - - # Step 1: Crawl stable documentation - stable_urls = validator.crawl_stable_docs() - - # Step 2: Validate against latest documentation - results = validator.validate_latest_docs(stable_urls) - - # Step 3: Print summary - validator.print_summary(results) - - # Exit with error code if there are missing pages - if results["missing"] or results["error"]: - exit(1) - else: - exit(0) - - -if __name__ == "__main__": - main() diff --git a/ci/check_object_inventory.py b/ci/check_object_inventory.py deleted file mode 100644 index 10c558cbf6..0000000000 --- a/ci/check_object_inventory.py +++ /dev/null @@ -1,145 +0,0 @@ -# /// script -# dependencies = [ -# "requests", -# "pandas", -# ] -# /// -""" -Compare objects.inv from stable docs with local docs. - -This script: -1. Downloads and parses objects.inv from stable docs -2. Parses objects.inv from newly built local docs -3. Compares the two inventories to identify differences -4. Generates a comprehensive report - -Generated by Claude. - -One can also manually inspect differences using: -`uv run --with sphinx python -m sphinx.ext.intersphinx "https://zarr.readthedocs.io/en/stable/objects.inv"` -`uv run --with sphinx python -m sphinx.ext.intersphinx "site/objects.inv"` -""" - -import zlib -from io import BytesIO -from pathlib import Path - -import pandas as pd -import requests - -# Configuration -STABLE_OBJECTS_INV_URL = "https://zarr.readthedocs.io/en/stable/objects.inv" -LOCAL_OBJECTS_INV = Path("site/objects.inv") - - -def parse_objects_inv(file_obj: bytes) -> pd.DataFrame: - """ - Parse a Sphinx objects.inv file and extract object information. - - Args: - file_obj: File-like object containing objects.inv data - - Returns: - DataFrame with columns: name, type, url - """ - # Read header (4 lines) - file_obj.readline() # project line - file_obj.readline() # version line - file_obj.readline() # compression line - file_obj.readline() # empty line - - # Decompress the rest - compressed = file_obj.read() - decompressed = zlib.decompress(compressed) - - # Parse entries - data = [] - lines = decompressed.decode("utf-8").split("\n") - - for line in lines: - line = line.strip() - if not line: - continue - - # Format: name domain:role priority uri dispname - # Example: Array py:class 1 api/array.html#zarr.Array - - parts = line.split(None, 4) - if len(parts) >= 4: - name = parts[0] - obj_type = parts[1] # e.g., "py:class", "std:doc" - uri = parts[3] - - # Handle special placeholder - if uri == "-": - continue - - # Store full URI (with anchor) and page-only URI - uri_full = uri - uri_page = uri.split("#")[0] if "#" in uri else uri - - data.append( - {"name": name, "type": obj_type, "url_full": uri_full, "url_page": uri_page} - ) - - return pd.DataFrame(data) - - -def download_stable_inventory() -> pd.DataFrame: - """Download and parse the stable docs objects.inv.""" - try: - # Use requests with User-Agent header - response = requests.get( - STABLE_OBJECTS_INV_URL, - headers={"User-Agent": "Mozilla/5.0 (compatible; zarr-docs-validator/1.0)"}, - timeout=30, - ) - response.raise_for_status() - - # Convert bytes to file-like object for parsing - file_obj: bytes = BytesIO(response.content) - return parse_objects_inv(file_obj) - except Exception as e: - print(f"Error downloading stable inventory: {e}") - return pd.DataFrame() - - -def load_local_inventory() -> pd.DataFrame: - """Parse the locally built objects.inv.""" - if not LOCAL_OBJECTS_INV.exists(): - return pd.DataFrame() - - with open(LOCAL_OBJECTS_INV, "rb") as f: - return parse_objects_inv(f) - - -def compare_inventories( - stable_inv: pd.DataFrame, - local_inv: pd.DataFrame, -) -> pd.DataFrame: - """ - Compare stable and local inventories to find differences. - - Returns: - DataFrame with comparison results - """ - - # Prepare stable inventory for comparison - stable_urls = stable_inv[["name", "type", "url_page"]].copy() - stable_urls = stable_urls.rename(columns={"url_page": "stable_url"}) - - # Prepare local inventory for comparison - local_urls = local_inv[["name", "type", "url_page"]].copy() - local_urls = local_urls.rename(columns={"url_page": "local_url"}) - - # Full outer merge to find all entries - return stable_urls.merge(local_urls, on=["name", "type"], how="outer", indicator=True) - - -def main() -> None: - stable_inv = download_stable_inventory() - local_inv = load_local_inventory() - compare_inventories(stable_inv, local_inv) - - -if __name__ == "__main__": - main()