MDverse · pierrepo · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
@@ -0,0 +1,10 @@
+---
+# Example file:
+# https://github.com/DavidAnson/markdownlint/blob/main/schema/.markdownlint.yaml
+# MD013/line-length : Line length
+# https://github.com/DavidAnson/markdownlint/blob/v0.40.0/doc/md013.md
+MD013:
+  # Number of characters
+  line_length: 200
+  # Exclude citations
+  blockquotes: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,53 +1,85 @@
+---
 # Install pre-commit hooks with:
 # uv run prek install
 # Update hooks:
 # uv run prek auto-update
 exclude: "scripts/*|tmp/*"
 repos:
 
-- repo: https://github.com/pre-commit/pre-commit-hooks
-  # Out-of-the-box hooks for pre-commit.
-  rev: v6.0.0
-  hooks:
-    # Makes sure files end in a newline and only a newline.
-    - id: end-of-file-fixer
-    # Replaces or checks mixed line ending.
-    - id: mixed-line-ending
-    # Trims trailing whitespace.
-    - id: trailing-whitespace
-    # Attempts to load all json files to verify syntax.
-    - id: check-json
-    # Attempts to load all yaml files to verify syntax.
-    - id: check-yaml
-    # Attempts to load all TOML files to verify syntax.
-    - id: check-toml
-    # Prevent giant files from being committed.
-    - id: check-added-large-files
-      # Set max file size to 500 KB.
-      args: ["--maxkb=500"]
-
-- repo: https://github.com/asottile/pyupgrade
-  # Automatically upgrade syntax for newer versions of the Python language.
-  rev: v3.21.2
-  hooks:
-  - id: pyupgrade
-
-- repo: https://github.com/astral-sh/ruff-pre-commit
-  # Linter and formatter for Python code.
-  rev: v0.14.14
-  hooks:
-    # Run the linter.
-    # Automatically fix issues where possible.
-    - id: ruff-check
-      types: [python]
-      args: [ --fix ]
-    # Run the formatter.
-    - id: ruff-format
-      types: [python]
-
-- repo: https://github.com/PyCQA/bandit
-  # Security linter for Python code.
-  rev: 1.9.3
-  hooks:
-  - id: bandit
-    args: ["--exclude", "tests"]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    # Out-of-the-box hooks for pre-commit.
+    rev: v6.0.0
+    hooks:
+      # Makes sure files end in a newline and only a newline.
+      - id: end-of-file-fixer
+      # Replaces or checks mixed line ending.
+      - id: mixed-line-ending
+      # Trims trailing whitespace.
+      - id: trailing-whitespace
+      # Attempts to load all json files to verify syntax.
+      - id: check-json
+      # Attempts to load all yaml files to verify syntax.
+      - id: check-yaml
+      # Attempts to load all TOML files to verify syntax.
+      - id: check-toml
+      # Prevent giant files from being committed.
+      - id: check-added-large-files
+        # Set max file size to 500 KB.
+        args: ["--maxkb=500"]
+
+  - repo: https://github.com/asottile/pyupgrade
+    # Automatically upgrade syntax for newer versions of the Python language.
+    rev: v3.21.2
+    hooks:
+      - id: pyupgrade
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Linter and formatter for Python code.
+    rev: v0.15.1
+    hooks:
+      # Run the linter.
+      # Automatically fix issues where possible.
+      - id: ruff-check
+        types: [python]
+        args: [--fix]
+      # Run the formatter.
+      - id: ruff-format
+        types: [python]
+
+  - repo: https://github.com/PyCQA/bandit
+    # Security linter for Python code.
+    rev: 1.9.3
+    hooks:
+      - id: bandit
+        args: ["--exclude", "tests"]
+
+  - repo: https://github.com/crate-ci/typos
+    #  Source code spell checker
+    rev: v1.43.4
+    hooks:
+      - id: typos
+
+  - repo: https://github.com/owenlamont/uv-secure
+    # Scan your uv.lock file for dependencies with known vulnerabilities
+    rev: 0.15.4
+    hooks:
+      - id: uv-secure
+
+  # YAML
+  - repo: https://github.com/adrienverge/yamllint
+    rev: v1.38.0
+    hooks:
+      - id: yamllint
+
+  # Markdown
+  - repo: https://github.com/igorshubovych/markdownlint-cli
+    rev: v0.47.0
+    hooks:
+      - id: markdownlint
+        args: ['--fix']
+
+  # Shell
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.11.0.1
+    hooks:
+      - id: shellcheck
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ The scraping takes some time (about 5 hours). A mechanism has been set up to avo
 
 Eventually, the scraper will produce two files: `zenodo_datasets.parquet` and `zenodo_files.parquet` :sparkles:
 
-Note that "[false positives](docs/false_positives.md)" have been removed in the scraping proccess.
+Note that "[false positives](docs/false_positives.md)" have been removed in the scraping process.
 
 ## Scrape Figshare
 
@@ -155,7 +155,7 @@ git clone https://github.com/NMRLipids/BilayerData.git
 
 > All metadata are stored in `README.yaml` files under the `Simulations` directory.
 
-2. Extract metadata from simulations
+1. Extract metadata from simulations
 
 ```bash
 uv run scripts/scrape_nmrlipids.py \
@@ -212,13 +212,15 @@ uv run scripts/download_files.py --input data/osf_files.tsv \
 
 Option `--withzipfiles` will also get files packaged in zip archives. It means that the script will first download the entire zip archive and then extract the mdp and gro files.
 
-This step will take a couple of hours to complete. Depending on the stability of your internet connection and the availability of the data repository servers, the download might fail for a couple of files. Re-rerun previous commands to resume the download. Files already retrieved will not be downloaded again.
+This step will take a couple of hours to complete.
+Depending on the stability of your internet connection and the availability of the data repository servers, the download might fail for a couple of files.
+Re-rerun previous commands to resume the download.
+Files already retrieved will not be downloaded again.
 
 Expect about 640 GB of data with the `--withzipfiles` option (~ 8800 gro files and 9500 mdp files)
 
 Numbers are indicative only and may vary depend on the time you run this command (databases tend to get bigger and bigger).
 
-
 ### Parse .mdp files
 
 ```bash
@@ -231,7 +233,8 @@ This step will take a couple of seconds to run. Results will be saved in `data/g
 
 ### Parse .gro files
 
-A rough molecular composition is deduced from the file `params/residue_name.yml` that contains a partial list of residues names organized in categories *protein*, *lipid*, *nucleic*, *glucid* and *water & ion*.
+A rough molecular composition is deduced from the file `params/residue_name.yml`
+that contains a partial list of residues names organized in categories *protein*, *lipid*, *nucleic*, *glucid* and *water & ion*.
 
 ```bash
 uv run scripts/parse_gro_files.py \
@@ -259,7 +262,6 @@ data/gromacs_gro_files.parquet
 data/gromacs_mdp_files.parquet
 ```
 
-
 ## Run all script
 
 You can run all commands above with the `run_all.sh` script:
@@ -271,7 +273,6 @@ bash run_all.sh
 > [!WARNING]
 > Be sure, you have have **sufficient** time, bandwidth and disk space to run this command.
 
-
 ## Upload data on Zenodo (for MDverse mainteners only)
 
 *For the owner of the Zenodo record only. Zenodo token requires `deposit:actions` and `deposit:write` scopes.*
@@ -283,6 +284,7 @@ uv run scripts/upload_datasets_to_zenodo.py --record 7856524 --metadata params/z
 ```
 
 Update files:
+
 ```bash
 uv run scripts/upload_datasets_to_zenodo.py --record 7856524 \
 --file data/datasets.parquet \

diff --git a/docs/mddb.md b/docs/mddb.md
@@ -1,6 +1,8 @@
 # MDDB
 
-> The [MDDB (Molecular Dynamics Data Bank) project](https://mddbr.eu/about/) is an initiative to collect, preserve, and share molecular dynamics (MD) simulation data. As part of this project, **MDposit** is an open platform that provides web access to atomistic MD simulations. Its goal is to facilitate and promote data sharing within the global scientific community to advance research.
+> The [MDDB (Molecular Dynamics Data Bank) project](https://mddbr.eu/about/) is an initiative to collect, preserve, and share molecular dynamics (MD) simulation data.
+As part of this project, **MDposit** is an open platform that provides web access to atomistic MD simulations.
+Its goal is to facilitate and promote data sharing within the global scientific community to advance research.
 
 The MDposit infrastructure is distributed across several MDposit nodes. All metadata are accessible through the global node:
 
@@ -29,9 +31,9 @@ A project can contain multiple replicas, each identified by `project_id`.`replic
 
 For example, the project [MD-A003ZP](https://mdposit.mddbr.eu/#/id/MD-A003ZP/overview) contains ten replicas:
 
-- `MD-A003ZP.1`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.1/overview
-- `MD-A003ZP.2`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.2/overview
-- `MD-A003ZP.3`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.3/overview
+- `MD-A003ZP.1`: <https://mdposit.mddbr.eu/#/id/MD-A003ZP.1/overview>
+- `MD-A003ZP.2`: <https://mdposit.mddbr.eu/#/id/MD-A003ZP.2/overview>
+- `MD-A003ZP.3`: <https://mdposit.mddbr.eu/#/id/MD-A003ZP.3/overview>
 - ...
 
 API entrypoint to get all datasets at once:
@@ -76,7 +78,9 @@ Title:
 
 Description:
 
-> The trajectories of all-atom MD simulations were obtained based on 4 starting representative conformations from the CG simulation. For each starting structure, there are six trajectories of the E protein: 3 with the protein embedded in the membrane containing POPC, and 3 with the membrane mimicking the natural ERGIC membrane (Mix: 50% POPC, 25% POPE, 10% POPI, 5% POPS, 10% cholesterol).
+> The trajectories of all-atom MD simulations were obtained based on 4 starting representative conformations from the CG simulation.
+For each starting structure, there are six trajectories of the E protein: 3 with the protein embedded in the membrane containing POPC, and 3 with the membrane mimicking the natural ERGIC membrane
+(Mix: 50% POPC, 25% POPE, 10% POPI, 5% POPS, 10% cholesterol).
 
 - [project on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A001T1/overview)
 - [project on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A001T1)

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,14 +51,14 @@ dependencies = [
 
 [dependency-groups]
 dev = [
-    "bandit>=1.9.2",
+    "bandit>=1.9.3",
     "jupyterlab>=4.4.9",
     "matplotlib>=3.10.1",
     "plotly>=6.0.0",
-    "prek>=0.2.29",
+    "prek>=0.3.2",
     "pysankeybeta>=1.4.2",
     "pytest>=9.0.2",
-    "ruff>=0.11.0",
+    "ruff>=0.15.1",
     "upsetplot>=0.9.0",
     "watermark>=2.5.0",
 ]

diff --git a/ruff.toml b/ruff.toml
@@ -8,39 +8,41 @@ docstring-code-line-length = 72
 [lint]
 preview = true  # Allow preview rules.
 extend-select = [
-    "F",    # pyflakes
-    "E",    # Pycodestyle errors
-    "W",    # Pycodestyle warnings
-    "I",    # isort - import ordering
+    "A",    # Flake8-builtins - misuse of Python built-in names
     "B",    # bugbear
+    "BLE",  # Flake8-blind-except - flags bare excepts
+    "C4",   # Flake8-comprehensions - best practices in comprehensions
     "C90",  # McCabe - code complexity metric for functions
-    "N",    # PEP8 Naming
+    "COM",  # Flake8-commas - trailing/comma issues
     "D",    # Pydocstyle - docstring formatting
+    "DOC",  # Pydoclint - docstring linting and consistency
+    "E",    # Pycodestyle errors
+    "EM",   # Flake8-errmsg - error message style
+    "F",    # pyflakes
+    "FAST", # FastAPI - FastAPI-specific linting rules
+    "FBT",  # Flake8-boolean-trap - potential pitfalls with booleans
+    "FURB", # Refurb - rules for code refurbishment
+    "G",    # Flake8-logging-format - logging format string issues
+    "I",    # isort - import ordering
+    "ICN",  # Flake8-import-conventions - enforces conventional import aliases
+    "LOG",  # Flake8-logging - proper logging usage
+    "N",    # PEP8 Naming
+    "NPY",  # NumPy-specific rules - ensures NumPy coding standards
+    "PD",   # Pandas-vet - checks pandas-specific code practices
+    "PERF", # Perflint - performance-related checks
+    "PL",   # Pylint  rules
+    "PT",   # Flake8-pytest-style - pytest best practices
+    "Q",    # Flake8-quotes - enforces quote style consistency
+    "RUF",  # Ruff-specific rules - additional Ruff checks
+    "S",    # Flake8-bandit - security issues
+    "SIM",  # Flake8-simplify - code simplification hints
     "UP",   # Pyupgrade - upgraded syntax to newer Python versions
-    "S",    # Flake8-bandit – security issues
-    "BLE",  # Flake8-blind-except – flags bare excepts
-    "FBT",  # Flake8-boolean-trap – potential pitfalls with booleans
-    "A",    # Flake8-builtins – misuse of Python built-in names
-    "COM",  # Flake8-commas – trailing/comma issues
-    "C4",   # Flake8-comprehensions – best practices in comprehensions
-    "EM",   # Flake8-errmsg – error message style
-    "ICN",  # Flake8-import-conventions – enforces conventional import aliases
-    "LOG",  # Flake8-logging – proper logging usage
-    "G",    # Flake8-logging-format – logging format string issues
-    "PT",   # Flake8-pytest-style – pytest best practices
-    "Q",    # Flake8-quotes – enforces quote style consistency
-    "SIM",  # Flake8-simplify – code simplification hints
-    "PD",   # Pandas-vet – checks pandas-specific code practices
-    "NPY",  # NumPy-specific rules – ensures NumPy coding standards
-    "FAST", # FastAPI – FastAPI-specific linting rules
-    "PERF", # Perflint – performance-related checks
-    "FURB", # Refurb – rules for code refurbishment
-    "DOC",  # Pydoclint – docstring linting and consistency
-    "RUF",  # Ruff-specific rules – additional Ruff checks
+    "W",    # Pycodestyle warnings
 ]
 ignore = [
     "COM812", # Redundant with ruff formatter. See: https://docs.astral.sh/ruff/rules/missing-trailing-comma/
-    "G004", # f-strings are allowed with the loguru module. See https://docs.astral.sh/ruff/rules/logging-f-string/
+    "G004",   # f-strings are allowed with the loguru module. See https://docs.astral.sh/ruff/rules/logging-f-string/
+    "PLR",    # No Pylint recommendations
 ]
 
 # Force numpy-style for docstrings
@@ -53,6 +55,6 @@ convention = "numpy"
 "tests/**/*.py" = [
     # at least this three should be fine in tests:
     "S101", # asserts allowed in tests...
-    "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
-    "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
+    "ARG",  # Unused function args -> fixtures nevertheless are functionally relevant...
+    "FBT",  # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
 ]
diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py
@@ -53,6 +53,7 @@
         "base_url": "https://cineca.mddbr.eu",
     },
 }
+MAX_NUMBER_OF_DATASETS_IN_DEBUG = 100
 
 
 def scrape_all_datasets(
@@ -140,8 +141,15 @@ def scrape_all_datasets(
         logger.debug("First dataset metadata on this page:")
         logger.debug(datasets[0] if datasets else "No datasets on this page")
 
-        if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 100:
-            logger.warning("Debug mode is ON: stopping after 100 datasets.")
+        if (
+            scraper
+            and scraper.is_in_debug_mode
+            and len(all_datasets) >= MAX_NUMBER_OF_DATASETS_IN_DEBUG
+        ):
+            logger.warning(
+                "Debug mode is ON: "
+                f"stopping after {MAX_NUMBER_OF_DATASETS_IN_DEBUG} datasets."
+            )
             return all_datasets
 
     logger.success(f"Scraped {len(all_datasets):,} datasets in MDposit.")