From 0a37b89e547465885bf79eac8ce231159ef63704 Mon Sep 17 00:00:00 2001 From: essmaw Date: Tue, 20 Jan 2026 00:04:36 +0100 Subject: [PATCH 01/43] feat: add MDposit dataset scraping script. --- src/mdverse_scrapers/scrapers/mdposit.py | 453 +++++++++++++++++++++++ 1 file changed, 453 insertions(+) create mode 100644 src/mdverse_scrapers/scrapers/mdposit.py diff --git a/src/mdverse_scrapers/scrapers/mdposit.py b/src/mdverse_scrapers/scrapers/mdposit.py new file mode 100644 index 0000000..45afdd6 --- /dev/null +++ b/src/mdverse_scrapers/scrapers/mdposit.py @@ -0,0 +1,453 @@ +"""Scrape molecular dynamics simulation datasets and files from MDposit. + +This script scrapes molecular dynamics datasets from the MDposit repository +https://mmb-dev.mddbr.eu/#/browse +""" + +import json +import sys +import time +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any + +import click +import httpx +import loguru + +from ..core.logger import create_logger +from ..core.network import ( + HttpMethod, + create_httpx_client, + make_http_request_with_retries, +) +from ..core.toolbox import export_list_of_models_to_parquet +from ..models.dataset import DatasetMetadata +from ..models.enums import DatasetProjectName, DatasetRepositoryName, DataType +from ..models.file import FileMetadata +from ..models.utils import validate_metadata_against_model + +BASE_MDPOSIT_URL = "https://mmb-dev.mddbr.eu/api/rest/v1" + + +def is_mdposit_connection_working( + client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger +) -> bool | None: + """Test connection to the MDposit API. + + Returns + ------- + bool + True if the connection is successful, False otherwise. + """ + logger.debug("Testing connection to MDposit API...") + response = make_http_request_with_retries(client, url, method=HttpMethod.GET) + if not response: + logger.error("Cannot connect to the MDposit API.") + return False + if response and hasattr(response, "headers"): + logger.debug(response.headers) + return True + + +def scrape_all_datasets( + client: httpx.Client, + query_entry_point: str, + page_size: int = 50, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Scrape Molecular Dynamics-related datasets from the MDposit API. + + Within the MDposit terminology, datasets are referred to as "projects". + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + query_entry_point : str + The entry point of the API request. + page_size : int + Number of entries to fetch per page. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict]: + A list of MDposit entries. + """ + logger.info("Scraping molecular dynamics datasets from MDposit.") + logger.info(f"Using batches of {page_size} datasets.") + all_datasets = [] + + # Start by requesting the first page to get total number of datasets. + logger.info("Requesting first page to get total number of datasets...") + page = 0 # start with first page + + while True: + response = make_http_request_with_retries( + client, + f"{BASE_MDPOSIT_URL}/{query_entry_point}?limit={page_size}&page={page}", + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.2, + ) + + if not response: + logger.error("Failed to fetch data from MDposit API.") + logger.error("Jumping to next iteration.") + page += 1 + continue + + try: + response_json = response.json() + datasets = response_json.get("projects", []) + total_datasets = response_json.get("filteredCount") + + if page == 0 and total_datasets is not None: + logger.info(f"Found a total of {total_datasets:,} datasets in MDposit.") + + if not datasets: + logger.info("No more datasets returned by API. Stopping pagination.") + break + + all_datasets.extend(datasets) + + logger.info(f"Scraped page {page} with {len(datasets)} datasets.") + if total_datasets: + logger.info( + f"Scraped {len(all_datasets)} datasets " + f"({len(all_datasets):,}/{total_datasets:,} " + f"{len(all_datasets) / total_datasets:.0%})" + ) + logger.debug("First dataset metadata on this page:") + logger.debug(datasets[0] if datasets else "No datasets on this page") + + except (json.decoder.JSONDecodeError, ValueError) as exc: + logger.error(f"Error while parsing MDposit response: {exc}") + logger.error("Jumping to next iteration.") + + page += 1 # increment page for next iteration + + logger.success(f"Scraped {len(all_datasets)} datasets in MDposit.") + return all_datasets + + +def scrape_files_for_all_datasets( + client: httpx.Client, + datasets: list[DatasetMetadata], + logger: "loguru.Logger" = loguru.logger, +) -> list[FileMetadata]: + """Scrape files metadata for all datasets in MDposit. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + datasets : list[DatasetMetadata] + List of datasets to scrape files metadata for. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[FileMetadata] + List of successfully validated `FileMetadata` objects. + """ + all_files_metadata = [] + for dataset_count, dataset in enumerate(datasets, start=1): + dataset_id = dataset.dataset_id_in_repository + files_metadata = scrape_files_for_one_dataset( + client, + url=f"{BASE_MDPOSIT_URL}/projects/{dataset_id}/filenotes", + dataset_id=dataset_id, + logger=logger, + ) + if not files_metadata: + continue + # Extract relevant files metadata. + files_selected_metadata = extract_files_metadata(files_metadata, dataset_id, logger=logger) + # Normalize files metadata with pydantic model (FileMetadata) + logger.info(f"Validating files metadata for dataset: {dataset_id}") + for file_metadata in files_selected_metadata: + normalized_metadata = validate_metadata_against_model( + file_metadata, + FileMetadata, + logger=logger, + ) + if not normalized_metadata: + logger.error( + f"Normalization failed for metadata of file " + f"{file_metadata.get('file_name')} " + f"in dataset {dataset_id}" + ) + continue + all_files_metadata.append(normalized_metadata) + logger.info("Done.") + logger.info(f"Total files: {len(all_files_metadata):,}") + logger.info( + "Extracted and validated files metadata for " + f"{dataset_count:,}/{len(datasets):,} " + f"({dataset_count / len(datasets):.0%}) datasets." + ) + return all_files_metadata + + +def scrape_files_for_one_dataset( + client: httpx.Client, + url: str, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> dict | None: + """ + Scrape files metadata for a given MDposit dataset. + + Doc: https://nomad-lab.eu/prod/v1/api/v1/extensions/docs#/entries/metadata + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + url : str + The URL endpoint. + dataset_id : str + The unique identifier of the dataset in MDposit. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + dict | None + File metadata dictionary for the dataset. + """ + logger.info(f"Scraping files for dataset ID: {dataset_id}") + response = make_http_request_with_retries( + client, + url, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.1, + ) + if not response: + logger.error("Failed to fetch files metadata.") + return None + return response.json() + + +def extract_datasets_metadata( + datasets: list[dict[str, Any]], + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Extract relevant metadata from raw MDposit datasets metadata. + + Parameters + ---------- + datasets : List[Dict[str, Any]] + List of raw MDposit datasets metadata. + + Returns + ------- + list[dict] + List of dataset metadata dictionaries. + """ + datasets_metadata = [] + for dataset in datasets: + dataset_id = dataset.get("accession") + logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") + entry_url = ( + f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" + ) + dataset_metadata = dataset.get("metadata", {}) + links = dataset_metadata.get("CITATION") + links_list = [links] if links else None + a = dataset_metadata.get("AUTHORS") + author_names = a if isinstance(a, list) else [a] if a else None + metadata = { + "dataset_repository_name": DatasetRepositoryName.MDPOSIT, + "dataset_project_name": DatasetProjectName.MDDB, + "dataset_id_in_repository": dataset_id, + "dataset_id_in_project": dataset_id, # idk? Maybe None + "dataset_url_in_repository": entry_url, + "dataset_url_in_project": entry_url, # idk? Maybe None + "external_links": links_list, + "title": dataset_metadata.get("NAME"), + "date_created": dataset.get("creationDate"), + "date_last_updated": dataset.get("updateDate"), + "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), + "nb_files": len(dataset.get("files", [])), + "author_names": author_names, + "license": dataset_metadata.get("LICENSE"), + "description": dataset_metadata.get("DESCRIPTION"), + "software_name": dataset_metadata.get("PROGRAM"), + "software_version": str(dataset_metadata.get("VERSION")), + "nb_atoms": dataset_metadata.get("atomCount"), + "forcefield_model_name": ", ".join( + filter(None, dataset_metadata.get("FF") or [])), + "simulation_temperature": [str(dataset_metadata.get("TEMP"))], + "molecule_names": dataset_metadata.get("SEQUENCES"), + } + datasets_metadata.append(metadata) + logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") + return datasets_metadata + + +def normalize_datasets_metadata( + datasets: list[dict], + logger: "loguru.Logger" = loguru.logger, +) -> list[DatasetMetadata]: + """ + Normalize dataset metadata with a Pydantic model. + + Parameters + ---------- + datasets : list[dict] + List of dataset metadata dictionaries. + + Returns + ------- + list[DatasetMetadata] + List of successfully validated `DatasetMetadata` objects. + """ + datasets_metadata = [] + for dataset in datasets: + logger.info( + f"Normalizing metadata for dataset: {dataset['dataset_id_in_repository']}" + ) + normalized_metadata = validate_metadata_against_model( + dataset, DatasetMetadata, logger=logger + ) + if not normalized_metadata: + logger.error( + f"Normalization failed for metadata of dataset " + f"{dataset['dataset_id_in_repository']}" + ) + continue + datasets_metadata.append(normalized_metadata) + logger.info( + "Normalized metadata for " + f"{len(datasets_metadata)}/{len(datasets)} " + f"({len(datasets_metadata) / len(datasets):.0%}) datasets." + ) + return datasets_metadata + + +def extract_files_metadata( + raw_metadata: dict, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Extract relevant metadata from raw MDposit files metadata. + + Parameters + ---------- + raw_metadata: dict + Raw files metadata. + dataset_id : str + The unique identifier of the dataset in MDposit. + + Returns + ------- + list[dict] + List of select files metadata. + """ + logger.info("Extracting files metadata...") + files_metadata = [] + for mdposit_file in raw_metadata: + file_name = Path(mdposit_file.get("filename")) + file_type = file_name.suffix.lstrip(".") + file_path_url = ( + f"https://mmb-dev.mddbr.eu/api/rest/current/projects/{dataset_id}/files/{file_name}") + + parsed_file = { + "dataset_repository_name": DatasetRepositoryName.MDPOSIT, + "dataset_id_in_repository": dataset_id, + "file_name": str(file_name), + "file_type": file_type, + "file_size_in_bytes": mdposit_file.get("length", None), + "file_md5": mdposit_file.get("md5", None), + "file_url_in_repository": file_path_url, + "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), + } + files_metadata.append(parsed_file) + logger.info(f"Extracted metadata for {len(files_metadata)} files.") + return files_metadata + + +@click.command( + help="Command line interface for MDverse scrapers", + epilog="Happy scraping!", +) +@click.option( + "--output-dir", + "output_dir_path", + type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), + required=True, + help="Output directory path to save results.", +) +def main(output_dir_path: Path) -> None: + """Scrape molecular dynamics datasets and files from MDposit.""" + # Create directories and logger. + output_dir_path = output_dir_path / DatasetProjectName.MDDB.value + output_dir_path.mkdir(parents=True, exist_ok=True) + logfile_path = output_dir_path / f"{DatasetProjectName.MDDB.value}_scraper.log" + logger = create_logger(logpath=logfile_path, level="INFO") + logger.info("Starting MDposit data scraping...") + start_time = time.perf_counter() + # Create HTTPX client + client = create_httpx_client() + # Check connection to MDposit API + if is_mdposit_connection_working(client, f"{BASE_MDPOSIT_URL}/projects/summary"): + logger.success("Connection to MDposit API successful!") + else: + logger.critical("Connection to MDposit API failed.") + logger.critical("Aborting.") + sys.exit(1) + + # Scrape MDposit datasets metadata. + datasets_raw_metadata = scrape_all_datasets( + client, + query_entry_point="/projects", + logger=logger, + ) + if not datasets_raw_metadata: + logger.critical("No datasets found in MDposit.") + logger.critical("Aborting.") + sys.exit(1) + # Select datasets metadata + datasets_selected_metadata = extract_datasets_metadata( + datasets_raw_metadata, logger=logger + ) + # Parse and validate MDposit dataset metadata with a pydantic model (DatasetMetadata) + datasets_normalized_metadata = normalize_datasets_metadata( + datasets_selected_metadata, logger=logger + ) + # Save datasets metadata to parquet file. + export_list_of_models_to_parquet( + output_dir_path + / f"{DatasetProjectName.MDDB.value}_{DataType.DATASETS.value}.parquet", + datasets_normalized_metadata, + logger=logger, + ) + # Scrape MDposit files metadata. + files_normalized_metadata = scrape_files_for_all_datasets( + client, datasets_normalized_metadata, logger=logger + ) + + # Save files metadata to parquet file. + export_list_of_models_to_parquet( + output_dir_path + / f"{DatasetProjectName.MDDB.value}_{DataType.FILES.value}.parquet", + files_normalized_metadata, + logger=logger, + ) + + # Print script duration. + elapsed_time = int(time.perf_counter() - start_time) + logger.success(f"Scraped MDposit in: {timedelta(seconds=elapsed_time)} 🎉") + + +if __name__ == "__main__": + main() From caf28655a87c7ec9f2043d667ab5276d53f040e8 Mon Sep 17 00:00:00 2001 From: essmaw Date: Tue, 20 Jan 2026 00:06:17 +0100 Subject: [PATCH 02/43] feat(models): add MDPOSIT repository and MDDB project fields. --- src/mdverse_scrapers/models/dataset.py | 10 ++++++---- src/mdverse_scrapers/models/enums.py | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/mdverse_scrapers/models/dataset.py b/src/mdverse_scrapers/models/dataset.py index b03df73..8e2e61a 100644 --- a/src/mdverse_scrapers/models/dataset.py +++ b/src/mdverse_scrapers/models/dataset.py @@ -50,14 +50,14 @@ class DatasetMetadata(BaseModel): ..., description=( "Name of the source repository. " - "Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD." + "Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD, MDPOSIT." ), ) dataset_project_name: DatasetProjectName | None = Field( None, description=( "Name of the project." - "Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD." + "Allowed values: ZENODO, FIGSHARE, OSF, NOMAD, ATLAS, GPCRMD, MDDB." ), ) dataset_id_in_repository: str = Field( @@ -190,12 +190,12 @@ class DatasetMetadata(BaseModel): @field_validator( "date_created", "date_last_updated", "date_last_fetched", mode="before" ) - def format_dates(cls, v: datetime | str) -> str: # noqa: N805 + def format_dates(cls, v: datetime | str | None) -> str | None: # noqa: N805 """Convert datetime objects or ISO strings to '%Y-%m-%dT%H:%M:%S' format. Parameters ---------- - cls : type[BaseDataset] + cls : type[DatasetMetadata] The Pydantic model class being validated. v : str The input value of the 'date' field to validate. @@ -205,6 +205,8 @@ def format_dates(cls, v: datetime | str) -> str: # noqa: N805 str: The date in '%Y-%m-%dT%H:%M:%S' format. """ + if v is None: + return None if isinstance(v, datetime): return v.strftime("%Y-%m-%dT%H:%M:%S") return datetime.fromisoformat(v).strftime("%Y-%m-%dT%H:%M:%S") diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index ba2393f..f225ae9 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -19,6 +19,7 @@ class DatasetRepositoryName(StrEnum): NOMAD = "nomad" ATLAS = "atlas" GPCRMD = "gpcrmd" + MDPOSIT = "mdposit" class DatasetProjectName(StrEnum): @@ -31,3 +32,4 @@ class DatasetProjectName(StrEnum): ATLAS = "atlas" GPCRMD = "gpcrmd" NMRLIPIDS = "nmrlipids" + MDDB = "mddb" From 9147f32e040456784a2770e2d881141e8c0338ce Mon Sep 17 00:00:00 2001 From: essmaw Date: Tue, 20 Jan 2026 00:07:14 +0100 Subject: [PATCH 03/43] feat(cli): add README command and scrape-mdposit entry point. --- README.md | 20 ++++++++++++++++++++ pyproject.toml | 2 ++ 2 files changed, 22 insertions(+) diff --git a/README.md b/README.md index b727ea2..73c27c4 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,26 @@ This command will: 5. Save the extracted metadata to Parquet files +## Scrape MDposit + +Have a look to the notes regarding [MDposit](docs/mdposit.md) and its API. + +Scrape MDposit to collect molecular dynamics (MD) datasets and files: + +```bash +uv run scrape-mdposit --output-dir data +``` + +This command will: + +1. Search for molecular dynamics entries and files through the MDposit API. +2. Parse metadata and validate them using the Pydantic models + `DatasetMetadata` and `FileMetadata`. +3. Save validated files and datasets metadata. + +The scraping takes about 13 minutes. + + ## Analyze Gromacs mdp and gro files ### Download files diff --git a/pyproject.toml b/pyproject.toml index 69c1405..c694f31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,3 +49,5 @@ build-backend = "uv_build" scrape-zenodo = "mdverse_scrapers.scrapers.zenodo:main" scrape-figshare = "mdverse_scrapers.scrapers.figshare:main" scrape-nomad = "mdverse_scrapers.scrapers.nomad:main" +scrape-mdposit = "mdverse_scrapers.scrapers.mdposit:main" + From e1a4e9d5aa5db07410daedbed06cc3f93c272708 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 29 Jan 2026 16:44:47 +0100 Subject: [PATCH 04/43] refactor(simulation-model): add molecule type field (protein, lipid, etc) as discussed in #73 --- src/mdverse_scrapers/models/enums.py | 13 +++++++++++++ src/mdverse_scrapers/models/simulation.py | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index bb8e668..9ce2739 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -21,3 +21,16 @@ class DatasetSourceName(StrEnum): GPCRMD = "gpcrmd" NMRLIPIDS = "nmrlipids" MDDB = "mddb" + MDPOSIT_INRIA_NODE = "mdposit_inria_node" + MDPOSIT_MMB_NODE = "mdposit_mmb_node" + + +class MoleculeType(StrEnum): + """Common molecular types found in molecular dynamics simulations.""" + + PROTEIN = "protein" + NUCLEIC_ACID = "nucleic_acid" + ION = "ion" + LIPID = "lipid" + CARBOHYDRATE = "carbohydrate" + SOLVENT = "solvent" diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index e6455ed..dcc0854 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -5,6 +5,8 @@ from pydantic import BaseModel, Field, StringConstraints, field_validator +from .enums import MoleculeType + DOI = Annotated[ str, StringConstraints(pattern=r"^10\.\d{4,9}/[\w\-.]+$"), @@ -15,6 +17,12 @@ class Molecule(BaseModel): """Molecule in a simulation.""" name: str = Field(..., description="Name of the molecule.") + type: MoleculeType | None = Field( + None, + description="Type of the molecule." + "Allowed values in the MoleculeType enum. " + "Examples: PROTEIN, ION, LIPID...", + ) number_of_atoms: int | None = Field( None, ge=0, description="Number of atoms in the molecule, if known." ) From fb283e102f31d42ea8a471148bfc9579f60730ea Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 29 Jan 2026 16:59:17 +0100 Subject: [PATCH 05/43] chore(ruff): disable PERF401 for model instance appends --- ruff.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/ruff.toml b/ruff.toml index 436c4ca..9bf13a6 100644 --- a/ruff.toml +++ b/ruff.toml @@ -41,6 +41,7 @@ extend-select = [ ignore = [ "COM812", # Redundant with ruff formatter. See: https://docs.astral.sh/ruff/rules/missing-trailing-comma/ "G004", # f-strings are allowed with the loguru module. See https://docs.astral.sh/ruff/rules/logging-f-string/ + "PERF401", # list.extend suggestion is not applicable when appending model instances. ] # Force numpy-style for docstrings From 064d94b00b9236e0d2f77bf57680978253cfeefb Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 29 Jan 2026 16:59:24 +0100 Subject: [PATCH 06/43] refactor(mdposit-scraper): update to scrape using both nodes of MDDB project and new models --- src/mdverse_scrapers/scrapers/mdposit.py | 557 ++++++++++++++--------- 1 file changed, 337 insertions(+), 220 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mdposit.py b/src/mdverse_scrapers/scrapers/mdposit.py index 45afdd6..b06a63f 100644 --- a/src/mdverse_scrapers/scrapers/mdposit.py +++ b/src/mdverse_scrapers/scrapers/mdposit.py @@ -1,13 +1,15 @@ -"""Scrape molecular dynamics simulation datasets and files from MDposit. +"""Scrape molecular dynamics simulation datasets and files from the MDDB. -This script scrapes molecular dynamics datasets from the MDposit repository -https://mmb-dev.mddbr.eu/#/browse +This script extracts molecular dynamics datasets produced within the +MDDB (Molecular Dynamics Data Bank) project, which is distributed across +two nodes: + +- MDPOSIT MMB node (https://mmb-dev.mddbr.eu/#/browse) +- MDPOSIT INRIA node https://dynarepo.inria.fr/#/ """ import json import sys -import time -from datetime import datetime, timedelta from pathlib import Path from typing import Any @@ -19,42 +21,33 @@ from ..core.network import ( HttpMethod, create_httpx_client, + is_connection_to_server_working, make_http_request_with_retries, ) -from ..core.toolbox import export_list_of_models_to_parquet +from ..core.toolbox import print_statistics from ..models.dataset import DatasetMetadata -from ..models.enums import DatasetProjectName, DatasetRepositoryName, DataType -from ..models.file import FileMetadata -from ..models.utils import validate_metadata_against_model - -BASE_MDPOSIT_URL = "https://mmb-dev.mddbr.eu/api/rest/v1" - - -def is_mdposit_connection_working( - client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger -) -> bool | None: - """Test connection to the MDposit API. +from ..models.enums import DatasetSourceName, MoleculeType +from ..models.scraper import ScraperContext +from ..models.simulation import ForceFieldModel, Molecule, Software +from ..models.utils import ( + export_list_of_models_to_parquet, + normalize_datasets_metadata, + normalize_files_metadata, +) - Returns - ------- - bool - True if the connection is successful, False otherwise. - """ - logger.debug("Testing connection to MDposit API...") - response = make_http_request_with_retries(client, url, method=HttpMethod.GET) - if not response: - logger.error("Cannot connect to the MDposit API.") - return False - if response and hasattr(response, "headers"): - logger.debug(response.headers) - return True +MDDB_REPOSITORIES = { + DatasetSourceName.MDPOSIT_MMB_NODE: "https://mmb-dev.mddbr.eu/api/rest/v1", + DatasetSourceName.MDPOSIT_INRIA_NODE: "https://inria.mddbr.eu/api/rest/v1", +} def scrape_all_datasets( client: httpx.Client, query_entry_point: str, + node_name: DatasetSourceName, page_size: int = 50, logger: "loguru.Logger" = loguru.logger, + scraper: ScraperContext | None = None, ) -> list[dict]: """ Scrape Molecular Dynamics-related datasets from the MDposit API. @@ -63,21 +56,27 @@ def scrape_all_datasets( Parameters ---------- - client : httpx.Client + client: httpx.Client The HTTPX client to use for making requests. - query_entry_point : str + query_entry_point: str The entry point of the API request. - page_size : int + node_name: DatasetSourceName + MDDB node name for logging. + page_size: int Number of entries to fetch per page. logger: "loguru.Logger" Logger for logging messages. + scraper: ScraperContext | None + Optional scraper context. When provided and running in debug mode, + dataset scraping is intentionally stopped early to limit the amount + of retrieved data. Returns ------- list[dict]: A list of MDposit entries. """ - logger.info("Scraping molecular dynamics datasets from MDposit.") + logger.info(f"Scraping molecular dynamics datasets from {node_name}.") logger.info(f"Using batches of {page_size} datasets.") all_datasets = [] @@ -88,7 +87,7 @@ def scrape_all_datasets( while True: response = make_http_request_with_retries( client, - f"{BASE_MDPOSIT_URL}/{query_entry_point}?limit={page_size}&page={page}", + f"{query_entry_point}?limit={page_size}&page={page}", method=HttpMethod.GET, timeout=60, delay_before_request=0.2, @@ -124,119 +123,138 @@ def scrape_all_datasets( logger.debug("First dataset metadata on this page:") logger.debug(datasets[0] if datasets else "No datasets on this page") + if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 120: + logger.warning("Debug mode is ON: stopping after 120 datasets.") + return all_datasets + except (json.decoder.JSONDecodeError, ValueError) as exc: logger.error(f"Error while parsing MDposit response: {exc}") logger.error("Jumping to next iteration.") page += 1 # increment page for next iteration - logger.success(f"Scraped {len(all_datasets)} datasets in MDposit.") + logger.success(f"Scraped {len(all_datasets):,} datasets in MDposit.") return all_datasets -def scrape_files_for_all_datasets( - client: httpx.Client, - datasets: list[DatasetMetadata], - logger: "loguru.Logger" = loguru.logger, -) -> list[FileMetadata]: - """Scrape files metadata for all datasets in MDposit. +def extract_software_and_version( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[Software] | None: + """ + Extract software names and versions from the nested dataset dictionary. Parameters ---------- - client : httpx.Client - The HTTPX client to use for making requests. - datasets : list[DatasetMetadata] - List of datasets to scrape files metadata for. + dataset_metadata: dict + The dataset dictionnary from which to extract molecules information. + dataset_id: str + Identifier of the dataset, used for logging. logger: "loguru.Logger" Logger for logging messages. Returns ------- - list[FileMetadata] - List of successfully validated `FileMetadata` objects. + list[Software] | None + A list of Software instances with `name` and `version` fields, None otherwise. """ - all_files_metadata = [] - for dataset_count, dataset in enumerate(datasets, start=1): - dataset_id = dataset.dataset_id_in_repository - files_metadata = scrape_files_for_one_dataset( - client, - url=f"{BASE_MDPOSIT_URL}/projects/{dataset_id}/filenotes", - dataset_id=dataset_id, - logger=logger, - ) - if not files_metadata: - continue - # Extract relevant files metadata. - files_selected_metadata = extract_files_metadata(files_metadata, dataset_id, logger=logger) - # Normalize files metadata with pydantic model (FileMetadata) - logger.info(f"Validating files metadata for dataset: {dataset_id}") - for file_metadata in files_selected_metadata: - normalized_metadata = validate_metadata_against_model( - file_metadata, - FileMetadata, - logger=logger, - ) - if not normalized_metadata: - logger.error( - f"Normalization failed for metadata of file " - f"{file_metadata.get('file_name')} " - f"in dataset {dataset_id}" - ) - continue - all_files_metadata.append(normalized_metadata) - logger.info("Done.") - logger.info(f"Total files: {len(all_files_metadata):,}") - logger.info( - "Extracted and validated files metadata for " - f"{dataset_count:,}/{len(datasets):,} " - f"({dataset_count / len(datasets):.0%}) datasets." - ) - return all_files_metadata + try: + name = dataset_metadata.get("PROGRAM") + version = dataset_metadata.get("VERSION") + if not name: + return None + return [Software(name=name, version=version)] + except (ValueError, KeyError, TypeError) as e: + logger.warning(f"Error parsing software info for dataset {dataset_id}: {e}") + return None -def scrape_files_for_one_dataset( - client: httpx.Client, - url: str, - dataset_id: str, - logger: "loguru.Logger" = loguru.logger, -) -> dict | None: +def extract_forcefields_and_version( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[ForceFieldModel] | None: """ - Scrape files metadata for a given MDposit dataset. + Extract forcefield or model names and versions from the nested dataset dictionary. + + Parameters + ---------- + dataset_metadata: dict + The dataset dictionnary from which to extract molecules information. + dataset_id: str + Identifier of the dataset entry, used for logging. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[ForceFieldModel] | None + A list of forcefield or model instances with `name` and `version` fields, + None otherwise. + """ + try: + names = dataset_metadata.get("FF") + # Adding the water model. + # Exemple: TIP3P. + water_model = dataset_metadata.get("WAT") + if water_model: + names.append(water_model) + if not names: + return None + return [ForceFieldModel(name=name) for name in names] + except (ValueError, KeyError) as e: + logger.warning( + f"Error parsing forcefield or model info for dataset {dataset_id}: {e}" + ) + return None - Doc: https://nomad-lab.eu/prod/v1/api/v1/extensions/docs#/entries/metadata + +def extract_molecules( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[Molecule] | None: + """ + Extract molecule names and types from the nested dataset dictionary. Parameters ---------- - client : httpx.Client - The HTTPX client to use for making requests. - url : str - The URL endpoint. - dataset_id : str - The unique identifier of the dataset in MDposit. + dataset_metadata: dict + The dataset dictionnary from which to extract molecules information. + dataset_id: str + Identifier of the dataset, used for logging. logger: "loguru.Logger" Logger for logging messages. Returns ------- - dict | None - File metadata dictionary for the dataset. + list[Molecule] | None + A list of molecules instances with `name` and `type` fields, + None otherwise. """ - logger.info(f"Scraping files for dataset ID: {dataset_id}") - response = make_http_request_with_retries( - client, - url, - method=HttpMethod.GET, - timeout=60, - delay_before_request=0.1, - ) - if not response: - logger.error("Failed to fetch files metadata.") + molecules = [] + try: + prot_seqs = dataset_metadata.get("PROTSEQ") or [] + nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] + ligands = dataset_metadata.get("LIGANDS") or [] + + for seq in prot_seqs: + molecules.append(Molecule(name=seq, type=MoleculeType.PROTEIN)) + + for seq in nucl_seqs: + molecules.append(Molecule(name=seq, type=MoleculeType.NUCLEIC)) + + for ligand in ligands: + molecules.append(Molecule(name=ligand)) + + if not molecules: + logger.warning(f"No molecules found in dataset {dataset_id}.") + return None + return molecules + + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing molecules info for dataset {dataset_id}: {e}") return None - return response.json() def extract_datasets_metadata( datasets: list[dict[str, Any]], + node_name: DatasetSourceName, logger: "loguru.Logger" = loguru.logger, ) -> list[dict]: """ @@ -244,8 +262,12 @@ def extract_datasets_metadata( Parameters ---------- - datasets : List[Dict[str, Any]] + datasets: List[Dict[str, Any]] List of raw MDposit datasets metadata. + node_name: DatasetSourceName + MDDB node name for the dataset url. + logger: "loguru.Logger" + Logger for logging messages. Returns ------- @@ -254,88 +276,154 @@ def extract_datasets_metadata( """ datasets_metadata = [] for dataset in datasets: + # Get the dataset id dataset_id = dataset.get("accession") logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") - entry_url = ( - f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" - ) + # Create the dataset url depending on the node + if node_name is DatasetSourceName.MDPOSIT_MMB_NODE: + dataset_url = f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" + elif node_name is DatasetSourceName.MDPOSIT_INRIA_NODE: + dataset_url = f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" + else: + logger.warning( + f"Unknown MDDB node '{node_name}'." + f"Cannot build entry URL for dataset {dataset_id}." + ) + dataset_metadata = dataset.get("metadata", {}) links = dataset_metadata.get("CITATION") links_list = [links] if links else None a = dataset_metadata.get("AUTHORS") author_names = a if isinstance(a, list) else [a] if a else None metadata = { - "dataset_repository_name": DatasetRepositoryName.MDPOSIT, - "dataset_project_name": DatasetProjectName.MDDB, + "dataset_repository_name": node_name.value, "dataset_id_in_repository": dataset_id, - "dataset_id_in_project": dataset_id, # idk? Maybe None - "dataset_url_in_repository": entry_url, - "dataset_url_in_project": entry_url, # idk? Maybe None + "dataset_url_in_repository": dataset_url, + "dataset_project_name": DatasetSourceName.MDDB, "external_links": links_list, "title": dataset_metadata.get("NAME"), "date_created": dataset.get("creationDate"), "date_last_updated": dataset.get("updateDate"), - "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), - "nb_files": len(dataset.get("files", [])), + "number_of_files": len(dataset.get("files", [])), "author_names": author_names, "license": dataset_metadata.get("LICENSE"), "description": dataset_metadata.get("DESCRIPTION"), - "software_name": dataset_metadata.get("PROGRAM"), - "software_version": str(dataset_metadata.get("VERSION")), - "nb_atoms": dataset_metadata.get("atomCount"), - "forcefield_model_name": ", ".join( - filter(None, dataset_metadata.get("FF") or [])), - "simulation_temperature": [str(dataset_metadata.get("TEMP"))], - "molecule_names": dataset_metadata.get("SEQUENCES"), + "total_number_of_atoms": dataset_metadata.get("atomCount"), } + # Extract simulation metadata if available. + # Software names with their versions. + metadata["software"] = extract_software_and_version( + dataset_metadata, dataset_id, logger + ) + # Forcefield and model names with their versions. + metadata["forcefields"] = extract_forcefields_and_version( + dataset_metadata, dataset_id, logger + ) + # Molecules with their nb of atoms and number total of atoms. + metadata["molecules"] = extract_molecules(dataset_metadata, dataset_id, logger) + # Time step in fs. + metadata["simulation_timesteps_in_fs"] = [dataset_metadata.get("TIMESTEP")] + # Temperatures in kelvin + metadata["simulation_temperatures_in_kelvin"] = [dataset_metadata.get("TEMP")] datasets_metadata.append(metadata) logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") return datasets_metadata -def normalize_datasets_metadata( - datasets: list[dict], +def scrape_files_for_one_dataset( + client: httpx.Client, + url: str, + dataset_id: str, logger: "loguru.Logger" = loguru.logger, -) -> list[DatasetMetadata]: +) -> dict | None: + """ + Scrape files metadata for a given MDposit dataset. + + Parameters + ---------- + client: httpx.Client + The HTTPX client to use for making requests. + url: str + The URL endpoint. + dataset_id: str + The unique identifier of the dataset in MDposit. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + dict | None + File metadata dictionary for the dataset. """ - Normalize dataset metadata with a Pydantic model. + logger.info(f"Scraping files for dataset ID: {dataset_id}") + response = make_http_request_with_retries( + client, + url, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.1, + ) + if not response: + logger.error("Failed to fetch files metadata.") + return None + return response.json() + + +def scrape_files_for_all_datasets( + client: httpx.Client, + datasets: list[DatasetMetadata], + node_base_url: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """Scrape files metadata for all datasets in MDposit API. Parameters ---------- - datasets : list[dict] - List of dataset metadata dictionaries. + client: httpx.Client + The HTTPX client to use for making requests. + datasets: list[DatasetMetadata] + List of datasets to scrape files metadata for. + node_base_url: str + Base url of the specific node of MDposit API. + logger: "loguru.Logger" + Logger for logging messages. Returns ------- - list[DatasetMetadata] - List of successfully validated `DatasetMetadata` objects. + list[dict] + List of files metadata dictionaries. """ - datasets_metadata = [] - for dataset in datasets: - logger.info( - f"Normalizing metadata for dataset: {dataset['dataset_id_in_repository']}" - ) - normalized_metadata = validate_metadata_against_model( - dataset, DatasetMetadata, logger=logger + all_files_metadata = [] + for dataset_count, dataset in enumerate(datasets, start=1): + dataset_id = dataset.dataset_id_in_repository + files_metadata = scrape_files_for_one_dataset( + client, + url=f"{node_base_url}/projects/{dataset_id}/filenotes", + dataset_id=dataset_id, + logger=logger, ) - if not normalized_metadata: - logger.error( - f"Normalization failed for metadata of dataset " - f"{dataset['dataset_id_in_repository']}" - ) + if not files_metadata: continue - datasets_metadata.append(normalized_metadata) - logger.info( - "Normalized metadata for " - f"{len(datasets_metadata)}/{len(datasets)} " - f"({len(datasets_metadata) / len(datasets):.0%}) datasets." - ) - return datasets_metadata + # Extract relevant files metadata. + logger.info(f"Getting files metadata for dataset: {dataset_id}") + files_metadata = extract_files_metadata( + files_metadata, node_base_url, dataset, logger=logger + ) + all_files_metadata += files_metadata + # Normalize files metadata with pydantic model (FileMetadata) + logger.info(f"Total files found: {len(all_files_metadata):,}") + logger.info( + "Extracted files metadata for " + f"{dataset_count:,}/{len(datasets):,} " + f"({dataset_count / len(datasets):.0%}) datasets." + ) + return all_files_metadata def extract_files_metadata( raw_metadata: dict, - dataset_id: str, + node_base_url: str, + dataset: DatasetMetadata, logger: "loguru.Logger" = loguru.logger, ) -> list[dict]: """ @@ -345,8 +433,12 @@ def extract_files_metadata( ---------- raw_metadata: dict Raw files metadata. - dataset_id : str + node_base_url: str The unique identifier of the dataset in MDposit. + dataset: DatasetMetadata + Normalized dataset to scrape files metadata for. + logger: "loguru.Logger" + Logger for logging messages. Returns ------- @@ -356,20 +448,23 @@ def extract_files_metadata( logger.info("Extracting files metadata...") files_metadata = [] for mdposit_file in raw_metadata: + dataset_id = dataset.dataset_id_in_repository file_name = Path(mdposit_file.get("filename")) file_type = file_name.suffix.lstrip(".") + node_base_url_for_file = node_base_url.replace("/v1", "") file_path_url = ( - f"https://mmb-dev.mddbr.eu/api/rest/current/projects/{dataset_id}/files/{file_name}") + f"{node_base_url_for_file}/current/projects/{dataset_id}/files/{file_name}" + ) parsed_file = { - "dataset_repository_name": DatasetRepositoryName.MDPOSIT, + "dataset_repository_name": dataset.dataset_repository_name, "dataset_id_in_repository": dataset_id, + "dataset_url_in_repository": dataset.dataset_url_in_repository, "file_name": str(file_name), "file_type": file_type, "file_size_in_bytes": mdposit_file.get("length", None), "file_md5": mdposit_file.get("md5", None), "file_url_in_repository": file_path_url, - "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), } files_metadata.append(parsed_file) logger.info(f"Extracted metadata for {len(files_metadata)} files.") @@ -383,70 +478,92 @@ def extract_files_metadata( @click.option( "--output-dir", "output_dir_path", - type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), required=True, help="Output directory path to save results.", ) -def main(output_dir_path: Path) -> None: - """Scrape molecular dynamics datasets and files from MDposit.""" - # Create directories and logger. - output_dir_path = output_dir_path / DatasetProjectName.MDDB.value - output_dir_path.mkdir(parents=True, exist_ok=True) - logfile_path = output_dir_path / f"{DatasetProjectName.MDDB.value}_scraper.log" - logger = create_logger(logpath=logfile_path, level="INFO") - logger.info("Starting MDposit data scraping...") - start_time = time.perf_counter() +@click.option( + "--debug", + "is_in_debug_mode", + is_flag=True, + default=False, + help="Enable debug mode.", +) +def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: + """Scrape molecular dynamics datasets and files from MDDB.""" # Create HTTPX client client = create_httpx_client() - # Check connection to MDposit API - if is_mdposit_connection_working(client, f"{BASE_MDPOSIT_URL}/projects/summary"): - logger.success("Connection to MDposit API successful!") - else: - logger.critical("Connection to MDposit API failed.") - logger.critical("Aborting.") - sys.exit(1) - - # Scrape MDposit datasets metadata. - datasets_raw_metadata = scrape_all_datasets( - client, - query_entry_point="/projects", - logger=logger, - ) - if not datasets_raw_metadata: - logger.critical("No datasets found in MDposit.") - logger.critical("Aborting.") - sys.exit(1) - # Select datasets metadata - datasets_selected_metadata = extract_datasets_metadata( - datasets_raw_metadata, logger=logger - ) - # Parse and validate MDposit dataset metadata with a pydantic model (DatasetMetadata) - datasets_normalized_metadata = normalize_datasets_metadata( - datasets_selected_metadata, logger=logger - ) - # Save datasets metadata to parquet file. - export_list_of_models_to_parquet( - output_dir_path - / f"{DatasetProjectName.MDDB.value}_{DataType.DATASETS.value}.parquet", - datasets_normalized_metadata, - logger=logger, - ) - # Scrape MDposit files metadata. - files_normalized_metadata = scrape_files_for_all_datasets( - client, datasets_normalized_metadata, logger=logger - ) - # Save files metadata to parquet file. - export_list_of_models_to_parquet( - output_dir_path - / f"{DatasetProjectName.MDDB.value}_{DataType.FILES.value}.parquet", - files_normalized_metadata, - logger=logger, - ) - - # Print script duration. - elapsed_time = int(time.perf_counter() - start_time) - logger.success(f"Scraped MDposit in: {timedelta(seconds=elapsed_time)} 🎉") + # Iterate over the nodes + for data_source_name, base_url in MDDB_REPOSITORIES.items(): + # Create scraper context. + scraper = ScraperContext( + data_source_name=data_source_name, + output_dir_path=output_dir_path, + is_in_debug_mode=is_in_debug_mode, + ) + # Create logger. + level = "DEBUG" if scraper.is_in_debug_mode else "INFO" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. + logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) + logger.info(f"Starting {data_source_name.name} data scraping...") + # Check connection to the API + if is_connection_to_server_working( + client, f"{base_url}/projects/summary", logger=logger + ): + logger.success(f"Connection to {data_source_name} API successful!") + else: + logger.critical(f"Connection to {data_source_name} API failed.") + logger.critical("Aborting.") + sys.exit(1) + + # Scrape the datasets metadata. + datasets_raw_metadata = scrape_all_datasets( + client, + query_entry_point=f"{base_url}/projects", + node_name=data_source_name, + logger=logger, + scraper=scraper, + ) + if not datasets_raw_metadata: + logger.critical(f"No datasets found in {data_source_name}.") + logger.critical("Aborting.") + sys.exit(1) + + # Select datasets metadata + datasets_selected_metadata = extract_datasets_metadata( + datasets_raw_metadata, data_source_name, logger=logger + ) + # Validate datasets metadata with the DatasetMetadata Pydantic model. + datasets_normalized_metadata = normalize_datasets_metadata( + datasets_selected_metadata, logger=logger + ) + # Save datasets metadata to parquet file. + scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( + scraper.datasets_parquet_file_path, + datasets_normalized_metadata, + logger=logger, + ) + # Scrape NOMAD files metadata. + files_metadata = scrape_files_for_all_datasets( + client, + datasets_normalized_metadata, + base_url, + logger=logger, + ) + # Validate NOMAD files metadata with the FileMetadata Pydantic model. + files_normalized_metadata = normalize_files_metadata( + files_metadata, logger=logger + ) + # Save files metadata to parquet file. + scraper.number_of_files_scraped = export_list_of_models_to_parquet( + scraper.files_parquet_file_path, + files_normalized_metadata, + logger=logger, + ) + # Print scraping statistics. + print_statistics(scraper, logger=logger) if __name__ == "__main__": From e150d2437e38231f59f1b4575f86550878500fd8 Mon Sep 17 00:00:00 2001 From: essmaw Date: Wed, 4 Feb 2026 23:33:40 +0100 Subject: [PATCH 07/43] docs: adding the mddb documentation + update the readme and command line --- README.md | 10 +- docs/mddb.md | 77 +++ pyproject.toml | 2 +- src/mdverse_scrapers/scrapers/mdposit.py | 570 ----------------------- 4 files changed, 83 insertions(+), 576 deletions(-) create mode 100644 docs/mddb.md delete mode 100644 src/mdverse_scrapers/scrapers/mdposit.py diff --git a/README.md b/README.md index 73c27c4..cb79ce3 100644 --- a/README.md +++ b/README.md @@ -171,19 +171,19 @@ This command will: 5. Save the extracted metadata to Parquet files -## Scrape MDposit +## Scrape MDDB -Have a look to the notes regarding [MDposit](docs/mdposit.md) and its API. +Have a look at the notes regarding [MDDB](docs/mddb.md) and its API. -Scrape MDposit to collect molecular dynamics (MD) datasets and files: +Scrape MDDB (MDposit MMB node and MDposit Inria node) to collect molecular dynamics (MD) datasets and files: ```bash -uv run scrape-mdposit --output-dir data +uv run scrape-mddb --output-dir data ``` This command will: -1. Search for molecular dynamics entries and files through the MDposit API. +1. Search for molecular dynamics datasets and files through the MDposit API nodes. 2. Parse metadata and validate them using the Pydantic models `DatasetMetadata` and `FileMetadata`. 3. Save validated files and datasets metadata. diff --git a/docs/mddb.md b/docs/mddb.md new file mode 100644 index 0000000..99f64d3 --- /dev/null +++ b/docs/mddb.md @@ -0,0 +1,77 @@ +# MDDB + +> The [MDDB (Molecular Dynamics Data Bank) project](https://mddbr.eu/about/) is an initiative to collect, preserve, and share molecular dynamics (MD) simulation data. As part of this project, **MDposit** is an open platform that provides web access to atomistic MD simulations. Its goal is to facilitate and promote data sharing within the global scientific community to advance research. + +The MDDB infrastructure is distributed across **two MDposit nodes**. Both nodes expose the same REST API entry points. The only difference is the base URL used to access the API. + +## MDposit MMB node + +- web site: +- documentation: +- API: +- API base URL: + +## MDposit INRIA node + +- web site: +- documentation: +- API: +- API base URL: + + +No account / token is needed to access the MDposit API. + +## Finding molecular dynamics datasets and files + +### Datasets + +In MDposit, a dataset (a simulation and its related files) is called an "[project](https://mmb.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)" and a project can contains multiple replicas, each identified by `project_id`.`replica_id`. + + +For exemple, the project [A026F](https://mmb.mddbr.eu/#/id/A026F/overview) contains four replicas: + - `A026F.1`: https://mmb.mddbr.eu/#/id/A026F.1/overview + - `A026F.2`: https://mmb.mddbr.eu/#/id/A026F.2/overview + - `A026F.3`: https://mmb.mddbr.eu/#/id/A026F.3/overview + - `A026F.4`: https://mmb.mddbr.eu/#/id/A026F.4/overview + + +API entrypoint to search for all datasets at once: + +- Endpoint: `/projects` +- HTTP method: GET +- [documentation](https://mmb.mddbr.eu/api/rest/docs/#/projects/get_projects) + + +### Files + +API endpoint to get files for a given replica of a project: + +- Endpoint: `/projects/{project_id.replica_id}/filenotes` +- HTTP method: GET +- [documentation](https://mmb.mddbr.eu/api/rest/docs/#/filenotes/get_projects__projectAccessionOrID__filenotes) + +## Examples + +### Project `A026F` + +- Project id: `A026F.1` +- [project on MDposit GUI](https://mmb.mddbr.eu/#/id/A026F.1/overview) +- [project on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A026F.1) + +Description: + +> Multi-scale simulation approaches which couple the molecular and neuronal simulations to predict the variation in the membrane potential and the neural spikes. + +- [files on MDposit GUI](https://mmb.mddbr.eu/#/id/A026F.1/files) +- [files on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A026F.1/filenotes) + +### Project `A025U` + +- Project id: `A025U.1` +- [project on MDposit GUI](https://mmb.mddbr.eu/#/id/A025U/overview) +- [project on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A025U.2) + +Remark: no description is provided for this dataset. + +- [files on MDposit GUI](https://mmb.mddbr.eu/#/id/A025U/files) +- [files on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A025U.2/filenotes) diff --git a/pyproject.toml b/pyproject.toml index 6f40827..0fb98da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,4 +72,4 @@ scrape-zenodo = "mdverse_scrapers.scrapers.zenodo:main" scrape-figshare = "mdverse_scrapers.scrapers.figshare:main" scrape-nomad = "mdverse_scrapers.scrapers.nomad:main" scrape-atlas = "mdverse_scrapers.scrapers.atlas:main" -scrape-mdposit = "mdverse_scrapers.scrapers.mdposit:main" +scrape-mddb = "mdverse_scrapers.scrapers.mddb:main" diff --git a/src/mdverse_scrapers/scrapers/mdposit.py b/src/mdverse_scrapers/scrapers/mdposit.py deleted file mode 100644 index b06a63f..0000000 --- a/src/mdverse_scrapers/scrapers/mdposit.py +++ /dev/null @@ -1,570 +0,0 @@ -"""Scrape molecular dynamics simulation datasets and files from the MDDB. - -This script extracts molecular dynamics datasets produced within the -MDDB (Molecular Dynamics Data Bank) project, which is distributed across -two nodes: - -- MDPOSIT MMB node (https://mmb-dev.mddbr.eu/#/browse) -- MDPOSIT INRIA node https://dynarepo.inria.fr/#/ -""" - -import json -import sys -from pathlib import Path -from typing import Any - -import click -import httpx -import loguru - -from ..core.logger import create_logger -from ..core.network import ( - HttpMethod, - create_httpx_client, - is_connection_to_server_working, - make_http_request_with_retries, -) -from ..core.toolbox import print_statistics -from ..models.dataset import DatasetMetadata -from ..models.enums import DatasetSourceName, MoleculeType -from ..models.scraper import ScraperContext -from ..models.simulation import ForceFieldModel, Molecule, Software -from ..models.utils import ( - export_list_of_models_to_parquet, - normalize_datasets_metadata, - normalize_files_metadata, -) - -MDDB_REPOSITORIES = { - DatasetSourceName.MDPOSIT_MMB_NODE: "https://mmb-dev.mddbr.eu/api/rest/v1", - DatasetSourceName.MDPOSIT_INRIA_NODE: "https://inria.mddbr.eu/api/rest/v1", -} - - -def scrape_all_datasets( - client: httpx.Client, - query_entry_point: str, - node_name: DatasetSourceName, - page_size: int = 50, - logger: "loguru.Logger" = loguru.logger, - scraper: ScraperContext | None = None, -) -> list[dict]: - """ - Scrape Molecular Dynamics-related datasets from the MDposit API. - - Within the MDposit terminology, datasets are referred to as "projects". - - Parameters - ---------- - client: httpx.Client - The HTTPX client to use for making requests. - query_entry_point: str - The entry point of the API request. - node_name: DatasetSourceName - MDDB node name for logging. - page_size: int - Number of entries to fetch per page. - logger: "loguru.Logger" - Logger for logging messages. - scraper: ScraperContext | None - Optional scraper context. When provided and running in debug mode, - dataset scraping is intentionally stopped early to limit the amount - of retrieved data. - - Returns - ------- - list[dict]: - A list of MDposit entries. - """ - logger.info(f"Scraping molecular dynamics datasets from {node_name}.") - logger.info(f"Using batches of {page_size} datasets.") - all_datasets = [] - - # Start by requesting the first page to get total number of datasets. - logger.info("Requesting first page to get total number of datasets...") - page = 0 # start with first page - - while True: - response = make_http_request_with_retries( - client, - f"{query_entry_point}?limit={page_size}&page={page}", - method=HttpMethod.GET, - timeout=60, - delay_before_request=0.2, - ) - - if not response: - logger.error("Failed to fetch data from MDposit API.") - logger.error("Jumping to next iteration.") - page += 1 - continue - - try: - response_json = response.json() - datasets = response_json.get("projects", []) - total_datasets = response_json.get("filteredCount") - - if page == 0 and total_datasets is not None: - logger.info(f"Found a total of {total_datasets:,} datasets in MDposit.") - - if not datasets: - logger.info("No more datasets returned by API. Stopping pagination.") - break - - all_datasets.extend(datasets) - - logger.info(f"Scraped page {page} with {len(datasets)} datasets.") - if total_datasets: - logger.info( - f"Scraped {len(all_datasets)} datasets " - f"({len(all_datasets):,}/{total_datasets:,} " - f"{len(all_datasets) / total_datasets:.0%})" - ) - logger.debug("First dataset metadata on this page:") - logger.debug(datasets[0] if datasets else "No datasets on this page") - - if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 120: - logger.warning("Debug mode is ON: stopping after 120 datasets.") - return all_datasets - - except (json.decoder.JSONDecodeError, ValueError) as exc: - logger.error(f"Error while parsing MDposit response: {exc}") - logger.error("Jumping to next iteration.") - - page += 1 # increment page for next iteration - - logger.success(f"Scraped {len(all_datasets):,} datasets in MDposit.") - return all_datasets - - -def extract_software_and_version( - dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger -) -> list[Software] | None: - """ - Extract software names and versions from the nested dataset dictionary. - - Parameters - ---------- - dataset_metadata: dict - The dataset dictionnary from which to extract molecules information. - dataset_id: str - Identifier of the dataset, used for logging. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[Software] | None - A list of Software instances with `name` and `version` fields, None otherwise. - """ - try: - name = dataset_metadata.get("PROGRAM") - version = dataset_metadata.get("VERSION") - if not name: - return None - return [Software(name=name, version=version)] - except (ValueError, KeyError, TypeError) as e: - logger.warning(f"Error parsing software info for dataset {dataset_id}: {e}") - return None - - -def extract_forcefields_and_version( - dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger -) -> list[ForceFieldModel] | None: - """ - Extract forcefield or model names and versions from the nested dataset dictionary. - - Parameters - ---------- - dataset_metadata: dict - The dataset dictionnary from which to extract molecules information. - dataset_id: str - Identifier of the dataset entry, used for logging. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[ForceFieldModel] | None - A list of forcefield or model instances with `name` and `version` fields, - None otherwise. - """ - try: - names = dataset_metadata.get("FF") - # Adding the water model. - # Exemple: TIP3P. - water_model = dataset_metadata.get("WAT") - if water_model: - names.append(water_model) - if not names: - return None - return [ForceFieldModel(name=name) for name in names] - except (ValueError, KeyError) as e: - logger.warning( - f"Error parsing forcefield or model info for dataset {dataset_id}: {e}" - ) - return None - - -def extract_molecules( - dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger -) -> list[Molecule] | None: - """ - Extract molecule names and types from the nested dataset dictionary. - - Parameters - ---------- - dataset_metadata: dict - The dataset dictionnary from which to extract molecules information. - dataset_id: str - Identifier of the dataset, used for logging. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[Molecule] | None - A list of molecules instances with `name` and `type` fields, - None otherwise. - """ - molecules = [] - try: - prot_seqs = dataset_metadata.get("PROTSEQ") or [] - nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] - ligands = dataset_metadata.get("LIGANDS") or [] - - for seq in prot_seqs: - molecules.append(Molecule(name=seq, type=MoleculeType.PROTEIN)) - - for seq in nucl_seqs: - molecules.append(Molecule(name=seq, type=MoleculeType.NUCLEIC)) - - for ligand in ligands: - molecules.append(Molecule(name=ligand)) - - if not molecules: - logger.warning(f"No molecules found in dataset {dataset_id}.") - return None - return molecules - - except (ValueError, KeyError) as e: - logger.warning(f"Error parsing molecules info for dataset {dataset_id}: {e}") - return None - - -def extract_datasets_metadata( - datasets: list[dict[str, Any]], - node_name: DatasetSourceName, - logger: "loguru.Logger" = loguru.logger, -) -> list[dict]: - """ - Extract relevant metadata from raw MDposit datasets metadata. - - Parameters - ---------- - datasets: List[Dict[str, Any]] - List of raw MDposit datasets metadata. - node_name: DatasetSourceName - MDDB node name for the dataset url. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[dict] - List of dataset metadata dictionaries. - """ - datasets_metadata = [] - for dataset in datasets: - # Get the dataset id - dataset_id = dataset.get("accession") - logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") - # Create the dataset url depending on the node - if node_name is DatasetSourceName.MDPOSIT_MMB_NODE: - dataset_url = f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" - elif node_name is DatasetSourceName.MDPOSIT_INRIA_NODE: - dataset_url = f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" - else: - logger.warning( - f"Unknown MDDB node '{node_name}'." - f"Cannot build entry URL for dataset {dataset_id}." - ) - - dataset_metadata = dataset.get("metadata", {}) - links = dataset_metadata.get("CITATION") - links_list = [links] if links else None - a = dataset_metadata.get("AUTHORS") - author_names = a if isinstance(a, list) else [a] if a else None - metadata = { - "dataset_repository_name": node_name.value, - "dataset_id_in_repository": dataset_id, - "dataset_url_in_repository": dataset_url, - "dataset_project_name": DatasetSourceName.MDDB, - "external_links": links_list, - "title": dataset_metadata.get("NAME"), - "date_created": dataset.get("creationDate"), - "date_last_updated": dataset.get("updateDate"), - "number_of_files": len(dataset.get("files", [])), - "author_names": author_names, - "license": dataset_metadata.get("LICENSE"), - "description": dataset_metadata.get("DESCRIPTION"), - "total_number_of_atoms": dataset_metadata.get("atomCount"), - } - # Extract simulation metadata if available. - # Software names with their versions. - metadata["software"] = extract_software_and_version( - dataset_metadata, dataset_id, logger - ) - # Forcefield and model names with their versions. - metadata["forcefields"] = extract_forcefields_and_version( - dataset_metadata, dataset_id, logger - ) - # Molecules with their nb of atoms and number total of atoms. - metadata["molecules"] = extract_molecules(dataset_metadata, dataset_id, logger) - # Time step in fs. - metadata["simulation_timesteps_in_fs"] = [dataset_metadata.get("TIMESTEP")] - # Temperatures in kelvin - metadata["simulation_temperatures_in_kelvin"] = [dataset_metadata.get("TEMP")] - datasets_metadata.append(metadata) - logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") - return datasets_metadata - - -def scrape_files_for_one_dataset( - client: httpx.Client, - url: str, - dataset_id: str, - logger: "loguru.Logger" = loguru.logger, -) -> dict | None: - """ - Scrape files metadata for a given MDposit dataset. - - Parameters - ---------- - client: httpx.Client - The HTTPX client to use for making requests. - url: str - The URL endpoint. - dataset_id: str - The unique identifier of the dataset in MDposit. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - dict | None - File metadata dictionary for the dataset. - """ - logger.info(f"Scraping files for dataset ID: {dataset_id}") - response = make_http_request_with_retries( - client, - url, - method=HttpMethod.GET, - timeout=60, - delay_before_request=0.1, - ) - if not response: - logger.error("Failed to fetch files metadata.") - return None - return response.json() - - -def scrape_files_for_all_datasets( - client: httpx.Client, - datasets: list[DatasetMetadata], - node_base_url: str, - logger: "loguru.Logger" = loguru.logger, -) -> list[dict]: - """Scrape files metadata for all datasets in MDposit API. - - Parameters - ---------- - client: httpx.Client - The HTTPX client to use for making requests. - datasets: list[DatasetMetadata] - List of datasets to scrape files metadata for. - node_base_url: str - Base url of the specific node of MDposit API. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[dict] - List of files metadata dictionaries. - """ - all_files_metadata = [] - for dataset_count, dataset in enumerate(datasets, start=1): - dataset_id = dataset.dataset_id_in_repository - files_metadata = scrape_files_for_one_dataset( - client, - url=f"{node_base_url}/projects/{dataset_id}/filenotes", - dataset_id=dataset_id, - logger=logger, - ) - if not files_metadata: - continue - # Extract relevant files metadata. - logger.info(f"Getting files metadata for dataset: {dataset_id}") - files_metadata = extract_files_metadata( - files_metadata, node_base_url, dataset, logger=logger - ) - all_files_metadata += files_metadata - # Normalize files metadata with pydantic model (FileMetadata) - logger.info(f"Total files found: {len(all_files_metadata):,}") - logger.info( - "Extracted files metadata for " - f"{dataset_count:,}/{len(datasets):,} " - f"({dataset_count / len(datasets):.0%}) datasets." - ) - return all_files_metadata - - -def extract_files_metadata( - raw_metadata: dict, - node_base_url: str, - dataset: DatasetMetadata, - logger: "loguru.Logger" = loguru.logger, -) -> list[dict]: - """ - Extract relevant metadata from raw MDposit files metadata. - - Parameters - ---------- - raw_metadata: dict - Raw files metadata. - node_base_url: str - The unique identifier of the dataset in MDposit. - dataset: DatasetMetadata - Normalized dataset to scrape files metadata for. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[dict] - List of select files metadata. - """ - logger.info("Extracting files metadata...") - files_metadata = [] - for mdposit_file in raw_metadata: - dataset_id = dataset.dataset_id_in_repository - file_name = Path(mdposit_file.get("filename")) - file_type = file_name.suffix.lstrip(".") - node_base_url_for_file = node_base_url.replace("/v1", "") - file_path_url = ( - f"{node_base_url_for_file}/current/projects/{dataset_id}/files/{file_name}" - ) - - parsed_file = { - "dataset_repository_name": dataset.dataset_repository_name, - "dataset_id_in_repository": dataset_id, - "dataset_url_in_repository": dataset.dataset_url_in_repository, - "file_name": str(file_name), - "file_type": file_type, - "file_size_in_bytes": mdposit_file.get("length", None), - "file_md5": mdposit_file.get("md5", None), - "file_url_in_repository": file_path_url, - } - files_metadata.append(parsed_file) - logger.info(f"Extracted metadata for {len(files_metadata)} files.") - return files_metadata - - -@click.command( - help="Command line interface for MDverse scrapers", - epilog="Happy scraping!", -) -@click.option( - "--output-dir", - "output_dir_path", - type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), - required=True, - help="Output directory path to save results.", -) -@click.option( - "--debug", - "is_in_debug_mode", - is_flag=True, - default=False, - help="Enable debug mode.", -) -def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: - """Scrape molecular dynamics datasets and files from MDDB.""" - # Create HTTPX client - client = create_httpx_client() - - # Iterate over the nodes - for data_source_name, base_url in MDDB_REPOSITORIES.items(): - # Create scraper context. - scraper = ScraperContext( - data_source_name=data_source_name, - output_dir_path=output_dir_path, - is_in_debug_mode=is_in_debug_mode, - ) - # Create logger. - level = "DEBUG" if scraper.is_in_debug_mode else "INFO" - logger = create_logger(logpath=scraper.log_file_path, level=level) - # Print scraper configuration. - logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) - logger.info(f"Starting {data_source_name.name} data scraping...") - # Check connection to the API - if is_connection_to_server_working( - client, f"{base_url}/projects/summary", logger=logger - ): - logger.success(f"Connection to {data_source_name} API successful!") - else: - logger.critical(f"Connection to {data_source_name} API failed.") - logger.critical("Aborting.") - sys.exit(1) - - # Scrape the datasets metadata. - datasets_raw_metadata = scrape_all_datasets( - client, - query_entry_point=f"{base_url}/projects", - node_name=data_source_name, - logger=logger, - scraper=scraper, - ) - if not datasets_raw_metadata: - logger.critical(f"No datasets found in {data_source_name}.") - logger.critical("Aborting.") - sys.exit(1) - - # Select datasets metadata - datasets_selected_metadata = extract_datasets_metadata( - datasets_raw_metadata, data_source_name, logger=logger - ) - # Validate datasets metadata with the DatasetMetadata Pydantic model. - datasets_normalized_metadata = normalize_datasets_metadata( - datasets_selected_metadata, logger=logger - ) - # Save datasets metadata to parquet file. - scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( - scraper.datasets_parquet_file_path, - datasets_normalized_metadata, - logger=logger, - ) - # Scrape NOMAD files metadata. - files_metadata = scrape_files_for_all_datasets( - client, - datasets_normalized_metadata, - base_url, - logger=logger, - ) - # Validate NOMAD files metadata with the FileMetadata Pydantic model. - files_normalized_metadata = normalize_files_metadata( - files_metadata, logger=logger - ) - # Save files metadata to parquet file. - scraper.number_of_files_scraped = export_list_of_models_to_parquet( - scraper.files_parquet_file_path, - files_normalized_metadata, - logger=logger, - ) - # Print scraping statistics. - print_statistics(scraper, logger=logger) - - -if __name__ == "__main__": - main() From e3c5e3803de8b43a4ebb229c647656586c0464b5 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 00:44:05 +0100 Subject: [PATCH 08/43] feat: refactor the code and resolve AttributeError --- src/mdverse_scrapers/scrapers/mddb.py | 574 ++++++++++++++++++++++++++ 1 file changed, 574 insertions(+) create mode 100644 src/mdverse_scrapers/scrapers/mddb.py diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py new file mode 100644 index 0000000..d57256b --- /dev/null +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -0,0 +1,574 @@ +"""Scrape molecular dynamics simulation datasets and files from the MDDB. + +This script extracts molecular dynamics datasets produced within the +MDDB (Molecular Dynamics Data Bank) project, which is distributed across +two nodes: + +- MDPOSIT MMB node (https://mmb-dev.mddbr.eu/#/browse) +- MDPOSIT INRIA node https://dynarepo.inria.fr/#/ +""" + +import json +import sys +from pathlib import Path +from typing import Any + +import click +import httpx +import loguru + +from ..core.logger import create_logger +from ..core.network import ( + HttpMethod, + create_httpx_client, + is_connection_to_server_working, + make_http_request_with_retries, +) +from ..core.toolbox import print_statistics +from ..models.dataset import DatasetMetadata +from ..models.enums import DatasetSourceName, MoleculeType +from ..models.scraper import ScraperContext +from ..models.simulation import ForceFieldModel, Molecule, Software +from ..models.utils import ( + export_list_of_models_to_parquet, + normalize_datasets_metadata, + normalize_files_metadata, +) + +MDDB_REPOSITORIES = { + DatasetSourceName.MDPOSIT_MMB_NODE: "https://mmb-dev.mddbr.eu/api/rest/v1", + DatasetSourceName.MDPOSIT_INRIA_NODE: "https://inria.mddbr.eu/api/rest/v1", +} + + +def scrape_all_datasets( + client: httpx.Client, + query_entry_point: str, + node_name: DatasetSourceName, + page_size: int = 50, + logger: "loguru.Logger" = loguru.logger, + scraper: ScraperContext | None = None, +) -> list[dict]: + """ + Scrape Molecular Dynamics-related datasets from the MDposit API. + + Within the MDposit terminology, datasets are referred to as "projects". + + Parameters + ---------- + client: httpx.Client + The HTTPX client to use for making requests. + query_entry_point: str + The entry point of the API request. + node_name: DatasetSourceName + MDDB node name for logging. + page_size: int + Number of entries to fetch per page. + logger: "loguru.Logger" + Logger for logging messages. + scraper: ScraperContext | None + Optional scraper context. When provided and running in debug mode, + dataset scraping is intentionally stopped early to limit the amount + of retrieved data. + + Returns + ------- + list[dict]: + A list of MDposit entries. + """ + logger.info(f"Scraping molecular dynamics datasets from {node_name}.") + logger.info(f"Using batches of {page_size} datasets.") + all_datasets = [] + + # Start by requesting the first page to get total number of datasets. + logger.info("Requesting first page to get total number of datasets...") + page = 0 # start with first page + + while True: + response = make_http_request_with_retries( + client, + f"{query_entry_point}?limit={page_size}&page={page}", + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.2, + ) + + if not response: + logger.error("Failed to fetch data from MDposit API.") + logger.error("Jumping to next iteration.") + page += 1 + continue + + try: + response_json = response.json() + datasets = response_json.get("projects", []) + total_datasets = response_json.get("filteredCount") + + if page == 0 and total_datasets is not None: + logger.info(f"Found a total of {total_datasets:,} datasets in MDposit.") + + if not datasets: + logger.info("No more datasets returned by API. Stopping pagination.") + break + + all_datasets.extend(datasets) + + logger.info(f"Scraped page {page} with {len(datasets)} datasets.") + if total_datasets: + logger.info( + f"Scraped {len(all_datasets)} datasets " + f"({len(all_datasets):,}/{total_datasets:,} " + f"{len(all_datasets) / total_datasets:.0%})" + ) + logger.debug("First dataset metadata on this page:") + logger.debug(datasets[0] if datasets else "No datasets on this page") + + if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 120: + logger.warning("Debug mode is ON: stopping after 120 datasets.") + return all_datasets + + except (json.decoder.JSONDecodeError, ValueError) as exc: + logger.error(f"Error while parsing MDposit response: {exc}") + logger.error("Jumping to next iteration.") + + page += 1 # increment page for next iteration + + logger.success(f"Scraped {len(all_datasets):,} datasets in MDposit.") + return all_datasets + + +def extract_software_and_version( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[Software] | None: + """ + Extract software names and versions from the nested dataset dictionary. + + Parameters + ---------- + dataset_metadata: dict + The dataset dictionnary from which to extract molecules information. + dataset_id: str + Identifier of the dataset, used for logging. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[Software] | None + A list of Software instances with `name` and `version` fields, None otherwise. + """ + try: + name = dataset_metadata.get("PROGRAM") + version = dataset_metadata.get("VERSION") + if not name: + return None + return [Software(name=name, version=str(version))] + except (ValueError, KeyError, TypeError) as e: + logger.warning(f"Error parsing software info for dataset {dataset_id}: {e}") + return None + + +def extract_forcefield_or_model_and_version( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[ForceFieldModel] | None: + """ + Extract forcefield or model names and versions from the nested dataset dictionary. + + Parameters + ---------- + dataset_metadata: dict + The dataset dictionnary from which to extract molecules information. + dataset_id: str + Identifier of the dataset entry, used for logging. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[ForceFieldModel] | None + A list of forcefield or model instances with `name` and `version` fields, + None otherwise. + """ + ffm_names = [] + try: + # Adding + ffm_names.extend(dataset_metadata.get("FF") or []) + # Adding the water model. + # Exemple: TIP3P. + water_model = dataset_metadata.get("WAT") + if water_model: + ffm_names.append(water_model) + return [ForceFieldModel(name=ffm_name) for ffm_name in ffm_names if ffm_name] + except (ValueError, KeyError) as e: + logger.warning( + f"Error parsing forcefield or model info for dataset {dataset_id}: {e}" + ) + return None + + +def extract_molecules( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[Molecule] | None: + """ + Extract molecule names and types from the nested dataset dictionary. + + Parameters + ---------- + dataset_metadata: dict + The dataset dictionnary from which to extract molecules information. + dataset_id: str + Identifier of the dataset, used for logging. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[Molecule] | None + A list of molecules instances with `name` and `type` fields, + None otherwise. + """ + molecules = [] + try: + prot_seqs = dataset_metadata.get("PROTSEQ") or [] + nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] + ligands = dataset_metadata.get("LIGANDS") or [] + + for seq in prot_seqs: + molecules.append(Molecule(name=seq, type=MoleculeType.PROTEIN)) + + for seq in nucl_seqs: + molecules.append(Molecule(name=seq, type=MoleculeType.NUCLEIC_ACID)) + + for ligand in ligands: + molecules.append(Molecule(name=ligand)) + + if not molecules: + logger.warning(f"No molecules found in dataset {dataset_id}.") + return None + return molecules + + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing molecules info for dataset {dataset_id}: {e}") + return None + + +def extract_datasets_metadata( + datasets: list[dict[str, Any]], + node_name: DatasetSourceName, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Extract relevant metadata from raw MDposit datasets metadata. + + Parameters + ---------- + datasets: List[Dict[str, Any]] + List of raw MDposit datasets metadata. + node_name: DatasetSourceName + MDDB node name for the dataset url. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict] + List of dataset metadata dictionaries. + """ + datasets_metadata = [] + for dataset in datasets: + # Get the dataset id + dataset_id = dataset.get("accession") + logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") + # Create the dataset url depending on the node + if node_name is DatasetSourceName.MDPOSIT_MMB_NODE: + dataset_url = f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" + elif node_name is DatasetSourceName.MDPOSIT_INRIA_NODE: + dataset_url = f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" + else: + logger.warning( + f"Unknown MDDB node '{node_name}'." + f"Cannot build entry URL for dataset {dataset_id}." + ) + + dataset_metadata = dataset.get("metadata", {}) + links = dataset_metadata.get("CITATION") + links_list = [links] if links else None + a = dataset_metadata.get("AUTHORS") + author_names = a if isinstance(a, list) else [a] if a else None + metadata = { + "dataset_repository_name": node_name.value, + "dataset_id_in_repository": dataset_id, + "dataset_url_in_repository": dataset_url, + "dataset_project_name": DatasetSourceName.MDDB, + "external_links": links_list, + "title": dataset_metadata.get("NAME"), + "date_created": dataset.get("creationDate"), + "date_last_updated": dataset.get("updateDate"), + "number_of_files": len(dataset.get("files", [])), + "author_names": author_names, + "license": dataset_metadata.get("LICENSE"), + "description": dataset_metadata.get("DESCRIPTION"), + "total_number_of_atoms": dataset_metadata.get("mdAtoms"), + } + # Extract simulation metadata if available. + # Software names with their versions. + metadata["software"] = extract_software_and_version( + dataset_metadata, dataset_id, logger + ) + # Forcefield and model names with their versions. + metadata["forcefields"] = extract_forcefield_or_model_and_version( + dataset_metadata, dataset_id, logger + ) + # Molecules with their nb of atoms and number total of atoms. + metadata["molecules"] = extract_molecules(dataset_metadata, dataset_id, logger) + # Time step in fs. + time_step = dataset_metadata.get("TIMESTEP") + metadata["simulation_timesteps_in_fs"] = [time_step] if time_step else None + # Temperatures in kelvin + temperature = dataset_metadata.get("TEMP") + metadata["simulation_temperatures_in_kelvin"] = ( + [temperature] if temperature else None + ) + datasets_metadata.append(metadata) + logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") + return datasets_metadata + + +def scrape_files_for_one_dataset( + client: httpx.Client, + url: str, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> dict | None: + """ + Scrape files metadata for a given MDposit dataset. + + Parameters + ---------- + client: httpx.Client + The HTTPX client to use for making requests. + url: str + The URL endpoint. + dataset_id: str + The unique identifier of the dataset in MDposit. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + dict | None + File metadata dictionary for the dataset. + """ + logger.info(f"Scraping files for dataset ID: {dataset_id}") + response = make_http_request_with_retries( + client, + url, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.1, + ) + if not response: + logger.error("Failed to fetch files metadata.") + return None + return response.json() + + +def scrape_files_for_all_datasets( + client: httpx.Client, + datasets: list[DatasetMetadata], + node_base_url: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """Scrape files metadata for all datasets in MDposit API. + + Parameters + ---------- + client: httpx.Client + The HTTPX client to use for making requests. + datasets: list[DatasetMetadata] + List of datasets to scrape files metadata for. + node_base_url: str + Base url of the specific node of MDposit API. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict] + List of files metadata dictionaries. + """ + all_files_metadata = [] + for dataset_count, dataset in enumerate(datasets, start=1): + dataset_id = dataset.dataset_id_in_repository + files_metadata = scrape_files_for_one_dataset( + client, + url=f"{node_base_url}/projects/{dataset_id}/filenotes", + dataset_id=dataset_id, + logger=logger, + ) + if not files_metadata: + continue + # Extract relevant files metadata. + logger.info(f"Getting files metadata for dataset: {dataset_id}") + files_metadata = extract_files_metadata( + files_metadata, node_base_url, dataset, logger=logger + ) + all_files_metadata += files_metadata + # Normalize files metadata with pydantic model (FileMetadata) + logger.info(f"Total files found: {len(all_files_metadata):,}") + logger.info( + "Extracted files metadata for " + f"{dataset_count:,}/{len(datasets):,} " + f"({dataset_count / len(datasets):.0%}) datasets." + ) + return all_files_metadata + + +def extract_files_metadata( + raw_metadata: list[dict[str, Any]], + node_base_url: str, + dataset: DatasetMetadata, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict[str, Any]]: + """ + Extract relevant metadata from raw MDposit files metadata. + + Parameters + ---------- + raw_metadata: dict + Raw files metadata. + node_base_url: str + The unique identifier of the dataset in MDposit. + dataset: DatasetMetadata + Normalized dataset to scrape files metadata for. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict] + List of select files metadata. + """ + logger.info("Extracting files metadata...") + files_metadata = [] + for mdposit_file in raw_metadata: + dataset_id = dataset.dataset_id_in_repository + file_name = Path(mdposit_file.get("filename")) + file_type = file_name.suffix.lstrip(".") + node_base_url_for_file = node_base_url.replace("/v1", "") + file_path_url = ( + f"{node_base_url_for_file}/current/projects/{dataset_id}/files/{file_name}" + ) + + parsed_file = { + "dataset_repository_name": dataset.dataset_repository_name, + "dataset_id_in_repository": dataset_id, + "dataset_url_in_repository": dataset.dataset_url_in_repository, + "file_name": str(file_name), + "file_type": file_type, + "file_size_in_bytes": mdposit_file.get("length", None), + "file_md5": mdposit_file.get("md5", None), + "file_url_in_repository": file_path_url, + } + files_metadata.append(parsed_file) + logger.info(f"Extracted metadata for {len(files_metadata)} files.") + return files_metadata + + +@click.command( + help="Command line interface for MDverse scrapers", + epilog="Happy scraping!", +) +@click.option( + "--output-dir", + "output_dir_path", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + required=True, + help="Output directory path to save results.", +) +@click.option( + "--debug", + "is_in_debug_mode", + is_flag=True, + default=False, + help="Enable debug mode.", +) +def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: + """Scrape molecular dynamics datasets and files from MDDB.""" + # Create HTTPX client + client = create_httpx_client() + + # Iterate over the nodes + for data_source_name, base_url in MDDB_REPOSITORIES.items(): + # Create scraper context. + scraper = ScraperContext( + data_source_name=data_source_name, + output_dir_path=output_dir_path, + is_in_debug_mode=is_in_debug_mode, + ) + # Create logger. + level = "DEBUG" if scraper.is_in_debug_mode else "INFO" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. + logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) + logger.info(f"Starting {data_source_name.name} data scraping...") + # Check connection to the API + if is_connection_to_server_working( + client, f"{base_url}/projects/summary", logger=logger + ): + logger.success(f"Connection to {data_source_name} API successful!") + else: + logger.critical(f"Connection to {data_source_name} API failed.") + logger.critical("Aborting.") + sys.exit(1) + + # Scrape the datasets metadata. + datasets_raw_metadata = scrape_all_datasets( + client, + query_entry_point=f"{base_url}/projects", + node_name=data_source_name, + logger=logger, + scraper=scraper, + ) + if not datasets_raw_metadata: + logger.critical(f"No datasets found in {data_source_name}.") + logger.critical("Aborting.") + sys.exit(1) + + # Select datasets metadata + datasets_selected_metadata = extract_datasets_metadata( + datasets_raw_metadata, data_source_name, logger=logger + ) + # Validate datasets metadata with the DatasetMetadata Pydantic model. + datasets_normalized_metadata = normalize_datasets_metadata( + datasets_selected_metadata, logger=logger + ) + # Save datasets metadata to parquet file. + scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( + scraper.datasets_parquet_file_path, + datasets_normalized_metadata, + logger=logger, + ) + # Scrape NOMAD files metadata. + files_metadata = scrape_files_for_all_datasets( + client, + datasets_normalized_metadata, + base_url, + logger=logger, + ) + # Validate NOMAD files metadata with the FileMetadata Pydantic model. + files_normalized_metadata = normalize_files_metadata( + files_metadata, logger=logger + ) + # Save files metadata to parquet file. + scraper.number_of_files_scraped = export_list_of_models_to_parquet( + scraper.files_parquet_file_path, + files_normalized_metadata, + logger=logger, + ) + # Print scraping statistics. + print_statistics(scraper, logger=logger) + + +if __name__ == "__main__": + main() From 5b01789c0386304103e8ee6d66db9081cd89b799 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 20:09:22 +0100 Subject: [PATCH 09/43] feat: add URL computation for ExternalIdentifier based on database name --- src/mdverse_scrapers/models/simulation.py | 49 ++++++++++++++++++----- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index 0ce233a..5cc1775 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -3,11 +3,16 @@ import re from typing import Annotated -from pydantic import BaseModel, ConfigDict, Field, StringConstraints, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + StringConstraints, + field_validator, + model_validator, +) -from .enums import ExternalDatabaseName - -from .enums import MoleculeType +from .enums import ExternalDatabaseName, MoleculeType DOI = Annotated[ str, @@ -39,6 +44,30 @@ class ExternalIdentifier(BaseModel): None, min_length=1, description="Direct URL to the identifier into the database" ) + @model_validator(mode="after") + def compute_url(self) -> "ExternalIdentifier": + """Compute the URL for the external identifier. + + Parameters + ---------- + self: ExternalIdentifier + The model instance being validated, with all fields already validated. + + Returns + ------- + ExternalIdentifier + The model instance with the URL field computed if it was not provided. + """ + if self.url is not None: + return self + + if self.database_name == ExternalDatabaseName.PDB: + self.url = f"https://www.rcsb.org/structure/{self.identifier}" + elif self.database_name == ExternalDatabaseName.UNIPROT: + self.url = f"https://www.uniprot.org/uniprotkb/{self.identifier}" + + return self + class Molecule(BaseModel): """Molecule in a simulation.""" @@ -53,6 +82,11 @@ class Molecule(BaseModel): "Allowed values in the MoleculeType enum. " "Examples: PROTEIN, ION, LIPID...", ) + number_of_this_molecule_type_in_system: int | None = Field( + None, + ge=0, + description="Number of molecules of this type in the simulation.", + ) number_of_atoms: int | None = Field( None, ge=0, description="Number of atoms in the molecule." ) @@ -60,11 +94,6 @@ class Molecule(BaseModel): sequence: str | None = Field( None, description="Sequence of the molecule for protein and nucleic acid." ) - number_of_molecules: int | None = Field( - None, - ge=0, - description="Number of molecules of this type in the simulation.", - ) external_identifiers: list[ExternalIdentifier] | None = Field( None, description=("List of external database identifiers for this molecule."), @@ -111,7 +140,7 @@ class SimulationMetadata(BaseModel): # Ensure scraped metadata matches the expected schema exactly. model_config = ConfigDict(extra="forbid") - software: list[Software] | None = Field( + softwares: list[Software] | None = Field( None, description="List of molecular dynamics tool or software.", ) From 5533d8bcf538d4ded4c8df5e0ff34b62036ac86f Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 20:12:16 +0100 Subject: [PATCH 10/43] Fix merging of new datasource names into DatasetSourceName instead of ExternalDatabaseName --- src/mdverse_scrapers/models/enums.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index 8fb482f..cba6bd4 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -20,6 +20,9 @@ class DatasetSourceName(StrEnum): ATLAS = "atlas" GPCRMD = "gpcrmd" NMRLIPIDS = "nmrlipids" + MDDB = "mddb" + MDPOSIT_INRIA_NODE = "mdposit_inria_node" + MDPOSIT_MMB_NODE = "mdposit_mmb_node" class ExternalDatabaseName(StrEnum): @@ -27,9 +30,6 @@ class ExternalDatabaseName(StrEnum): PDB = "pdb" UNIPROT = "uniprot" - MDDB = "mddb" - MDPOSIT_INRIA_NODE = "mdposit_inria_node" - MDPOSIT_MMB_NODE = "mdposit_mmb_node" class MoleculeType(StrEnum): From 9ebc838c458d0c7756243aa1a6414c9b3cb5e6e7 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 20:13:28 +0100 Subject: [PATCH 11/43] feat: enhance molecule extraction to fit the new model and adding UniProt protein name retrieval --- src/mdverse_scrapers/scrapers/mddb.py | 312 +++++++++++++++++++++++--- 1 file changed, 276 insertions(+), 36 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index d57256b..60bc523 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -26,9 +26,9 @@ ) from ..core.toolbox import print_statistics from ..models.dataset import DatasetMetadata -from ..models.enums import DatasetSourceName, MoleculeType +from ..models.enums import DatasetSourceName, ExternalDatabaseName, MoleculeType from ..models.scraper import ScraperContext -from ..models.simulation import ForceFieldModel, Molecule, Software +from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software from ..models.utils import ( export_list_of_models_to_parquet, normalize_datasets_metadata, @@ -206,57 +206,297 @@ def extract_forcefield_or_model_and_version( return None -def extract_molecules( - dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger -) -> list[Molecule] | None: +def fetch_uniprot_protein_name( + client, + uniprot_id: str, + logger: "loguru.Logger", +) -> str | None: """ - Extract molecule names and types from the nested dataset dictionary. + Retrieve protein name from UniProt API. + + Parameters + ---------- + client + HTTP client used to perform the request. + uniprot_id: str + UniProt accession identifier. + logger: loguru.Logger + Logger instance. + + Returns + ------- + str | None + Protein full name if available, None otherwise. + """ + logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") + try: + response = make_http_request_with_retries( + client, + f"https://rest.uniprot.org/uniprotkb/{uniprot_id}", + method=HttpMethod.GET, + timeout=30, + delay_before_request=0.1, + ) + data: dict = response.json() + protein_name = ( + data.get("proteinDescription", {}) + .get("recommendedName", {}) + .get("fullName", {}) + .get("value") + ) + if protein_name: + logger.success( + f"Retrieved protein name for UniProt ID {uniprot_id}: {protein_name}" + ) + return protein_name + return f"Protein {uniprot_id}" # Fallback to a generic name if not found + + except (AttributeError, TypeError) as exc: + logger.warning(f"Invalid UniProt response for {uniprot_id}: {exc}") + return None + + +def extract_proteins( + pdb_ids: list[ExternalIdentifier], + references: list[str], + prot_seqs: list[str], + prot_atoms: int, + prot_count: int, + client: "httpx.Client", + dataset_id: str, + logger: "loguru.Logger", +) -> list[Molecule]: + """Extract proteins from dataset metadata. + + Parameters + ---------- + pdb_ids: list[ExternalIdentifier] + List of PDB identifiers to associate with the proteins. + references: list[str] + List of reference identifiers (e.g., UniProt accessions) + to associate with the proteins. + prot_seqs: list[str] + List of protein sequences. + prot_atoms: int + Total number of atoms in the protein. + prot_count: int + Total number of protein molecules in the system. + client: httpx.Client + The HTTP client used for making requests. + dataset_id: str + The ID of the dataset being processed, used for logging. + logger: loguru.Logger + Logger for logging messages. + + Returns + ------- + list[Molecule] + A list of extracted proteins. + """ + molecules = [] + for i, seq in enumerate(prot_seqs): + try: + external_ids = list(pdb_ids) + uniprot_id = references[i] if i < len(references) else None + if uniprot_id: + external_ids.append( + ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier=uniprot_id, + ) + ) + prot_name = ( + fetch_uniprot_protein_name(client, uniprot_id, logger) + if uniprot_id + else f"Protein {i + 1}" + ) + molecules.append( + Molecule( + name=prot_name, + type=MoleculeType.PROTEIN, + sequence=seq, + number_of_atoms=prot_atoms if len(prot_seqs) == 1 else None, + number_of_this_molecule_type_in_system=prot_count, + external_identifiers=external_ids, + ) + ) + except (TypeError, ValueError) as exc: + logger.warning( + f"Skipping protein {i + 1} in dataset {dataset_id} due to {type(exc).__name__}: {exc}" + ) + return molecules + + +def extract_nucleic_acids( + pdb_ids: list[ExternalIdentifier], + nucl_seqs: list[str], + nucl_atoms: list[int], + dataset_id: str, + logger: "loguru.Logger", +) -> list[Molecule]: + """Extract nucleic acids from dataset metadata. + + Parameters + ---------- + pdb_ids: list[ExternalIdentifier] + List of PDB identifiers to associate with the nucleic acids. + nucl_seqs: list[str] + List of nucleic acid sequences. + nucl_atoms: list[int] + List of atom counts for the nucleic acids. + dataset_id: str + The ID of the dataset being processed, used for logging. + logger: loguru.Logger + Logger for logging messages. + + Returns + ------- + list[Molecule] + A list of extracted nucleic acids. + """ + molecules = [] + for i, seq in enumerate(nucl_seqs): + try: + molecules.append( + Molecule( + name=f"Nucleic Acid {i + 1}", + type=MoleculeType.NUCLEIC_ACID, + sequence=seq, + number_of_atoms=nucl_atoms if len(nucl_seqs) == 1 else None, + external_identifiers=pdb_ids, + ) + ) + except (TypeError, ValueError) as exc: + logger.warning( + f"Skipping nucleic acid {i + 1} in dataset {dataset_id} " + f"due to {type(exc).__name__}: {exc}" + ) + return molecules + + +def extract_small_molecules( + dataset_metadata: dict, + dataset_id: str, + logger: "loguru.Logger", +) -> list[Molecule]: + """Extract small molecules (lipids, solvents, ions) from dataset metadata. Parameters ---------- dataset_metadata: dict - The dataset dictionnary from which to extract molecules information. + The dataset metadata containing information about the molecules. dataset_id: str - Identifier of the dataset, used for logging. - logger: "loguru.Logger" + The ID of the dataset being processed, used for logging. + logger: loguru.Logger Logger for logging messages. Returns ------- - list[Molecule] | None - A list of molecules instances with `name` and `type` fields, - None otherwise. + list[Molecule] + A list of extracted small molecules. """ molecules = [] - try: - prot_seqs = dataset_metadata.get("PROTSEQ") or [] - nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] - ligands = dataset_metadata.get("LIGANDS") or [] + species_type_map = { + "DPPC": MoleculeType.LIPID, + "SOL": MoleculeType.SOLVENT, + "NA": MoleculeType.ION, + "CL": MoleculeType.ION, + } + for species, mol_type in species_type_map.items(): + try: + count = dataset_metadata.get(species, 0) + if isinstance(count, int) and count > 0: + molecules.append( + Molecule( + name=species, + type=mol_type, + number_of_this_molecule_type_in_system=count, + ) + ) + except (TypeError, ValueError) as exc: + logger.warning( + f"Skipping small molecule {species} in dataset {dataset_id} " + f"due to {type(exc).__name__}: {exc}" + ) + return molecules + - for seq in prot_seqs: - molecules.append(Molecule(name=seq, type=MoleculeType.PROTEIN)) +def extract_molecules( + dataset_metadata: dict, + dataset_id: str, + client: "httpx.Client", + logger: "loguru.Logger" = loguru.logger, +) -> list[Molecule] | None: + """Coordinator function to extract all molecule types from dataset metadata. - for seq in nucl_seqs: - molecules.append(Molecule(name=seq, type=MoleculeType.NUCLEIC_ACID)) + Parameters + ---------- + dataset_metadata: dict + The dataset metadata containing information about the molecules. + dataset_id: str + The ID of the dataset being processed. + client: httpx.Client + The HTTP client used for making requests. + logger: loguru.Logger + The logger used for logging messages. - for ligand in ligands: - molecules.append(Molecule(name=ligand)) + Returns + ------- + list[Molecule] | None + A list of extracted molecules or None if no molecules were found. + """ + molecules: list[Molecule] = [] + + # Normalize common fields + pdbs = dataset_metadata.get("PDBIDS") or [] + references = dataset_metadata.get("REFERENCES") or [] + prot_seqs = dataset_metadata.get("PROTSEQ") or [] + prot_atoms = dataset_metadata.get("PROTATS") or [] + prot_count = dataset_metadata.get("PROT", 0) + nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] + nucl_atoms = (dataset_metadata.get("DNAATS") or []) + ( + dataset_metadata.get("RNAATS") or [] + ) - if not molecules: - logger.warning(f"No molecules found in dataset {dataset_id}.") - return None - return molecules + # Pre-create PDB identifiers + pdb_ids = [ + ExternalIdentifier(database_name=ExternalDatabaseName.PDB, identifier=pdb_id) + for pdb_id in pdbs + ] + + # Extract proteins first + molecules.extend( + extract_proteins( + pdb_ids, + references, + prot_seqs, + prot_atoms, + prot_count, + client, + dataset_id, + logger, + ) + ) + # Then extract nucleic acids + molecules.extend( + extract_nucleic_acids(pdb_ids, nucl_seqs, nucl_atoms, dataset_id, logger) + ) + # Finally extract small molecules like lipids, solvents and ions. + molecules.extend(extract_small_molecules(dataset_metadata, dataset_id, logger)) - except (ValueError, KeyError) as e: - logger.warning(f"Error parsing molecules info for dataset {dataset_id}: {e}") + if not molecules: + logger.warning(f"No molecules found in dataset {dataset_id}.") return None + return molecules + def extract_datasets_metadata( datasets: list[dict[str, Any]], node_name: DatasetSourceName, + client: "httpx.Client", logger: "loguru.Logger" = loguru.logger, -) -> list[dict]: +) -> list[dict[str, Any]]: """ Extract relevant metadata from raw MDposit datasets metadata. @@ -271,7 +511,7 @@ def extract_datasets_metadata( Returns ------- - list[dict] + list[dict[str, Any]] List of dataset metadata dictionaries. """ datasets_metadata = [] @@ -312,15 +552,17 @@ def extract_datasets_metadata( } # Extract simulation metadata if available. # Software names with their versions. - metadata["software"] = extract_software_and_version( + metadata["softwares"] = extract_software_and_version( dataset_metadata, dataset_id, logger ) # Forcefield and model names with their versions. - metadata["forcefields"] = extract_forcefield_or_model_and_version( + metadata["forcefields_models"] = extract_forcefield_or_model_and_version( dataset_metadata, dataset_id, logger ) # Molecules with their nb of atoms and number total of atoms. - metadata["molecules"] = extract_molecules(dataset_metadata, dataset_id, logger) + metadata["molecules"] = extract_molecules( + dataset_metadata, dataset_id, client, logger + ) # Time step in fs. time_step = dataset_metadata.get("TIMESTEP") metadata["simulation_timesteps_in_fs"] = [time_step] if time_step else None @@ -454,7 +696,6 @@ def extract_files_metadata( for mdposit_file in raw_metadata: dataset_id = dataset.dataset_id_in_repository file_name = Path(mdposit_file.get("filename")) - file_type = file_name.suffix.lstrip(".") node_base_url_for_file = node_base_url.replace("/v1", "") file_path_url = ( f"{node_base_url_for_file}/current/projects/{dataset_id}/files/{file_name}" @@ -465,7 +706,6 @@ def extract_files_metadata( "dataset_id_in_repository": dataset_id, "dataset_url_in_repository": dataset.dataset_url_in_repository, "file_name": str(file_name), - "file_type": file_type, "file_size_in_bytes": mdposit_file.get("length", None), "file_md5": mdposit_file.get("md5", None), "file_url_in_repository": file_path_url, @@ -537,7 +777,7 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: # Select datasets metadata datasets_selected_metadata = extract_datasets_metadata( - datasets_raw_metadata, data_source_name, logger=logger + datasets_raw_metadata, data_source_name, client, logger=logger ) # Validate datasets metadata with the DatasetMetadata Pydantic model. datasets_normalized_metadata = normalize_datasets_metadata( From 96793e5c705193e3829049b785ee3c244fb2215f Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 20:14:18 +0100 Subject: [PATCH 12/43] test(simulation): test URL computation for ExternalIdentifier --- tests/models/test_simulation.py | 42 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/tests/models/test_simulation.py b/tests/models/test_simulation.py index 25e5d5c..1c9f8e9 100644 --- a/tests/models/test_simulation.py +++ b/tests/models/test_simulation.py @@ -13,9 +13,9 @@ ) -# ------------------------------------------------------------------- +# -------------------------------------------------- # Test simulation timestep and time positive values -# ------------------------------------------------------------------- +# -------------------------------------------------- @pytest.mark.parametrize( ("values", "should_raise_exception"), [ @@ -35,9 +35,9 @@ def test_positive_simulation_values(values, should_raise_exception): assert metadata.simulation_timesteps_in_fs == values -# ------------------------------------------------------------------- +# ------------------------------ # Test temperature normalization -# ------------------------------------------------------------------- +# ------------------------------ @pytest.mark.parametrize( ("test_temp", "expected_temp_in_kelvin"), [ @@ -54,13 +54,13 @@ def test_temperature_normalization(test_temp, expected_temp_in_kelvin): assert metadata.simulation_temperatures_in_kelvin == expected_temp_in_kelvin -# ------------------------------------------------------------------- +# ---------------------------------------------- # Test software, molecules, forcefields creation -# ------------------------------------------------------------------- +# ---------------------------------------------- def test_structured_fields_creation(): """Test that software, molecules, and forcefields can be created.""" metadata = SimulationMetadata( - software=[Software(name="GROMACS", version="2023.1")], + softwares=[Software(name="GROMACS", version="2023.1")], molecules=[ Molecule( name="H2O", @@ -77,7 +77,7 @@ def test_structured_fields_creation(): ], forcefields_models=[ForceFieldModel(name="AMBER", version="ff14SB")], ) - assert metadata.software[0].name == "GROMACS" + assert metadata.softwares[0].name == "GROMACS" assert metadata.molecules[0].number_of_atoms == 3 assert metadata.molecules[0].number_of_molecules == 100 assert metadata.forcefields_models[0].version == "ff14SB" @@ -89,21 +89,39 @@ def test_structured_fields_creation(): assert metadata.molecules[0].external_identifiers[0].identifier == "1ABC" -# ------------------------------------------------------------------- +# ------------------- # Test invalid fields -# ------------------------------------------------------------------- +# ------------------- def test_invalid_fields(): """Test with a non-existing fields.""" with pytest.raises(ValidationError): SimulationMetadata(total_number_of_something=1000) -# ------------------------- +# -------------------------------------- # Test invalid simulation parameter type -# ------------------------- +# -------------------------------------- def test_invalid_simulation_value_type(): """Test that non-numeric strings raise ValidationError.""" with pytest.raises(ValidationError): SimulationMetadata( simulation_timesteps_in_fs=["invalid-value"], # because not a float ) + + +# -------------------------------------- +# Test compute URL in ExternalIdentifier +# -------------------------------------- +def test_compute_url_in_external_identifier(): + """Test that the compute_url method generates the correct URL.""" + identifier = ExternalIdentifier( + database_name=ExternalDatabaseName.PDB, + identifier="1ABC", + ) + assert identifier.url == "https://www.rcsb.org/structure/1ABC" + + identifier = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier="P12345", + ) + assert identifier.url == "https://www.uniprot.org/uniprotkb/P12345" From f031e28cfd41fde40aadd25f0034e45cd14eea48 Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 20:24:38 +0100 Subject: [PATCH 13/43] tests: refactor tests for ExternalIdentifier to account for automatically computed URLs --- tests/models/test_simulation.py | 18 ------------------ tests/models/test_simulation_molecule.py | 17 +++++++++++++++-- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/tests/models/test_simulation.py b/tests/models/test_simulation.py index 1c9f8e9..599a729 100644 --- a/tests/models/test_simulation.py +++ b/tests/models/test_simulation.py @@ -107,21 +107,3 @@ def test_invalid_simulation_value_type(): SimulationMetadata( simulation_timesteps_in_fs=["invalid-value"], # because not a float ) - - -# -------------------------------------- -# Test compute URL in ExternalIdentifier -# -------------------------------------- -def test_compute_url_in_external_identifier(): - """Test that the compute_url method generates the correct URL.""" - identifier = ExternalIdentifier( - database_name=ExternalDatabaseName.PDB, - identifier="1ABC", - ) - assert identifier.url == "https://www.rcsb.org/structure/1ABC" - - identifier = ExternalIdentifier( - database_name=ExternalDatabaseName.UNIPROT, - identifier="P12345", - ) - assert identifier.url == "https://www.uniprot.org/uniprotkb/P12345" diff --git a/tests/models/test_simulation_molecule.py b/tests/models/test_simulation_molecule.py index a3e1b55..16ca6da 100644 --- a/tests/models/test_simulation_molecule.py +++ b/tests/models/test_simulation_molecule.py @@ -54,14 +54,12 @@ def test_invalid_number_of_molecules(): "1K79", "https://www.rcsb.org/structure/1K79", ), - (ExternalDatabaseName.PDB, 1234, "1234", None), ( ExternalDatabaseName.UNIPROT, "P06213", "P06213", "https://www.uniprot.org/uniprotkb/P06213/entry", ), - (ExternalDatabaseName.UNIPROT, 123456, "123456", None), ], ) def test_external_identifier_creation( @@ -91,3 +89,18 @@ def test_invalid_database_name_in_external_identifiers(): database_name=ExternalDatabaseName.DUMMY, # type: ignore identifier="1ABC", ) + + +def test_compute_url_in_external_identifier(): + """Test that the compute_url method generates the correct URL.""" + identifier = ExternalIdentifier( + database_name=ExternalDatabaseName.PDB, + identifier="1ABC", + ) + assert identifier.url == "https://www.rcsb.org/structure/1ABC" + + identifier = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier="P12345", + ) + assert identifier.url == "https://www.uniprot.org/uniprotkb/P12345" From 6cb949d72008c6be866554fcdcfdb9226df8855e Mon Sep 17 00:00:00 2001 From: essmaw Date: Thu, 5 Feb 2026 20:28:11 +0100 Subject: [PATCH 14/43] refactor: rename number_of_molecules to number_of_this_molecule_type_in_system in tests --- tests/models/test_simulation.py | 4 ++-- tests/models/test_simulation_molecule.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/test_simulation.py b/tests/models/test_simulation.py index 599a729..7a96112 100644 --- a/tests/models/test_simulation.py +++ b/tests/models/test_simulation.py @@ -66,7 +66,7 @@ def test_structured_fields_creation(): name="H2O", number_of_atoms=3, formula="H2O", - number_of_molecules=100, + number_of_this_molecule_type_in_system=100, sequence="PEPTIDE", external_identifiers=[ ExternalIdentifier( @@ -79,7 +79,7 @@ def test_structured_fields_creation(): ) assert metadata.softwares[0].name == "GROMACS" assert metadata.molecules[0].number_of_atoms == 3 - assert metadata.molecules[0].number_of_molecules == 100 + assert metadata.molecules[0].number_of_this_molecule_type_in_system == 100 assert metadata.forcefields_models[0].version == "ff14SB" assert metadata.molecules[0].sequence == "PEPTIDE" assert ( diff --git a/tests/models/test_simulation_molecule.py b/tests/models/test_simulation_molecule.py index 16ca6da..73cf6d7 100644 --- a/tests/models/test_simulation_molecule.py +++ b/tests/models/test_simulation_molecule.py @@ -36,10 +36,10 @@ def test_invalid_number_of_atoms(): Molecule(name="H2O", number_of_atoms=-10, formula="H2O") -def test_invalid_number_of_molecules(): - """Test that number_of_molecules cannot be negative.""" +def test_invalid_number_of_this_molecule_type_in_system(): + """Test that number_of_this_molecule_type_in_system cannot be negative.""" with pytest.raises(ValidationError): - Molecule(name="H2O", number_of_molecules=-10, formula="H2O") + Molecule(name="H2O", number_of_this_molecule_type_in_system=-10, formula="H2O") # ------------------------------------------------------------------- From 3871d22af63481305c0cadb37d6dc7235d675651 Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 6 Feb 2026 15:33:18 +0100 Subject: [PATCH 15/43] refactor: rename number_of_this_molecule_type_in_system to number_of_molecules in Molecule model --- src/mdverse_scrapers/models/simulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index 5cc1775..51b9469 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -82,7 +82,7 @@ class Molecule(BaseModel): "Allowed values in the MoleculeType enum. " "Examples: PROTEIN, ION, LIPID...", ) - number_of_this_molecule_type_in_system: int | None = Field( + number_of_molecules: int | None = Field( None, ge=0, description="Number of molecules of this type in the simulation.", From c9be76f364d34afd674ecf14dc6909c0b49aad40 Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 6 Feb 2026 15:39:20 +0100 Subject: [PATCH 16/43] tests: refactor with `number_of_molecules` attribute and adding specific fonction `test_external_identifier_coerces_int_to_str`. --- tests/models/test_simulation.py | 4 ++-- tests/models/test_simulation_molecule.py | 27 +++++++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/tests/models/test_simulation.py b/tests/models/test_simulation.py index 7a96112..599a729 100644 --- a/tests/models/test_simulation.py +++ b/tests/models/test_simulation.py @@ -66,7 +66,7 @@ def test_structured_fields_creation(): name="H2O", number_of_atoms=3, formula="H2O", - number_of_this_molecule_type_in_system=100, + number_of_molecules=100, sequence="PEPTIDE", external_identifiers=[ ExternalIdentifier( @@ -79,7 +79,7 @@ def test_structured_fields_creation(): ) assert metadata.softwares[0].name == "GROMACS" assert metadata.molecules[0].number_of_atoms == 3 - assert metadata.molecules[0].number_of_this_molecule_type_in_system == 100 + assert metadata.molecules[0].number_of_molecules == 100 assert metadata.forcefields_models[0].version == "ff14SB" assert metadata.molecules[0].sequence == "PEPTIDE" assert ( diff --git a/tests/models/test_simulation_molecule.py b/tests/models/test_simulation_molecule.py index 73cf6d7..50c4db7 100644 --- a/tests/models/test_simulation_molecule.py +++ b/tests/models/test_simulation_molecule.py @@ -36,10 +36,10 @@ def test_invalid_number_of_atoms(): Molecule(name="H2O", number_of_atoms=-10, formula="H2O") -def test_invalid_number_of_this_molecule_type_in_system(): - """Test that number_of_this_molecule_type_in_system cannot be negative.""" +def test_invalid_number_of_molecules(): + """Test that number_of_molecules cannot be negative.""" with pytest.raises(ValidationError): - Molecule(name="H2O", number_of_this_molecule_type_in_system=-10, formula="H2O") + Molecule(name="H2O", number_of_molecules=-10, formula="H2O") # ------------------------------------------------------------------- @@ -91,6 +91,27 @@ def test_invalid_database_name_in_external_identifiers(): ) +@pytest.mark.parametrize( + ("database_name", "identifier", "expected_identifier"), + [ + (ExternalDatabaseName.PDB, 1234, "1234"), + (ExternalDatabaseName.UNIPROT, 123456, "123456"), + ], +) +def test_external_identifier_coerces_int_to_str( + database_name, + identifier, + expected_identifier, +): + """Test that integer identifiers are coerced to strings.""" + ext_id = ExternalIdentifier( + database_name=database_name, + identifier=identifier, + ) + + assert ext_id.identifier == expected_identifier + + def test_compute_url_in_external_identifier(): """Test that the compute_url method generates the correct URL.""" identifier = ExternalIdentifier( From 542f54a1ec04a89607e303f534a067ea09640f07 Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 6 Feb 2026 15:42:17 +0100 Subject: [PATCH 17/43] fixes(mddb scraper): correct spelling errors, improve parameter descriptions and update number_of_molecules attribute. --- src/mdverse_scrapers/scrapers/mddb.py | 37 +++++++++++---------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 60bc523..67c617f 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -146,7 +146,7 @@ def extract_software_and_version( Parameters ---------- dataset_metadata: dict - The dataset dictionnary from which to extract molecules information. + The dataset dictionary from which to extract molecules information. dataset_id: str Identifier of the dataset, used for logging. logger: "loguru.Logger" @@ -177,7 +177,7 @@ def extract_forcefield_or_model_and_version( Parameters ---------- dataset_metadata: dict - The dataset dictionnary from which to extract molecules information. + The dataset dictionary from which to extract molecules information. dataset_id: str Identifier of the dataset entry, used for logging. logger: "loguru.Logger" @@ -194,7 +194,7 @@ def extract_forcefield_or_model_and_version( # Adding ffm_names.extend(dataset_metadata.get("FF") or []) # Adding the water model. - # Exemple: TIP3P. + # Example: TIP3P. water_model = dataset_metadata.get("WAT") if water_model: ffm_names.append(water_model) @@ -260,8 +260,7 @@ def extract_proteins( pdb_ids: list[ExternalIdentifier], references: list[str], prot_seqs: list[str], - prot_atoms: int, - prot_count: int, + prot_atoms: int | None, client: "httpx.Client", dataset_id: str, logger: "loguru.Logger", @@ -277,10 +276,8 @@ def extract_proteins( to associate with the proteins. prot_seqs: list[str] List of protein sequences. - prot_atoms: int + prot_atoms: int | None Total number of atoms in the protein. - prot_count: int - Total number of protein molecules in the system. client: httpx.Client The HTTP client used for making requests. dataset_id: str @@ -316,13 +313,13 @@ def extract_proteins( type=MoleculeType.PROTEIN, sequence=seq, number_of_atoms=prot_atoms if len(prot_seqs) == 1 else None, - number_of_this_molecule_type_in_system=prot_count, external_identifiers=external_ids, ) ) except (TypeError, ValueError) as exc: logger.warning( - f"Skipping protein {i + 1} in dataset {dataset_id} due to {type(exc).__name__}: {exc}" + f"Skipping protein {i + 1} in dataset {dataset_id} due to " + f"{type(exc).__name__}: {exc}" ) return molecules @@ -330,7 +327,7 @@ def extract_proteins( def extract_nucleic_acids( pdb_ids: list[ExternalIdentifier], nucl_seqs: list[str], - nucl_atoms: list[int], + nucl_atoms: int | None, dataset_id: str, logger: "loguru.Logger", ) -> list[Molecule]: @@ -342,8 +339,8 @@ def extract_nucleic_acids( List of PDB identifiers to associate with the nucleic acids. nucl_seqs: list[str] List of nucleic acid sequences. - nucl_atoms: list[int] - List of atom counts for the nucleic acids. + nucl_atoms: int + Total number of atoms in the nucleic acids. dataset_id: str The ID of the dataset being processed, used for logging. logger: loguru.Logger @@ -410,7 +407,7 @@ def extract_small_molecules( Molecule( name=species, type=mol_type, - number_of_this_molecule_type_in_system=count, + number_of_molecules=count, ) ) except (TypeError, ValueError) as exc: @@ -451,12 +448,9 @@ def extract_molecules( pdbs = dataset_metadata.get("PDBIDS") or [] references = dataset_metadata.get("REFERENCES") or [] prot_seqs = dataset_metadata.get("PROTSEQ") or [] - prot_atoms = dataset_metadata.get("PROTATS") or [] - prot_count = dataset_metadata.get("PROT", 0) + prot_atoms = dataset_metadata.get("PROTATS") nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] - nucl_atoms = (dataset_metadata.get("DNAATS") or []) + ( - dataset_metadata.get("RNAATS") or [] - ) + nucl_atoms = dataset_metadata.get("DNAATS", 0) + (dataset_metadata.get("RNAATS", 0)) # Pre-create PDB identifiers pdb_ids = [ @@ -471,7 +465,6 @@ def extract_molecules( references, prot_seqs, prot_atoms, - prot_count, client, dataset_id, logger, @@ -789,14 +782,14 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: datasets_normalized_metadata, logger=logger, ) - # Scrape NOMAD files metadata. + # Scrape MDDB files metadata. files_metadata = scrape_files_for_all_datasets( client, datasets_normalized_metadata, base_url, logger=logger, ) - # Validate NOMAD files metadata with the FileMetadata Pydantic model. + # Validate MDDB files metadata with the FileMetadata Pydantic model. files_normalized_metadata = normalize_files_metadata( files_metadata, logger=logger ) From 21943fc970477db45bfb9d91071151916af6c39f Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 6 Feb 2026 15:43:14 +0100 Subject: [PATCH 18/43] docs: correct spelling errors --- docs/mddb.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/mddb.md b/docs/mddb.md index 99f64d3..ede832a 100644 --- a/docs/mddb.md +++ b/docs/mddb.md @@ -25,10 +25,10 @@ No account / token is needed to access the MDposit API. ### Datasets -In MDposit, a dataset (a simulation and its related files) is called an "[project](https://mmb.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)" and a project can contains multiple replicas, each identified by `project_id`.`replica_id`. +In MDposit, a dataset (a simulation and its related files) is called an "[project](https://mmb.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)" and a project can contain multiple replicas, each identified by `project_id`.`replica_id`. -For exemple, the project [A026F](https://mmb.mddbr.eu/#/id/A026F/overview) contains four replicas: +For example, the project [A026F](https://mmb.mddbr.eu/#/id/A026F/overview) contains four replicas: - `A026F.1`: https://mmb.mddbr.eu/#/id/A026F.1/overview - `A026F.2`: https://mmb.mddbr.eu/#/id/A026F.2/overview - `A026F.3`: https://mmb.mddbr.eu/#/id/A026F.3/overview From d826989f36e4cdcd0da3029325e6115054eb6562 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Fri, 6 Feb 2026 18:49:24 +0100 Subject: [PATCH 19/43] fix: Revert to 'software' field --- src/mdverse_scrapers/models/simulation.py | 2 +- src/mdverse_scrapers/scrapers/mddb.py | 4 ++-- tests/models/test_simulation.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index 51b9469..2883041 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -140,7 +140,7 @@ class SimulationMetadata(BaseModel): # Ensure scraped metadata matches the expected schema exactly. model_config = ConfigDict(extra="forbid") - softwares: list[Software] | None = Field( + software: list[Software] | None = Field( None, description="List of molecular dynamics tool or software.", ) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 67c617f..fbc7a2d 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -510,7 +510,7 @@ def extract_datasets_metadata( datasets_metadata = [] for dataset in datasets: # Get the dataset id - dataset_id = dataset.get("accession") + dataset_id = str(dataset.get("accession")) logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") # Create the dataset url depending on the node if node_name is DatasetSourceName.MDPOSIT_MMB_NODE: @@ -545,7 +545,7 @@ def extract_datasets_metadata( } # Extract simulation metadata if available. # Software names with their versions. - metadata["softwares"] = extract_software_and_version( + metadata["software"] = extract_software_and_version( dataset_metadata, dataset_id, logger ) # Forcefield and model names with their versions. diff --git a/tests/models/test_simulation.py b/tests/models/test_simulation.py index 599a729..860ee1d 100644 --- a/tests/models/test_simulation.py +++ b/tests/models/test_simulation.py @@ -60,7 +60,7 @@ def test_temperature_normalization(test_temp, expected_temp_in_kelvin): def test_structured_fields_creation(): """Test that software, molecules, and forcefields can be created.""" metadata = SimulationMetadata( - softwares=[Software(name="GROMACS", version="2023.1")], + software=[Software(name="GROMACS", version="2023.1")], molecules=[ Molecule( name="H2O", @@ -77,7 +77,7 @@ def test_structured_fields_creation(): ], forcefields_models=[ForceFieldModel(name="AMBER", version="ff14SB")], ) - assert metadata.softwares[0].name == "GROMACS" + assert metadata.software[0].name == "GROMACS" assert metadata.molecules[0].number_of_atoms == 3 assert metadata.molecules[0].number_of_molecules == 100 assert metadata.forcefields_models[0].version == "ff14SB" From 671008c3c9dd256df1b183c7a0c90d73ab9c8780 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Fri, 6 Feb 2026 20:21:31 +0100 Subject: [PATCH 20/43] refactor: Reduce usage and scope of try/except blocks --- src/mdverse_scrapers/scrapers/mddb.py | 198 ++++++++++++++------------ 1 file changed, 105 insertions(+), 93 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index fbc7a2d..1cf6419 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -8,7 +8,6 @@ - MDPOSIT INRIA node https://dynarepo.inria.fr/#/ """ -import json import sys from pathlib import Path from typing import Any @@ -82,56 +81,57 @@ def scrape_all_datasets( # Start by requesting the first page to get total number of datasets. logger.info("Requesting first page to get total number of datasets...") - page = 0 # start with first page - - while True: + params = {"limit": 10, "page": 1} + response = make_http_request_with_retries( + client, + query_entry_point, + method=HttpMethod.GET, + params=params, + timeout=60, + delay_before_request=0.2, + ) + if not response: + logger.error("Failed to fetch data from MDposit API.") + return all_datasets + total_datasets = int(response.json().get("filteredCount", 0)) + logger.success(f"Found a total of {total_datasets:,} datasets in {node_name}.") + # Compute total number of pages to scrape based on total datasets and page size. + page_total = total_datasets // page_size + if total_datasets % page_size != 0: + page_total += 1 + + for page in range(1, page_total + 1): + params = {"limit": page_size, "page": page} response = make_http_request_with_retries( client, - f"{query_entry_point}?limit={page_size}&page={page}", + query_entry_point, method=HttpMethod.GET, + params=params, timeout=60, delay_before_request=0.2, ) - if not response: logger.error("Failed to fetch data from MDposit API.") logger.error("Jumping to next iteration.") - page += 1 continue - try: - response_json = response.json() - datasets = response_json.get("projects", []) - total_datasets = response_json.get("filteredCount") - - if page == 0 and total_datasets is not None: - logger.info(f"Found a total of {total_datasets:,} datasets in MDposit.") - - if not datasets: - logger.info("No more datasets returned by API. Stopping pagination.") - break - - all_datasets.extend(datasets) - - logger.info(f"Scraped page {page} with {len(datasets)} datasets.") - if total_datasets: - logger.info( - f"Scraped {len(all_datasets)} datasets " - f"({len(all_datasets):,}/{total_datasets:,} " - f"{len(all_datasets) / total_datasets:.0%})" - ) - logger.debug("First dataset metadata on this page:") - logger.debug(datasets[0] if datasets else "No datasets on this page") - - if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 120: - logger.warning("Debug mode is ON: stopping after 120 datasets.") - return all_datasets + response_json = response.json() + datasets = response_json.get("projects", []) + all_datasets.extend(datasets) - except (json.decoder.JSONDecodeError, ValueError) as exc: - logger.error(f"Error while parsing MDposit response: {exc}") - logger.error("Jumping to next iteration.") + logger.info(f"Scraped page {page}/{page_total} with {len(datasets)} datasets.") + if total_datasets: + logger.info( + f"Scraped {len(all_datasets)} datasets " + f"({len(all_datasets):,}/{total_datasets:,} " + f":{len(all_datasets) / total_datasets:.0%})" + ) + logger.debug("First dataset metadata on this page:") + logger.debug(datasets[0] if datasets else "No datasets on this page") - page += 1 # increment page for next iteration + if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 100: + logger.warning("Debug mode is ON: stopping after 100 datasets.") + return all_datasets logger.success(f"Scraped {len(all_datasets):,} datasets in MDposit.") return all_datasets @@ -207,53 +207,56 @@ def extract_forcefield_or_model_and_version( def fetch_uniprot_protein_name( - client, + client: httpx.Client, uniprot_id: str, - logger: "loguru.Logger", -) -> str | None: + logger: "loguru.Logger" = loguru.logger, +) -> str: """ Retrieve protein name from UniProt API. Parameters ---------- - client + client: httpx.Client HTTP client used to perform the request. uniprot_id: str UniProt accession identifier. - logger: loguru.Logger - Logger instance. + logger: "loguru.Logger" + Logger for logging messages. Returns ------- - str | None + str Protein full name if available, None otherwise. """ logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") - try: - response = make_http_request_with_retries( - client, - f"https://rest.uniprot.org/uniprotkb/{uniprot_id}", - method=HttpMethod.GET, - timeout=30, - delay_before_request=0.1, - ) - data: dict = response.json() - protein_name = ( - data.get("proteinDescription", {}) - .get("recommendedName", {}) - .get("fullName", {}) - .get("value") + if uniprot_id == "noref": + logger.warning("UniProt ID is 'noref', cannot fetch protein name.") + return "Unknow protein" + # Defaut value for protein name: + default_protein_name = f"Protein {uniprot_id}" + response = make_http_request_with_retries( + client, + f"https://rest.uniprot.org/uniprotkb/{uniprot_id}", + method=HttpMethod.GET, + timeout=30, + delay_before_request=0.1, + ) + if not response: + logger.error(f"Failed to fetch data from UniProt API for ID {uniprot_id}.") + return default_protein_name + protein_name = ( + response.json() + .get("proteinDescription", {}) + .get("recommendedName", {}) + .get("fullName", {}) + .get("value") + ) + if protein_name: + logger.success( + f"Retrieved protein name for UniProt ID {uniprot_id}: {protein_name}" ) - if protein_name: - logger.success( - f"Retrieved protein name for UniProt ID {uniprot_id}: {protein_name}" - ) - return protein_name - return f"Protein {uniprot_id}" # Fallback to a generic name if not found - - except (AttributeError, TypeError) as exc: - logger.warning(f"Invalid UniProt response for {uniprot_id}: {exc}") - return None + return protein_name + return default_protein_name def extract_proteins( @@ -261,9 +264,9 @@ def extract_proteins( references: list[str], prot_seqs: list[str], prot_atoms: int | None, - client: "httpx.Client", + client: httpx.Client, dataset_id: str, - logger: "loguru.Logger", + logger: "loguru.Logger" = loguru.logger, ) -> list[Molecule]: """Extract proteins from dataset metadata. @@ -291,10 +294,11 @@ def extract_proteins( A list of extracted proteins. """ molecules = [] - for i, seq in enumerate(prot_seqs): + for counter, seq in enumerate(prot_seqs): + external_ids = list(pdb_ids) + prot_name = f"Protein {counter + 1}" try: - external_ids = list(pdb_ids) - uniprot_id = references[i] if i < len(references) else None + uniprot_id = references[counter] if counter < len(references) else None if uniprot_id: external_ids.append( ExternalIdentifier( @@ -302,25 +306,23 @@ def extract_proteins( identifier=uniprot_id, ) ) - prot_name = ( - fetch_uniprot_protein_name(client, uniprot_id, logger) - if uniprot_id - else f"Protein {i + 1}" - ) - molecules.append( - Molecule( - name=prot_name, - type=MoleculeType.PROTEIN, - sequence=seq, - number_of_atoms=prot_atoms if len(prot_seqs) == 1 else None, - external_identifiers=external_ids, + prot_name = fetch_uniprot_protein_name( + client, uniprot_id, logger=logger ) - ) except (TypeError, ValueError) as exc: logger.warning( - f"Skipping protein {i + 1} in dataset {dataset_id} due to " + f"Skipping protein {counter + 1} in dataset {dataset_id} due to " f"{type(exc).__name__}: {exc}" ) + molecules.append( + Molecule( + name=prot_name, + type=MoleculeType.PROTEIN, + sequence=seq, + number_of_atoms=prot_atoms if len(prot_seqs) == 1 else None, + external_identifiers=external_ids, + ) + ) return molecules @@ -421,7 +423,7 @@ def extract_small_molecules( def extract_molecules( dataset_metadata: dict, dataset_id: str, - client: "httpx.Client", + client: httpx.Client, logger: "loguru.Logger" = loguru.logger, ) -> list[Molecule] | None: """Coordinator function to extract all molecule types from dataset metadata. @@ -435,7 +437,7 @@ def extract_molecules( client: httpx.Client The HTTP client used for making requests. logger: loguru.Logger - The logger used for logging messages. + Logger for logging messages. Returns ------- @@ -467,7 +469,7 @@ def extract_molecules( prot_atoms, client, dataset_id, - logger, + logger=logger, ) ) # Then extract nucleic acids @@ -511,7 +513,7 @@ def extract_datasets_metadata( for dataset in datasets: # Get the dataset id dataset_id = str(dataset.get("accession")) - logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") + logger.info(f"Extracting metadata for dataset: {dataset_id}") # Create the dataset url depending on the node if node_name is DatasetSourceName.MDPOSIT_MMB_NODE: dataset_url = f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" @@ -565,7 +567,11 @@ def extract_datasets_metadata( [temperature] if temperature else None ) datasets_metadata.append(metadata) - logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") + logger.info( + f"Scraped metadata for {len(datasets_metadata)} datasets " + f"({len(datasets_metadata):,}/{len(datasets):,}" + f":{len(datasets_metadata) / len(datasets):.0%})" + ) return datasets_metadata @@ -782,6 +788,9 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: datasets_normalized_metadata, logger=logger, ) + # Output first dataset metadata for debugging purposes. + logger.debug("First dataset metadata:") + logger.debug(datasets_normalized_metadata[0]) # Scrape MDDB files metadata. files_metadata = scrape_files_for_all_datasets( client, @@ -799,6 +808,9 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: files_normalized_metadata, logger=logger, ) + # Output first file metadata for debugging purposes. + logger.debug("First file metadata:") + logger.debug(files_normalized_metadata[0]) # Print scraping statistics. print_statistics(scraper, logger=logger) From f987ea72c7d15860988964cc4bbc1872126a379f Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 01:20:42 +0100 Subject: [PATCH 21/43] feat: Add default DatasetSourceName --- src/mdverse_scrapers/models/enums.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index cba6bd4..a8f23ab 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -13,6 +13,7 @@ class DataType(StrEnum): class DatasetSourceName(StrEnum): """Molecular dynamics sources: data repositories and projects.""" + UNKNOWN = "unknown" ZENODO = "zenodo" FIGSHARE = "figshare" OSF = "osf" From 059d51f1b6bd64ede4fd3602fbe275dd5b114df2 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 01:21:20 +0100 Subject: [PATCH 22/43] feat: Coexerce verstion to str --- src/mdverse_scrapers/models/simulation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index 2883041..b194fdc 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -103,8 +103,9 @@ class Molecule(BaseModel): class ForceFieldModel(BaseModel): """Forcefield or Model used in a simulation.""" - # Ensure scraped metadata matches the expected schema exactly. - model_config = ConfigDict(extra="forbid") + # Ensure scraped metadata matches the expected schema exactly + # and version is coerced to string when needed. + model_config = ConfigDict(extra="forbid", coerce_numbers_to_str=True) name: str = Field( ..., @@ -118,8 +119,9 @@ class ForceFieldModel(BaseModel): class Software(BaseModel): """Simulation software or tool used in a simulation.""" - # Ensure scraped metadata matches the expected schema exactly. - model_config = ConfigDict(extra="forbid") + # Ensure scraped metadata matches the expected schema exactly + # and version is coerced to string when needed. + model_config = ConfigDict(extra="forbid", coerce_numbers_to_str=True) name: str = Field( ..., From ebf4470e001d67aa8e338381ab092e7ba652d406 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 01:21:55 +0100 Subject: [PATCH 23/43] docs: Update MDDB documentation and examples --- docs/mddb.md | 88 +++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 39 deletions(-) diff --git a/docs/mddb.md b/docs/mddb.md index ede832a..ee060bd 100644 --- a/docs/mddb.md +++ b/docs/mddb.md @@ -2,45 +2,43 @@ > The [MDDB (Molecular Dynamics Data Bank) project](https://mddbr.eu/about/) is an initiative to collect, preserve, and share molecular dynamics (MD) simulation data. As part of this project, **MDposit** is an open platform that provides web access to atomistic MD simulations. Its goal is to facilitate and promote data sharing within the global scientific community to advance research. -The MDDB infrastructure is distributed across **two MDposit nodes**. Both nodes expose the same REST API entry points. The only difference is the base URL used to access the API. +The MDposit infrastructure is distributed across several MDposit nodes. All metadata are accessible through the global node: -## MDposit MMB node - -- web site: -- documentation: -- API: -- API base URL: - -## MDposit INRIA node - -- web site: -- documentation: -- API: -- API base URL: +MDposit MMB node: +- web site: +- documentation: +- API: +- API base URL: No account / token is needed to access the MDposit API. -## Finding molecular dynamics datasets and files +## Getting metadata ### Datasets -In MDposit, a dataset (a simulation and its related files) is called an "[project](https://mmb.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)" and a project can contain multiple replicas, each identified by `project_id`.`replica_id`. +In MDposit, a dataset (a simulation and its related files) is called a "[project](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)". + +APY entrypoint to get the total number of projects: +- Endpoint: `/projects/summary` +- HTTP methode: GET +- [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects_summary) -For example, the project [A026F](https://mmb.mddbr.eu/#/id/A026F/overview) contains four replicas: - - `A026F.1`: https://mmb.mddbr.eu/#/id/A026F.1/overview - - `A026F.2`: https://mmb.mddbr.eu/#/id/A026F.2/overview - - `A026F.3`: https://mmb.mddbr.eu/#/id/A026F.3/overview - - `A026F.4`: https://mmb.mddbr.eu/#/id/A026F.4/overview +A project can contain multiple replicas, each identified by `project_id`.`replica_id`. +For example, the project [MD-A003ZP](https://mdposit.mddbr.eu/#/id/MD-A003ZP/overview) contains ten replicas: -API entrypoint to search for all datasets at once: +- `MD-A003ZP.1`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.1/overview +- `MD-A003ZP.2`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.2/overview +- `MD-A003ZP.3`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.3/overview +- ... + +API entrypoint to get all datasets at once: - Endpoint: `/projects` - HTTP method: GET -- [documentation](https://mmb.mddbr.eu/api/rest/docs/#/projects/get_projects) - +- [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects) ### Files @@ -48,30 +46,42 @@ API endpoint to get files for a given replica of a project: - Endpoint: `/projects/{project_id.replica_id}/filenotes` - HTTP method: GET -- [documentation](https://mmb.mddbr.eu/api/rest/docs/#/filenotes/get_projects__projectAccessionOrID__filenotes) +- [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/filenotes/get_projects__projectAccessionOrID__filenotes) ## Examples -### Project `A026F` +### Project `MD-A003ZP` + +Title: -- Project id: `A026F.1` -- [project on MDposit GUI](https://mmb.mddbr.eu/#/id/A026F.1/overview) -- [project on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A026F.1) +> MDBind 3x1k Description: -> Multi-scale simulation approaches which couple the molecular and neuronal simulations to predict the variation in the membrane potential and the neural spikes. +> 10 ns simulation of 1ma4m pdb structure from MDBind dataset, a dynamic view of the PDBBind database -- [files on MDposit GUI](https://mmb.mddbr.eu/#/id/A026F.1/files) -- [files on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A026F.1/filenotes) +- [project on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A003ZP/overview) +- [project on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A003ZP) + +Files for replica 1: + +- [files on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A003ZP.1/files) +- [files on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A003ZP.1/filenotes) + +### Project `MD-A001T1` + +Title: + +> All-atom molecular dynamics simulations of SARS-CoV-2 envelope protein E in the monomeric form, C4 popc + +Description: -### Project `A025U` +> The trajectories of all-atom MD simulations were obtained based on 4 starting representative conformations from the CG simulation. For each starting structure, there are six trajectories of the E protein: 3 with the protein embedded in the membrane containing POPC, and 3 with the membrane mimicking the natural ERGIC membrane (Mix: 50% POPC, 25% POPE, 10% POPI, 5% POPS, 10% cholesterol). -- Project id: `A025U.1` -- [project on MDposit GUI](https://mmb.mddbr.eu/#/id/A025U/overview) -- [project on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A025U.2) +- [project on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A001T1/overview) +- [project on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A001T1) -Remark: no description is provided for this dataset. +Files for replica 1: -- [files on MDposit GUI](https://mmb.mddbr.eu/#/id/A025U/files) -- [files on MDposit API](https://mmb.mddbr.eu/api/rest/current/projects/A025U.2/filenotes) +- [files on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A001T1.1/files) +- [files on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A001T1.1/filenotes) From 63181faf9db0680b9dba98136dbed465429fe7f4 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 01:22:12 +0100 Subject: [PATCH 24/43] refactor: Remove more try/except --- src/mdverse_scrapers/scrapers/mddb.py | 551 +++++++++++++------------- 1 file changed, 283 insertions(+), 268 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 1cf6419..65089c6 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -1,11 +1,8 @@ """Scrape molecular dynamics simulation datasets and files from the MDDB. -This script extracts molecular dynamics datasets produced within the -MDDB (Molecular Dynamics Data Bank) project, which is distributed across -two nodes: - -- MDPOSIT MMB node (https://mmb-dev.mddbr.eu/#/browse) -- MDPOSIT INRIA node https://dynarepo.inria.fr/#/ +This script extracts molecular dynamics datasets managed by the +MDDB (Molecular Dynamics Data Bank) project +and the MDposit platform. """ import sys @@ -34,16 +31,10 @@ normalize_files_metadata, ) -MDDB_REPOSITORIES = { - DatasetSourceName.MDPOSIT_MMB_NODE: "https://mmb-dev.mddbr.eu/api/rest/v1", - DatasetSourceName.MDPOSIT_INRIA_NODE: "https://inria.mddbr.eu/api/rest/v1", -} - def scrape_all_datasets( client: httpx.Client, query_entry_point: str, - node_name: DatasetSourceName, page_size: int = 50, logger: "loguru.Logger" = loguru.logger, scraper: ScraperContext | None = None, @@ -59,8 +50,6 @@ def scrape_all_datasets( The HTTPX client to use for making requests. query_entry_point: str The entry point of the API request. - node_name: DatasetSourceName - MDDB node name for logging. page_size: int Number of entries to fetch per page. logger: "loguru.Logger" @@ -75,7 +64,7 @@ def scrape_all_datasets( list[dict]: A list of MDposit entries. """ - logger.info(f"Scraping molecular dynamics datasets from {node_name}.") + logger.info("Scraping molecular dynamics datasets from MDposit.") logger.info(f"Using batches of {page_size} datasets.") all_datasets = [] @@ -94,7 +83,7 @@ def scrape_all_datasets( logger.error("Failed to fetch data from MDposit API.") return all_datasets total_datasets = int(response.json().get("filteredCount", 0)) - logger.success(f"Found a total of {total_datasets:,} datasets in {node_name}.") + logger.success(f"Found a total of {total_datasets:,} datasets in MDposit.") # Compute total number of pages to scrape based on total datasets and page size. page_total = total_datasets // page_size if total_datasets % page_size != 0: @@ -157,15 +146,12 @@ def extract_software_and_version( list[Software] | None A list of Software instances with `name` and `version` fields, None otherwise. """ - try: - name = dataset_metadata.get("PROGRAM") - version = dataset_metadata.get("VERSION") - if not name: - return None - return [Software(name=name, version=str(version))] - except (ValueError, KeyError, TypeError) as e: - logger.warning(f"Error parsing software info for dataset {dataset_id}: {e}") + name = dataset_metadata.get("PROGRAM", "").strip() + version = dataset_metadata.get("VERSION") + if not name: + logger.warning(f"No software found for dataset {dataset_id}.") return None + return [Software(name=name, version=version)] def extract_forcefield_or_model_and_version( @@ -189,21 +175,16 @@ def extract_forcefield_or_model_and_version( A list of forcefield or model instances with `name` and `version` fields, None otherwise. """ - ffm_names = [] - try: - # Adding - ffm_names.extend(dataset_metadata.get("FF") or []) - # Adding the water model. - # Example: TIP3P. - water_model = dataset_metadata.get("WAT") - if water_model: - ffm_names.append(water_model) - return [ForceFieldModel(name=ffm_name) for ffm_name in ffm_names if ffm_name] - except (ValueError, KeyError) as e: - logger.warning( - f"Error parsing forcefield or model info for dataset {dataset_id}: {e}" - ) - return None + forcefields_and_models = [] + # Add forcefield names. + for forcefield in dataset_metadata.get("FF", []): + if isinstance(forcefield, str): + forcefields_and_models.append(ForceFieldModel(name=forcefield.strip())) + # Add water model. + water_model = dataset_metadata.get("WAT", "") + if water_model: + forcefields_and_models.append(ForceFieldModel(name=water_model.strip())) + return forcefields_and_models def fetch_uniprot_protein_name( @@ -229,9 +210,9 @@ def fetch_uniprot_protein_name( Protein full name if available, None otherwise. """ logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") - if uniprot_id == "noref": - logger.warning("UniProt ID is 'noref', cannot fetch protein name.") - return "Unknow protein" + if uniprot_id in ("noref", "notfound"): + logger.warning(f"Cannot fetch protein name for UniProt ID '{uniprot_id}'.") + return "Unknown protein" # Defaut value for protein name: default_protein_name = f"Protein {uniprot_id}" response = make_http_request_with_retries( @@ -242,7 +223,7 @@ def fetch_uniprot_protein_name( delay_before_request=0.1, ) if not response: - logger.error(f"Failed to fetch data from UniProt API for ID {uniprot_id}.") + logger.error(f"Failed to query the UniProt API for ID {uniprot_id}.") return default_protein_name protein_name = ( response.json() @@ -256,31 +237,32 @@ def fetch_uniprot_protein_name( f"Retrieved protein name for UniProt ID {uniprot_id}: {protein_name}" ) return protein_name - return default_protein_name + else: + logger.warning( + f"Protein name not found in UniProt API response for ID {uniprot_id}." + ) + return default_protein_name def extract_proteins( - pdb_ids: list[ExternalIdentifier], - references: list[str], - prot_seqs: list[str], - prot_atoms: int | None, + pdb_identifiers: list[ExternalIdentifier], + uniprot_identifiers: list[str], + protein_sequences: list[str], client: httpx.Client, dataset_id: str, logger: "loguru.Logger" = loguru.logger, -) -> list[Molecule]: +) -> list: """Extract proteins from dataset metadata. Parameters ---------- - pdb_ids: list[ExternalIdentifier] + pdb_identifiers: list[ExternalIdentifier] List of PDB identifiers to associate with the proteins. - references: list[str] - List of reference identifiers (e.g., UniProt accessions) + uniprot_identifiers: list[str] + List of UniProt accessions. to associate with the proteins. - prot_seqs: list[str] + protein_sequences: list[str] List of protein sequences. - prot_atoms: int | None - Total number of atoms in the protein. client: httpx.Client The HTTP client used for making requests. dataset_id: str @@ -290,59 +272,100 @@ def extract_proteins( Returns ------- - list[Molecule] - A list of extracted proteins. + list + A list of extracted proteins or empty list. """ molecules = [] - for counter, seq in enumerate(prot_seqs): - external_ids = list(pdb_ids) - prot_name = f"Protein {counter + 1}" - try: - uniprot_id = references[counter] if counter < len(references) else None - if uniprot_id: - external_ids.append( - ExternalIdentifier( - database_name=ExternalDatabaseName.UNIPROT, - identifier=uniprot_id, - ) + # Case 1: + # We have protein sequences but no UniProt identifiers. + if protein_sequences and not uniprot_identifiers: + logger.warning( + "Protein sequences found but no UniProt identifier " + f"in dataset {dataset_id}." + ) + for sequence in protein_sequences: + molecules.append( + Molecule( + name="Protein", + type=MoleculeType.PROTEIN, + sequence=sequence, + external_identifiers=pdb_identifiers, ) - prot_name = fetch_uniprot_protein_name( - client, uniprot_id, logger=logger + ) + return molecules + # Case 2: + # We have UniProt identifiers but no protein sequences. + if uniprot_identifiers and not protein_sequences: + logger.warning( + "UniProt identifiers found but no protein sequence " + f"in dataset {dataset_id}." + ) + for identifier in uniprot_identifiers: + external = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, identifier=identifier + ) + protein_name = fetch_uniprot_protein_name(client, identifier, logger=logger) + molecules.append( + Molecule( + name=protein_name, + type=MoleculeType.PROTEIN, + sequence=None, + external_identifiers=[external, *pdb_identifiers], ) - except (TypeError, ValueError) as exc: - logger.warning( - f"Skipping protein {counter + 1} in dataset {dataset_id} due to " - f"{type(exc).__name__}: {exc}" ) + return molecules + # Case 3: + # We have UniProt identifiers and protein sequences, + # but their numbers do not match. + if len(uniprot_identifiers) != len(protein_sequences): + logger.warning( + f"Number of UniProt identifiers ({len(uniprot_identifiers)}) does not " + f"match number of protein sequences ({len(protein_sequences)}) in dataset " + f"{dataset_id}." + ) molecules.append( Molecule( - name=prot_name, + name="Unknown protein", type=MoleculeType.PROTEIN, - sequence=seq, - number_of_atoms=prot_atoms if len(prot_seqs) == 1 else None, - external_identifiers=external_ids, + external_identifiers=pdb_identifiers, + ) + ) + return molecules + # Case 4: + # We have UniProt identifiers and protein sequences, + # and their numbers match. + for identifier, sequence in zip( + uniprot_identifiers, protein_sequences, strict=True + ): + external = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, identifier=identifier + ) + protein_name = fetch_uniprot_protein_name(client, identifier, logger=logger) + molecules.append( + Molecule( + name=protein_name, + type=MoleculeType.PROTEIN, + sequence=sequence, + external_identifiers=[external, *pdb_identifiers], ) ) return molecules def extract_nucleic_acids( - pdb_ids: list[ExternalIdentifier], - nucl_seqs: list[str], - nucl_atoms: int | None, + pdb_identifiers: list[ExternalIdentifier], + nucleic_acid_sequences: list[str], dataset_id: str, - logger: "loguru.Logger", -) -> list[Molecule]: + logger: "loguru.Logger" = loguru.logger, +) -> list: """Extract nucleic acids from dataset metadata. Parameters ---------- - pdb_ids: list[ExternalIdentifier] - List of PDB identifiers to associate with the nucleic acids. - nucl_seqs: list[str] + pdb_identifiers: list[ExternalIdentifier] + List of PDB identifiers to associate with the proteins. + nucleic_acid_sequences: list[str] List of nucleic acid sequences. - nucl_atoms: int - Total number of atoms in the nucleic acids. dataset_id: str The ID of the dataset being processed, used for logging. logger: loguru.Logger @@ -350,34 +373,27 @@ def extract_nucleic_acids( Returns ------- - list[Molecule] + list A list of extracted nucleic acids. """ molecules = [] - for i, seq in enumerate(nucl_seqs): - try: - molecules.append( - Molecule( - name=f"Nucleic Acid {i + 1}", - type=MoleculeType.NUCLEIC_ACID, - sequence=seq, - number_of_atoms=nucl_atoms if len(nucl_seqs) == 1 else None, - external_identifiers=pdb_ids, - ) - ) - except (TypeError, ValueError) as exc: - logger.warning( - f"Skipping nucleic acid {i + 1} in dataset {dataset_id} " - f"due to {type(exc).__name__}: {exc}" + for sequence in nucleic_acid_sequences: + molecules.append( + Molecule( + name="Nucleic acid", + type=MoleculeType.NUCLEIC_ACID, + sequence=sequence, + external_identifiers=pdb_identifiers, ) + ) return molecules def extract_small_molecules( dataset_metadata: dict, dataset_id: str, - logger: "loguru.Logger", -) -> list[Molecule]: + logger: "loguru.Logger" = loguru.logger, +) -> list: """Extract small molecules (lipids, solvents, ions) from dataset metadata. Parameters @@ -391,31 +407,24 @@ def extract_small_molecules( Returns ------- - list[Molecule] - A list of extracted small molecules. + list + A list of extracted small molecules or an empty list. """ molecules = [] - species_type_map = { - "DPPC": MoleculeType.LIPID, + name_type_maping = { "SOL": MoleculeType.SOLVENT, "NA": MoleculeType.ION, "CL": MoleculeType.ION, } - for species, mol_type in species_type_map.items(): - try: - count = dataset_metadata.get(species, 0) - if isinstance(count, int) and count > 0: - molecules.append( - Molecule( - name=species, - type=mol_type, - number_of_molecules=count, - ) + for name, mol_type in name_type_maping.items(): + count = dataset_metadata.get(name, 0) + if isinstance(count, int) and count > 0: + molecules.append( + Molecule( + name=name, + type=mol_type, + number_of_molecules=count, ) - except (TypeError, ValueError) as exc: - logger.warning( - f"Skipping small molecule {species} in dataset {dataset_id} " - f"due to {type(exc).__name__}: {exc}" ) return molecules @@ -444,52 +453,44 @@ def extract_molecules( list[Molecule] | None A list of extracted molecules or None if no molecules were found. """ - molecules: list[Molecule] = [] - - # Normalize common fields - pdbs = dataset_metadata.get("PDBIDS") or [] - references = dataset_metadata.get("REFERENCES") or [] - prot_seqs = dataset_metadata.get("PROTSEQ") or [] - prot_atoms = dataset_metadata.get("PROTATS") - nucl_seqs = dataset_metadata.get("NUCLSEQ") or [] - nucl_atoms = dataset_metadata.get("DNAATS", 0) + (dataset_metadata.get("RNAATS", 0)) - - # Pre-create PDB identifiers - pdb_ids = [ - ExternalIdentifier(database_name=ExternalDatabaseName.PDB, identifier=pdb_id) - for pdb_id in pdbs - ] - - # Extract proteins first - molecules.extend( - extract_proteins( - pdb_ids, - references, - prot_seqs, - prot_atoms, - client, - dataset_id, - logger=logger, + molecules = [] + # Add PDB identifiers as external identifiers. + pdb_identifiers = [] + for pdb in dataset_metadata.get("PDBIDS", []): + external = ExternalIdentifier( + database_name=ExternalDatabaseName.PDB, identifier=pdb ) + pdb_identifiers.append(external) + # Add UniProt identifiers and protein sequence. + proteins = extract_proteins( + pdb_identifiers, + dataset_metadata.get("REFERENCES", []), + dataset_metadata.get("PROTSEQ", []), + client, + dataset_id, + logger=logger, ) - # Then extract nucleic acids - molecules.extend( - extract_nucleic_acids(pdb_ids, nucl_seqs, nucl_atoms, dataset_id, logger) + if proteins: + molecules.extend(proteins) + # Add nucleic acids + nucleic_acids = extract_nucleic_acids( + pdb_identifiers, dataset_metadata.get("NUCLSEQ", []), dataset_id, logger=logger ) + if nucleic_acids: + molecules.extend(nucleic_acids) # Finally extract small molecules like lipids, solvents and ions. - molecules.extend(extract_small_molecules(dataset_metadata, dataset_id, logger)) - + small_molecules = extract_small_molecules(dataset_metadata, dataset_id, logger) + if small_molecules: + molecules.extend(small_molecules) if not molecules: logger.warning(f"No molecules found in dataset {dataset_id}.") return None - return molecules def extract_datasets_metadata( datasets: list[dict[str, Any]], - node_name: DatasetSourceName, - client: "httpx.Client", + client: httpx.Client, logger: "loguru.Logger" = loguru.logger, ) -> list[dict[str, Any]]: """ @@ -499,8 +500,6 @@ def extract_datasets_metadata( ---------- datasets: List[Dict[str, Any]] List of raw MDposit datasets metadata. - node_name: DatasetSourceName - MDDB node name for the dataset url. logger: "loguru.Logger" Logger for logging messages. @@ -514,28 +513,47 @@ def extract_datasets_metadata( # Get the dataset id dataset_id = str(dataset.get("accession")) logger.info(f"Extracting metadata for dataset: {dataset_id}") + # Extract node name. + node_name = dataset.get("node", "") + node_name_full = f"mdposit_{dataset.get('node', '')}_node" # Create the dataset url depending on the node - if node_name is DatasetSourceName.MDPOSIT_MMB_NODE: - dataset_url = f"https://mmb-dev.mddbr.eu/#/id/{dataset_id}/overview" - elif node_name is DatasetSourceName.MDPOSIT_INRIA_NODE: - dataset_url = f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" - else: - logger.warning( - f"Unknown MDDB node '{node_name}'." - f"Cannot build entry URL for dataset {dataset_id}." + dataset_repository_name = DatasetSourceName.UNKNOWN + dataset_id_in_repository = "" + dataset_url_in_repository = "" + if node_name_full == DatasetSourceName.MDPOSIT_MMB_NODE: + dataset_repository_name = DatasetSourceName.MDPOSIT_MMB_NODE + dataset_id_in_repository = str(dataset.get("local")) + dataset_url_in_repository = ( + f"https://mmb.mddbr.eu/#/id/{dataset_id}/overview" + ) + elif node_name_full == DatasetSourceName.MDPOSIT_INRIA_NODE: + dataset_repository_name = DatasetSourceName.MDPOSIT_INRIA_NODE + dataset_id_in_repository = str(dataset.get("local")) + dataset_url_in_repository = ( + f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" ) + else: + logger.error(f"Unknown MDDB node '{node_name}' for dataset {dataset_id}.") + logger.error("Skipping dataset.") + continue dataset_metadata = dataset.get("metadata", {}) - links = dataset_metadata.get("CITATION") - links_list = [links] if links else None - a = dataset_metadata.get("AUTHORS") - author_names = a if isinstance(a, list) else [a] if a else None + citations = dataset_metadata.get("CITATION") + external_links = [citations] if citations else None + authors = dataset_metadata.get("AUTHORS") + author_names = None + if isinstance(authors, list): + author_names = authors + elif isinstance(authors, str): + author_names = [authors.strip()] metadata = { - "dataset_repository_name": node_name.value, - "dataset_id_in_repository": dataset_id, - "dataset_url_in_repository": dataset_url, + "dataset_repository_name": dataset_repository_name, + "dataset_id_in_repository": dataset_id_in_repository, + "dataset_url_in_repository": dataset_url_in_repository, "dataset_project_name": DatasetSourceName.MDDB, - "external_links": links_list, + "dataset_id_in_project": dataset_id, + "dataset_url_in_project": f"https://mdposit.mddbr.eu/#/id/{dataset_id}/overview", + "external_links": external_links, "title": dataset_metadata.get("NAME"), "date_created": dataset.get("creationDate"), "date_last_updated": dataset.get("updateDate"), @@ -548,15 +566,15 @@ def extract_datasets_metadata( # Extract simulation metadata if available. # Software names with their versions. metadata["software"] = extract_software_and_version( - dataset_metadata, dataset_id, logger + dataset_metadata, dataset_id, logger=logger ) # Forcefield and model names with their versions. metadata["forcefields_models"] = extract_forcefield_or_model_and_version( - dataset_metadata, dataset_id, logger + dataset_metadata, dataset_id, logger=logger ) # Molecules with their nb of atoms and number total of atoms. metadata["molecules"] = extract_molecules( - dataset_metadata, dataset_id, client, logger + dataset_metadata, dataset_id, client, logger=logger ) # Time step in fs. time_step = dataset_metadata.get("TIMESTEP") @@ -568,7 +586,7 @@ def extract_datasets_metadata( ) datasets_metadata.append(metadata) logger.info( - f"Scraped metadata for {len(datasets_metadata)} datasets " + f"Extracted metadata for {len(datasets_metadata)} datasets " f"({len(datasets_metadata):,}/{len(datasets):,}" f":{len(datasets_metadata) / len(datasets):.0%})" ) @@ -580,9 +598,9 @@ def scrape_files_for_one_dataset( url: str, dataset_id: str, logger: "loguru.Logger" = loguru.logger, -) -> dict | None: +) -> list[dict] | None: """ - Scrape files metadata for a given MDposit dataset. + Scrape files metadata for MDposit. Parameters ---------- @@ -597,7 +615,7 @@ def scrape_files_for_one_dataset( Returns ------- - dict | None + list[dict] | None File metadata dictionary for the dataset. """ logger.info(f"Scraping files for dataset ID: {dataset_id}") @@ -641,18 +659,18 @@ def scrape_files_for_all_datasets( all_files_metadata = [] for dataset_count, dataset in enumerate(datasets, start=1): dataset_id = dataset.dataset_id_in_repository - files_metadata = scrape_files_for_one_dataset( + raw_files_metadata = scrape_files_for_one_dataset( client, url=f"{node_base_url}/projects/{dataset_id}/filenotes", dataset_id=dataset_id, logger=logger, ) - if not files_metadata: + if not raw_files_metadata: continue # Extract relevant files metadata. logger.info(f"Getting files metadata for dataset: {dataset_id}") files_metadata = extract_files_metadata( - files_metadata, node_base_url, dataset, logger=logger + raw_files_metadata, node_base_url, dataset, logger=logger ) all_files_metadata += files_metadata # Normalize files metadata with pydantic model (FileMetadata) @@ -666,22 +684,22 @@ def scrape_files_for_all_datasets( def extract_files_metadata( - raw_metadata: list[dict[str, Any]], + raw_metadata: list[dict], node_base_url: str, dataset: DatasetMetadata, logger: "loguru.Logger" = loguru.logger, -) -> list[dict[str, Any]]: +) -> list[dict]: """ Extract relevant metadata from raw MDposit files metadata. Parameters ---------- - raw_metadata: dict + raw_metadata: list[dict] Raw files metadata. node_base_url: str - The unique identifier of the dataset in MDposit. + The unique identifier of the dataset in MDDB. dataset: DatasetMetadata - Normalized dataset to scrape files metadata for. + Normalized dataset to get files metadata for. logger: "loguru.Logger" Logger for logging messages. @@ -694,7 +712,7 @@ def extract_files_metadata( files_metadata = [] for mdposit_file in raw_metadata: dataset_id = dataset.dataset_id_in_repository - file_name = Path(mdposit_file.get("filename")) + file_name = Path(mdposit_file.get("filename", "")) node_base_url_for_file = node_base_url.replace("/v1", "") file_path_url = ( f"{node_base_url_for_file}/current/projects/{dataset_id}/files/{file_name}" @@ -737,82 +755,79 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: # Create HTTPX client client = create_httpx_client() - # Iterate over the nodes - for data_source_name, base_url in MDDB_REPOSITORIES.items(): - # Create scraper context. - scraper = ScraperContext( - data_source_name=data_source_name, - output_dir_path=output_dir_path, - is_in_debug_mode=is_in_debug_mode, - ) - # Create logger. - level = "DEBUG" if scraper.is_in_debug_mode else "INFO" - logger = create_logger(logpath=scraper.log_file_path, level=level) - # Print scraper configuration. - logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) - logger.info(f"Starting {data_source_name.name} data scraping...") - # Check connection to the API - if is_connection_to_server_working( - client, f"{base_url}/projects/summary", logger=logger - ): - logger.success(f"Connection to {data_source_name} API successful!") - else: - logger.critical(f"Connection to {data_source_name} API failed.") - logger.critical("Aborting.") - sys.exit(1) - - # Scrape the datasets metadata. - datasets_raw_metadata = scrape_all_datasets( - client, - query_entry_point=f"{base_url}/projects", - node_name=data_source_name, - logger=logger, - scraper=scraper, - ) - if not datasets_raw_metadata: - logger.critical(f"No datasets found in {data_source_name}.") - logger.critical("Aborting.") - sys.exit(1) - - # Select datasets metadata - datasets_selected_metadata = extract_datasets_metadata( - datasets_raw_metadata, data_source_name, client, logger=logger - ) - # Validate datasets metadata with the DatasetMetadata Pydantic model. - datasets_normalized_metadata = normalize_datasets_metadata( - datasets_selected_metadata, logger=logger - ) - # Save datasets metadata to parquet file. - scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( - scraper.datasets_parquet_file_path, - datasets_normalized_metadata, - logger=logger, - ) - # Output first dataset metadata for debugging purposes. - logger.debug("First dataset metadata:") - logger.debug(datasets_normalized_metadata[0]) - # Scrape MDDB files metadata. - files_metadata = scrape_files_for_all_datasets( - client, - datasets_normalized_metadata, - base_url, - logger=logger, - ) - # Validate MDDB files metadata with the FileMetadata Pydantic model. - files_normalized_metadata = normalize_files_metadata( - files_metadata, logger=logger - ) - # Save files metadata to parquet file. - scraper.number_of_files_scraped = export_list_of_models_to_parquet( - scraper.files_parquet_file_path, - files_normalized_metadata, - logger=logger, - ) - # Output first file metadata for debugging purposes. - logger.debug("First file metadata:") - logger.debug(files_normalized_metadata[0]) - # Print scraping statistics. - print_statistics(scraper, logger=logger) + data_source_name = DatasetSourceName.MDDB + base_url = "https://mdposit.mddbr.eu/api/rest/v1" + # Create scraper context. + scraper = ScraperContext( + data_source_name=data_source_name, + output_dir_path=output_dir_path, + is_in_debug_mode=is_in_debug_mode, + ) + # Create logger. + level = "DEBUG" if scraper.is_in_debug_mode else "INFO" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. + logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) + logger.info(f"Starting {data_source_name.name} data scraping...") + # Check connection to the API + if is_connection_to_server_working( + client, f"{base_url}/projects/summary", logger=logger + ): + logger.success(f"Connection to {data_source_name} API successful!") + else: + logger.critical(f"Connection to {data_source_name} API failed.") + logger.critical("Aborting.") + sys.exit(1) + + # Scrape the datasets metadata. + datasets_raw_metadata = scrape_all_datasets( + client, + query_entry_point=f"{base_url}/projects", + logger=logger, + scraper=scraper, + ) + if not datasets_raw_metadata: + logger.critical(f"No datasets found in {data_source_name}.") + logger.critical("Aborting.") + sys.exit(1) + + # Extract datasets metadata. + datasets_selected_metadata = extract_datasets_metadata( + datasets_raw_metadata, client, logger=logger + ) + # Validate datasets metadata with the DatasetMetadata Pydantic model. + datasets_normalized_metadata = normalize_datasets_metadata( + datasets_selected_metadata, logger=logger + ) + # Save datasets metadata to parquet file. + scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( + scraper.datasets_parquet_file_path, + datasets_normalized_metadata, + logger=logger, + ) + # Output first dataset metadata for debugging purposes. + logger.debug("First dataset metadata:") + logger.debug(datasets_normalized_metadata[0]) + # Scrape MDDB files metadata. + files_metadata = scrape_files_for_all_datasets( + client, + datasets_normalized_metadata, + base_url, + logger=logger, + ) + # Validate MDDB files metadata with the FileMetadata Pydantic model. + files_normalized_metadata = normalize_files_metadata(files_metadata, logger=logger) + # Save files metadata to parquet file. + scraper.number_of_files_scraped = export_list_of_models_to_parquet( + scraper.files_parquet_file_path, + files_normalized_metadata, + logger=logger, + ) + # Output first file metadata for debugging purposes. + logger.debug("First file metadata:") + logger.debug(files_normalized_metadata[0]) + # Print scraping statistics. + print_statistics(scraper, logger=logger) if __name__ == "__main__": From 7a5f580d4448ac83d5e482392d431d835c056f30 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 01:24:41 +0100 Subject: [PATCH 25/43] refactor: Split log message --- src/mdverse_scrapers/scrapers/mddb.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 65089c6..be388ed 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -233,9 +233,8 @@ def fetch_uniprot_protein_name( .get("value") ) if protein_name: - logger.success( - f"Retrieved protein name for UniProt ID {uniprot_id}: {protein_name}" - ) + logger.success(f"Retrieved protein name for UniProt ID {uniprot_id}:") + logger.success(protein_name) return protein_name else: logger.warning( From d0324eeb585c23212c3ef0535f60bef793c3cb19 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 01:31:37 +0100 Subject: [PATCH 26/43] fix: Fix error when forcefield metadata is undifiend --- src/mdverse_scrapers/scrapers/mddb.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index be388ed..99401b3 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -177,9 +177,11 @@ def extract_forcefield_or_model_and_version( """ forcefields_and_models = [] # Add forcefield names. - for forcefield in dataset_metadata.get("FF", []): - if isinstance(forcefield, str): - forcefields_and_models.append(ForceFieldModel(name=forcefield.strip())) + forcefields = dataset_metadata.get("FF") + if forcefields: + for forcefield in forcefields: + if isinstance(forcefield, str): + forcefields_and_models.append(ForceFieldModel(name=forcefield.strip())) # Add water model. water_model = dataset_metadata.get("WAT", "") if water_model: From 8b57c76e8fbbee088263d1b16df2a92c2ccddb8b Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 09:45:57 +0100 Subject: [PATCH 27/43] fix: Handle case with no protein sequence nor Uniprot identifier --- src/mdverse_scrapers/scrapers/mddb.py | 36 ++++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 99401b3..5e5d55b 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -278,6 +278,23 @@ def extract_proteins( """ molecules = [] # Case 1: + # We have no protein sequences but no UniProt identifiers. + if not protein_sequences and not uniprot_identifiers: + logger.info( + "No protein sequences or UniProt identifiers found " + f"in dataset {dataset_id}." + ) + if pdb_identifiers: + molecules.append( + Molecule( + name="Protein", + type=MoleculeType.PROTEIN, + sequence=None, + external_identifiers=pdb_identifiers, + ) + ) + return molecules + # Case 2: # We have protein sequences but no UniProt identifiers. if protein_sequences and not uniprot_identifiers: logger.warning( @@ -294,7 +311,7 @@ def extract_proteins( ) ) return molecules - # Case 2: + # Case 3: # We have UniProt identifiers but no protein sequences. if uniprot_identifiers and not protein_sequences: logger.warning( @@ -315,7 +332,7 @@ def extract_proteins( ) ) return molecules - # Case 3: + # Case 4: # We have UniProt identifiers and protein sequences, # but their numbers do not match. if len(uniprot_identifiers) != len(protein_sequences): @@ -324,15 +341,16 @@ def extract_proteins( f"match number of protein sequences ({len(protein_sequences)}) in dataset " f"{dataset_id}." ) - molecules.append( - Molecule( - name="Unknown protein", - type=MoleculeType.PROTEIN, - external_identifiers=pdb_identifiers, + if pdb_identifiers: + molecules.append( + Molecule( + name="Unknown protein", + type=MoleculeType.PROTEIN, + external_identifiers=pdb_identifiers, + ) ) - ) return molecules - # Case 4: + # Case 5: # We have UniProt identifiers and protein sequences, # and their numbers match. for identifier, sequence in zip( From 024efa98dfdf29a0c3e1edb8ef5016605135b754 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 12:58:49 +0100 Subject: [PATCH 28/43] fix: Handle case when no software is available --- src/mdverse_scrapers/scrapers/mddb.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 5e5d55b..cb33d6d 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -132,6 +132,9 @@ def extract_software_and_version( """ Extract software names and versions from the nested dataset dictionary. + Example of dataset with no software: + https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001R9 + Parameters ---------- dataset_metadata: dict @@ -146,12 +149,12 @@ def extract_software_and_version( list[Software] | None A list of Software instances with `name` and `version` fields, None otherwise. """ - name = dataset_metadata.get("PROGRAM", "").strip() + name = dataset_metadata.get("PROGRAM") version = dataset_metadata.get("VERSION") if not name: logger.warning(f"No software found for dataset {dataset_id}.") return None - return [Software(name=name, version=version)] + return [Software(name=name.strip(), version=version)] def extract_forcefield_or_model_and_version( @@ -481,6 +484,8 @@ def extract_molecules( ) pdb_identifiers.append(external) # Add UniProt identifiers and protein sequence. + # Example with no PDBIDS, no PROTSEQ and no REFERENCES: + # https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001M3 proteins = extract_proteins( pdb_identifiers, dataset_metadata.get("REFERENCES", []), @@ -491,7 +496,8 @@ def extract_molecules( ) if proteins: molecules.extend(proteins) - # Add nucleic acids + # Add nucleic acids. + # See for instance: https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001M3 nucleic_acids = extract_nucleic_acids( pdb_identifiers, dataset_metadata.get("NUCLSEQ", []), dataset_id, logger=logger ) From 88b99556ac2567da5925a9f27edac3a6e8ed05ca Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 16:22:19 +0100 Subject: [PATCH 29/43] feat: Add InChIKey field for Molecule model --- src/mdverse_scrapers/models/enums.py | 1 + src/mdverse_scrapers/models/simulation.py | 1 + src/mdverse_scrapers/scrapers/mddb.py | 64 +++++++++++++++++------ 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index a8f23ab..e1282c8 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -42,3 +42,4 @@ class MoleculeType(StrEnum): LIPID = "lipid" CARBOHYDRATE = "carbohydrate" SOLVENT = "solvent" + SMALL_MOLECULE = "small_molecule" diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index b194fdc..5fbef97 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -94,6 +94,7 @@ class Molecule(BaseModel): sequence: str | None = Field( None, description="Sequence of the molecule for protein and nucleic acid." ) + inchikey: str | None = Field(None, description="InChIKey of the molecule.") external_identifiers: list[ExternalIdentifier] | None = Field( None, description=("List of external database identifiers for this molecule."), diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index cb33d6d..bb8f16f 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -111,8 +111,8 @@ def scrape_all_datasets( logger.info(f"Scraped page {page}/{page_total} with {len(datasets)} datasets.") if total_datasets: logger.info( - f"Scraped {len(all_datasets)} datasets " - f"({len(all_datasets):,}/{total_datasets:,} " + f"Scraped {len(all_datasets):,} datasets " + f"({len(all_datasets):,}/{total_datasets:,}" f":{len(all_datasets) / total_datasets:.0%})" ) logger.debug("First dataset metadata on this page:") @@ -189,6 +189,15 @@ def extract_forcefield_or_model_and_version( water_model = dataset_metadata.get("WAT", "") if water_model: forcefields_and_models.append(ForceFieldModel(name=water_model.strip())) + # Print summary of extracted forcefields and models. + if forcefields_and_models: + logger.info( + f"Found {len(forcefields_and_models)} forcefield(s) or model(s) " + f"in dataset {dataset_id}." + ) + else: + logger.warning(f"No forcefield or model found for dataset {dataset_id}.") + return None return forcefields_and_models @@ -448,6 +457,17 @@ def extract_small_molecules( number_of_molecules=count, ) ) + # Get InChIKey for small molecules if available. + inchikeys = dataset_metadata.get("INCHIKEYs") + if inchikeys and isinstance(inchikeys, list): + for inchikey in inchikeys: + molecules.append( + Molecule( + name="Small molecule", + type=MoleculeType.SMALL_MOLECULE, + inchikey=inchikey, + ) + ) return molecules @@ -495,6 +515,7 @@ def extract_molecules( logger=logger, ) if proteins: + logger.info(f"Found {len(proteins)} protein(s)") molecules.extend(proteins) # Add nucleic acids. # See for instance: https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001M3 @@ -502,13 +523,22 @@ def extract_molecules( pdb_identifiers, dataset_metadata.get("NUCLSEQ", []), dataset_id, logger=logger ) if nucleic_acids: + logger.info(f"Found {len(nucleic_acids)} nucleic acid(s)") molecules.extend(nucleic_acids) # Finally extract small molecules like lipids, solvents and ions. - small_molecules = extract_small_molecules(dataset_metadata, dataset_id, logger) + small_molecules = extract_small_molecules( + dataset_metadata, dataset_id, logger=logger + ) if small_molecules: + logger.info(f"Found {len(small_molecules)} small molecule(s)") molecules.extend(small_molecules) - if not molecules: - logger.warning(f"No molecules found in dataset {dataset_id}.") + # Print summary of extracted molecules. + if molecules: + logger.info( + f"Found a total of {len(molecules)} molecule(s) in dataset {dataset_id}" + ) + else: + logger.warning(f"No molecules found in dataset {dataset_id}") return None return molecules @@ -558,8 +588,8 @@ def extract_datasets_metadata( f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" ) else: - logger.error(f"Unknown MDDB node '{node_name}' for dataset {dataset_id}.") - logger.error("Skipping dataset.") + logger.error(f"Unknown MDDB node '{node_name}' for dataset {dataset_id}") + logger.error("Skipping dataset") continue dataset_metadata = dataset.get("metadata", {}) @@ -604,16 +634,20 @@ def extract_datasets_metadata( # Time step in fs. time_step = dataset_metadata.get("TIMESTEP") metadata["simulation_timesteps_in_fs"] = [time_step] if time_step else None - # Temperatures in kelvin + # Temperatures in kelvin. temperature = dataset_metadata.get("TEMP") - metadata["simulation_temperatures_in_kelvin"] = ( - [temperature] if temperature else None - ) + if temperature and isinstance(temperature, (int, float)): + metadata["simulation_temperatures_in_kelvin"] = [temperature] + logger.debug( + f"Found simulation temperature: {temperature} K in dataset {dataset_id}" + ) + else: + logger.warning(f"No simulation temperature found in dataset {dataset_id}") datasets_metadata.append(metadata) logger.info( - f"Extracted metadata for {len(datasets_metadata)} datasets " - f"({len(datasets_metadata):,}/{len(datasets):,}" - f":{len(datasets_metadata) / len(datasets):.0%})" + "Extracted metadata for " + f"{len(datasets_metadata):,}/{len(datasets):,} datasets " + f"({len(datasets_metadata) / len(datasets):.0%})" ) return datasets_metadata @@ -683,7 +717,7 @@ def scrape_files_for_all_datasets( """ all_files_metadata = [] for dataset_count, dataset in enumerate(datasets, start=1): - dataset_id = dataset.dataset_id_in_repository + dataset_id = dataset.dataset_id_in_project raw_files_metadata = scrape_files_for_one_dataset( client, url=f"{node_base_url}/projects/{dataset_id}/filenotes", From dd724a7d2dce73563d6f078e389479e680606cde Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 18:25:07 +0100 Subject: [PATCH 30/43] fix: Fix dataset_url_in_repository field --- src/mdverse_scrapers/scrapers/mddb.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index bb8f16f..8b8ef3d 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -196,7 +196,7 @@ def extract_forcefield_or_model_and_version( f"in dataset {dataset_id}." ) else: - logger.warning(f"No forcefield or model found for dataset {dataset_id}.") + logger.warning(f"No forcefield or model found for dataset {dataset_id}") return None return forcefields_and_models @@ -571,7 +571,7 @@ def extract_datasets_metadata( # Extract node name. node_name = dataset.get("node", "") node_name_full = f"mdposit_{dataset.get('node', '')}_node" - # Create the dataset url depending on the node + # Create the dataset url depending on the node. dataset_repository_name = DatasetSourceName.UNKNOWN dataset_id_in_repository = "" dataset_url_in_repository = "" @@ -579,14 +579,25 @@ def extract_datasets_metadata( dataset_repository_name = DatasetSourceName.MDPOSIT_MMB_NODE dataset_id_in_repository = str(dataset.get("local")) dataset_url_in_repository = ( - f"https://mmb.mddbr.eu/#/id/{dataset_id}/overview" + f"https://mmb.mddbr.eu/#/id/{dataset_id_in_repository}/overview" ) - elif node_name_full == DatasetSourceName.MDPOSIT_INRIA_NODE: + elif ( + (node_name_full == DatasetSourceName.MDPOSIT_INRIA_NODE) + or (node_name_full == "inr") # For compatibility with error in database + ): dataset_repository_name = DatasetSourceName.MDPOSIT_INRIA_NODE dataset_id_in_repository = str(dataset.get("local")) dataset_url_in_repository = ( - f"https://dynarepo.inria.fr/#/id/{dataset_id}/overview" + f"https://dynarepo.inria.fr/#/id/{dataset_id_in_repository}/overview" ) + if node_name_full == "inr": + logger.warning( + f"Dataset {dataset_id} is associated with node 'inr', " + "which seems to be an error in the database" + ) + logger.warning( + f"Using node name '{DatasetSourceName.MDPOSIT_INRIA_NODE}'" + ) else: logger.error(f"Unknown MDDB node '{node_name}' for dataset {dataset_id}") logger.error("Skipping dataset") From 9e0374f820e007eec65a4cb0e5f48afe95fd073a Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 18:33:09 +0100 Subject: [PATCH 31/43] docs: Print dataset URL in API --- src/mdverse_scrapers/scrapers/mddb.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 8b8ef3d..8920ca7 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -62,12 +62,11 @@ def scrape_all_datasets( Returns ------- list[dict]: - A list of MDposit entries. + List of MDposit entries metadata. """ logger.info("Scraping molecular dynamics datasets from MDposit.") logger.info(f"Using batches of {page_size} datasets.") all_datasets = [] - # Start by requesting the first page to get total number of datasets. logger.info("Requesting first page to get total number of datasets...") params = {"limit": 10, "page": 1} @@ -152,7 +151,7 @@ def extract_software_and_version( name = dataset_metadata.get("PROGRAM") version = dataset_metadata.get("VERSION") if not name: - logger.warning(f"No software found for dataset {dataset_id}.") + logger.warning("No software found for dataset") return None return [Software(name=name.strip(), version=version)] @@ -191,12 +190,9 @@ def extract_forcefield_or_model_and_version( forcefields_and_models.append(ForceFieldModel(name=water_model.strip())) # Print summary of extracted forcefields and models. if forcefields_and_models: - logger.info( - f"Found {len(forcefields_and_models)} forcefield(s) or model(s) " - f"in dataset {dataset_id}." - ) + logger.info(f"Found {len(forcefields_and_models)} forcefield(s) or model(s)") else: - logger.warning(f"No forcefield or model found for dataset {dataset_id}") + logger.warning("No forcefield or model found") return None return forcefields_and_models @@ -568,6 +564,7 @@ def extract_datasets_metadata( # Get the dataset id dataset_id = str(dataset.get("accession")) logger.info(f"Extracting metadata for dataset: {dataset_id}") + logger.debug(f"https://mdposit.mddbr.eu/api/rest/v1/projects/{dataset_id}") # Extract node name. node_name = dataset.get("node", "") node_name_full = f"mdposit_{dataset.get('node', '')}_node" From 6b959da83043397c6f5ae6d1211208cac1f7a2ab Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 18:45:42 +0100 Subject: [PATCH 32/43] feat: Align uniprot identifiers with protein sequences --- src/mdverse_scrapers/scrapers/mddb.py | 43 ++++++++++++++++++++------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 8920ca7..6d5839f 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -221,7 +221,7 @@ def fetch_uniprot_protein_name( """ logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") if uniprot_id in ("noref", "notfound"): - logger.warning(f"Cannot fetch protein name for UniProt ID '{uniprot_id}'.") + logger.warning(f"Cannot fetch protein name for UniProt ID '{uniprot_id}'") return "Unknown protein" # Defaut value for protein name: default_protein_name = f"Protein {uniprot_id}" @@ -233,7 +233,7 @@ def fetch_uniprot_protein_name( delay_before_request=0.1, ) if not response: - logger.error(f"Failed to query the UniProt API for ID {uniprot_id}.") + logger.error(f"Failed to query the UniProt API for ID {uniprot_id}") return default_protein_name protein_name = ( response.json() @@ -248,12 +248,12 @@ def fetch_uniprot_protein_name( return protein_name else: logger.warning( - f"Protein name not found in UniProt API response for ID {uniprot_id}." + f"Protein name not found in UniProt API response for ID {uniprot_id}" ) return default_protein_name -def extract_proteins( +def extract_proteins( # noqa: C901 pdb_identifiers: list[ExternalIdentifier], uniprot_identifiers: list[str], protein_sequences: list[str], @@ -341,13 +341,33 @@ def extract_proteins( ) return molecules # Case 4: - # We have UniProt identifiers and protein sequences, + # We have one UniProt identifier and several protein sequences, + # we assume all protein sequences are associated with the same UniProt identifier. + if (len(uniprot_identifiers) == 1) and (len(protein_sequences) > 1): + external = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier=uniprot_identifiers[0], + ) + protein_name = fetch_uniprot_protein_name( + client, uniprot_identifiers[0], logger=logger + ) + for sequence in protein_sequences: + molecules.append( + Molecule( + name=protein_name, + type=MoleculeType.PROTEIN, + sequence=sequence, + external_identifiers=[external, *pdb_identifiers], + ) + ) + return molecules + # Case 5: + # We have more than one UniProt identifiers and several protein sequences, # but their numbers do not match. if len(uniprot_identifiers) != len(protein_sequences): logger.warning( f"Number of UniProt identifiers ({len(uniprot_identifiers)}) does not " - f"match number of protein sequences ({len(protein_sequences)}) in dataset " - f"{dataset_id}." + f"match number of protein sequences ({len(protein_sequences)})" ) if pdb_identifiers: molecules.append( @@ -358,7 +378,7 @@ def extract_proteins( ) ) return molecules - # Case 5: + # Case 6: # We have UniProt identifiers and protein sequences, # and their numbers match. for identifier, sequence in zip( @@ -438,12 +458,12 @@ def extract_small_molecules( A list of extracted small molecules or an empty list. """ molecules = [] - name_type_maping = { + name_type_mapping = { "SOL": MoleculeType.SOLVENT, "NA": MoleculeType.ION, "CL": MoleculeType.ION, } - for name, mol_type in name_type_maping.items(): + for name, mol_type in name_type_mapping.items(): count = dataset_metadata.get(name, 0) if isinstance(count, int) and count > 0: molecules.append( @@ -454,7 +474,7 @@ def extract_small_molecules( ) ) # Get InChIKey for small molecules if available. - inchikeys = dataset_metadata.get("INCHIKEYs") + inchikeys = dataset_metadata.get("INCHIKEYS") if inchikeys and isinstance(inchikeys, list): for inchikey in inchikeys: molecules.append( @@ -563,6 +583,7 @@ def extract_datasets_metadata( for dataset in datasets: # Get the dataset id dataset_id = str(dataset.get("accession")) + logger.info("-" * 30) logger.info(f"Extracting metadata for dataset: {dataset_id}") logger.debug(f"https://mdposit.mddbr.eu/api/rest/v1/projects/{dataset_id}") # Extract node name. From e3a353cef0de62aae5a1cdd5f76e3cdd522c859f Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sat, 7 Feb 2026 22:48:45 +0100 Subject: [PATCH 33/43] feat: Add replicas logic in file metadata extraction --- src/mdverse_scrapers/scrapers/mddb.py | 181 +++++++++++++------------- 1 file changed, 88 insertions(+), 93 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 6d5839f..8459973 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -7,7 +7,7 @@ import sys from pathlib import Path -from typing import Any +from urllib.parse import urlparse import click import httpx @@ -184,10 +184,12 @@ def extract_forcefield_or_model_and_version( for forcefield in forcefields: if isinstance(forcefield, str): forcefields_and_models.append(ForceFieldModel(name=forcefield.strip())) + logger.debug(f"Found forcefield/model: {forcefield.strip()}") # Add water model. water_model = dataset_metadata.get("WAT", "") if water_model: forcefields_and_models.append(ForceFieldModel(name=water_model.strip())) + logger.debug(f"Found water model: {water_model.strip()}") # Print summary of extracted forcefields and models. if forcefields_and_models: logger.info(f"Found {len(forcefields_and_models)} forcefield(s) or model(s)") @@ -560,30 +562,33 @@ def extract_molecules( def extract_datasets_metadata( - datasets: list[dict[str, Any]], + datasets: list[dict], client: httpx.Client, logger: "loguru.Logger" = loguru.logger, -) -> list[dict[str, Any]]: +) -> tuple[list[dict], dict]: """ Extract relevant metadata from raw MDposit datasets metadata. Parameters ---------- - datasets: List[Dict[str, Any]] + datasets: list[dict] List of raw MDposit datasets metadata. logger: "loguru.Logger" Logger for logging messages. Returns ------- - list[dict[str, Any]] + list[dict] List of dataset metadata dictionaries. + dict + Dictionnary for replicas by dataset. """ datasets_metadata = [] + replicas = {} for dataset in datasets: # Get the dataset id dataset_id = str(dataset.get("accession")) - logger.info("-" * 30) + logger.info("-" * 50) logger.info(f"Extracting metadata for dataset: {dataset_id}") logger.debug(f"https://mdposit.mddbr.eu/api/rest/v1/projects/{dataset_id}") # Extract node name. @@ -621,10 +626,10 @@ def extract_datasets_metadata( logger.error("Skipping dataset") continue - dataset_metadata = dataset.get("metadata", {}) - citations = dataset_metadata.get("CITATION") + simulation_metadata = dataset.get("metadata", {}) + citations = simulation_metadata.get("CITATION") external_links = [citations] if citations else None - authors = dataset_metadata.get("AUTHORS") + authors = simulation_metadata.get("AUTHORS") author_names = None if isinstance(authors, list): author_names = authors @@ -638,91 +643,56 @@ def extract_datasets_metadata( "dataset_id_in_project": dataset_id, "dataset_url_in_project": f"https://mdposit.mddbr.eu/#/id/{dataset_id}/overview", "external_links": external_links, - "title": dataset_metadata.get("NAME"), + "title": simulation_metadata.get("NAME"), "date_created": dataset.get("creationDate"), "date_last_updated": dataset.get("updateDate"), "number_of_files": len(dataset.get("files", [])), "author_names": author_names, - "license": dataset_metadata.get("LICENSE"), - "description": dataset_metadata.get("DESCRIPTION"), - "total_number_of_atoms": dataset_metadata.get("mdAtoms"), + "license": simulation_metadata.get("LICENSE"), + "description": simulation_metadata.get("DESCRIPTION"), + "total_number_of_atoms": simulation_metadata.get("mdAtoms"), } # Extract simulation metadata if available. # Software names with their versions. metadata["software"] = extract_software_and_version( - dataset_metadata, dataset_id, logger=logger + simulation_metadata, dataset_id, logger=logger ) # Forcefield and model names with their versions. metadata["forcefields_models"] = extract_forcefield_or_model_and_version( - dataset_metadata, dataset_id, logger=logger + simulation_metadata, dataset_id, logger=logger ) # Molecules with their nb of atoms and number total of atoms. metadata["molecules"] = extract_molecules( - dataset_metadata, dataset_id, client, logger=logger + simulation_metadata, dataset_id, client, logger=logger ) # Time step in fs. - time_step = dataset_metadata.get("TIMESTEP") + time_step = simulation_metadata.get("TIMESTEP") metadata["simulation_timesteps_in_fs"] = [time_step] if time_step else None # Temperatures in kelvin. - temperature = dataset_metadata.get("TEMP") + temperature = simulation_metadata.get("TEMP") if temperature and isinstance(temperature, (int, float)): metadata["simulation_temperatures_in_kelvin"] = [temperature] - logger.debug( - f"Found simulation temperature: {temperature} K in dataset {dataset_id}" - ) + logger.debug(f"Found simulation temperature: {temperature} K") else: - logger.warning(f"No simulation temperature found in dataset {dataset_id}") + logger.warning("No simulation temperature found") + # Extract replicas. + replica_list = dataset.get("mds") + if replica_list: + replicas[dataset_id] = replica_list + # Append extracted metadata. datasets_metadata.append(metadata) logger.info( "Extracted metadata for " f"{len(datasets_metadata):,}/{len(datasets):,} datasets " f"({len(datasets_metadata) / len(datasets):.0%})" ) - return datasets_metadata - - -def scrape_files_for_one_dataset( - client: httpx.Client, - url: str, - dataset_id: str, - logger: "loguru.Logger" = loguru.logger, -) -> list[dict] | None: - """ - Scrape files metadata for MDposit. - - Parameters - ---------- - client: httpx.Client - The HTTPX client to use for making requests. - url: str - The URL endpoint. - dataset_id: str - The unique identifier of the dataset in MDposit. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[dict] | None - File metadata dictionary for the dataset. - """ - logger.info(f"Scraping files for dataset ID: {dataset_id}") - response = make_http_request_with_retries( - client, - url, - method=HttpMethod.GET, - timeout=60, - delay_before_request=0.1, - ) - if not response: - logger.error("Failed to fetch files metadata.") - return None - return response.json() + return datasets_metadata, replicas def scrape_files_for_all_datasets( client: httpx.Client, - datasets: list[DatasetMetadata], + datasets_metadata: list[DatasetMetadata], + datasets_replicas: dict, node_base_url: str, logger: "loguru.Logger" = loguru.logger, ) -> list[dict]: @@ -732,8 +702,10 @@ def scrape_files_for_all_datasets( ---------- client: httpx.Client The HTTPX client to use for making requests. - datasets: list[DatasetMetadata] + datasets_metadata: list[DatasetMetadata] List of datasets to scrape files metadata for. + datasets_replicas: dict + Dictionnary for replicas by dataset. node_base_url: str Base url of the specific node of MDposit API. logger: "loguru.Logger" @@ -745,28 +717,46 @@ def scrape_files_for_all_datasets( List of files metadata dictionaries. """ all_files_metadata = [] - for dataset_count, dataset in enumerate(datasets, start=1): + for dataset_count, dataset in enumerate(datasets_metadata, start=1): dataset_id = dataset.dataset_id_in_project - raw_files_metadata = scrape_files_for_one_dataset( - client, - url=f"{node_base_url}/projects/{dataset_id}/filenotes", - dataset_id=dataset_id, - logger=logger, - ) - if not raw_files_metadata: - continue - # Extract relevant files metadata. - logger.info(f"Getting files metadata for dataset: {dataset_id}") - files_metadata = extract_files_metadata( - raw_files_metadata, node_base_url, dataset, logger=logger - ) - all_files_metadata += files_metadata - # Normalize files metadata with pydantic model (FileMetadata) - logger.info(f"Total files found: {len(all_files_metadata):,}") + for replica_id, replica_name in enumerate( + datasets_replicas.get(dataset_id, []), start=1 + ): + logger.info(f"Scraping files for dataset: {dataset_id} / {replica_name}") + response = make_http_request_with_retries( + client, + url=f"{node_base_url}/projects/{dataset_id}.{replica_id}/filenotes", + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.1, + logger=logger, + ) + if not response: + logger.error("Failed to fetch files metadata") + continue + raw_files_metadata = response.json() + # Extract relevant files metadata. + logger.info( + f"Extracting files metadata for dataset: {dataset_id} / {replica_name}" + ) + # We integrate replica name and id to distinguish files + # from different replicas of the same dataset, + # as they usually have the same names. + files_metadata = extract_files_metadata( + raw_files_metadata, + node_base_url, + dataset, + replica_id, + replica_name, + logger=logger, + ) + all_files_metadata += files_metadata + # Normalize files metadata with pydantic model (FileMetadata) + logger.info(f"Total files found: {len(all_files_metadata):,}") logger.info( "Extracted files metadata for " - f"{dataset_count:,}/{len(datasets):,} " - f"({dataset_count / len(datasets):.0%}) datasets." + f"{dataset_count:,}/{len(datasets_metadata):,} " + f"({dataset_count / len(datasets_metadata):.0%}) datasets" ) return all_files_metadata @@ -775,6 +765,8 @@ def extract_files_metadata( raw_metadata: list[dict], node_base_url: str, dataset: DatasetMetadata, + replica_id: int, + replica_name: str, logger: "loguru.Logger" = loguru.logger, ) -> list[dict]: """ @@ -788,6 +780,10 @@ def extract_files_metadata( The unique identifier of the dataset in MDDB. dataset: DatasetMetadata Normalized dataset to get files metadata for. + replica_id: int + Identifer of the corresponding replica associated with the files. + replica_name: str + The name of the corresponding replica associated with the files. logger: "loguru.Logger" Logger for logging messages. @@ -801,22 +797,20 @@ def extract_files_metadata( for mdposit_file in raw_metadata: dataset_id = dataset.dataset_id_in_repository file_name = Path(mdposit_file.get("filename", "")) - node_base_url_for_file = node_base_url.replace("/v1", "") - file_path_url = ( - f"{node_base_url_for_file}/current/projects/{dataset_id}/files/{file_name}" - ) - - parsed_file = { + # Extract base url from dataset url. + base_url = urlparse(dataset.dataset_url_in_repository).netloc + file_path_url = f"https://{base_url}/api/rest/current/projects/{dataset_id}.{replica_id}/files/{file_name}" + file_metadata = { "dataset_repository_name": dataset.dataset_repository_name, "dataset_id_in_repository": dataset_id, "dataset_url_in_repository": dataset.dataset_url_in_repository, - "file_name": str(file_name), + "file_name": f"{replica_name.replace(' ', '_')}/{file_name}", "file_size_in_bytes": mdposit_file.get("length", None), "file_md5": mdposit_file.get("md5", None), "file_url_in_repository": file_path_url, } - files_metadata.append(parsed_file) - logger.info(f"Extracted metadata for {len(files_metadata)} files.") + files_metadata.append(file_metadata) + logger.info(f"Extracted metadata for {len(files_metadata)} files") return files_metadata @@ -880,7 +874,7 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: sys.exit(1) # Extract datasets metadata. - datasets_selected_metadata = extract_datasets_metadata( + datasets_selected_metadata, replicas = extract_datasets_metadata( datasets_raw_metadata, client, logger=logger ) # Validate datasets metadata with the DatasetMetadata Pydantic model. @@ -900,6 +894,7 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: files_metadata = scrape_files_for_all_datasets( client, datasets_normalized_metadata, + replicas, base_url, logger=logger, ) From 70685843e92ab122f6ea9fa434adfa9359168aac Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sun, 8 Feb 2026 00:15:54 +0100 Subject: [PATCH 34/43] feat: Add rules to avoid lengthy try / except blocks --- AGENTS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 6813b72..efba1f8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,8 +24,9 @@ When writing code: When writing functions, always: -- Add descriptive docstrings. +- Add descriptive docstrings - Use early returns for error conditions +- Limit size of try / except blocks to the strict minimum Never import libraries by yourself. Always ask before adding dependencies. From 9cd0a88864e212b0174e4235010316ba5a8e8c76 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sun, 8 Feb 2026 00:16:33 +0100 Subject: [PATCH 35/43] fix: Add special case for 'inr' (INRIA) node name --- src/mdverse_scrapers/scrapers/mddb.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 8459973..e424c76 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -606,14 +606,14 @@ def extract_datasets_metadata( ) elif ( (node_name_full == DatasetSourceName.MDPOSIT_INRIA_NODE) - or (node_name_full == "inr") # For compatibility with error in database + or (node_name == "inr") # For compatibility with error in database ): dataset_repository_name = DatasetSourceName.MDPOSIT_INRIA_NODE dataset_id_in_repository = str(dataset.get("local")) dataset_url_in_repository = ( f"https://dynarepo.inria.fr/#/id/{dataset_id_in_repository}/overview" ) - if node_name_full == "inr": + if node_name == "inr": logger.warning( f"Dataset {dataset_id} is associated with node 'inr', " "which seems to be an error in the database" @@ -718,6 +718,7 @@ def scrape_files_for_all_datasets( """ all_files_metadata = [] for dataset_count, dataset in enumerate(datasets_metadata, start=1): + logger.info("-" * 50) dataset_id = dataset.dataset_id_in_project for replica_id, replica_name in enumerate( datasets_replicas.get(dataset_id, []), start=1 @@ -752,8 +753,8 @@ def scrape_files_for_all_datasets( ) all_files_metadata += files_metadata # Normalize files metadata with pydantic model (FileMetadata) - logger.info(f"Total files found: {len(all_files_metadata):,}") - logger.info( + logger.success(f"Total number of files found: {len(all_files_metadata):,}") + logger.success( "Extracted files metadata for " f"{dataset_count:,}/{len(datasets_metadata):,} " f"({dataset_count / len(datasets_metadata):.0%}) datasets" From 40ea3cad363efb66a5c3b0eb7c7cdc6524c16215 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sun, 8 Feb 2026 01:58:20 +0100 Subject: [PATCH 36/43] feat: Add Cineca MDDB node --- src/mdverse_scrapers/models/enums.py | 2 +- src/mdverse_scrapers/scrapers/mddb.py | 69 +++++++++++++++------------ 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index e1282c8..d8d4264 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -13,7 +13,6 @@ class DataType(StrEnum): class DatasetSourceName(StrEnum): """Molecular dynamics sources: data repositories and projects.""" - UNKNOWN = "unknown" ZENODO = "zenodo" FIGSHARE = "figshare" OSF = "osf" @@ -24,6 +23,7 @@ class DatasetSourceName(StrEnum): MDDB = "mddb" MDPOSIT_INRIA_NODE = "mdposit_inria_node" MDPOSIT_MMB_NODE = "mdposit_mmb_node" + MDPOSIT_CINECA_NODE = "mdposit_cineca_node" class ExternalDatabaseName(StrEnum): diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index e424c76..206e0ee 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -31,6 +31,29 @@ normalize_files_metadata, ) +MDDB_NODES = { + # INRIA node. + "inria": { + "name": DatasetSourceName.MDPOSIT_INRIA_NODE, + "base_url": "https://dynarepo.inria.fr", + }, + # INRIA node, with typo. + "inr": { + "name": DatasetSourceName.MDPOSIT_INRIA_NODE, + "base_url": "https://dynarepo.inria.fr", + }, + # MMB node. + "mmb": { + "name": DatasetSourceName.MDPOSIT_MMB_NODE, + "base_url": "https://mmb.mddbr.eu", + }, + # Cineca node. + "cin": { + "name": DatasetSourceName.MDPOSIT_CINECA_NODE, + "base_url": "https://cineca.mddbr.eu", + }, +} + def scrape_all_datasets( client: httpx.Client, @@ -563,6 +586,7 @@ def extract_molecules( def extract_datasets_metadata( datasets: list[dict], + mddb_nodes: dict, client: httpx.Client, logger: "loguru.Logger" = loguru.logger, ) -> tuple[list[dict], dict]: @@ -573,6 +597,8 @@ def extract_datasets_metadata( ---------- datasets: list[dict] List of raw MDposit datasets metadata. + mddb_nodes: dict + Dictionnary of MDDB nodes. logger: "loguru.Logger" Logger for logging messages. @@ -593,39 +619,22 @@ def extract_datasets_metadata( logger.debug(f"https://mdposit.mddbr.eu/api/rest/v1/projects/{dataset_id}") # Extract node name. node_name = dataset.get("node", "") - node_name_full = f"mdposit_{dataset.get('node', '')}_node" # Create the dataset url depending on the node. - dataset_repository_name = DatasetSourceName.UNKNOWN - dataset_id_in_repository = "" - dataset_url_in_repository = "" - if node_name_full == DatasetSourceName.MDPOSIT_MMB_NODE: - dataset_repository_name = DatasetSourceName.MDPOSIT_MMB_NODE - dataset_id_in_repository = str(dataset.get("local")) - dataset_url_in_repository = ( - f"https://mmb.mddbr.eu/#/id/{dataset_id_in_repository}/overview" - ) - elif ( - (node_name_full == DatasetSourceName.MDPOSIT_INRIA_NODE) - or (node_name == "inr") # For compatibility with error in database - ): - dataset_repository_name = DatasetSourceName.MDPOSIT_INRIA_NODE - dataset_id_in_repository = str(dataset.get("local")) - dataset_url_in_repository = ( - f"https://dynarepo.inria.fr/#/id/{dataset_id_in_repository}/overview" - ) - if node_name == "inr": - logger.warning( - f"Dataset {dataset_id} is associated with node 'inr', " - "which seems to be an error in the database" - ) - logger.warning( - f"Using node name '{DatasetSourceName.MDPOSIT_INRIA_NODE}'" - ) - else: + dataset_id_in_repository = str(dataset.get("local")) + if node_name not in mddb_nodes: logger.error(f"Unknown MDDB node '{node_name}' for dataset {dataset_id}") logger.error("Skipping dataset") continue - + if node_name == "inr": + logger.warning( + f"MDDB node 'inr' should probably be 'inria' for dataset {dataset_id}" + ) + dataset_repository_name = mddb_nodes[node_name]["name"] + dataset_url_in_repository = ( + f"{mddb_nodes[node_name]['base_url']}" + f"/#/id/{dataset_id_in_repository}/overview" + ) + # Extract simulation metadata. simulation_metadata = dataset.get("metadata", {}) citations = simulation_metadata.get("CITATION") external_links = [citations] if citations else None @@ -876,7 +885,7 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: # Extract datasets metadata. datasets_selected_metadata, replicas = extract_datasets_metadata( - datasets_raw_metadata, client, logger=logger + datasets_raw_metadata, MDDB_NODES, client, logger=logger ) # Validate datasets metadata with the DatasetMetadata Pydantic model. datasets_normalized_metadata = normalize_datasets_metadata( From a8ed77bf6b39dc2e320e7c8c2bf31053c59c56e4 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sun, 8 Feb 2026 17:17:37 +0100 Subject: [PATCH 37/43] feat: Add another way to get protein name from Uniprot --- src/mdverse_scrapers/scrapers/mddb.py | 28 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 206e0ee..08b58cb 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -105,7 +105,7 @@ def scrape_all_datasets( logger.error("Failed to fetch data from MDposit API.") return all_datasets total_datasets = int(response.json().get("filteredCount", 0)) - logger.success(f"Found a total of {total_datasets:,} datasets in MDposit.") + logger.success(f"Found a total of {total_datasets:,} datasets in MDposit") # Compute total number of pages to scrape based on total datasets and page size. page_total = total_datasets // page_size if total_datasets % page_size != 0: @@ -174,8 +174,9 @@ def extract_software_and_version( name = dataset_metadata.get("PROGRAM") version = dataset_metadata.get("VERSION") if not name: - logger.warning("No software found for dataset") + logger.warning("No software found") return None + logger.debug(f"Found software: {name.strip()} ({version})") return [Software(name=name.strip(), version=version)] @@ -246,7 +247,7 @@ def fetch_uniprot_protein_name( """ logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") if uniprot_id in ("noref", "notfound"): - logger.warning(f"Cannot fetch protein name for UniProt ID '{uniprot_id}'") + logger.warning("Uniprot ID is weird. Abording.") return "Unknown protein" # Defaut value for protein name: default_protein_name = f"Protein {uniprot_id}" @@ -258,8 +259,9 @@ def fetch_uniprot_protein_name( delay_before_request=0.1, ) if not response: - logger.error(f"Failed to query the UniProt API for ID {uniprot_id}") + logger.error("Failed to query the UniProt API") return default_protein_name + # First option: try to get the recommended name. protein_name = ( response.json() .get("proteinDescription", {}) @@ -267,14 +269,21 @@ def fetch_uniprot_protein_name( .get("fullName", {}) .get("value") ) + # Second option: try to get the submitted name. + if not protein_name: + protein_name = ( + response.json() + .get("proteinDescription", {}) + .get("submissionNames", {}) + .get("fullName", {}) + .get("value") + ) if protein_name: - logger.success(f"Retrieved protein name for UniProt ID {uniprot_id}:") + logger.success("Retrieved protein name:") logger.success(protein_name) return protein_name else: - logger.warning( - f"Protein name not found in UniProt API response for ID {uniprot_id}" - ) + logger.warning("Cannot extract protein name from UniProt API response") return default_protein_name @@ -314,8 +323,7 @@ def extract_proteins( # noqa: C901 # We have no protein sequences but no UniProt identifiers. if not protein_sequences and not uniprot_identifiers: logger.info( - "No protein sequences or UniProt identifiers found " - f"in dataset {dataset_id}." + f"No protein sequences or UniProt identifiers found in dataset {dataset_id}" ) if pdb_identifiers: molecules.append( From 78842750a168e9b82248d71cac76d66b2e84bfe8 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Sun, 8 Feb 2026 21:13:41 +0100 Subject: [PATCH 38/43] fix: Update logic to fetch protein name from Uniprot --- src/mdverse_scrapers/scrapers/mddb.py | 39 ++++++++++++++------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 08b58cb..8dc404e 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -130,7 +130,7 @@ def scrape_all_datasets( datasets = response_json.get("projects", []) all_datasets.extend(datasets) - logger.info(f"Scraped page {page}/{page_total} with {len(datasets)} datasets.") + logger.info(f"Scraped page {page}/{page_total} with {len(datasets)} datasets") if total_datasets: logger.info( f"Scraped {len(all_datasets):,} datasets " @@ -270,20 +270,27 @@ def fetch_uniprot_protein_name( .get("value") ) # Second option: try to get the submitted name. + # See for instance: https://rest.uniprot.org/uniprotkb/Q51760 if not protein_name: - protein_name = ( - response.json() - .get("proteinDescription", {}) - .get("submissionNames", {}) - .get("fullName", {}) - .get("value") + submission_name = ( + response.json().get("proteinDescription", {}).get("submissionNames") ) + # The "submissionNames" field can be a list. + # See for instance; https://rest.uniprot.org/uniprotkb/Q16968 + if submission_name and isinstance(submission_name, list): + protein_name = submission_name[0].get("fullName", {}).get("value") + # Or a dictionnary. + # See for instance: https://rest.uniprot.org/uniprotkb/Q51760 + elif submission_name and isinstance(submission_name, dict): + protein_name = submission_name.get("fullName", {}).get("value") if protein_name: logger.success("Retrieved protein name:") logger.success(protein_name) return protein_name else: - logger.warning("Cannot extract protein name from UniProt API response") + # Uniprot records are sometimes outdated or discontinued. + # See for instance: https://rest.uniprot.org/uniprotkb/Q9RHW0 + logger.error("Cannot extract protein name from UniProt API response") return default_protein_name @@ -322,9 +329,7 @@ def extract_proteins( # noqa: C901 # Case 1: # We have no protein sequences but no UniProt identifiers. if not protein_sequences and not uniprot_identifiers: - logger.info( - f"No protein sequences or UniProt identifiers found in dataset {dataset_id}" - ) + logger.info("Found no protein sequence nor UniProt identifier") if pdb_identifiers: molecules.append( Molecule( @@ -338,10 +343,7 @@ def extract_proteins( # noqa: C901 # Case 2: # We have protein sequences but no UniProt identifiers. if protein_sequences and not uniprot_identifiers: - logger.warning( - "Protein sequences found but no UniProt identifier " - f"in dataset {dataset_id}." - ) + logger.warning("Found protein sequences but no UniProt identifier") for sequence in protein_sequences: molecules.append( Molecule( @@ -355,10 +357,7 @@ def extract_proteins( # noqa: C901 # Case 3: # We have UniProt identifiers but no protein sequences. if uniprot_identifiers and not protein_sequences: - logger.warning( - "UniProt identifiers found but no protein sequence " - f"in dataset {dataset_id}." - ) + logger.warning("Found UniProt identifiers but no protein sequence") for identifier in uniprot_identifiers: external = ExternalIdentifier( database_name=ExternalDatabaseName.UNIPROT, identifier=identifier @@ -397,6 +396,8 @@ def extract_proteins( # noqa: C901 # Case 5: # We have more than one UniProt identifiers and several protein sequences, # but their numbers do not match. + # See for instance: https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A000AE + # with 2 UniProt identifiers and 4 protein sequences. if len(uniprot_identifiers) != len(protein_sequences): logger.warning( f"Number of UniProt identifiers ({len(uniprot_identifiers)}) does not " From 71f7c43d06b55109c1dccff926add8e070a50dc7 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 11 Feb 2026 14:09:27 +0100 Subject: [PATCH 39/43] docs: Fix typos --- docs/mddb.md | 4 ++-- src/mdverse_scrapers/scrapers/mddb.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/mddb.md b/docs/mddb.md index ee060bd..0a040ab 100644 --- a/docs/mddb.md +++ b/docs/mddb.md @@ -19,10 +19,10 @@ No account / token is needed to access the MDposit API. In MDposit, a dataset (a simulation and its related files) is called a "[project](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)". -APY entrypoint to get the total number of projects: +API entrypoint to get the total number of projects: - Endpoint: `/projects/summary` -- HTTP methode: GET +- HTTP method: GET - [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects_summary) A project can contain multiple replicas, each identified by `project_id`.`replica_id`. diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index 8dc404e..e073684 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -247,7 +247,7 @@ def fetch_uniprot_protein_name( """ logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") if uniprot_id in ("noref", "notfound"): - logger.warning("Uniprot ID is weird. Abording.") + logger.warning("UniProt ID is weird. Aborting.") return "Unknown protein" # Defaut value for protein name: default_protein_name = f"Protein {uniprot_id}" @@ -288,7 +288,7 @@ def fetch_uniprot_protein_name( logger.success(protein_name) return protein_name else: - # Uniprot records are sometimes outdated or discontinued. + # UniProt records are sometimes outdated or discontinued. # See for instance: https://rest.uniprot.org/uniprotkb/Q9RHW0 logger.error("Cannot extract protein name from UniProt API response") return default_protein_name @@ -607,7 +607,7 @@ def extract_datasets_metadata( datasets: list[dict] List of raw MDposit datasets metadata. mddb_nodes: dict - Dictionnary of MDDB nodes. + Dictionary of MDDB nodes. logger: "loguru.Logger" Logger for logging messages. @@ -616,7 +616,7 @@ def extract_datasets_metadata( list[dict] List of dataset metadata dictionaries. dict - Dictionnary for replicas by dataset. + Dictionary for replicas by dataset. """ datasets_metadata = [] replicas = {} @@ -723,7 +723,7 @@ def scrape_files_for_all_datasets( datasets_metadata: list[DatasetMetadata] List of datasets to scrape files metadata for. datasets_replicas: dict - Dictionnary for replicas by dataset. + Dictionary for replicas by dataset. node_base_url: str Base url of the specific node of MDposit API. logger: "loguru.Logger" From 3d003b3edd51e77bf36ddbe9a8e348d700178b21 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 11 Feb 2026 14:12:51 +0100 Subject: [PATCH 40/43] docs: Relax scraping time --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 059830d..ca370f9 100644 --- a/README.md +++ b/README.md @@ -170,10 +170,9 @@ This command will: 4. Validate entries using Pydantic models 5. Save the extracted metadata to Parquet files - ## Scrape MDDB -Have a look at the notes regarding [MDDB](docs/mddb.md) and its API. +See [MDDB](docs/mddb.md) to understand how with use scrape metadata from MDDB. Scrape MDDB (MDposit MMB node and MDposit Inria node) to collect molecular dynamics (MD) datasets and files: @@ -188,8 +187,7 @@ This command will: `DatasetMetadata` and `FileMetadata`. 3. Save validated files and datasets metadata. -The scraping takes about 13 minutes. - +The scraping process takes about 2 hours, depending on your network connection and hardware. ## Analyze Gromacs mdp and gro files From cf32a04fb959900cc001d42652eec64ea18416d9 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 11 Feb 2026 14:15:47 +0100 Subject: [PATCH 41/43] chore: Reallow PERF401 rules --- ruff.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/ruff.toml b/ruff.toml index 9bf13a6..436c4ca 100644 --- a/ruff.toml +++ b/ruff.toml @@ -41,7 +41,6 @@ extend-select = [ ignore = [ "COM812", # Redundant with ruff formatter. See: https://docs.astral.sh/ruff/rules/missing-trailing-comma/ "G004", # f-strings are allowed with the loguru module. See https://docs.astral.sh/ruff/rules/logging-f-string/ - "PERF401", # list.extend suggestion is not applicable when appending model instances. ] # Force numpy-style for docstrings From 91595f155b158bf60d301128eaa17e59fc125d76 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 11 Feb 2026 14:16:58 +0100 Subject: [PATCH 42/43] docs: Remove MDDB node names --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ca370f9..8bfd1c9 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,7 @@ This command will: See [MDDB](docs/mddb.md) to understand how with use scrape metadata from MDDB. -Scrape MDDB (MDposit MMB node and MDposit Inria node) to collect molecular dynamics (MD) datasets and files: +Scrape MDDB to collect molecular dynamics (MD) datasets and files: ```bash uv run scrape-mddb --output-dir data @@ -182,7 +182,7 @@ uv run scrape-mddb --output-dir data This command will: -1. Search for molecular dynamics datasets and files through the MDposit API nodes. +1. List all datasets and files through the main MDposit nodes. 2. Parse metadata and validate them using the Pydantic models `DatasetMetadata` and `FileMetadata`. 3. Save validated files and datasets metadata. From 66589732aba3c19be53b53670cc926d4528985a2 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Wed, 11 Feb 2026 14:31:02 +0100 Subject: [PATCH 43/43] refactor: Clean code --- src/mdverse_scrapers/scrapers/mddb.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py index e073684..cac1ba0 100644 --- a/src/mdverse_scrapers/scrapers/mddb.py +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -243,13 +243,13 @@ def fetch_uniprot_protein_name( Returns ------- str - Protein full name if available, None otherwise. + Protein full name if available, default name otherwise. """ logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") if uniprot_id in ("noref", "notfound"): logger.warning("UniProt ID is weird. Aborting.") return "Unknown protein" - # Defaut value for protein name: + # Default value for protein name: default_protein_name = f"Protein {uniprot_id}" response = make_http_request_with_retries( client, @@ -261,10 +261,10 @@ def fetch_uniprot_protein_name( if not response: logger.error("Failed to query the UniProt API") return default_protein_name + json_data = response.json() # First option: try to get the recommended name. protein_name = ( - response.json() - .get("proteinDescription", {}) + json_data.get("proteinDescription", {}) .get("recommendedName", {}) .get("fullName", {}) .get("value") @@ -272,9 +272,7 @@ def fetch_uniprot_protein_name( # Second option: try to get the submitted name. # See for instance: https://rest.uniprot.org/uniprotkb/Q51760 if not protein_name: - submission_name = ( - response.json().get("proteinDescription", {}).get("submissionNames") - ) + submission_name = json_data.get("proteinDescription", {}).get("submissionNames") # The "submissionNames" field can be a list. # See for instance; https://rest.uniprot.org/uniprotkb/Q16968 if submission_name and isinstance(submission_name, list): @@ -345,7 +343,7 @@ def extract_proteins( # noqa: C901 if protein_sequences and not uniprot_identifiers: logger.warning("Found protein sequences but no UniProt identifier") for sequence in protein_sequences: - molecules.append( + molecules.append( # noqa: PERF401 Molecule( name="Protein", type=MoleculeType.PROTEIN, @@ -384,7 +382,7 @@ def extract_proteins( # noqa: C901 client, uniprot_identifiers[0], logger=logger ) for sequence in protein_sequences: - molecules.append( + molecules.append( # noqa: PERF401 Molecule( name=protein_name, type=MoleculeType.PROTEIN, @@ -459,7 +457,7 @@ def extract_nucleic_acids( """ molecules = [] for sequence in nucleic_acid_sequences: - molecules.append( + molecules.append( # noqa: PERF401 Molecule( name="Nucleic acid", type=MoleculeType.NUCLEIC_ACID, @@ -511,7 +509,7 @@ def extract_small_molecules( inchikeys = dataset_metadata.get("INCHIKEYS") if inchikeys and isinstance(inchikeys, list): for inchikey in inchikeys: - molecules.append( + molecules.append( # noqa: PERF401 Molecule( name="Small molecule", type=MoleculeType.SMALL_MOLECULE, @@ -763,7 +761,6 @@ def scrape_files_for_all_datasets( # as they usually have the same names. files_metadata = extract_files_metadata( raw_files_metadata, - node_base_url, dataset, replica_id, replica_name, @@ -782,7 +779,6 @@ def scrape_files_for_all_datasets( def extract_files_metadata( raw_metadata: list[dict], - node_base_url: str, dataset: DatasetMetadata, replica_id: int, replica_name: str, @@ -795,12 +791,10 @@ def extract_files_metadata( ---------- raw_metadata: list[dict] Raw files metadata. - node_base_url: str - The unique identifier of the dataset in MDDB. dataset: DatasetMetadata Normalized dataset to get files metadata for. replica_id: int - Identifer of the corresponding replica associated with the files. + Identifier of the corresponding replica associated with the files. replica_name: str The name of the corresponding replica associated with the files. logger: "loguru.Logger"