diff --git a/AGENTS.md b/AGENTS.md index 6813b72..efba1f8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,8 +24,9 @@ When writing code: When writing functions, always: -- Add descriptive docstrings. +- Add descriptive docstrings - Use early returns for error conditions +- Limit size of try / except blocks to the strict minimum Never import libraries by yourself. Always ask before adding dependencies. diff --git a/README.md b/README.md index 368f115..8bfd1c9 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,24 @@ This command will: 4. Validate entries using Pydantic models 5. Save the extracted metadata to Parquet files +## Scrape MDDB + +See [MDDB](docs/mddb.md) to understand how with use scrape metadata from MDDB. + +Scrape MDDB to collect molecular dynamics (MD) datasets and files: + +```bash +uv run scrape-mddb --output-dir data +``` + +This command will: + +1. List all datasets and files through the main MDposit nodes. +2. Parse metadata and validate them using the Pydantic models + `DatasetMetadata` and `FileMetadata`. +3. Save validated files and datasets metadata. + +The scraping process takes about 2 hours, depending on your network connection and hardware. ## Analyze Gromacs mdp and gro files diff --git a/docs/mddb.md b/docs/mddb.md new file mode 100644 index 0000000..0a040ab --- /dev/null +++ b/docs/mddb.md @@ -0,0 +1,87 @@ +# MDDB + +> The [MDDB (Molecular Dynamics Data Bank) project](https://mddbr.eu/about/) is an initiative to collect, preserve, and share molecular dynamics (MD) simulation data. As part of this project, **MDposit** is an open platform that provides web access to atomistic MD simulations. Its goal is to facilitate and promote data sharing within the global scientific community to advance research. + +The MDposit infrastructure is distributed across several MDposit nodes. All metadata are accessible through the global node: + +MDposit MMB node: + +- web site: +- documentation: +- API: +- API base URL: + +No account / token is needed to access the MDposit API. + +## Getting metadata + +### Datasets + +In MDposit, a dataset (a simulation and its related files) is called a "[project](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects_summary)". + +API entrypoint to get the total number of projects: + +- Endpoint: `/projects/summary` +- HTTP method: GET +- [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects_summary) + +A project can contain multiple replicas, each identified by `project_id`.`replica_id`. + +For example, the project [MD-A003ZP](https://mdposit.mddbr.eu/#/id/MD-A003ZP/overview) contains ten replicas: + +- `MD-A003ZP.1`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.1/overview +- `MD-A003ZP.2`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.2/overview +- `MD-A003ZP.3`: https://mdposit.mddbr.eu/#/id/MD-A003ZP.3/overview +- ... + +API entrypoint to get all datasets at once: + +- Endpoint: `/projects` +- HTTP method: GET +- [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/projects/get_projects) + +### Files + +API endpoint to get files for a given replica of a project: + +- Endpoint: `/projects/{project_id.replica_id}/filenotes` +- HTTP method: GET +- [documentation](https://mdposit.mddbr.eu/api/rest/docs/#/filenotes/get_projects__projectAccessionOrID__filenotes) + +## Examples + +### Project `MD-A003ZP` + +Title: + +> MDBind 3x1k + +Description: + +> 10 ns simulation of 1ma4m pdb structure from MDBind dataset, a dynamic view of the PDBBind database + +- [project on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A003ZP/overview) +- [project on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A003ZP) + +Files for replica 1: + +- [files on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A003ZP.1/files) +- [files on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A003ZP.1/filenotes) + +### Project `MD-A001T1` + +Title: + +> All-atom molecular dynamics simulations of SARS-CoV-2 envelope protein E in the monomeric form, C4 popc + +Description: + +> The trajectories of all-atom MD simulations were obtained based on 4 starting representative conformations from the CG simulation. For each starting structure, there are six trajectories of the E protein: 3 with the protein embedded in the membrane containing POPC, and 3 with the membrane mimicking the natural ERGIC membrane (Mix: 50% POPC, 25% POPE, 10% POPI, 5% POPS, 10% cholesterol). + +- [project on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A001T1/overview) +- [project on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A001T1) + +Files for replica 1: + +- [files on MDposit GUI](https://mdposit.mddbr.eu/#/id/MD-A001T1.1/files) +- [files on MDposit API](https://mdposit.mddbr.eu/api/rest/current/projects/MD-A001T1.1/filenotes) diff --git a/pyproject.toml b/pyproject.toml index 48e1a24..a1d1bbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,3 +73,4 @@ scrape-figshare = "mdverse_scrapers.scrapers.figshare:main" scrape-nomad = "mdverse_scrapers.scrapers.nomad:main" scrape-atlas = "mdverse_scrapers.scrapers.atlas:main" scrape-gpcrmd = "mdverse_scrapers.scrapers.gpcrmd:main" +scrape-mddb = "mdverse_scrapers.scrapers.mddb:main" diff --git a/src/mdverse_scrapers/models/dataset.py b/src/mdverse_scrapers/models/dataset.py index 3333e1f..3cedbd2 100644 --- a/src/mdverse_scrapers/models/dataset.py +++ b/src/mdverse_scrapers/models/dataset.py @@ -170,7 +170,7 @@ def format_dates(cls, value: datetime | str | None) -> str | None: Parameters ---------- - cls : type[BaseDataset] + cls : type[DatasetMetadata] The Pydantic model class being validated. value : datetime | str | None The input value of the 'date' field to validate. diff --git a/src/mdverse_scrapers/models/enums.py b/src/mdverse_scrapers/models/enums.py index c94e5f4..d8d4264 100644 --- a/src/mdverse_scrapers/models/enums.py +++ b/src/mdverse_scrapers/models/enums.py @@ -20,6 +20,10 @@ class DatasetSourceName(StrEnum): ATLAS = "atlas" GPCRMD = "gpcrmd" NMRLIPIDS = "nmrlipids" + MDDB = "mddb" + MDPOSIT_INRIA_NODE = "mdposit_inria_node" + MDPOSIT_MMB_NODE = "mdposit_mmb_node" + MDPOSIT_CINECA_NODE = "mdposit_cineca_node" class ExternalDatabaseName(StrEnum): @@ -27,3 +31,15 @@ class ExternalDatabaseName(StrEnum): PDB = "pdb" UNIPROT = "uniprot" + + +class MoleculeType(StrEnum): + """Common molecular types found in molecular dynamics simulations.""" + + PROTEIN = "protein" + NUCLEIC_ACID = "nucleic_acid" + ION = "ion" + LIPID = "lipid" + CARBOHYDRATE = "carbohydrate" + SOLVENT = "solvent" + SMALL_MOLECULE = "small_molecule" diff --git a/src/mdverse_scrapers/models/simulation.py b/src/mdverse_scrapers/models/simulation.py index 0a23b2f..5fbef97 100644 --- a/src/mdverse_scrapers/models/simulation.py +++ b/src/mdverse_scrapers/models/simulation.py @@ -3,9 +3,16 @@ import re from typing import Annotated -from pydantic import BaseModel, ConfigDict, Field, StringConstraints, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + StringConstraints, + field_validator, + model_validator, +) -from .enums import ExternalDatabaseName +from .enums import ExternalDatabaseName, MoleculeType DOI = Annotated[ str, @@ -37,6 +44,30 @@ class ExternalIdentifier(BaseModel): None, min_length=1, description="Direct URL to the identifier into the database" ) + @model_validator(mode="after") + def compute_url(self) -> "ExternalIdentifier": + """Compute the URL for the external identifier. + + Parameters + ---------- + self: ExternalIdentifier + The model instance being validated, with all fields already validated. + + Returns + ------- + ExternalIdentifier + The model instance with the URL field computed if it was not provided. + """ + if self.url is not None: + return self + + if self.database_name == ExternalDatabaseName.PDB: + self.url = f"https://www.rcsb.org/structure/{self.identifier}" + elif self.database_name == ExternalDatabaseName.UNIPROT: + self.url = f"https://www.uniprot.org/uniprotkb/{self.identifier}" + + return self + class Molecule(BaseModel): """Molecule in a simulation.""" @@ -45,6 +76,17 @@ class Molecule(BaseModel): model_config = ConfigDict(extra="forbid") name: str = Field(..., description="Name of the molecule.") + type: MoleculeType | None = Field( + None, + description="Type of the molecule." + "Allowed values in the MoleculeType enum. " + "Examples: PROTEIN, ION, LIPID...", + ) + number_of_molecules: int | None = Field( + None, + ge=0, + description="Number of molecules of this type in the simulation.", + ) number_of_atoms: int | None = Field( None, ge=0, description="Number of atoms in the molecule." ) @@ -52,11 +94,7 @@ class Molecule(BaseModel): sequence: str | None = Field( None, description="Sequence of the molecule for protein and nucleic acid." ) - number_of_molecules: int | None = Field( - None, - ge=0, - description="Number of molecules of this type in the simulation.", - ) + inchikey: str | None = Field(None, description="InChIKey of the molecule.") external_identifiers: list[ExternalIdentifier] | None = Field( None, description=("List of external database identifiers for this molecule."), @@ -66,8 +104,9 @@ class Molecule(BaseModel): class ForceFieldModel(BaseModel): """Forcefield or Model used in a simulation.""" - # Ensure scraped metadata matches the expected schema exactly. - model_config = ConfigDict(extra="forbid") + # Ensure scraped metadata matches the expected schema exactly + # and version is coerced to string when needed. + model_config = ConfigDict(extra="forbid", coerce_numbers_to_str=True) name: str = Field( ..., @@ -81,8 +120,9 @@ class ForceFieldModel(BaseModel): class Software(BaseModel): """Simulation software or tool used in a simulation.""" - # Ensure scraped metadata matches the expected schema exactly. - model_config = ConfigDict(extra="forbid") + # Ensure scraped metadata matches the expected schema exactly + # and version is coerced to string when needed. + model_config = ConfigDict(extra="forbid", coerce_numbers_to_str=True) name: str = Field( ..., diff --git a/src/mdverse_scrapers/scrapers/mddb.py b/src/mdverse_scrapers/scrapers/mddb.py new file mode 100644 index 0000000..cac1ba0 --- /dev/null +++ b/src/mdverse_scrapers/scrapers/mddb.py @@ -0,0 +1,930 @@ +"""Scrape molecular dynamics simulation datasets and files from the MDDB. + +This script extracts molecular dynamics datasets managed by the +MDDB (Molecular Dynamics Data Bank) project +and the MDposit platform. +""" + +import sys +from pathlib import Path +from urllib.parse import urlparse + +import click +import httpx +import loguru + +from ..core.logger import create_logger +from ..core.network import ( + HttpMethod, + create_httpx_client, + is_connection_to_server_working, + make_http_request_with_retries, +) +from ..core.toolbox import print_statistics +from ..models.dataset import DatasetMetadata +from ..models.enums import DatasetSourceName, ExternalDatabaseName, MoleculeType +from ..models.scraper import ScraperContext +from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software +from ..models.utils import ( + export_list_of_models_to_parquet, + normalize_datasets_metadata, + normalize_files_metadata, +) + +MDDB_NODES = { + # INRIA node. + "inria": { + "name": DatasetSourceName.MDPOSIT_INRIA_NODE, + "base_url": "https://dynarepo.inria.fr", + }, + # INRIA node, with typo. + "inr": { + "name": DatasetSourceName.MDPOSIT_INRIA_NODE, + "base_url": "https://dynarepo.inria.fr", + }, + # MMB node. + "mmb": { + "name": DatasetSourceName.MDPOSIT_MMB_NODE, + "base_url": "https://mmb.mddbr.eu", + }, + # Cineca node. + "cin": { + "name": DatasetSourceName.MDPOSIT_CINECA_NODE, + "base_url": "https://cineca.mddbr.eu", + }, +} + + +def scrape_all_datasets( + client: httpx.Client, + query_entry_point: str, + page_size: int = 50, + logger: "loguru.Logger" = loguru.logger, + scraper: ScraperContext | None = None, +) -> list[dict]: + """ + Scrape Molecular Dynamics-related datasets from the MDposit API. + + Within the MDposit terminology, datasets are referred to as "projects". + + Parameters + ---------- + client: httpx.Client + The HTTPX client to use for making requests. + query_entry_point: str + The entry point of the API request. + page_size: int + Number of entries to fetch per page. + logger: "loguru.Logger" + Logger for logging messages. + scraper: ScraperContext | None + Optional scraper context. When provided and running in debug mode, + dataset scraping is intentionally stopped early to limit the amount + of retrieved data. + + Returns + ------- + list[dict]: + List of MDposit entries metadata. + """ + logger.info("Scraping molecular dynamics datasets from MDposit.") + logger.info(f"Using batches of {page_size} datasets.") + all_datasets = [] + # Start by requesting the first page to get total number of datasets. + logger.info("Requesting first page to get total number of datasets...") + params = {"limit": 10, "page": 1} + response = make_http_request_with_retries( + client, + query_entry_point, + method=HttpMethod.GET, + params=params, + timeout=60, + delay_before_request=0.2, + ) + if not response: + logger.error("Failed to fetch data from MDposit API.") + return all_datasets + total_datasets = int(response.json().get("filteredCount", 0)) + logger.success(f"Found a total of {total_datasets:,} datasets in MDposit") + # Compute total number of pages to scrape based on total datasets and page size. + page_total = total_datasets // page_size + if total_datasets % page_size != 0: + page_total += 1 + + for page in range(1, page_total + 1): + params = {"limit": page_size, "page": page} + response = make_http_request_with_retries( + client, + query_entry_point, + method=HttpMethod.GET, + params=params, + timeout=60, + delay_before_request=0.2, + ) + if not response: + logger.error("Failed to fetch data from MDposit API.") + logger.error("Jumping to next iteration.") + continue + + response_json = response.json() + datasets = response_json.get("projects", []) + all_datasets.extend(datasets) + + logger.info(f"Scraped page {page}/{page_total} with {len(datasets)} datasets") + if total_datasets: + logger.info( + f"Scraped {len(all_datasets):,} datasets " + f"({len(all_datasets):,}/{total_datasets:,}" + f":{len(all_datasets) / total_datasets:.0%})" + ) + logger.debug("First dataset metadata on this page:") + logger.debug(datasets[0] if datasets else "No datasets on this page") + + if scraper and scraper.is_in_debug_mode and len(all_datasets) >= 100: + logger.warning("Debug mode is ON: stopping after 100 datasets.") + return all_datasets + + logger.success(f"Scraped {len(all_datasets):,} datasets in MDposit.") + return all_datasets + + +def extract_software_and_version( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[Software] | None: + """ + Extract software names and versions from the nested dataset dictionary. + + Example of dataset with no software: + https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001R9 + + Parameters + ---------- + dataset_metadata: dict + The dataset dictionary from which to extract molecules information. + dataset_id: str + Identifier of the dataset, used for logging. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[Software] | None + A list of Software instances with `name` and `version` fields, None otherwise. + """ + name = dataset_metadata.get("PROGRAM") + version = dataset_metadata.get("VERSION") + if not name: + logger.warning("No software found") + return None + logger.debug(f"Found software: {name.strip()} ({version})") + return [Software(name=name.strip(), version=version)] + + +def extract_forcefield_or_model_and_version( + dataset_metadata: dict, dataset_id: str, logger: "loguru.Logger" = loguru.logger +) -> list[ForceFieldModel] | None: + """ + Extract forcefield or model names and versions from the nested dataset dictionary. + + Parameters + ---------- + dataset_metadata: dict + The dataset dictionary from which to extract molecules information. + dataset_id: str + Identifier of the dataset entry, used for logging. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[ForceFieldModel] | None + A list of forcefield or model instances with `name` and `version` fields, + None otherwise. + """ + forcefields_and_models = [] + # Add forcefield names. + forcefields = dataset_metadata.get("FF") + if forcefields: + for forcefield in forcefields: + if isinstance(forcefield, str): + forcefields_and_models.append(ForceFieldModel(name=forcefield.strip())) + logger.debug(f"Found forcefield/model: {forcefield.strip()}") + # Add water model. + water_model = dataset_metadata.get("WAT", "") + if water_model: + forcefields_and_models.append(ForceFieldModel(name=water_model.strip())) + logger.debug(f"Found water model: {water_model.strip()}") + # Print summary of extracted forcefields and models. + if forcefields_and_models: + logger.info(f"Found {len(forcefields_and_models)} forcefield(s) or model(s)") + else: + logger.warning("No forcefield or model found") + return None + return forcefields_and_models + + +def fetch_uniprot_protein_name( + client: httpx.Client, + uniprot_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> str: + """ + Retrieve protein name from UniProt API. + + Parameters + ---------- + client: httpx.Client + HTTP client used to perform the request. + uniprot_id: str + UniProt accession identifier. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + str + Protein full name if available, default name otherwise. + """ + logger.info(f"Fetching protein name for UniProt ID: {uniprot_id}") + if uniprot_id in ("noref", "notfound"): + logger.warning("UniProt ID is weird. Aborting.") + return "Unknown protein" + # Default value for protein name: + default_protein_name = f"Protein {uniprot_id}" + response = make_http_request_with_retries( + client, + f"https://rest.uniprot.org/uniprotkb/{uniprot_id}", + method=HttpMethod.GET, + timeout=30, + delay_before_request=0.1, + ) + if not response: + logger.error("Failed to query the UniProt API") + return default_protein_name + json_data = response.json() + # First option: try to get the recommended name. + protein_name = ( + json_data.get("proteinDescription", {}) + .get("recommendedName", {}) + .get("fullName", {}) + .get("value") + ) + # Second option: try to get the submitted name. + # See for instance: https://rest.uniprot.org/uniprotkb/Q51760 + if not protein_name: + submission_name = json_data.get("proteinDescription", {}).get("submissionNames") + # The "submissionNames" field can be a list. + # See for instance; https://rest.uniprot.org/uniprotkb/Q16968 + if submission_name and isinstance(submission_name, list): + protein_name = submission_name[0].get("fullName", {}).get("value") + # Or a dictionnary. + # See for instance: https://rest.uniprot.org/uniprotkb/Q51760 + elif submission_name and isinstance(submission_name, dict): + protein_name = submission_name.get("fullName", {}).get("value") + if protein_name: + logger.success("Retrieved protein name:") + logger.success(protein_name) + return protein_name + else: + # UniProt records are sometimes outdated or discontinued. + # See for instance: https://rest.uniprot.org/uniprotkb/Q9RHW0 + logger.error("Cannot extract protein name from UniProt API response") + return default_protein_name + + +def extract_proteins( # noqa: C901 + pdb_identifiers: list[ExternalIdentifier], + uniprot_identifiers: list[str], + protein_sequences: list[str], + client: httpx.Client, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> list: + """Extract proteins from dataset metadata. + + Parameters + ---------- + pdb_identifiers: list[ExternalIdentifier] + List of PDB identifiers to associate with the proteins. + uniprot_identifiers: list[str] + List of UniProt accessions. + to associate with the proteins. + protein_sequences: list[str] + List of protein sequences. + client: httpx.Client + The HTTP client used for making requests. + dataset_id: str + The ID of the dataset being processed, used for logging. + logger: loguru.Logger + Logger for logging messages. + + Returns + ------- + list + A list of extracted proteins or empty list. + """ + molecules = [] + # Case 1: + # We have no protein sequences but no UniProt identifiers. + if not protein_sequences and not uniprot_identifiers: + logger.info("Found no protein sequence nor UniProt identifier") + if pdb_identifiers: + molecules.append( + Molecule( + name="Protein", + type=MoleculeType.PROTEIN, + sequence=None, + external_identifiers=pdb_identifiers, + ) + ) + return molecules + # Case 2: + # We have protein sequences but no UniProt identifiers. + if protein_sequences and not uniprot_identifiers: + logger.warning("Found protein sequences but no UniProt identifier") + for sequence in protein_sequences: + molecules.append( # noqa: PERF401 + Molecule( + name="Protein", + type=MoleculeType.PROTEIN, + sequence=sequence, + external_identifiers=pdb_identifiers, + ) + ) + return molecules + # Case 3: + # We have UniProt identifiers but no protein sequences. + if uniprot_identifiers and not protein_sequences: + logger.warning("Found UniProt identifiers but no protein sequence") + for identifier in uniprot_identifiers: + external = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, identifier=identifier + ) + protein_name = fetch_uniprot_protein_name(client, identifier, logger=logger) + molecules.append( + Molecule( + name=protein_name, + type=MoleculeType.PROTEIN, + sequence=None, + external_identifiers=[external, *pdb_identifiers], + ) + ) + return molecules + # Case 4: + # We have one UniProt identifier and several protein sequences, + # we assume all protein sequences are associated with the same UniProt identifier. + if (len(uniprot_identifiers) == 1) and (len(protein_sequences) > 1): + external = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier=uniprot_identifiers[0], + ) + protein_name = fetch_uniprot_protein_name( + client, uniprot_identifiers[0], logger=logger + ) + for sequence in protein_sequences: + molecules.append( # noqa: PERF401 + Molecule( + name=protein_name, + type=MoleculeType.PROTEIN, + sequence=sequence, + external_identifiers=[external, *pdb_identifiers], + ) + ) + return molecules + # Case 5: + # We have more than one UniProt identifiers and several protein sequences, + # but their numbers do not match. + # See for instance: https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A000AE + # with 2 UniProt identifiers and 4 protein sequences. + if len(uniprot_identifiers) != len(protein_sequences): + logger.warning( + f"Number of UniProt identifiers ({len(uniprot_identifiers)}) does not " + f"match number of protein sequences ({len(protein_sequences)})" + ) + if pdb_identifiers: + molecules.append( + Molecule( + name="Unknown protein", + type=MoleculeType.PROTEIN, + external_identifiers=pdb_identifiers, + ) + ) + return molecules + # Case 6: + # We have UniProt identifiers and protein sequences, + # and their numbers match. + for identifier, sequence in zip( + uniprot_identifiers, protein_sequences, strict=True + ): + external = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, identifier=identifier + ) + protein_name = fetch_uniprot_protein_name(client, identifier, logger=logger) + molecules.append( + Molecule( + name=protein_name, + type=MoleculeType.PROTEIN, + sequence=sequence, + external_identifiers=[external, *pdb_identifiers], + ) + ) + return molecules + + +def extract_nucleic_acids( + pdb_identifiers: list[ExternalIdentifier], + nucleic_acid_sequences: list[str], + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> list: + """Extract nucleic acids from dataset metadata. + + Parameters + ---------- + pdb_identifiers: list[ExternalIdentifier] + List of PDB identifiers to associate with the proteins. + nucleic_acid_sequences: list[str] + List of nucleic acid sequences. + dataset_id: str + The ID of the dataset being processed, used for logging. + logger: loguru.Logger + Logger for logging messages. + + Returns + ------- + list + A list of extracted nucleic acids. + """ + molecules = [] + for sequence in nucleic_acid_sequences: + molecules.append( # noqa: PERF401 + Molecule( + name="Nucleic acid", + type=MoleculeType.NUCLEIC_ACID, + sequence=sequence, + external_identifiers=pdb_identifiers, + ) + ) + return molecules + + +def extract_small_molecules( + dataset_metadata: dict, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> list: + """Extract small molecules (lipids, solvents, ions) from dataset metadata. + + Parameters + ---------- + dataset_metadata: dict + The dataset metadata containing information about the molecules. + dataset_id: str + The ID of the dataset being processed, used for logging. + logger: loguru.Logger + Logger for logging messages. + + Returns + ------- + list + A list of extracted small molecules or an empty list. + """ + molecules = [] + name_type_mapping = { + "SOL": MoleculeType.SOLVENT, + "NA": MoleculeType.ION, + "CL": MoleculeType.ION, + } + for name, mol_type in name_type_mapping.items(): + count = dataset_metadata.get(name, 0) + if isinstance(count, int) and count > 0: + molecules.append( + Molecule( + name=name, + type=mol_type, + number_of_molecules=count, + ) + ) + # Get InChIKey for small molecules if available. + inchikeys = dataset_metadata.get("INCHIKEYS") + if inchikeys and isinstance(inchikeys, list): + for inchikey in inchikeys: + molecules.append( # noqa: PERF401 + Molecule( + name="Small molecule", + type=MoleculeType.SMALL_MOLECULE, + inchikey=inchikey, + ) + ) + return molecules + + +def extract_molecules( + dataset_metadata: dict, + dataset_id: str, + client: httpx.Client, + logger: "loguru.Logger" = loguru.logger, +) -> list[Molecule] | None: + """Coordinator function to extract all molecule types from dataset metadata. + + Parameters + ---------- + dataset_metadata: dict + The dataset metadata containing information about the molecules. + dataset_id: str + The ID of the dataset being processed. + client: httpx.Client + The HTTP client used for making requests. + logger: loguru.Logger + Logger for logging messages. + + Returns + ------- + list[Molecule] | None + A list of extracted molecules or None if no molecules were found. + """ + molecules = [] + # Add PDB identifiers as external identifiers. + pdb_identifiers = [] + for pdb in dataset_metadata.get("PDBIDS", []): + external = ExternalIdentifier( + database_name=ExternalDatabaseName.PDB, identifier=pdb + ) + pdb_identifiers.append(external) + # Add UniProt identifiers and protein sequence. + # Example with no PDBIDS, no PROTSEQ and no REFERENCES: + # https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001M3 + proteins = extract_proteins( + pdb_identifiers, + dataset_metadata.get("REFERENCES", []), + dataset_metadata.get("PROTSEQ", []), + client, + dataset_id, + logger=logger, + ) + if proteins: + logger.info(f"Found {len(proteins)} protein(s)") + molecules.extend(proteins) + # Add nucleic acids. + # See for instance: https://mdposit.mddbr.eu/api/rest/v1/projects/MD-A001M3 + nucleic_acids = extract_nucleic_acids( + pdb_identifiers, dataset_metadata.get("NUCLSEQ", []), dataset_id, logger=logger + ) + if nucleic_acids: + logger.info(f"Found {len(nucleic_acids)} nucleic acid(s)") + molecules.extend(nucleic_acids) + # Finally extract small molecules like lipids, solvents and ions. + small_molecules = extract_small_molecules( + dataset_metadata, dataset_id, logger=logger + ) + if small_molecules: + logger.info(f"Found {len(small_molecules)} small molecule(s)") + molecules.extend(small_molecules) + # Print summary of extracted molecules. + if molecules: + logger.info( + f"Found a total of {len(molecules)} molecule(s) in dataset {dataset_id}" + ) + else: + logger.warning(f"No molecules found in dataset {dataset_id}") + return None + return molecules + + +def extract_datasets_metadata( + datasets: list[dict], + mddb_nodes: dict, + client: httpx.Client, + logger: "loguru.Logger" = loguru.logger, +) -> tuple[list[dict], dict]: + """ + Extract relevant metadata from raw MDposit datasets metadata. + + Parameters + ---------- + datasets: list[dict] + List of raw MDposit datasets metadata. + mddb_nodes: dict + Dictionary of MDDB nodes. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict] + List of dataset metadata dictionaries. + dict + Dictionary for replicas by dataset. + """ + datasets_metadata = [] + replicas = {} + for dataset in datasets: + # Get the dataset id + dataset_id = str(dataset.get("accession")) + logger.info("-" * 50) + logger.info(f"Extracting metadata for dataset: {dataset_id}") + logger.debug(f"https://mdposit.mddbr.eu/api/rest/v1/projects/{dataset_id}") + # Extract node name. + node_name = dataset.get("node", "") + # Create the dataset url depending on the node. + dataset_id_in_repository = str(dataset.get("local")) + if node_name not in mddb_nodes: + logger.error(f"Unknown MDDB node '{node_name}' for dataset {dataset_id}") + logger.error("Skipping dataset") + continue + if node_name == "inr": + logger.warning( + f"MDDB node 'inr' should probably be 'inria' for dataset {dataset_id}" + ) + dataset_repository_name = mddb_nodes[node_name]["name"] + dataset_url_in_repository = ( + f"{mddb_nodes[node_name]['base_url']}" + f"/#/id/{dataset_id_in_repository}/overview" + ) + # Extract simulation metadata. + simulation_metadata = dataset.get("metadata", {}) + citations = simulation_metadata.get("CITATION") + external_links = [citations] if citations else None + authors = simulation_metadata.get("AUTHORS") + author_names = None + if isinstance(authors, list): + author_names = authors + elif isinstance(authors, str): + author_names = [authors.strip()] + metadata = { + "dataset_repository_name": dataset_repository_name, + "dataset_id_in_repository": dataset_id_in_repository, + "dataset_url_in_repository": dataset_url_in_repository, + "dataset_project_name": DatasetSourceName.MDDB, + "dataset_id_in_project": dataset_id, + "dataset_url_in_project": f"https://mdposit.mddbr.eu/#/id/{dataset_id}/overview", + "external_links": external_links, + "title": simulation_metadata.get("NAME"), + "date_created": dataset.get("creationDate"), + "date_last_updated": dataset.get("updateDate"), + "number_of_files": len(dataset.get("files", [])), + "author_names": author_names, + "license": simulation_metadata.get("LICENSE"), + "description": simulation_metadata.get("DESCRIPTION"), + "total_number_of_atoms": simulation_metadata.get("mdAtoms"), + } + # Extract simulation metadata if available. + # Software names with their versions. + metadata["software"] = extract_software_and_version( + simulation_metadata, dataset_id, logger=logger + ) + # Forcefield and model names with their versions. + metadata["forcefields_models"] = extract_forcefield_or_model_and_version( + simulation_metadata, dataset_id, logger=logger + ) + # Molecules with their nb of atoms and number total of atoms. + metadata["molecules"] = extract_molecules( + simulation_metadata, dataset_id, client, logger=logger + ) + # Time step in fs. + time_step = simulation_metadata.get("TIMESTEP") + metadata["simulation_timesteps_in_fs"] = [time_step] if time_step else None + # Temperatures in kelvin. + temperature = simulation_metadata.get("TEMP") + if temperature and isinstance(temperature, (int, float)): + metadata["simulation_temperatures_in_kelvin"] = [temperature] + logger.debug(f"Found simulation temperature: {temperature} K") + else: + logger.warning("No simulation temperature found") + # Extract replicas. + replica_list = dataset.get("mds") + if replica_list: + replicas[dataset_id] = replica_list + # Append extracted metadata. + datasets_metadata.append(metadata) + logger.info( + "Extracted metadata for " + f"{len(datasets_metadata):,}/{len(datasets):,} datasets " + f"({len(datasets_metadata) / len(datasets):.0%})" + ) + return datasets_metadata, replicas + + +def scrape_files_for_all_datasets( + client: httpx.Client, + datasets_metadata: list[DatasetMetadata], + datasets_replicas: dict, + node_base_url: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """Scrape files metadata for all datasets in MDposit API. + + Parameters + ---------- + client: httpx.Client + The HTTPX client to use for making requests. + datasets_metadata: list[DatasetMetadata] + List of datasets to scrape files metadata for. + datasets_replicas: dict + Dictionary for replicas by dataset. + node_base_url: str + Base url of the specific node of MDposit API. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict] + List of files metadata dictionaries. + """ + all_files_metadata = [] + for dataset_count, dataset in enumerate(datasets_metadata, start=1): + logger.info("-" * 50) + dataset_id = dataset.dataset_id_in_project + for replica_id, replica_name in enumerate( + datasets_replicas.get(dataset_id, []), start=1 + ): + logger.info(f"Scraping files for dataset: {dataset_id} / {replica_name}") + response = make_http_request_with_retries( + client, + url=f"{node_base_url}/projects/{dataset_id}.{replica_id}/filenotes", + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.1, + logger=logger, + ) + if not response: + logger.error("Failed to fetch files metadata") + continue + raw_files_metadata = response.json() + # Extract relevant files metadata. + logger.info( + f"Extracting files metadata for dataset: {dataset_id} / {replica_name}" + ) + # We integrate replica name and id to distinguish files + # from different replicas of the same dataset, + # as they usually have the same names. + files_metadata = extract_files_metadata( + raw_files_metadata, + dataset, + replica_id, + replica_name, + logger=logger, + ) + all_files_metadata += files_metadata + # Normalize files metadata with pydantic model (FileMetadata) + logger.success(f"Total number of files found: {len(all_files_metadata):,}") + logger.success( + "Extracted files metadata for " + f"{dataset_count:,}/{len(datasets_metadata):,} " + f"({dataset_count / len(datasets_metadata):.0%}) datasets" + ) + return all_files_metadata + + +def extract_files_metadata( + raw_metadata: list[dict], + dataset: DatasetMetadata, + replica_id: int, + replica_name: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Extract relevant metadata from raw MDposit files metadata. + + Parameters + ---------- + raw_metadata: list[dict] + Raw files metadata. + dataset: DatasetMetadata + Normalized dataset to get files metadata for. + replica_id: int + Identifier of the corresponding replica associated with the files. + replica_name: str + The name of the corresponding replica associated with the files. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[dict] + List of select files metadata. + """ + logger.info("Extracting files metadata...") + files_metadata = [] + for mdposit_file in raw_metadata: + dataset_id = dataset.dataset_id_in_repository + file_name = Path(mdposit_file.get("filename", "")) + # Extract base url from dataset url. + base_url = urlparse(dataset.dataset_url_in_repository).netloc + file_path_url = f"https://{base_url}/api/rest/current/projects/{dataset_id}.{replica_id}/files/{file_name}" + file_metadata = { + "dataset_repository_name": dataset.dataset_repository_name, + "dataset_id_in_repository": dataset_id, + "dataset_url_in_repository": dataset.dataset_url_in_repository, + "file_name": f"{replica_name.replace(' ', '_')}/{file_name}", + "file_size_in_bytes": mdposit_file.get("length", None), + "file_md5": mdposit_file.get("md5", None), + "file_url_in_repository": file_path_url, + } + files_metadata.append(file_metadata) + logger.info(f"Extracted metadata for {len(files_metadata)} files") + return files_metadata + + +@click.command( + help="Command line interface for MDverse scrapers", + epilog="Happy scraping!", +) +@click.option( + "--output-dir", + "output_dir_path", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + required=True, + help="Output directory path to save results.", +) +@click.option( + "--debug", + "is_in_debug_mode", + is_flag=True, + default=False, + help="Enable debug mode.", +) +def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: + """Scrape molecular dynamics datasets and files from MDDB.""" + # Create HTTPX client + client = create_httpx_client() + + data_source_name = DatasetSourceName.MDDB + base_url = "https://mdposit.mddbr.eu/api/rest/v1" + # Create scraper context. + scraper = ScraperContext( + data_source_name=data_source_name, + output_dir_path=output_dir_path, + is_in_debug_mode=is_in_debug_mode, + ) + # Create logger. + level = "DEBUG" if scraper.is_in_debug_mode else "INFO" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. + logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) + logger.info(f"Starting {data_source_name.name} data scraping...") + # Check connection to the API + if is_connection_to_server_working( + client, f"{base_url}/projects/summary", logger=logger + ): + logger.success(f"Connection to {data_source_name} API successful!") + else: + logger.critical(f"Connection to {data_source_name} API failed.") + logger.critical("Aborting.") + sys.exit(1) + + # Scrape the datasets metadata. + datasets_raw_metadata = scrape_all_datasets( + client, + query_entry_point=f"{base_url}/projects", + logger=logger, + scraper=scraper, + ) + if not datasets_raw_metadata: + logger.critical(f"No datasets found in {data_source_name}.") + logger.critical("Aborting.") + sys.exit(1) + + # Extract datasets metadata. + datasets_selected_metadata, replicas = extract_datasets_metadata( + datasets_raw_metadata, MDDB_NODES, client, logger=logger + ) + # Validate datasets metadata with the DatasetMetadata Pydantic model. + datasets_normalized_metadata = normalize_datasets_metadata( + datasets_selected_metadata, logger=logger + ) + # Save datasets metadata to parquet file. + scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( + scraper.datasets_parquet_file_path, + datasets_normalized_metadata, + logger=logger, + ) + # Output first dataset metadata for debugging purposes. + logger.debug("First dataset metadata:") + logger.debug(datasets_normalized_metadata[0]) + # Scrape MDDB files metadata. + files_metadata = scrape_files_for_all_datasets( + client, + datasets_normalized_metadata, + replicas, + base_url, + logger=logger, + ) + # Validate MDDB files metadata with the FileMetadata Pydantic model. + files_normalized_metadata = normalize_files_metadata(files_metadata, logger=logger) + # Save files metadata to parquet file. + scraper.number_of_files_scraped = export_list_of_models_to_parquet( + scraper.files_parquet_file_path, + files_normalized_metadata, + logger=logger, + ) + # Output first file metadata for debugging purposes. + logger.debug("First file metadata:") + logger.debug(files_normalized_metadata[0]) + # Print scraping statistics. + print_statistics(scraper, logger=logger) + + +if __name__ == "__main__": + main() diff --git a/tests/models/test_simulation.py b/tests/models/test_simulation.py index 25e5d5c..860ee1d 100644 --- a/tests/models/test_simulation.py +++ b/tests/models/test_simulation.py @@ -13,9 +13,9 @@ ) -# ------------------------------------------------------------------- +# -------------------------------------------------- # Test simulation timestep and time positive values -# ------------------------------------------------------------------- +# -------------------------------------------------- @pytest.mark.parametrize( ("values", "should_raise_exception"), [ @@ -35,9 +35,9 @@ def test_positive_simulation_values(values, should_raise_exception): assert metadata.simulation_timesteps_in_fs == values -# ------------------------------------------------------------------- +# ------------------------------ # Test temperature normalization -# ------------------------------------------------------------------- +# ------------------------------ @pytest.mark.parametrize( ("test_temp", "expected_temp_in_kelvin"), [ @@ -54,9 +54,9 @@ def test_temperature_normalization(test_temp, expected_temp_in_kelvin): assert metadata.simulation_temperatures_in_kelvin == expected_temp_in_kelvin -# ------------------------------------------------------------------- +# ---------------------------------------------- # Test software, molecules, forcefields creation -# ------------------------------------------------------------------- +# ---------------------------------------------- def test_structured_fields_creation(): """Test that software, molecules, and forcefields can be created.""" metadata = SimulationMetadata( @@ -89,18 +89,18 @@ def test_structured_fields_creation(): assert metadata.molecules[0].external_identifiers[0].identifier == "1ABC" -# ------------------------------------------------------------------- +# ------------------- # Test invalid fields -# ------------------------------------------------------------------- +# ------------------- def test_invalid_fields(): """Test with a non-existing fields.""" with pytest.raises(ValidationError): SimulationMetadata(total_number_of_something=1000) -# ------------------------- +# -------------------------------------- # Test invalid simulation parameter type -# ------------------------- +# -------------------------------------- def test_invalid_simulation_value_type(): """Test that non-numeric strings raise ValidationError.""" with pytest.raises(ValidationError): diff --git a/tests/models/test_simulation_molecule.py b/tests/models/test_simulation_molecule.py index a3e1b55..50c4db7 100644 --- a/tests/models/test_simulation_molecule.py +++ b/tests/models/test_simulation_molecule.py @@ -54,14 +54,12 @@ def test_invalid_number_of_molecules(): "1K79", "https://www.rcsb.org/structure/1K79", ), - (ExternalDatabaseName.PDB, 1234, "1234", None), ( ExternalDatabaseName.UNIPROT, "P06213", "P06213", "https://www.uniprot.org/uniprotkb/P06213/entry", ), - (ExternalDatabaseName.UNIPROT, 123456, "123456", None), ], ) def test_external_identifier_creation( @@ -91,3 +89,39 @@ def test_invalid_database_name_in_external_identifiers(): database_name=ExternalDatabaseName.DUMMY, # type: ignore identifier="1ABC", ) + + +@pytest.mark.parametrize( + ("database_name", "identifier", "expected_identifier"), + [ + (ExternalDatabaseName.PDB, 1234, "1234"), + (ExternalDatabaseName.UNIPROT, 123456, "123456"), + ], +) +def test_external_identifier_coerces_int_to_str( + database_name, + identifier, + expected_identifier, +): + """Test that integer identifiers are coerced to strings.""" + ext_id = ExternalIdentifier( + database_name=database_name, + identifier=identifier, + ) + + assert ext_id.identifier == expected_identifier + + +def test_compute_url_in_external_identifier(): + """Test that the compute_url method generates the correct URL.""" + identifier = ExternalIdentifier( + database_name=ExternalDatabaseName.PDB, + identifier="1ABC", + ) + assert identifier.url == "https://www.rcsb.org/structure/1ABC" + + identifier = ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier="P12345", + ) + assert identifier.url == "https://www.uniprot.org/uniprotkb/P12345"