From 296e3068642e8020411dfade9b2c175a72011620 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 16 Feb 2026 14:04:26 +0100 Subject: [PATCH 1/3] feat: Add simulation metadata --- docs/atlas.md | 7 ++- src/mdverse_scrapers/scrapers/atlas.py | 61 ++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/docs/atlas.md b/docs/atlas.md index f02bb3c..19e0e92 100644 --- a/docs/atlas.md +++ b/docs/atlas.md @@ -1,6 +1,8 @@ # ATLAS -ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures, accompanied by their analysis in the form of interactive diagrams and trajectory visualisation. All raw trajectories as well as the results of analysis are available for download. +ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures, +accompanied by their analysis in the form of interactive diagrams and trajectory visualisation. +All raw trajectories as well as the results of analysis are available for download. - web site: - publication: [ATLAS: protein flexibility description from atomistic molecular dynamics simulations](https://academic.oup.com/nar/article/52/D1/D384/7438909), Nucleic Acids Research, 2024. @@ -38,7 +40,8 @@ Example with dataset id `1k5n_A`: Remarks: - The title of the dataset is the protein name. -- No comment or description is provided. We used the organism as description. +- No comment or description is provided. We used the organism name as description. +- Parameters of molecular dynamics simulations are provided through the API endpoint . ### Metadata for files diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index e5124fe..89fcc1a 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -19,8 +19,9 @@ ) from ..core.toolbox import print_statistics from ..models.dataset import DatasetMetadata -from ..models.enums import DatasetSourceName +from ..models.enums import DatasetSourceName, ExternalDatabaseName from ..models.scraper import ScraperContext +from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software from ..models.utils import ( export_list_of_models_to_parquet, normalize_datasets_metadata, @@ -40,6 +41,14 @@ ], "doi": "10.1093/nar/gkad1084", # https://academic.oup.com/nar/article/52/D1/D384/7438909 "external_link": ["https://www.dsimb.inserm.fr/ATLAS/"], + "software_name": "GROMACS", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "software_version": "v2019.6", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "forcefied_name": "CHARMM36m", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "forcefied_version": "July 2020", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "water_model": "TIP3P", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "simulation_temperature": 300, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "simulation_time": "100 ns", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "simulation_timestep": 2, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters } @@ -115,7 +124,7 @@ def extract_file_sizes_from_html( return files_metadata -def scrape_metadata_for_a_dataset( +def scrape_metadata_for_one_dataset( client: httpx.Client, chain_id: str, logger: "loguru.Logger" = loguru.logger, @@ -165,6 +174,52 @@ def scrape_metadata_for_a_dataset( "doi": ATLAS_METADATA["doi"], "external_links": ATLAS_METADATA["external_link"], } + # Add molecules. + external_identifiers = [] + if meta_json.get("PDB"): + external_identifiers.append( + ExternalIdentifier( + database_name=ExternalDatabaseName.PDB, + identifier=meta_json["PDB"].split("_", maxsplit=1)[0], + ) + ) + if meta_json.get("UniProt"): + external_identifiers.append( + ExternalIdentifier( + database_name=ExternalDatabaseName.UNIPROT, + identifier=meta_json["UniProt"], + ) + ) + metadata["molecules"] = [ + Molecule( + name=meta_json.get("protein_name"), + sequence=meta_json.get("sequence"), + external_identifiers=external_identifiers, + ) + ] + # Add software. + metadata["software"] = [ + Software( + name=ATLAS_METADATA["software_name"], + version=ATLAS_METADATA["software_version"], + ) + ] + # Add forcefields and models. + metadata["forcefields_models"] = [ + ForceFieldModel( + name=ATLAS_METADATA["forcefield_name"], + version=ATLAS_METADATA["forcefield_version"], + ), + ForceFieldModel(name=ATLAS_METADATA["water_model"]), + ] + # Add simulation temperature. + metadata["simulation_temperatures_in_kelvin"] = [ + ATLAS_METADATA["simulation_temperature"] + ] + # Add simulation time. + metadata["simulation_times"] = [ATLAS_METADATA["simulation_time"]] + # Add simulation time step. + metadata["simulation_timesteps_in_fs"] = [ATLAS_METADATA["simulation_timestep"]] logger.info("Done.") return metadata @@ -223,7 +278,7 @@ def scrape_all_datasets( datasets_meta = [] logger.info("Starting scraping of all datasets...") for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1): - metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger) + metadata = scrape_metadata_for_one_dataset(client, pdb_chain, logger=logger) if metadata: datasets_meta.append(metadata) logger.info( From 8cce6feff9bd55272f81309afaf739cf7af9183b Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 16 Feb 2026 14:07:28 +0100 Subject: [PATCH 2/3] fix: Fix typo in dictionary keys Close #86 --- src/mdverse_scrapers/scrapers/atlas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index 89fcc1a..a8ea374 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -43,8 +43,8 @@ "external_link": ["https://www.dsimb.inserm.fr/ATLAS/"], "software_name": "GROMACS", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters "software_version": "v2019.6", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters - "forcefied_name": "CHARMM36m", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters - "forcefied_version": "July 2020", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "forcefield_name": "CHARMM36m", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters + "forcefield_version": "July 2020", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters "water_model": "TIP3P", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters "simulation_temperature": 300, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters "simulation_time": "100 ns", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters From 6bbfedb5f59e0d7bef6212de372f93f043c5dd96 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 16 Feb 2026 14:13:12 +0100 Subject: [PATCH 3/3] feat: Add molecular type --- src/mdverse_scrapers/scrapers/atlas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index a8ea374..75fedfe 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -19,7 +19,7 @@ ) from ..core.toolbox import print_statistics from ..models.dataset import DatasetMetadata -from ..models.enums import DatasetSourceName, ExternalDatabaseName +from ..models.enums import DatasetSourceName, ExternalDatabaseName, MoleculeType from ..models.scraper import ScraperContext from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software from ..models.utils import ( @@ -195,6 +195,7 @@ def scrape_metadata_for_one_dataset( name=meta_json.get("protein_name"), sequence=meta_json.get("sequence"), external_identifiers=external_identifiers, + type=MoleculeType.PROTEIN, ) ] # Add software.