Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions docs/atlas.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# ATLAS

ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures, accompanied by their analysis in the form of interactive diagrams and trajectory visualisation. All raw trajectories as well as the results of analysis are available for download.
ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures,
accompanied by their analysis in the form of interactive diagrams and trajectory visualisation.
All raw trajectories as well as the results of analysis are available for download.

- web site: <https://www.dsimb.inserm.fr/ATLAS/>
- publication: [ATLAS: protein flexibility description from atomistic molecular dynamics simulations](https://academic.oup.com/nar/article/52/D1/D384/7438909), Nucleic Acids Research, 2024.
Expand Down Expand Up @@ -38,7 +40,8 @@ Example with dataset id `1k5n_A`:
Remarks:

- The title of the dataset is the protein name.
- No comment or description is provided. We used the organism as description.
- No comment or description is provided. We used the organism name as description.
- Parameters of molecular dynamics simulations are provided through the API endpoint <https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters>.

### Metadata for files

Expand Down
62 changes: 59 additions & 3 deletions src/mdverse_scrapers/scrapers/atlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
)
from ..core.toolbox import print_statistics
from ..models.dataset import DatasetMetadata
from ..models.enums import DatasetSourceName
from ..models.enums import DatasetSourceName, ExternalDatabaseName, MoleculeType
from ..models.scraper import ScraperContext
from ..models.simulation import ExternalIdentifier, ForceFieldModel, Molecule, Software
from ..models.utils import (
export_list_of_models_to_parquet,
normalize_datasets_metadata,
Expand All @@ -40,6 +41,14 @@
],
"doi": "10.1093/nar/gkad1084", # https://academic.oup.com/nar/article/52/D1/D384/7438909
"external_link": ["https://www.dsimb.inserm.fr/ATLAS/"],
"software_name": "GROMACS", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"software_version": "v2019.6", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"forcefield_name": "CHARMM36m", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"forcefield_version": "July 2020", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"water_model": "TIP3P", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"simulation_temperature": 300, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"simulation_time": "100 ns", # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
"simulation_timestep": 2, # https://www.dsimb.inserm.fr/ATLAS/api/MD_parameters
}


Expand Down Expand Up @@ -115,7 +124,7 @@ def extract_file_sizes_from_html(
return files_metadata


def scrape_metadata_for_a_dataset(
def scrape_metadata_for_one_dataset(
client: httpx.Client,
chain_id: str,
logger: "loguru.Logger" = loguru.logger,
Expand Down Expand Up @@ -165,6 +174,53 @@ def scrape_metadata_for_a_dataset(
"doi": ATLAS_METADATA["doi"],
"external_links": ATLAS_METADATA["external_link"],
}
# Add molecules.
external_identifiers = []
if meta_json.get("PDB"):
external_identifiers.append(
ExternalIdentifier(
database_name=ExternalDatabaseName.PDB,
identifier=meta_json["PDB"].split("_", maxsplit=1)[0],
)
)
if meta_json.get("UniProt"):
external_identifiers.append(
ExternalIdentifier(
database_name=ExternalDatabaseName.UNIPROT,
identifier=meta_json["UniProt"],
)
)
metadata["molecules"] = [
Molecule(
name=meta_json.get("protein_name"),
sequence=meta_json.get("sequence"),
external_identifiers=external_identifiers,
type=MoleculeType.PROTEIN,
)
]
# Add software.
metadata["software"] = [
Software(
name=ATLAS_METADATA["software_name"],
version=ATLAS_METADATA["software_version"],
)
]
# Add forcefields and models.
metadata["forcefields_models"] = [
ForceFieldModel(
name=ATLAS_METADATA["forcefield_name"],
version=ATLAS_METADATA["forcefield_version"],
),
ForceFieldModel(name=ATLAS_METADATA["water_model"]),
]
# Add simulation temperature.
metadata["simulation_temperatures_in_kelvin"] = [
ATLAS_METADATA["simulation_temperature"]
]
# Add simulation time.
metadata["simulation_times"] = [ATLAS_METADATA["simulation_time"]]
# Add simulation time step.
metadata["simulation_timesteps_in_fs"] = [ATLAS_METADATA["simulation_timestep"]]
logger.info("Done.")
return metadata

Expand Down Expand Up @@ -223,7 +279,7 @@ def scrape_all_datasets(
datasets_meta = []
logger.info("Starting scraping of all datasets...")
for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1):
metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger)
metadata = scrape_metadata_for_one_dataset(client, pdb_chain, logger=logger)
if metadata:
datasets_meta.append(metadata)
logger.info(
Expand Down