From 463947ddc1fbcc52004c49bace75780536b17af5 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Thu, 16 Mar 2023 19:13:19 +0000 Subject: [PATCH 1/4] Initial ThermoML transform script --- data/thermoml_archive/requirements.txt | 3 + data/thermoml_archive/transform.py | 140 +++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 data/thermoml_archive/requirements.txt create mode 100644 data/thermoml_archive/transform.py diff --git a/data/thermoml_archive/requirements.txt b/data/thermoml_archive/requirements.txt new file mode 100644 index 000000000..7d75ea7c3 --- /dev/null +++ b/data/thermoml_archive/requirements.txt @@ -0,0 +1,3 @@ +git+https://github.com/sustainable-processes/thermopyl +tqdm +pyyaml diff --git a/data/thermoml_archive/transform.py b/data/thermoml_archive/transform.py new file mode 100644 index 000000000..ebbcb5b1b --- /dev/null +++ b/data/thermoml_archive/transform.py @@ -0,0 +1,140 @@ +import hashlib +import pathlib +import tarfile +import warnings + +import pandas as pd +import requests +import tqdm +import yaml +from thermopyl import Parser + +from chemnlp.data_val.model import Dataset + + +def get_and_transform_data(): + """Downloads the archived version of ThermoML, extracts it and + loops through the provided JSON-LD files to construct a dataframe. + + """ + # get raw data + fname = "ThermoML.v2020-09-30.tgz" + download_path = pathlib.Path(__file__).parent / fname + remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}" + sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2" + final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv" + final_expected_csv_checksum = "" + + if not download_path.exists(): + data = requests.get(remote_data_path) + with open(download_path, "wb") as f: + for chunk in tqdm.tqdm( + data.iter_content(chunk_size=8192), desc="Downloading archive" + ): + f.write(chunk) + + # check if checksum is correct + sha256 = hashlib.sha256() + with open(download_path, "rb") as f: + for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"): + sha256.update(chunk) + + if received_hash := sha256.hexdigest() != sha256_checksum: + raise RuntimeError( + "Downloaded file did not match expected checksum -- " + "either a new version has been released or something has gone wrong!\n" + f"Expected: {sha256_checksum}\n" + f"Received: {received_hash}" + ) + + # Extract tar.gz archive + with tarfile.open(download_path, "r:*") as tar: + tar.extractall(pathlib.Path(__file__).parent) + + # Loop through journal DOI folders and scrape files + + if final_csv_path.exists(): + sha256 = hashlib.sha256() + with open(final_csv_path, "rb") as f: + for chunk in tqdm.tqdm( + iter(lambda: f.read(8192), b""), desc="Checking hash" + ): + sha256.update(chunk) + if sha256.hexdigest() != final_expected_csv_checksum: + warnings.warn( + "Old CSV file did not match expected checksum, will try to recreate." + ) + final_csv_path.rename(final_csv_path.with_suffix(".old.csv")) + + root_dois = ("10.1007", "10.1016", "10.1021") + + num_points = 0 + num_failed = 0 + for doi in root_dois: + for path in tqdm.tqdm( + (pathlib.Path(__file__).parent / doi).glob("*.xml"), + desc=f"Looping over files in {doi}", + ): + with open(path, "r") as f: + try: + pd.DataFrame(Parser(path).parse()).to_csv(final_csv_path, mode="a") + num_points += 1 + except Exception: + num_failed += 1 + + print(f"Ingested {num_points} with {num_failed} failures.") + + sha256 = hashlib.sha256() + with open(final_csv_path, "rb") as f: + for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"): + sha256.update(chunk) + + if csv_hash := sha256.hexdigest() != final_expected_csv_checksum: + warnings.warn( + "Final CSV file did not match expected checksum!\n" + f"Expected: {final_expected_csv_checksum}\n" + f"Received: {csv_hash}" + ) + + # create metadata + meta = Dataset( + **{ + "name": "thermoml_archive", + "description": "ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.", # noqa + "identifiers": [ + { + "id": "", + "type": "inchi", + }, + { + "id": "", + "type": "inchikey", + }, + ], + "license": "https://www.nist.gov/open/license", + "links": [ + { + "url": "https://doi.org/10.18434/mds2-2422", + "description": "data publication", + }, + { + "url": "https://www.nist.gov/publications/towards-improved-fairness-thermoml-archive", + "description": "NIST publication description", + }, + { + "url": "https://trc.nist.gov/ThermoML", + "description": "Live database hosted at NIST Thermodynamics Research Center", + }, + ], + "num_points": num_points, + "bibtex": [ + "@article{Riccardi2022,title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},author = {Riccardi, Demian and Trautt, Zachary and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},year = {2022},journal = {Journal of Computational Chemistry},volume = {43},number = {12},pages = {879--887},doi = {10.1002/jcc.26842},langid = {english}}", # noqa + ], + } + ) + with open("meta.yaml", "w") as f: + yaml.dump(meta.dict(), f, sort_keys=False) + + +if __name__ == "__main__": + get_and_transform_data() From 21ec32f1e24bd4694e3e32d4220fb16e10750cb7 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Thu, 16 Mar 2023 19:14:08 +0000 Subject: [PATCH 2/4] Update docstring --- data/thermoml_archive/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/thermoml_archive/transform.py b/data/thermoml_archive/transform.py index ebbcb5b1b..38784f417 100644 --- a/data/thermoml_archive/transform.py +++ b/data/thermoml_archive/transform.py @@ -14,7 +14,7 @@ def get_and_transform_data(): """Downloads the archived version of ThermoML, extracts it and - loops through the provided JSON-LD files to construct a dataframe. + parses the provided XML files with thermopyl to construct a flat csv. """ # get raw data From 0e0bb71b80bc31cb28662d50f4e5d68c2b89f2e0 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Thu, 16 Mar 2023 22:49:47 +0000 Subject: [PATCH 3/4] Wrap sha256 calculator in function and add final csv checksum --- data/thermoml_archive/transform.py | 57 ++++++++++++++++++------------ 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/data/thermoml_archive/transform.py b/data/thermoml_archive/transform.py index 38784f417..8fd21ef02 100644 --- a/data/thermoml_archive/transform.py +++ b/data/thermoml_archive/transform.py @@ -2,12 +2,13 @@ import pathlib import tarfile import warnings +from typing import BinaryIO import pandas as pd import requests import tqdm import yaml -from thermopyl import Parser +from thermopyl import Parser as ThermoPylParser from chemnlp.data_val.model import Dataset @@ -17,13 +18,16 @@ def get_and_transform_data(): parses the provided XML files with thermopyl to construct a flat csv. """ + # get raw data fname = "ThermoML.v2020-09-30.tgz" download_path = pathlib.Path(__file__).parent / fname remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}" sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2" final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv" - final_expected_csv_checksum = "" + final_expected_csv_checksum = ( + "fc296f47c1877b6ace72f7aa4a80c489b80d0eb25ea3a59885d067e554378b08" + ) if not download_path.exists(): data = requests.get(remote_data_path) @@ -34,12 +38,10 @@ def get_and_transform_data(): f.write(chunk) # check if checksum is correct - sha256 = hashlib.sha256() with open(download_path, "rb") as f: - for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"): - sha256.update(chunk) + received_hash = _sha256_chunked_file_digest(f) - if received_hash := sha256.hexdigest() != sha256_checksum: + if received_hash != sha256_checksum: raise RuntimeError( "Downloaded file did not match expected checksum -- " "either a new version has been released or something has gone wrong!\n" @@ -47,24 +49,24 @@ def get_and_transform_data(): f"Received: {received_hash}" ) - # Extract tar.gz archive - with tarfile.open(download_path, "r:*") as tar: - tar.extractall(pathlib.Path(__file__).parent) - # Loop through journal DOI folders and scrape files - if final_csv_path.exists(): - sha256 = hashlib.sha256() with open(final_csv_path, "rb") as f: - for chunk in tqdm.tqdm( - iter(lambda: f.read(8192), b""), desc="Checking hash" - ): - sha256.update(chunk) - if sha256.hexdigest() != final_expected_csv_checksum: + csv_sha256_checksum = _sha256_chunked_file_digest(f) + + if csv_sha256_checksum != final_expected_csv_checksum: warnings.warn( "Old CSV file did not match expected checksum, will try to recreate." ) - final_csv_path.rename(final_csv_path.with_suffix(".old.csv")) + final_csv_path.rename(final_csv_path.with_suffix(".old.csv")) + + else: + print(f"Correct csv file already available at {final_csv_path}, exiting...") + return + + # Extract tar.gz archive + with tarfile.open(download_path, "r:*") as tar: + tar.extractall(pathlib.Path(__file__).parent) root_dois = ("10.1007", "10.1016", "10.1021") @@ -77,19 +79,19 @@ def get_and_transform_data(): ): with open(path, "r") as f: try: - pd.DataFrame(Parser(path).parse()).to_csv(final_csv_path, mode="a") + pd.DataFrame(ThermoPylParser(path).parse()).to_csv( + final_csv_path, mode="a" + ) num_points += 1 except Exception: num_failed += 1 print(f"Ingested {num_points} with {num_failed} failures.") - sha256 = hashlib.sha256() with open(final_csv_path, "rb") as f: - for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"): - sha256.update(chunk) + csv_hash = _sha256_chunked_file_digest(f) - if csv_hash := sha256.hexdigest() != final_expected_csv_checksum: + if csv_hash != final_expected_csv_checksum: warnings.warn( "Final CSV file did not match expected checksum!\n" f"Expected: {final_expected_csv_checksum}\n" @@ -136,5 +138,14 @@ def get_and_transform_data(): yaml.dump(meta.dict(), f, sort_keys=False) +def _sha256_chunked_file_digest(fp: BinaryIO) -> str: + """Compute the SHA256 digest of a file in chunks.""" + sha256 = hashlib.sha256() + for chunk in tqdm.tqdm(iter(lambda: fp.read(8192), b""), desc="Checking hash"): + sha256.update(chunk) + + return sha256.hexdigest() + + if __name__ == "__main__": get_and_transform_data() From b96adc359f8a3248ae78e950bdc1fc853f9c7161 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Thu, 16 Mar 2023 22:51:44 +0000 Subject: [PATCH 4/4] Add generated meta.yaml [wip] --- data/thermoml_archive/meta.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 data/thermoml_archive/meta.yaml diff --git a/data/thermoml_archive/meta.yaml b/data/thermoml_archive/meta.yaml new file mode 100644 index 000000000..fdd9186a6 --- /dev/null +++ b/data/thermoml_archive/meta.yaml @@ -0,0 +1,16 @@ +--- +name: thermoml_archive +description: ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML + archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals. +targets: +identifiers: [] +license: https://www.nist.gov/open/license +num_points: +bibtex: + - "@article{Riccardi2022,\n title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},\n author = {Riccardi, Demian and Trautt, Zachary\ + \ and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},\n\ + \ year = {2022},\n journal = {Journal of Computational Chemistry},\n volume = {43},\n number = {12},\n pages = {879--887},\n doi\ + \ = {10.1002/jcc.26842},\n langid = {english}\n }" +templates: +fields: +links: