From 463947ddc1fbcc52004c49bace75780536b17af5 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Thu, 16 Mar 2023 19:13:19 +0000
Subject: [PATCH 1/4] Initial ThermoML transform script

---
 data/thermoml_archive/requirements.txt |   3 +
 data/thermoml_archive/transform.py     | 140 +++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 data/thermoml_archive/requirements.txt
 create mode 100644 data/thermoml_archive/transform.py

diff --git a/data/thermoml_archive/requirements.txt b/data/thermoml_archive/requirements.txt
new file mode 100644
index 000000000..7d75ea7c3
--- /dev/null
+++ b/data/thermoml_archive/requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/sustainable-processes/thermopyl
+tqdm
+pyyaml
diff --git a/data/thermoml_archive/transform.py b/data/thermoml_archive/transform.py
new file mode 100644
index 000000000..ebbcb5b1b
--- /dev/null
+++ b/data/thermoml_archive/transform.py
@@ -0,0 +1,140 @@
+import hashlib
+import pathlib
+import tarfile
+import warnings
+
+import pandas as pd
+import requests
+import tqdm
+import yaml
+from thermopyl import Parser
+
+from chemnlp.data_val.model import Dataset
+
+
+def get_and_transform_data():
+    """Downloads the archived version of ThermoML, extracts it and
+    loops through the provided JSON-LD files to construct a dataframe.
+
+    """
+    # get raw data
+    fname = "ThermoML.v2020-09-30.tgz"
+    download_path = pathlib.Path(__file__).parent / fname
+    remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}"
+    sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2"
+    final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv"
+    final_expected_csv_checksum = ""
+
+    if not download_path.exists():
+        data = requests.get(remote_data_path)
+        with open(download_path, "wb") as f:
+            for chunk in tqdm.tqdm(
+                data.iter_content(chunk_size=8192), desc="Downloading archive"
+            ):
+                f.write(chunk)
+
+    # check if checksum is correct
+    sha256 = hashlib.sha256()
+    with open(download_path, "rb") as f:
+        for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
+            sha256.update(chunk)
+
+    if received_hash := sha256.hexdigest() != sha256_checksum:
+        raise RuntimeError(
+            "Downloaded file did not match expected checksum -- "
+            "either a new version has been released or something has gone wrong!\n"
+            f"Expected: {sha256_checksum}\n"
+            f"Received: {received_hash}"
+        )
+
+    # Extract tar.gz archive
+    with tarfile.open(download_path, "r:*") as tar:
+        tar.extractall(pathlib.Path(__file__).parent)
+
+    # Loop through journal DOI folders and scrape files
+
+    if final_csv_path.exists():
+        sha256 = hashlib.sha256()
+        with open(final_csv_path, "rb") as f:
+            for chunk in tqdm.tqdm(
+                iter(lambda: f.read(8192), b""), desc="Checking hash"
+            ):
+                sha256.update(chunk)
+        if sha256.hexdigest() != final_expected_csv_checksum:
+            warnings.warn(
+                "Old CSV file did not match expected checksum, will try to recreate."
+            )
+        final_csv_path.rename(final_csv_path.with_suffix(".old.csv"))
+
+    root_dois = ("10.1007", "10.1016", "10.1021")
+
+    num_points = 0
+    num_failed = 0
+    for doi in root_dois:
+        for path in tqdm.tqdm(
+            (pathlib.Path(__file__).parent / doi).glob("*.xml"),
+            desc=f"Looping over files in {doi}",
+        ):
+            with open(path, "r") as f:
+                try:
+                    pd.DataFrame(Parser(path).parse()).to_csv(final_csv_path, mode="a")
+                    num_points += 1
+                except Exception:
+                    num_failed += 1
+
+    print(f"Ingested {num_points} with {num_failed} failures.")
+
+    sha256 = hashlib.sha256()
+    with open(final_csv_path, "rb") as f:
+        for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
+            sha256.update(chunk)
+
+    if csv_hash := sha256.hexdigest() != final_expected_csv_checksum:
+        warnings.warn(
+            "Final CSV file did not match expected checksum!\n"
+            f"Expected: {final_expected_csv_checksum}\n"
+            f"Received: {csv_hash}"
+        )
+
+    # create metadata
+    meta = Dataset(
+        **{
+            "name": "thermoml_archive",
+            "description": "ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.",  # noqa
+            "identifiers": [
+                {
+                    "id": "",
+                    "type": "inchi",
+                },
+                {
+                    "id": "",
+                    "type": "inchikey",
+                },
+            ],
+            "license": "https://www.nist.gov/open/license",
+            "links": [
+                {
+                    "url": "https://doi.org/10.18434/mds2-2422",
+                    "description": "data publication",
+                },
+                {
+                    "url": "https://www.nist.gov/publications/towards-improved-fairness-thermoml-archive",
+                    "description": "NIST publication description",
+                },
+                {
+                    "url": "https://trc.nist.gov/ThermoML",
+                    "description": "Live database hosted at NIST Thermodynamics Research Center",
+                },
+            ],
+            "num_points": num_points,
+            "bibtex": [
+                "@article{Riccardi2022,title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},author = {Riccardi, Demian and Trautt, Zachary and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},year = {2022},journal = {Journal of Computational Chemistry},volume = {43},number = {12},pages = {879--887},doi = {10.1002/jcc.26842},langid = {english}}",  # noqa
+            ],
+        }
+    )
+    with open("meta.yaml", "w") as f:
+        yaml.dump(meta.dict(), f, sort_keys=False)
+
+
+if __name__ == "__main__":
+    get_and_transform_data()

From 21ec32f1e24bd4694e3e32d4220fb16e10750cb7 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Thu, 16 Mar 2023 19:14:08 +0000
Subject: [PATCH 2/4] Update docstring

---
 data/thermoml_archive/transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/thermoml_archive/transform.py b/data/thermoml_archive/transform.py
index ebbcb5b1b..38784f417 100644
--- a/data/thermoml_archive/transform.py
+++ b/data/thermoml_archive/transform.py
@@ -14,7 +14,7 @@
 
 def get_and_transform_data():
     """Downloads the archived version of ThermoML, extracts it and
-    loops through the provided JSON-LD files to construct a dataframe.
+    parses the provided XML files with thermopyl to construct a flat csv.
 
     """
     # get raw data

From 0e0bb71b80bc31cb28662d50f4e5d68c2b89f2e0 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Thu, 16 Mar 2023 22:49:47 +0000
Subject: [PATCH 3/4] Wrap sha256 calculator in function and add final csv
 checksum

---
 data/thermoml_archive/transform.py | 57 ++++++++++++++++++------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/data/thermoml_archive/transform.py b/data/thermoml_archive/transform.py
index 38784f417..8fd21ef02 100644
--- a/data/thermoml_archive/transform.py
+++ b/data/thermoml_archive/transform.py
@@ -2,12 +2,13 @@
 import pathlib
 import tarfile
 import warnings
+from typing import BinaryIO
 
 import pandas as pd
 import requests
 import tqdm
 import yaml
-from thermopyl import Parser
+from thermopyl import Parser as ThermoPylParser
 
 from chemnlp.data_val.model import Dataset
 
@@ -17,13 +18,16 @@ def get_and_transform_data():
     parses the provided XML files with thermopyl to construct a flat csv.
 
     """
+
     # get raw data
     fname = "ThermoML.v2020-09-30.tgz"
     download_path = pathlib.Path(__file__).parent / fname
     remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}"
     sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2"
     final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv"
-    final_expected_csv_checksum = ""
+    final_expected_csv_checksum = (
+        "fc296f47c1877b6ace72f7aa4a80c489b80d0eb25ea3a59885d067e554378b08"
+    )
 
     if not download_path.exists():
         data = requests.get(remote_data_path)
@@ -34,12 +38,10 @@ def get_and_transform_data():
                 f.write(chunk)
 
     # check if checksum is correct
-    sha256 = hashlib.sha256()
     with open(download_path, "rb") as f:
-        for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
-            sha256.update(chunk)
+        received_hash = _sha256_chunked_file_digest(f)
 
-    if received_hash := sha256.hexdigest() != sha256_checksum:
+    if received_hash != sha256_checksum:
         raise RuntimeError(
             "Downloaded file did not match expected checksum -- "
             "either a new version has been released or something has gone wrong!\n"
@@ -47,24 +49,24 @@ def get_and_transform_data():
             f"Received: {received_hash}"
         )
 
-    # Extract tar.gz archive
-    with tarfile.open(download_path, "r:*") as tar:
-        tar.extractall(pathlib.Path(__file__).parent)
-
     # Loop through journal DOI folders and scrape files
-
     if final_csv_path.exists():
-        sha256 = hashlib.sha256()
         with open(final_csv_path, "rb") as f:
-            for chunk in tqdm.tqdm(
-                iter(lambda: f.read(8192), b""), desc="Checking hash"
-            ):
-                sha256.update(chunk)
-        if sha256.hexdigest() != final_expected_csv_checksum:
+            csv_sha256_checksum = _sha256_chunked_file_digest(f)
+
+        if csv_sha256_checksum != final_expected_csv_checksum:
             warnings.warn(
                 "Old CSV file did not match expected checksum, will try to recreate."
             )
-        final_csv_path.rename(final_csv_path.with_suffix(".old.csv"))
+            final_csv_path.rename(final_csv_path.with_suffix(".old.csv"))
+
+        else:
+            print(f"Correct csv file already available at {final_csv_path}, exiting...")
+            return
+
+    # Extract tar.gz archive
+    with tarfile.open(download_path, "r:*") as tar:
+        tar.extractall(pathlib.Path(__file__).parent)
 
     root_dois = ("10.1007", "10.1016", "10.1021")
 
@@ -77,19 +79,19 @@ def get_and_transform_data():
         ):
             with open(path, "r") as f:
                 try:
-                    pd.DataFrame(Parser(path).parse()).to_csv(final_csv_path, mode="a")
+                    pd.DataFrame(ThermoPylParser(path).parse()).to_csv(
+                        final_csv_path, mode="a"
+                    )
                     num_points += 1
                 except Exception:
                     num_failed += 1
 
     print(f"Ingested {num_points} with {num_failed} failures.")
 
-    sha256 = hashlib.sha256()
     with open(final_csv_path, "rb") as f:
-        for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
-            sha256.update(chunk)
+        csv_hash = _sha256_chunked_file_digest(f)
 
-    if csv_hash := sha256.hexdigest() != final_expected_csv_checksum:
+    if csv_hash != final_expected_csv_checksum:
         warnings.warn(
             "Final CSV file did not match expected checksum!\n"
             f"Expected: {final_expected_csv_checksum}\n"
@@ -136,5 +138,14 @@ def get_and_transform_data():
         yaml.dump(meta.dict(), f, sort_keys=False)
 
 
+def _sha256_chunked_file_digest(fp: BinaryIO) -> str:
+    """Compute the SHA256 digest of a file in chunks."""
+    sha256 = hashlib.sha256()
+    for chunk in tqdm.tqdm(iter(lambda: fp.read(8192), b""), desc="Checking hash"):
+        sha256.update(chunk)
+
+    return sha256.hexdigest()
+
+
 if __name__ == "__main__":
     get_and_transform_data()

From b96adc359f8a3248ae78e950bdc1fc853f9c7161 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Thu, 16 Mar 2023 22:51:44 +0000
Subject: [PATCH 4/4] Add generated meta.yaml [wip]

---
 data/thermoml_archive/meta.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 data/thermoml_archive/meta.yaml

diff --git a/data/thermoml_archive/meta.yaml b/data/thermoml_archive/meta.yaml
new file mode 100644
index 000000000..fdd9186a6
--- /dev/null
+++ b/data/thermoml_archive/meta.yaml
@@ -0,0 +1,16 @@
+---
+name: thermoml_archive
+description: ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML
+    archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.
+targets:
+identifiers: []
+license: https://www.nist.gov/open/license
+num_points:
+bibtex:
+    - "@article{Riccardi2022,\n    title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},\n    author = {Riccardi, Demian and Trautt, Zachary\
+      \ and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},\n\
+      \    year = {2022},\n    journal = {Journal of Computational Chemistry},\n    volume = {43},\n    number = {12},\n    pages = {879--887},\n    doi\
+      \ = {10.1002/jcc.26842},\n    langid = {english}\n  }"
+templates:
+fields:
+links: