OpenBioML · csjackson0 · Mar 11, 2023 · Mar 11, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/.github/workflows/install.yaml b/.github/workflows/install.yaml
@@ -22,10 +22,3 @@ jobs:
               run: |
                   conda activate chemnlp
                   python -m src.chemnlp.data_val.validate data
-            - name: Lint
-              shell: bash -l {0}
-              run: |
-                  conda activate chemnlp
-                  black --check .
-                  isort --check-only .
-                  flake8 .
diff --git a/.gitignore b/.gitignore
@@ -129,4 +129,18 @@ dmypy.json
 .pyre/
 
 # wandb
+**/wandb/
 scripts/wandb
+
+# scratch
+scratch/
+
+
+# let's don't commit data unless we really want to
+*.tab
+*.csv
+
+
+# vim
+*~
+*.swp
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,4 @@
 [submodule "gpt-neox"]
 	path = gpt-neox
-	url = git@github.com:EleutherAI/gpt-neox.git
+	url = git@github.com:OpenBioML/gpt-neox.git
+	branch = main
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,8 @@
 ---
 ci:
     autoupdate_schedule: quarterly
+    autofix_prs: false
+    submodules: false
 
 repos:
     - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -16,6 +18,7 @@ repos:
       rev: 0.2.2
       hooks:
           - id: yamlfmt
+            exclude: ^experiments/configs
 
     - repo: https://github.com/psf/black
       rev: 22.12.0

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -13,19 +13,7 @@ Please make a [GitHub account](https://github.com/) prior to implementing a data
 
 For code and data contributions, we recommend you creata a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). If you do not have conda already installed on your system, we recommend installing [miniconda](https://docs.conda.io/en/latest/miniconda.html):
 
-```bash
-conda env create -f conda.yaml  # Creates a conda env
-conda activate chemnlp  # Activate your conda environment
-```
-
-Then, please run
-
-```bash
-pre-commit install
-```
-
-to install the [pre-commit hooks](https://pre-commit.com/). These will automatically format and lint your code upon every commit.
-There might be some warnings, e.g., by `flake8`. If you struggle with them, do not hestiate to contact us.
+To create your developer environment please follow the guidelines in the `Installation and set-up` of [README.md](README.md)
 
 # Implementing a dataset
 
@@ -37,11 +25,17 @@ With "implementing" we mean the following:
 - Make an issue in this repository that you want to add this dataset (we will label this issue and assign it to you)
 - Make a PR that adds in a new folder in `data`
   - `meta.yaml` describing the dataset in the form that `transform.py` produces. We will use this later to construct the prompts.
+    > If your dataset has multiple natural splits (i.e. train, test, validation) you can create a <split>\_meta.yaml for each.
   - `transform.py` Python code that transforms the original dataset (linked in `meta.yaml`) into a form that can be consumed by the loader.
     For tabular datasets that will mostly involve: Removing/merging duplicated entries, renaming columns, dropping unused columns.
     Try to keep the output your `transform.py` uses as lean as possible (i.e. no columns that will not be used).
     In some cases, you might envision that extra columns might be useful. If this is the case, please add them (e.g., indicating some grouping, etc.)
     Even though some examples create the `meta.yaml` in `transform.py` there is no need to do so. You can also do it by hand.
+
+
+    In the `transform.py` please try to download the data from an official resource.
+    We encourage you to upload the raw data to HuggingFace Hub, Foundry or some other repository and then retrieve the data from there with your script, if the raw data license permits it.
+
   - If you need additional dependencies, add them to `dev-requirements.txt` (those are needed for linting/testing/validation) or `requirements.txt` (those are the ones for running `transform.py`)
 
 
@@ -57,7 +51,7 @@ targets:
   - id: Solubility # name of the column in a tabular dataset
     description: Experimental aqueous solubility value (LogS) # description of what this column means
     units: log(mol/L) # units of the values in this column (leave empty if unitless)
-    type: continuous # can be "categorical", "ordinal", "continuous"
+    type: continuous # can be "categorical", "ordinal", "continuous", "boolean"
     names: # names for the property (to sample from for building the prompts)
       - solubility
       - water solubility
@@ -69,9 +63,13 @@ targets:
       - solubility
       - water solubility
       - solubility in water
+benchmarks: # lists all benchmarks this dataset has been part of. split_column is a column in this dataframe with the value "train", "valid", "test" - indicating to which fold a specific entry belongs to
+    - name: TDC
+      link: https://tdcommons.ai/
+      split_column: split
 identifiers:
   - id: InChI # column name
-    type: InChI # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
+    type: InChI # can be "SMILES", "SELFIES", "IUPAC", "Other", "InChI", "InChiKey", "RXNSMILES", "RXNSMILESWAdd" see IdentifierEnum
     description: International Chemical Identifier # description (optional, except for "OTHER")
 license: CC0 1.0 # license under which the original dataset was published
 num_points: 10000 # number of datapoints in this dataset
@@ -94,6 +92,7 @@ bibtex: # citation(s) for this dataset in BibTeX format
     journal = {Sci Data}
     }"
 ```
+Please do not simply copy/paste generic descriptions but try to give a concise and specific description for the dataset you are adding.
 
 For the typical material-property datasets, we will later use the `identifier` and `property` columns to create and fill prompt templates.
 In case your dataset isn't a simple tabular dataset with chemical compounds and properties, please also add the following additional fields for the templates:
@@ -141,6 +140,68 @@ In this case, we will sample from the identifier and targets columns. If you spe
 Therefore, it is very important that the column names in the `meta.yaml` match the ones in the file that `transform.py` produces.
 One example of a prompt we might construct is `"What is the <target_name> of <identifier>"`, where we sample `target_name` from the names of the targets listed in `meta.yaml` and `identifier` from the identifiers provided in `meta.yaml`.
 
+#### Splits
+
+If your dataset is part of a benchmark, please indicate what fold your data is part of using an additional `split_col` in which you use `train`, `valid`, `test` to indicate the split type.
+Please indicate this in the `meta.yaml` under the field `split_col`.
+
+#### Identifiers
+
+We ask you to add `uris` and `pubchem_aids` in case you find suitable references. We distinguish certain types of identifiers, for which you have to specify the correct strings. The currently allowed types are in the `IdentifierEnum` in `src/chemnlp/data_val/model.py`:
+
+- `SMILES`: Use the canonical form ([RdKit](https://www.rdkit.org/docs/GettingStartedInPython.html))
+- `SELFIES`: [Self-referencing embedded strings](https://github.com/aspuru-guzik-group/selfies)
+- `IUPAC`: IUPAC-Name, not use it for non-standard, common names
+- `InChI`
+- `InChIKey`: The key derived from the `InChI`
+- `RXNSMILES`: The [reaction SMILES](https://www.daylight.com/meetings/summerschool98/course/dave/smiles-react.html) containing only educt and product
+- `RXNSMILESWAdd`: The reaction SMILES also containing solvent and additives
+- `Other`: For all other identifiers
+
+##### Uniform Resource Identifiers (URIs)
+
+If you have a uniform resource identifier (URI) that links to a suitable name of a property, please list it in the `uris` list for a given `target`.
+Please ensure that the link is specific. If you have a boolean target that measures inhibition of a protein, link to `inhbitor of XY` and _not_ to the protein.
+If such a link does not exist, leave the field empty.
+
+You might find suitable links using the following resources:
+
+- https://bioportal.bioontology.org/search
+- https://goldbook.iupac.org/
+
+
+#### PubChem Assay IDs
+
+For some targets, the activity was measured using assays. In this case, please list the assays using with their _numeric_ PubChem assay id in the field `pubchem_aids`.
+Please ensure that the _first_ entry in this list is a primary scan which corresponds to the target property (and not to its inverse or a control).
+Keep in mind that we plan to look up the name and the description of the assay to build prompt. That is, the name of the assay of the _first entry_ in this list should also work in a prompt such as `Is <identifier> active in `<pubchem assay name>?`
+
+#### Prompt examples
+
+##### Boolean variables
+
+- `Is <name> <identifier>?`
+- ```
+  What molecules in the list are <name>?
+
+  - <identifier_1>
+  - <identifier_2>
+  - <identifier_3>
+  ```
+
+
+#### Continuous variables
+
+- `What is <name> of <identifier>?`
+- ```
+  What is the molecule with largest <name> in the following list?
+
+  - <identifier_1>
+  - <identifier_2>
+  - <identifier_3>
+  ```
+
+
 
 For datasets that are not in tabular form, we are still discussing the best process, but we also envision that we might perform some named-entity-recognition to also use some of the text datasets in a framework such as LIFT. Otherwise, we will simple use them in the typical GPT pretraining task.
 
@@ -162,3 +223,16 @@ Our first experiments will be based on [Pythia model](https://github.com/Eleuthe
 If you are not familiar LLM training have a look at this very good guide: [Large-scale language modeling tutorials with PyTorch from TUNiB](https://nbviewer-org.translate.goog/github/tunib-ai/large-scale-lm-tutorials/blob/main/notebooks/01_introduction.ipynb?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=de&_x_tr_pto=wapp)
 
 Please have a look for the details in the [corresponding section in our proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit#heading=h.aww08l8o9tti).
+
+## Hugging Face Hub
+
+We have a preference for using the Hugging Face Hub and processing datasets through the [`datasets`](https://github.com/huggingface/datasets) package when storing larger datasets on the [OpenBioML](https://huggingface.co/OpenBioML) hub as it can offer us a lot of nice features such as
+
+- Easy multiprocessing parallelism for data cleaning
+- Version controlling of the datasets as well as our code
+- Easy interface into tokenisation & other aspects for model training
+- Reuse of utility functions once we have a consistent data structure.
+
+However, don't feel pressured to use this if you're more comfortable contributing an external dataset in another format. We are primarily thinking of using this functionality for processed, combined datasets which are ready for training.
+
+Feel free to reach out to one of the team and read [this guide](https://huggingface.co/docs/datasets/upload_dataset#share-a-dataset-to-the-hub) for more information.
diff --git a/README.md b/README.md
@@ -1,59 +1,75 @@
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md)
 
 # ChemNLP project 🧪🚀
+
 The ChemNLP project aims to
-1) create an extensive chemistry dataset and
-1) use it to train large language models (LLMs) that can leverage the data for a wide range of chemistry applications.
 
-For more details see our [information material section below](#information-material).
+1. create an extensive chemistry dataset and
+1. use it to train large language models (LLMs) that can leverage the data for a wide range of chemistry applications.
 
+For more details see our [information material section below](#information-material).
 
 # Information material
-* [Introduction presentation](https://docs.google.com/presentation/d/1JkAKJveYsNGtAWoaksU8ykTdrC0aX3FshiFJ13SU6o8/edit?usp=sharing)
-* [Project proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit?usp=sharing)
-* [Task board](https://github.com/orgs/OpenBioML/projects/5/views/1)
-* [awesome-chemistry-datasets repository](https://github.com/kjappelbaum/awesome-chemistry-datasets) to collect interesting chemistry datasets
-* Weekly meetings are set up soon! Please join our [Discord community](#community) for more information.
 
+- [Introduction presentation](https://docs.google.com/presentation/d/1JkAKJveYsNGtAWoaksU8ykTdrC0aX3FshiFJ13SU6o8/edit?usp=sharing)
+- [Project proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit?usp=sharing)
+- [Task board](https://github.com/orgs/OpenBioML/projects/5/views/1)
+- [awesome-chemistry-datasets repository](https://github.com/kjappelbaum/awesome-chemistry-datasets) to collect interesting chemistry datasets
+- Weekly meetings are set up soon! Please join our [Discord community](#community) for more information.
 
 # Community
-Feel free to join our `#chemnlp` channel on our [OpenBioML discord server](https://discord.com/invite/GgDBFP8ZEt) to start the discussion in more detail.
 
+Feel free to join our `#chemnlp` channel on our [OpenBioML discord server](https://discord.com/invite/GgDBFP8ZEt) to start the discussion in more detail.
 
 # Contributing
+
 ChemNLP is an open-source project - your involvement is warmly welcome! If you're excited to join us, we recommend the following steps:
-* Join our [Discord server](#community).
-* Have a look at our [contributing guide](https://github.com/OpenBioML/chemnlp/blob/main/CONTRIBUTING.md).
-* Looking for ideas? See our [task board](https://github.com/orgs/OpenBioML/projects/5/views/1) to see what we may need help with.
-* Have an idea? Create an [issue](https://github.com/OpenBioML/chemnlp/issues)!
 
+- Join our [Discord server](#community).
+- Have a look at our [contributing guide](https://github.com/OpenBioML/chemnlp/blob/main/CONTRIBUTING.md).
+- Looking for ideas? See our [task board](https://github.com/orgs/OpenBioML/projects/5/views/1) to see what we may need help with.
+- Have an idea? Create an [issue](https://github.com/OpenBioML/chemnlp/issues)!
 
 # Note on the "ChemNLP" name
-Our OpenBioML ChemNLP project is not afiliated to the [ChemNLP library from NIST](https://arxiv.org/abs/2209.08203) and we use "ChemNLP" as a general term to highlight our project focus. The datasets and models we create through our project will have a unique and recognizable name when we release them.
 
+Our OpenBioML ChemNLP project is not afiliated to the [ChemNLP library from NIST](https://arxiv.org/abs/2209.08203) and we use "ChemNLP" as a general term to highlight our project focus. The datasets and models we create through our project will have a unique and recognizable name when we release them.
 
 # About OpenBioML.org
+
 See https://openbioml.org, especially [our approach and partners](https://openbioml.org/approach-and-partners.html).
 
 # Installation and set-up
+
 Create a new conda environment with Python 3.8:
+
 ```
 conda create -n chemnlp python=3.8
 conda activate chemnlp
 ```
+
 To install the `chemnlp` package (and required dependencies):
 
 ```
 pip install chemnlp
 ```
 
 If working on developing the python package:
+
 ```
 pip install -e "chemnlp[dev]"  # to install development dependencies
 ```
 
 If extra dependencies are required (e.g. for dataset creation) but are not needed for the main package please add to the `pyproject.toml` in the `dataset_creation` variable and ensure this is reflected in the `conda.yml` file.
 
+Then, please run
+
+```bash
+pre-commit install
+```
+
+to install the [pre-commit hooks](https://pre-commit.com/). These will automatically format and lint your code upon every commit.
+There might be some warnings, e.g., by `flake8`. If you struggle with them, do not hestiate to contact us.
+
 **Note**
 
 If working on model training, request access to the `wandb` project `chemnlp`
@@ -62,3 +78,30 @@ and log-in to `wandb` with your API key per [here](https://docs.wandb.ai/quickst
 ### Adding a new dataset (to the model training pipline)
 
 We specify datasets by creating a new function [here](src/chemnlp/data/hf_datasets.py) which is named per the dataset on Hugging Face. At present the function must accept a tokenizer and return back the tokenized train and validation datasets.
+
+### Installing submodules
+
+In order to ensure you also clone and install the required submodules (i.e. gpt-neox) you will have to do one of the following;
+
+- Recursively clone the submodule from GitHub
+
+  ```
+   # using ssh (if you have your ssh key on GitHub)
+  git clone --recurse-submodules --remote-submodules git@github.com:OpenBioML/chemnlp.git
+
+   # using https (if you use personal access token)
+  git clone --recurse-submodules --remote-submodules [git@github.com:OpenBioML/chemnlp.git ](https://github.com/OpenBioML/chemnlp.git)
+  ```
+
+  > This will automatically initialize and update each submodule in the repository, including nested submodules if any of the submodules in the repository have submodules themselve
+
+- Initialise and install the submodule after cloning
+
+  ```
+  git submodule init # registers submodule
+  git submodule update # clones and updates submodule
+  ```
+
+### Experiments
+
+Follow the guidelines [here](experiments/README.md) for more information about running experiments on the Stability AI cluster.