Skip to content

Commit 02ae4aa

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent bf1d27d commit 02ae4aa

File tree

8 files changed

+159
-158
lines changed

8 files changed

+159
-158
lines changed

data/tabular/ld50_catmos/meta.yaml

Lines changed: 135 additions & 136 deletions
Large diffs are not rendered by default.

data/tabular/mona/example_processing_and_templates.ipynb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
"from tqdm import tqdm\n",
2121
"\n",
2222
"# import datasets\n",
23-
"import rdkit\n",
2423
"import rdkit.Chem as Chem\n",
2524
"import rdkit.RDLogger as RDLogger"
2625
]
@@ -1444,7 +1443,7 @@
14441443
" k = md[\"name\"]\n",
14451444
" v = md.get(\"value\", np.nan)\n",
14461445
" df_row[\"md_\" + transform_key(k)] = v\n",
1447-
" if not (v is np.nan):\n",
1446+
" if v is not np.nan:\n",
14481447
" md_keys.append(k)\n",
14491448
" md_key_counter.update(md_keys)\n",
14501449
" compounds = entry.get(\"compound\", [])\n",

data/tabular/ocp/transform.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str:
2121
text = text.replace(chr(code), f"$_{code-8320}$")
2222

2323
text = text.replace("\u0305", "$^-$")
24-
text = text.replace("\u207A", "$^+$")
25-
text = text.replace("\u207B", "$^-$")
24+
text = text.replace("\u207a", "$^+$")
25+
text = text.replace("\u207b", "$^-$")
2626
text = text.replace("\u2074", "$^4$")
2727
text = text.replace("\u2070", "$^0$")
2828
text = text.replace("\u2078", "$^1$")

data/tabular/orbnet_denali/develop_transform.ipynb

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,7 @@
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
28-
"from pathlib import Path\n",
2928
"from rdkit import Chem\n",
30-
"import matplotlib.pyplot as plt\n",
31-
"import numpy as np\n",
32-
"import os\n",
3329
"import pandas as pd\n",
3430
"from glob import glob"
3531
]
@@ -474,7 +470,6 @@
474470
"metadata": {},
475471
"outputs": [],
476472
"source": [
477-
"from rdkit.Chem import rdDetermineBonds\n",
478473
"from chemnlp.utils import xyz_to_mol"
479474
]
480475
},

docs/CONTRIBUTING.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ One of the most valuable contributions is implementing a dataset. Here's how to
1717
1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there.
1818
2. Create an issue in this repository stating your intention to add the dataset.
1919
3. Make a Pull Request (PR) that adds a new folder in `data` with the following files:
20-
2120
- `meta.yaml`: Describes the dataset (see structure below).
2221
- `transform.py`: Python code to transform the original dataset into a usable form.
2322

experiments/README.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ running on the cluster.
1111

1212
1. [Create Environment](scripts/env_creation_neox.sh) -
1313
creates a basic conda environment for experiments.
14-
1514
- Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
1615
> Using the positional argument passed into the script
1716
- Clones `chemnlp` into your personal cluster `USER` directory.
@@ -28,7 +27,6 @@ running on the cluster.
2827

2928
2. [Training Models](scripts/sbatch_train_neox.sh) -
3029
runs a GPT-NeoX training pipeline
31-
3230
- creates a conda environment using the `env_creation_neox.sh` script.
3331
- runs the GPT-NeoX `train.py` script using the user configuration
3432
> as GPT-NeoX configurations can be combined, the PEFT configurations are held
@@ -48,7 +46,6 @@ running on the cluster.
4846

4947
1. [Create Environment](scripts/env_creation_hf.sh) -
5048
creates a basic conda environment for experiments.
51-
5249
- Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
5350
> Using the positional argument passed into the script
5451
- Clones `chemnlp` into your personal cluster `USER` directory.
@@ -65,7 +62,6 @@ running on the cluster.
6562

6663
2. [Single Node Models](scripts/sbatch_train_hf.sh) -
6764
runs a Hugging Face training pipeline across devices
68-
6965
- creates a conda environment using the `env_creation_hf.sh` script.
7066
- runs the Hugging Face `run_tune.py` script with the user configuration
7167

@@ -81,7 +77,6 @@ running on the cluster.
8177
8278
3. [Multi Node Models](scripts/sbatch_train_hf_multinode.sh) -
8379
runs a Hugging Face training pipeline across nodes
84-
8580
- creates a conda environment using the `env_creation_hf.sh` script.
8681
- runs the Hugging Face `run_tune.py` script with the user configuration
8782

@@ -97,7 +92,6 @@ running on the cluster.
9792
9893
4. [Grid Search](scripts/run_grid_search.py) -
9994
runs a grid search across training pipeline configuration options
100-
10195
- Update the upper-case parameters at the top of the script
10296
- The script runs an exhaustive set of experiments across all permutations
10397

experiments/ablations/continued_pretrain.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,13 @@ def load_model(
5757

5858

5959
def train(
60-
model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None
60+
model,
61+
tokenizer,
62+
dataset,
63+
run_name: str,
64+
batch_size: int = 64,
65+
max_seq_length=2048,
66+
eval_dataset=None,
6167
):
6268
wandb.init(project="chemnlp-ablations", name=run_name)
6369
trainer = UnslothTrainer(
@@ -83,8 +89,8 @@ def train(
8389
lr_scheduler_type="linear",
8490
seed=3407,
8591
output_dir=f"outputs_{run_name}",
86-
eval_strategy = 'steps' if eval_dataset is not None else 'no',
87-
eval_steps = 10_000 if eval_dataset is not None else None
92+
eval_strategy="steps" if eval_dataset is not None else "no",
93+
eval_steps=10_000 if eval_dataset is not None else None,
8894
),
8995
)
9096

@@ -138,9 +144,18 @@ def run(
138144
)
139145

140146
dataset = create_dataset(tokenizer, data_files)
141-
eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None
147+
eval_dataset = (
148+
create_dataset(tokenizer, eval_data_files) if eval_data_files else None
149+
)
142150

143-
train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset)
151+
train(
152+
model,
153+
tokenizer,
154+
dataset,
155+
run_name,
156+
batch_size=batch_size,
157+
eval_dataset=eval_dataset,
158+
)
144159

145160

146161
if __name__ == "__main__":
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
model_name: "EleutherAI/pythia-1b"
22
context_length: 2048
33
dataset_name: "EleutherAI/pile"
4-
dataset_args: {"name": "pubmed", "split": "train"}
4+
dataset_args: { "name": "pubmed", "split": "train" }
55
batch_size: 1
66
string_key: "text"
77
save_path: "/fsx/proj-chemnlp/data/example_tokenised"

0 commit comments

Comments
 (0)