[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 02ae4aa2a12b · 2025-06-23T17:12:31.000Z
for more information, see https://pre-commit.ci
diff --git a/data/tabular/ld50_catmos/meta.yaml b/data/tabular/ld50_catmos/meta.yaml
diff --git a/data/tabular/mona/example_processing_and_templates.ipynb b/data/tabular/mona/example_processing_and_templates.ipynb
@@ -20,7 +20,6 @@
     "from tqdm import tqdm\n",
     "\n",
     "# import datasets\n",
-    "import rdkit\n",
     "import rdkit.Chem as Chem\n",
     "import rdkit.RDLogger as RDLogger"
    ]
@@ -1444,7 +1443,7 @@
     "                k = md[\"name\"]\n",
     "                v = md.get(\"value\", np.nan)\n",
     "                df_row[\"md_\" + transform_key(k)] = v\n",
-    "                if not (v is np.nan):\n",
+    "                if v is not np.nan:\n",
     "                    md_keys.append(k)\n",
     "            md_key_counter.update(md_keys)\n",
     "            compounds = entry.get(\"compound\", [])\n",
diff --git a/data/tabular/ocp/transform.py b/data/tabular/ocp/transform.py
@@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str:
         text = text.replace(chr(code), f"$_{code-8320}$")
 
     text = text.replace("\u0305", "$^-$")
-    text = text.replace("\u207A", "$^+$")
-    text = text.replace("\u207B", "$^-$")
+    text = text.replace("\u207a", "$^+$")
+    text = text.replace("\u207b", "$^-$")
     text = text.replace("\u2074", "$^4$")
     text = text.replace("\u2070", "$^0$")
     text = text.replace("\u2078", "$^1$")
diff --git a/data/tabular/orbnet_denali/develop_transform.ipynb b/data/tabular/orbnet_denali/develop_transform.ipynb
@@ -25,11 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pathlib import Path\n",
     "from rdkit import Chem\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import os\n",
     "import pandas as pd\n",
     "from glob import glob"
    ]
@@ -474,7 +470,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from rdkit.Chem import rdDetermineBonds\n",
     "from chemnlp.utils import xyz_to_mol"
    ]
   },
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -17,7 +17,6 @@ One of the most valuable contributions is implementing a dataset. Here's how to
 1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there.
 2. Create an issue in this repository stating your intention to add the dataset.
 3. Make a Pull Request (PR) that adds a new folder in `data` with the following files:
-
    - `meta.yaml`: Describes the dataset (see structure below).
    - `transform.py`: Python code to transform the original dataset into a usable form.
 
diff --git a/experiments/README.md b/experiments/README.md
@@ -11,7 +11,6 @@ running on the cluster.
 
 1. [Create Environment](scripts/env_creation_neox.sh) -
    creates a basic conda environment for experiments.
-
    - Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
      > Using the positional argument passed into the script
    - Clones `chemnlp` into your personal cluster `USER` directory.
@@ -28,7 +27,6 @@ running on the cluster.
 
 2. [Training Models](scripts/sbatch_train_neox.sh) -
    runs a GPT-NeoX training pipeline
-
    - creates a conda environment using the `env_creation_neox.sh` script.
    - runs the GPT-NeoX `train.py` script using the user configuration
      > as GPT-NeoX configurations can be combined, the PEFT configurations are held
@@ -48,7 +46,6 @@ running on the cluster.
 
 1. [Create Environment](scripts/env_creation_hf.sh) -
    creates a basic conda environment for experiments.
-
    - Creates a conda environment at the prefix `CONDA_ENV_PATH` path.
      > Using the positional argument passed into the script
    - Clones `chemnlp` into your personal cluster `USER` directory.
@@ -65,7 +62,6 @@ running on the cluster.
 
 2. [Single Node Models](scripts/sbatch_train_hf.sh) -
    runs a Hugging Face training pipeline across devices
-
    - creates a conda environment using the `env_creation_hf.sh` script.
    - runs the Hugging Face `run_tune.py` script with the user configuration
 
@@ -81,7 +77,6 @@ running on the cluster.
 
 3. [Multi Node Models](scripts/sbatch_train_hf_multinode.sh) -
    runs a Hugging Face training pipeline across nodes
-
    - creates a conda environment using the `env_creation_hf.sh` script.
    - runs the Hugging Face `run_tune.py` script with the user configuration
 
@@ -97,7 +92,6 @@ running on the cluster.
 
 4. [Grid Search](scripts/run_grid_search.py) -
    runs a grid search across training pipeline configuration options
-
    - Update the upper-case parameters at the top of the script
    - The script runs an exhaustive set of experiments across all permutations
 
diff --git a/experiments/ablations/continued_pretrain.py b/experiments/ablations/continued_pretrain.py
@@ -57,7 +57,13 @@ def load_model(
 
 
 def train(
-    model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None
+    model,
+    tokenizer,
+    dataset,
+    run_name: str,
+    batch_size: int = 64,
+    max_seq_length=2048,
+    eval_dataset=None,
 ):
     wandb.init(project="chemnlp-ablations", name=run_name)
     trainer = UnslothTrainer(
@@ -83,8 +89,8 @@ def train(
             lr_scheduler_type="linear",
             seed=3407,
             output_dir=f"outputs_{run_name}",
-            eval_strategy = 'steps' if eval_dataset is not None else 'no',
-            eval_steps = 10_000 if eval_dataset is not None else None
+            eval_strategy="steps" if eval_dataset is not None else "no",
+            eval_steps=10_000 if eval_dataset is not None else None,
         ),
     )
 
@@ -138,9 +144,18 @@ def run(
     )
 
     dataset = create_dataset(tokenizer, data_files)
-    eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None
+    eval_dataset = (
+        create_dataset(tokenizer, eval_data_files) if eval_data_files else None
+    )
 
-    train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset)
+    train(
+        model,
+        tokenizer,
+        dataset,
+        run_name,
+        batch_size=batch_size,
+        eval_dataset=eval_dataset,
+    )
 
 
 if __name__ == "__main__":
diff --git a/experiments/configs/data_configs/hf_data.yml b/experiments/configs/data_configs/hf_data.yml
@@ -1,7 +1,7 @@
 model_name: "EleutherAI/pythia-1b"
 context_length: 2048
 dataset_name: "EleutherAI/pile"
-dataset_args: {"name": "pubmed", "split": "train"}
+dataset_args: { "name": "pubmed", "split": "train" }
 batch_size: 1
 string_key: "text"
 save_path: "/fsx/proj-chemnlp/data/example_tokenised"