diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a794fa2f9..b8c96f3bd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v6.0.0 hooks: - id: check-added-large-files - id: check-byte-order-marker @@ -21,20 +21,20 @@ repos: - id: trailing-whitespace exclude: .gitignore - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.4.2 + rev: 25.11.0 hooks: - id: black-jupyter - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.1 + rev: v0.14.5 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/rbubley/mirrors-prettier - rev: v3.3.2 + rev: v3.6.2 hooks: - id: prettier - repo: https://github.com/google/yamlfmt - rev: v0.13.0 + rev: v0.20.0 hooks: - id: yamlfmt - repo: https://github.com/Yelp/detect-secrets @@ -42,25 +42,25 @@ repos: hooks: - id: detect-secrets - repo: https://github.com/pappasam/toml-sort - rev: v0.23.1 + rev: v0.24.3 hooks: - id: toml-sort-fix exclude: poetry.lock - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.1 hooks: - id: codespell additional_dependencies: [".[toml]"] - repo: https://github.com/sqlfluff/sqlfluff - rev: 3.1.0 + rev: 4.0.0a2 hooks: - id: sqlfluff-fix - repo: https://github.com/hadolint/hadolint - rev: v2.13.0-beta + rev: v2.14.0 hooks: - id: hadolint-docker - repo: https://github.com/jsh9/markdown-toc-creator - rev: 0.0.6 + rev: 0.1.3 hooks: - id: markdown-toc-creator - repo: https://github.com/jumanjihouse/pre-commit-hooks @@ -68,22 +68,22 @@ repos: hooks: - id: check-mailmap - repo: https://github.com/python-poetry/poetry - rev: 1.8.0 + rev: 2.2.1 hooks: - id: poetry-check - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.1 + rev: v1.18.2 hooks: - id: mypy additional_dependencies: - types-requests - repo: https://github.com/srstevenson/nb-clean - rev: 3.3.0 + rev: 4.0.1 hooks: - id: nb-clean args: [--preserve-cell-outputs, --remove-empty-cells] - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.18 + rev: v0.24.1 hooks: - id: validate-pyproject additional_dependencies: diff --git a/code_of_conduct.md b/code_of_conduct.md index 8b4fcfd34..f6e93cd9e 100644 --- a/code_of_conduct.md +++ b/code_of_conduct.md @@ -1,5 +1,27 @@ # Contributor Covenant Code of Conduct + + +______________________________________________________________________ + +**Table of Contents** + +- [Our Pledge](#our-pledge) +- [Our Standards](#our-standards) +- [Enforcement Responsibilities](#enforcement-responsibilities) +- [Scope](#scope) +- [Enforcement](#enforcement) +- [Enforcement Guidelines](#enforcement-guidelines) + - [1. Correction](#1-correction) + - [2. Warning](#2-warning) + - [3. Temporary Ban](#3-temporary-ban) + - [4. Permanent Ban](#4-permanent-ban) +- [Attribution](#attribution) + +______________________________________________________________________ + + + ## Our Pledge We as members, contributors, and leaders pledge to make participation in our diff --git a/data/tabular/ld50_catmos/meta.yaml b/data/tabular/ld50_catmos/meta.yaml index 00b0d5373..e4a74438e 100644 --- a/data/tabular/ld50_catmos/meta.yaml +++ b/data/tabular/ld50_catmos/meta.yaml @@ -1,145 +1,144 @@ ---- name: ld50_catmos description: |- - Acute toxicity LD50 measures - the most conservative dose that can lead to lethal adverse effects. - The higher the dose, the more lethal of a drug. - We aggregated the data from multiple SMILES by computing the mean. + Acute toxicity LD50 measures + the most conservative dose that can lead to lethal adverse effects. + The higher the dose, the more lethal of a drug. + We aggregated the data from multiple SMILES by computing the mean. targets: - - id: CATMoS_LD50_mgkg - description: Acute Toxicity LD50. - units: mg/kg - type: continuous - names: - - noun: acute oral toxicity rat LD50 - - noun: acute oral toxicity (LD50 in rats) - uris: - - http://www.bioassayontology.org/bao#BAO_0002117 - significant_digits: 1 - - id: log10_LD50 - description: Acute Toxicity LD50. - units: log10(mg/kg) - type: continuous - names: - - noun: log10 acute oral toxicity rat LD50 - - noun: log10 acute oral toxicity (LD50 in rats) - - noun: log10 LD50 in rats (oral exposure) - - noun: log10 rat LD50 (oral exposure) - significant_digits: 2 - - id: num_ghose_violations - description: Ghose filter violations - type: ordinal - significant_digits: 0 - names: - - noun: Ghose filter violations - - noun: violations of the Ghose filter - - id: num_lead_likeness_violations - description: Lead likeness filter violations - type: ordinal - significant_digits: 0 - names: - - noun: lead likeness filter violations - - noun: violations of the lead likeness filter - - id: num_lipinski_violations - description: Lipinski filter violations - type: ordinal - significant_digits: 0 - names: - - noun: Lipinski rule violations - - noun: violations of the Lipinski rules - - id: molecular_mass - description: Molecular mass - type: continuous - units: g/mol - names: - - noun: molecular mass - - noun: molecular weight - - id: num_carbon_atoms - description: Number of carbon atoms - type: ordinal - significant_digits: 0 - names: - - noun: carbon atoms - - id: num_oxygen_atoms - description: Number of oxygen atoms - type: ordinal - significant_digits: 0 - names: - - noun: oxygen atoms + - id: CATMoS_LD50_mgkg + description: Acute Toxicity LD50. + units: mg/kg + type: continuous + names: + - noun: acute oral toxicity rat LD50 + - noun: acute oral toxicity (LD50 in rats) + uris: + - http://www.bioassayontology.org/bao#BAO_0002117 + significant_digits: 1 + - id: log10_LD50 + description: Acute Toxicity LD50. + units: log10(mg/kg) + type: continuous + names: + - noun: log10 acute oral toxicity rat LD50 + - noun: log10 acute oral toxicity (LD50 in rats) + - noun: log10 LD50 in rats (oral exposure) + - noun: log10 rat LD50 (oral exposure) + significant_digits: 2 + - id: num_ghose_violations + description: Ghose filter violations + type: ordinal + significant_digits: 0 + names: + - noun: Ghose filter violations + - noun: violations of the Ghose filter + - id: num_lead_likeness_violations + description: Lead likeness filter violations + type: ordinal + significant_digits: 0 + names: + - noun: lead likeness filter violations + - noun: violations of the lead likeness filter + - id: num_lipinski_violations + description: Lipinski filter violations + type: ordinal + significant_digits: 0 + names: + - noun: Lipinski rule violations + - noun: violations of the Lipinski rules + - id: molecular_mass + description: Molecular mass + type: continuous + units: g/mol + names: + - noun: molecular mass + - noun: molecular weight + - id: num_carbon_atoms + description: Number of carbon atoms + type: ordinal + significant_digits: 0 + names: + - noun: carbon atoms + - id: num_oxygen_atoms + description: Number of oxygen atoms + type: ordinal + significant_digits: 0 + names: + - noun: oxygen atoms identifiers: - - id: SMILES - type: SMILES - description: SMILES + - id: SMILES + type: SMILES + description: SMILES license: CC BY 4.0 links: - - url: https://ehp.niehs.nih.gov/doi/full/10.1289/EHP8495#supplementary-materials - description: corresponding publication + - url: https://ehp.niehs.nih.gov/doi/full/10.1289/EHP8495#supplementary-materials + description: corresponding publication num_points: 9032 bibtex: - - |- - @article{Mansouri_2021, title={CATMoS: Collaborative Acute Toxicity Modeling Suite}, - volume={129}, - ISSN={1552-9924}, - url={http://dx.doi.org/10.1289/EHP8495}, - DOI={10.1289/ehp8495}, - number={4}, - journal={Environmental Health Perspectives}, - publisher={Environmental Health Perspectives}, - author={Mansouri, Kamel and Karmaus, Agnes L. and Fitzpatrick, Jeremy - and Patlewicz, Grace and Pradeep, Prachi and Alberga, Domenico and - Alepee, Nathalie and Allen, Timothy E.H. and Allen, Dave and Alves, Vinicius M. - and Andrade, Carolina H. and Auernhammer, Tyler R. and Ballabio, Davide and - Bell, Shannon and Benfenati, Emilio and Bhattacharya, Sudin and - Bastos, Joyce V. and Boyd, Stephen and Brown, J.B. and Capuzzi, Stephen J. and - Chushak, Yaroslav and Ciallella, Heather and Clark, Alex M. and - Consonni, Viviana and Daga, Pankaj R. and Ekins, Sean and Farag, Sherif and - Fedorov, Maxim and Fourches, Denis and Gadaleta, Domenico and Gao, Feng and - Gearhart, Jeffery M. and Goh, Garett and Goodman, Jonathan M. and - Grisoni, Francesca and Grulke, Christopher M. and Hartung, Thomas and - Hirn, Matthew and Karpov, Pavel and Korotcov, Alexandru and - Lavado, Giovanna J. and Lawless, Michael and Li, Xinhao and - Luechtefeld, Thomas and Lunghini, Filippo and Mangiatordi, Giuseppe F. and - Marcou, Gilles and Marsh, Dan and Martin, Todd and Mauri, Andrea and - Muratov, Eugene N. and Myatt, Glenn J. and Nguyen, Dac-Trung and - Nicolotti, Orazio and Note, Reine and Pande, Paritosh and - Parks, Amanda K. and Peryea, Tyler and Polash, Ahsan H. and - Rallo, Robert and Roncaglioni, Alessandra and Rowlands, Craig and - Ruiz, Patricia and Russo, Daniel P. and Sayed, Ahmed and Sayre, Risa and - Sheils, Timothy and Siegel, Charles and Silva, Arthur C. and Simeonov, Anton and - Sosnin, Sergey and Southall, Noel and Strickland, Judy and Tang, Yun and - Teppen, Brian and Tetko, Igor V. and Thomas, Dennis and Tkachenko, Valery and - Todeschini, Roberto and Toma, Cosimo and Tripodi, Ignacio and - Trisciuzzi, Daniela and Tropsha, Alexander and Varnek, Alexandre and - Vukovic, Kristijan and Wang, Zhongyu and Wang, Liguo and - Waters, Katrina M. and Wedlake, Andrew J. and Wijeyesakere, Sanjeeva J. and - Wilson, Dan and Xiao, Zijun and Yang, Hongbin and Zahoranszky-Kohalmi, Gergely and - Zakharov, Alexey V. and Zhang, Fagen F. and Zhang, Zhen and Zhao, Tongan and - Zhu, Hao and Zorn, Kimberley M. and Casey, Warren and Kleinstreuer, Nicole C.}, - year={2021}, month=apr } + - |- + @article{Mansouri_2021, title={CATMoS: Collaborative Acute Toxicity Modeling Suite}, + volume={129}, + ISSN={1552-9924}, + url={http://dx.doi.org/10.1289/EHP8495}, + DOI={10.1289/ehp8495}, + number={4}, + journal={Environmental Health Perspectives}, + publisher={Environmental Health Perspectives}, + author={Mansouri, Kamel and Karmaus, Agnes L. and Fitzpatrick, Jeremy + and Patlewicz, Grace and Pradeep, Prachi and Alberga, Domenico and + Alepee, Nathalie and Allen, Timothy E.H. and Allen, Dave and Alves, Vinicius M. + and Andrade, Carolina H. and Auernhammer, Tyler R. and Ballabio, Davide and + Bell, Shannon and Benfenati, Emilio and Bhattacharya, Sudin and + Bastos, Joyce V. and Boyd, Stephen and Brown, J.B. and Capuzzi, Stephen J. and + Chushak, Yaroslav and Ciallella, Heather and Clark, Alex M. and + Consonni, Viviana and Daga, Pankaj R. and Ekins, Sean and Farag, Sherif and + Fedorov, Maxim and Fourches, Denis and Gadaleta, Domenico and Gao, Feng and + Gearhart, Jeffery M. and Goh, Garett and Goodman, Jonathan M. and + Grisoni, Francesca and Grulke, Christopher M. and Hartung, Thomas and + Hirn, Matthew and Karpov, Pavel and Korotcov, Alexandru and + Lavado, Giovanna J. and Lawless, Michael and Li, Xinhao and + Luechtefeld, Thomas and Lunghini, Filippo and Mangiatordi, Giuseppe F. and + Marcou, Gilles and Marsh, Dan and Martin, Todd and Mauri, Andrea and + Muratov, Eugene N. and Myatt, Glenn J. and Nguyen, Dac-Trung and + Nicolotti, Orazio and Note, Reine and Pande, Paritosh and + Parks, Amanda K. and Peryea, Tyler and Polash, Ahsan H. and + Rallo, Robert and Roncaglioni, Alessandra and Rowlands, Craig and + Ruiz, Patricia and Russo, Daniel P. and Sayed, Ahmed and Sayre, Risa and + Sheils, Timothy and Siegel, Charles and Silva, Arthur C. and Simeonov, Anton and + Sosnin, Sergey and Southall, Noel and Strickland, Judy and Tang, Yun and + Teppen, Brian and Tetko, Igor V. and Thomas, Dennis and Tkachenko, Valery and + Todeschini, Roberto and Toma, Cosimo and Tripodi, Ignacio and + Trisciuzzi, Daniela and Tropsha, Alexander and Varnek, Alexandre and + Vukovic, Kristijan and Wang, Zhongyu and Wang, Liguo and + Waters, Katrina M. and Wedlake, Andrew J. and Wijeyesakere, Sanjeeva J. and + Wilson, Dan and Xiao, Zijun and Yang, Hongbin and Zahoranszky-Kohalmi, Gergely and + Zakharov, Alexey V. and Zhang, Fagen F. and Zhang, Zhen and Zhao, Tongan and + Zhu, Hao and Zorn, Kimberley M. and Casey, Warren and Kleinstreuer, Nicole C.}, + year={2021}, month=apr } templates: - - The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} an {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}. - - The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}. - - | - Task: Determine the acute oral toxicity and molecular properties of a {#molecule|chemical|compound!} given the {SMILES__description}. - Input: {SMILES#} - Desired Output: {CATMoS_LD50_mgkg__names__noun}, {log10_LD50__names__noun}, {num_ghose_violations__names__noun}, {num_lead_likeness_violations__names__noun}, {num_lipinski_violations__names__noun}, {molecular_mass__names__noun}, {num_carbon_atoms__names__noun}, {num_oxygen_atoms__names__noun} - Output: {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {log10_LD50#} {log10_LD50__units}, {num_ghose_violations#}, {num_lead_likeness_violations#}, {num_lipinski_violations#}, {molecular_mass#} {molecular_mass__units}, {num_carbon_atoms#}, {num_oxygen_atoms#} - - | - Context: You are {#an assistant|researcher|scientist!} in a pharmaceutical company. Your {#boss|superior|department head!} has asked you to {#design|create|synthesize!} a new drug. - User: The {#drug|compound|chemical!} should have a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {molecular_mass#} {molecular_mass__names__noun} {molecular_mass__units}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. - Assistant: {#Happy to help!|Sure!|Of course!} The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties. - - | - User: I need a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}. - Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}? - User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. - Assistant: The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties. - - | - User: I need a {#drug|compound|chemical!} with a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}. - Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}? - User: The {#drug|compound|chemical!} should have a {num_carbon_atoms#} {num_carbon_atoms__names__noun}, {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}, and a {molecular_mass__names__noun} of {molecular_mass#} {molecular_mass__units}. Could you please only provide me with the {SMILES__description} and return no other information? - Assistant: {SMILES#} - - | - User: I am looking for a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}. - Assistant: {#That's interesting!|Interesting!|I see!} Can you provide me with more {#constraints|details|information!}? - User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. Please return only the {SMILES__description} wrapped as follows [ANSWER][/ANSWER]. - Assistant: [ANSWER]{SMILES#}[/ANSWER] + - The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} an {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}. + - The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}. + - | + Task: Determine the acute oral toxicity and molecular properties of a {#molecule|chemical|compound!} given the {SMILES__description}. + Input: {SMILES#} + Desired Output: {CATMoS_LD50_mgkg__names__noun}, {log10_LD50__names__noun}, {num_ghose_violations__names__noun}, {num_lead_likeness_violations__names__noun}, {num_lipinski_violations__names__noun}, {molecular_mass__names__noun}, {num_carbon_atoms__names__noun}, {num_oxygen_atoms__names__noun} + Output: {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {log10_LD50#} {log10_LD50__units}, {num_ghose_violations#}, {num_lead_likeness_violations#}, {num_lipinski_violations#}, {molecular_mass#} {molecular_mass__units}, {num_carbon_atoms#}, {num_oxygen_atoms#} + - | + Context: You are {#an assistant|researcher|scientist!} in a pharmaceutical company. Your {#boss|superior|department head!} has asked you to {#design|create|synthesize!} a new drug. + User: The {#drug|compound|chemical!} should have a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}, {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {molecular_mass#} {molecular_mass__names__noun} {molecular_mass__units}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. + Assistant: {#Happy to help!|Sure!|Of course!} The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties. + - | + User: I need a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}. + Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}? + User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. + Assistant: The {#molecule|chemical|compound!} with the {SMILES__description} {#representation of |!}{SMILES#} {#shows|exhibits|displays!} the desired properties. + - | + User: I need a {#drug|compound|chemical!} with a {CATMoS_LD50_mgkg__names__noun} of {CATMoS_LD50_mgkg#} {CATMoS_LD50_mgkg__units}. + Assistant: {#Happy to help!|Sure!|Of course!} Can you provide me with more {#constraints|details|information!}? + User: The {#drug|compound|chemical!} should have a {num_carbon_atoms#} {num_carbon_atoms__names__noun}, {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}, and a {molecular_mass__names__noun} of {molecular_mass#} {molecular_mass__units}. Could you please only provide me with the {SMILES__description} and return no other information? + Assistant: {SMILES#} + - | + User: I am looking for a {#drug|compound|chemical!} with a {log10_LD50__names__noun} of {log10_LD50#} {log10_LD50__units}. + Assistant: {#That's interesting!|Interesting!|I see!} Can you provide me with more {#constraints|details|information!}? + User: The {#drug|compound|chemical!} should have {num_ghose_violations#} {num_ghose_violations__names__noun}, {num_lead_likeness_violations#} {num_lead_likeness_violations__names__noun}, {num_lipinski_violations#} {num_lipinski_violations__names__noun}, {num_carbon_atoms#} {num_carbon_atoms__names__noun}, and {num_oxygen_atoms#} {num_oxygen_atoms__names__noun}. Please return only the {SMILES__description} wrapped as follows [ANSWER][/ANSWER]. + Assistant: [ANSWER]{SMILES#}[/ANSWER] diff --git a/data/tabular/mona/example_processing_and_templates.ipynb b/data/tabular/mona/example_processing_and_templates.ipynb index 5f12a6f7f..786a90365 100644 --- a/data/tabular/mona/example_processing_and_templates.ipynb +++ b/data/tabular/mona/example_processing_and_templates.ipynb @@ -20,7 +20,6 @@ "from tqdm import tqdm\n", "\n", "# import datasets\n", - "import rdkit\n", "import rdkit.Chem as Chem\n", "import rdkit.RDLogger as RDLogger" ] @@ -1444,7 +1443,7 @@ " k = md[\"name\"]\n", " v = md.get(\"value\", np.nan)\n", " df_row[\"md_\" + transform_key(k)] = v\n", - " if not (v is np.nan):\n", + " if v is not np.nan:\n", " md_keys.append(k)\n", " md_key_counter.update(md_keys)\n", " compounds = entry.get(\"compound\", [])\n", diff --git a/data/tabular/ocp/transform.py b/data/tabular/ocp/transform.py index 91cd54553..8d0911c42 100644 --- a/data/tabular/ocp/transform.py +++ b/data/tabular/ocp/transform.py @@ -21,8 +21,8 @@ def uniCode2Latex(text: str) -> str: text = text.replace(chr(code), f"$_{code-8320}$") text = text.replace("\u0305", "$^-$") - text = text.replace("\u207A", "$^+$") - text = text.replace("\u207B", "$^-$") + text = text.replace("\u207a", "$^+$") + text = text.replace("\u207b", "$^-$") text = text.replace("\u2074", "$^4$") text = text.replace("\u2070", "$^0$") text = text.replace("\u2078", "$^1$") diff --git a/data/tabular/orbnet_denali/develop_transform.ipynb b/data/tabular/orbnet_denali/develop_transform.ipynb index 039c60f89..5e7f1dab6 100644 --- a/data/tabular/orbnet_denali/develop_transform.ipynb +++ b/data/tabular/orbnet_denali/develop_transform.ipynb @@ -25,11 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", "from rdkit import Chem\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import os\n", "import pandas as pd\n", "from glob import glob" ] @@ -474,7 +470,6 @@ "metadata": {}, "outputs": [], "source": [ - "from rdkit.Chem import rdDetermineBonds\n", "from chemnlp.utils import xyz_to_mol" ] }, diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 3d4fec76e..e15e807da 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -1,5 +1,23 @@ # Contributing to ChemNLP + + +______________________________________________________________________ + +**Table of Contents** + +- [Getting Started](#getting-started) +- [Implementing a Dataset](#implementing-a-dataset) + - [meta.yaml Structure](#metayaml-structure) + - [transform.py Guidelines](#transformpy-guidelines) +- [Text Templates](#text-templates) +- [Testing Your Contribution](#testing-your-contribution) +- [Submitting Your Contribution](#submitting-your-contribution) + +______________________________________________________________________ + + + Thank you for your interest in contributing to ChemNLP! There are many ways to contribute, including implementing datasets, improving code, and enhancing documentation. ## Getting Started @@ -17,7 +35,6 @@ One of the most valuable contributions is implementing a dataset. Here's how to 1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there. 2. Create an issue in this repository stating your intention to add the dataset. 3. Make a Pull Request (PR) that adds a new folder in `data` with the following files: - - `meta.yaml`: Describes the dataset (see structure below). - `transform.py`: Python code to transform the original dataset into a usable form. diff --git a/docs/api/meta_yaml_augmentor.md b/docs/api/meta_yaml_augmentor.md index 0e8328295..8e5470e12 100644 --- a/docs/api/meta_yaml_augmentor.md +++ b/docs/api/meta_yaml_augmentor.md @@ -1,5 +1,25 @@ # Meta YAML Augmenter + + +______________________________________________________________________ + +**Table of Contents** + +- [Overview](#overview) +- [generate_augmented_meta_yaml](#generate_augmented_meta_yaml) +- [CLI Interface](#cli-interface) + - [Usage](#usage) + - [Arguments](#arguments) + - [Example](#example) +- [Augmentation Process](#augmentation-process) +- [Notes](#notes) +- [Example Usage in Python](#example-usage-in-python) + +______________________________________________________________________ + + + ## Overview The Meta YAML Augmenter is a tool designed to enhance existing `meta.yaml` files for chemical datasets. It uses Large Language Models (LLMs) to generate additional templates and improve the metadata structure, particularly focusing on advanced sampling methods and template formats. diff --git a/docs/api/meta_yaml_generator.md b/docs/api/meta_yaml_generator.md index 09c3f651e..bcd24e1e8 100644 --- a/docs/api/meta_yaml_generator.md +++ b/docs/api/meta_yaml_generator.md @@ -1,5 +1,19 @@ # Meta YAML Generator + + +______________________________________________________________________ + +**Table of Contents** + +- [Overview](#overview) +- [`generate_meta_yaml`](#generate_meta_yaml) +- [Usage Example](#usage-example) + +______________________________________________________________________ + + + ## Overview The Meta YAML Generator is a tool designed to automatically create a `meta.yaml` file for chemical datasets using Large Language Models (LLMs). It analyzes the structure of a given DataFrame and generates a comprehensive metadata file, including advanced sampling methods and template formats. diff --git a/docs/api/sampler.md b/docs/api/sampler.md index 6e073916a..e6287413d 100644 --- a/docs/api/sampler.md +++ b/docs/api/sampler.md @@ -1,5 +1,28 @@ # Sampler Module + + +______________________________________________________________________ + +**Table of Contents** + +- [Overview](#overview) +- [TemplateSampler](#templatesampler) + - [Class: TemplateSampler](#class-templatesampler) + - [Initialization](#initialization) + - [Configuration Options](#configuration-options) + - [Main Methods](#main-methods) + - [`sample`](#sample) + - [`enable_class_balancing`](#enable_class_balancing) + - [`disable_class_balancing`](#disable_class_balancing) + - [Identifier Wrapping](#identifier-wrapping) + - [Usage Examples](#usage-examples) +- [Notes](#notes) + +______________________________________________________________________ + + + ## Overview The `sampler` module provides functionality for generating text samples based on templates and data. It is primarily used for creating datasets for natural language processing tasks in chemistry and related fields. The main class in this module is `TemplateSampler`, which allows for flexible text generation with support for multiple choice questions, class balancing, and identifier wrapping. diff --git a/docs/api/sampler_cli.md b/docs/api/sampler_cli.md index 0e165565d..e0bcb1cbe 100644 --- a/docs/api/sampler_cli.md +++ b/docs/api/sampler_cli.md @@ -1,5 +1,31 @@ # Sampler CLI + + +______________________________________________________________________ + +**Table of Contents** + +- [Overview](#overview) +- [Usage](#usage) + - [Arguments](#arguments) + - [Options](#options) +- [Detailed Option Descriptions](#detailed-option-descriptions) + - [`chunksize`](#chunksize) + - [`class_balanced`](#class_balanced) + - [`benchmarking`](#benchmarking) + - [`multiple_choice`](#multiple_choice) + - [`additional_templates`](#additional_templates) + - [`use_standard_templates`](#use_standard_templates) + - [`wrap_identifiers`](#wrap_identifiers) +- [Examples](#examples) +- [Notes](#notes) +- [Troubleshooting](#troubleshooting) + +______________________________________________________________________ + + + ## Overview The Sampler CLI is a command-line interface tool designed to process chemical datasets using the `TemplateSampler`. It allows for flexible text generation based on templates, with support for various sampling scenarios including class balancing, benchmarking, and multiple-choice questions. diff --git a/experiments/README.md b/experiments/README.md index 578f32415..2ef57ca4b 100644 --- a/experiments/README.md +++ b/experiments/README.md @@ -1,5 +1,18 @@ # Working with the Stability cluster + + +______________________________________________________________________ + +**Table of Contents** + +- [GPT-Neox](#gpt-neox) +- [Hugging Face](#hugging-face) + +______________________________________________________________________ + + + We currently run our large scale experiments on the Stability AI HPC cluster. This subdirectory features a few helpful scripts that can help you get up and running on the cluster. @@ -11,7 +24,6 @@ running on the cluster. 1. [Create Environment](scripts/env_creation_neox.sh) - creates a basic conda environment for experiments. - - Creates a conda environment at the prefix `CONDA_ENV_PATH` path. > Using the positional argument passed into the script - Clones `chemnlp` into your personal cluster `USER` directory. @@ -28,7 +40,6 @@ running on the cluster. 2. [Training Models](scripts/sbatch_train_neox.sh) - runs a GPT-NeoX training pipeline - - creates a conda environment using the `env_creation_neox.sh` script. - runs the GPT-NeoX `train.py` script using the user configuration > as GPT-NeoX configurations can be combined, the PEFT configurations are held @@ -48,7 +59,6 @@ running on the cluster. 1. [Create Environment](scripts/env_creation_hf.sh) - creates a basic conda environment for experiments. - - Creates a conda environment at the prefix `CONDA_ENV_PATH` path. > Using the positional argument passed into the script - Clones `chemnlp` into your personal cluster `USER` directory. @@ -65,7 +75,6 @@ running on the cluster. 2. [Single Node Models](scripts/sbatch_train_hf.sh) - runs a Hugging Face training pipeline across devices - - creates a conda environment using the `env_creation_hf.sh` script. - runs the Hugging Face `run_tune.py` script with the user configuration @@ -81,7 +90,6 @@ running on the cluster. 3. [Multi Node Models](scripts/sbatch_train_hf_multinode.sh) - runs a Hugging Face training pipeline across nodes - - creates a conda environment using the `env_creation_hf.sh` script. - runs the Hugging Face `run_tune.py` script with the user configuration @@ -97,7 +105,6 @@ running on the cluster. 4. [Grid Search](scripts/run_grid_search.py) - runs a grid search across training pipeline configuration options - - Update the upper-case parameters at the top of the script - The script runs an exhaustive set of experiments across all permutations diff --git a/experiments/ablations/continued_pretrain.py b/experiments/ablations/continued_pretrain.py index 730453b95..0d5fd0577 100644 --- a/experiments/ablations/continued_pretrain.py +++ b/experiments/ablations/continued_pretrain.py @@ -57,7 +57,13 @@ def load_model( def train( - model, tokenizer, dataset, run_name: str, batch_size: int = 64, max_seq_length=2048, eval_dataset=None + model, + tokenizer, + dataset, + run_name: str, + batch_size: int = 64, + max_seq_length=2048, + eval_dataset=None, ): wandb.init(project="chemnlp-ablations", name=run_name) trainer = UnslothTrainer( @@ -83,8 +89,8 @@ def train( lr_scheduler_type="linear", seed=3407, output_dir=f"outputs_{run_name}", - eval_strategy = 'steps' if eval_dataset is not None else 'no', - eval_steps = 10_000 if eval_dataset is not None else None + eval_strategy="steps" if eval_dataset is not None else "no", + eval_steps=10_000 if eval_dataset is not None else None, ), ) @@ -138,9 +144,18 @@ def run( ) dataset = create_dataset(tokenizer, data_files) - eval_dataset = create_dataset(tokenizer, eval_data_files) if eval_data_files else None + eval_dataset = ( + create_dataset(tokenizer, eval_data_files) if eval_data_files else None + ) - train(model, tokenizer, dataset, run_name, batch_size=batch_size, eval_dataset=eval_dataset) + train( + model, + tokenizer, + dataset, + run_name, + batch_size=batch_size, + eval_dataset=eval_dataset, + ) if __name__ == "__main__": diff --git a/experiments/configs/data_configs/hf_data.yml b/experiments/configs/data_configs/hf_data.yml index c3fec721e..64d71ab4e 100644 --- a/experiments/configs/data_configs/hf_data.yml +++ b/experiments/configs/data_configs/hf_data.yml @@ -1,7 +1,7 @@ model_name: "EleutherAI/pythia-1b" context_length: 2048 dataset_name: "EleutherAI/pile" -dataset_args: {"name": "pubmed", "split": "train"} +dataset_args: { "name": "pubmed", "split": "train" } batch_size: 1 string_key: "text" save_path: "/fsx/proj-chemnlp/data/example_tokenised"