From f04b359fb57679a01a2812255795ecc5065ed737 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 13 Oct 2023 11:45:56 +0200 Subject: [PATCH 1/2] add drugchat --- .../drugchat_liang_zhang_et_al/meta.yaml | 57 +++++++++++++++++++ .../drugchat_liang_zhang_et_al/transform.py | 54 ++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 data/tabular/drugchat_liang_zhang_et_al/meta.yaml create mode 100644 data/tabular/drugchat_liang_zhang_et_al/transform.py diff --git a/data/tabular/drugchat_liang_zhang_et_al/meta.yaml b/data/tabular/drugchat_liang_zhang_et_al/meta.yaml new file mode 100644 index 000000000..590ecf76f --- /dev/null +++ b/data/tabular/drugchat_liang_zhang_et_al/meta.yaml @@ -0,0 +1,57 @@ +--- +name: drugchat_liang_zhang_et_al +description: |- + Instruction tuning dataset used for the LLM component of DrugChat. + 10,834 compounds (3,8962 from ChEMBL and 6,942 from PubChem) containing + descriptive drug information were collected. 143,517 questions were generated + using the molecules' classification, properties and descriptions from ChEBI, LOTUS & YMDB. +targets: + - id: answ + description: answer to the question about the SMILES + type: string +identifiers: + - id: SMILES + type: SMILES + description: SMILES + - id: quest + type: string + description: Question about SMILES +license: BSD-3-Clause +links: + - url: https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922 + description: corresponding publication + - url: https://github.com/UCSD-AI4H/drugchat + description: rep & data source +num_points: 143,517 +bibtex: + - |- + @article{Liang2023, + author = "Youwei Liang and Ruiyi Zhang and Li Zhang and Pengtao Xie", + title = "{DrugChat: Towards Enabling ChatGPT-Like Capabilities on Drug Molecule Graphs}", + year = "2023", + month = "5", + url = "https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922", + doi = "10.36227/techrxiv.22945922.v1"} +templates: + - |- + Task: Please answer the following question about the molecule with {SMILES__description} {SMILES#}. + {#Description|Question|Request!}: {quest#} + {#Result|Answer|Completion!}: {answ#} + - |- + {#Question|Q!}: {quest#} + {#Constraint:|Description:|!} The {#Molecule|Compound|Chemical!} {#has the|can be represented with the!} {SMILES__description} {SMILES#}. + {#Answer|Result|Answer!}: {answ#} + - |- + User: I have a question about the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#}. + Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} + User: {quest#} + Assistant: {#The answer is |!}{answ#} + - |- + User: I want to know more about the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#}. + Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} + User: {quest#} + Assistant: {#The answer is |!}{answ#} + - |- + Task: Answer the following question about the molecule with {SMILES__description} {SMILES#}. + {#Description|Question|Request!}: {quest#} + {#Result|Answer|Completion!}: {answ#} diff --git a/data/tabular/drugchat_liang_zhang_et_al/transform.py b/data/tabular/drugchat_liang_zhang_et_al/transform.py new file mode 100644 index 000000000..f32f15aae --- /dev/null +++ b/data/tabular/drugchat_liang_zhang_et_al/transform.py @@ -0,0 +1,54 @@ +from datasets import concatenate_datasets, load_dataset + +PUBCHEM_DATASET = "alxfgh/PubChem_Drug_Instruction_Tuning" +CHEMBL_DATASET = "alxfgh/ChEMBL_Drug_Instruction_Tuning" + + +if __name__ == "__main__": + # Load the two datasets + dataset1 = load_dataset(PUBCHEM_DATASET) + dataset2 = load_dataset(CHEMBL_DATASET) + + # Verify that the datasets have the same schema (i.e., the same fields) + assert ( + dataset1["train"].features == dataset2["train"].features + ), "Datasets do not have the same schema" + + # Concatenate the 'train' split of dataset2 to the 'train' split of dataset1 + combined_dataset = concatenate_datasets([dataset1["train"], dataset2["train"]]) + + # Define the fractions for train/test/valid split + train_fraction = 0.8 + test_fraction = 0.1 + # The remaining part will be the validation fraction + + # Generate the train/test/valid splits + train_test_valid_datasets = combined_dataset.train_test_split( + test_size=test_fraction, shuffle=True + ) + train_valid_datasets = train_test_valid_datasets["train"].train_test_split( + test_size=(1 - train_fraction) / (1 - test_fraction), shuffle=True + ) + + final_datasets = { + "train": train_valid_datasets["train"], + "test": train_test_valid_datasets["test"], + "valid": train_valid_datasets["test"], + } + + # Add the 'split' column to each dataset + for split in final_datasets: + final_datasets[split] = final_datasets[split].add_column( + "split", [split] * len(final_datasets[split]) + ) + + # Concatenate all splits again + all_datasets = concatenate_datasets( + [final_datasets[split] for split in final_datasets] + ) + df = all_datasets.to_pandas() + + df.rename(columns={"Answer": "answ", "Question": "quest"}, inplace=True) + + # Save the combined dataset as a CSV file + df.to_csv("data_clean.csv", index=False) From ad96602cb3914c12b08f72a67d8122303853b225 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Fri, 13 Oct 2023 11:55:14 +0200 Subject: [PATCH 2/2] remove lint --- .../drugchat_liang_zhang_et_al/meta.yaml | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/data/tabular/drugchat_liang_zhang_et_al/meta.yaml b/data/tabular/drugchat_liang_zhang_et_al/meta.yaml index 590ecf76f..da9520a12 100644 --- a/data/tabular/drugchat_liang_zhang_et_al/meta.yaml +++ b/data/tabular/drugchat_liang_zhang_et_al/meta.yaml @@ -32,26 +32,26 @@ bibtex: month = "5", url = "https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922", doi = "10.36227/techrxiv.22945922.v1"} -templates: - - |- - Task: Please answer the following question about the molecule with {SMILES__description} {SMILES#}. - {#Description|Question|Request!}: {quest#} - {#Result|Answer|Completion!}: {answ#} - - |- - {#Question|Q!}: {quest#} - {#Constraint:|Description:|!} The {#Molecule|Compound|Chemical!} {#has the|can be represented with the!} {SMILES__description} {SMILES#}. - {#Answer|Result|Answer!}: {answ#} - - |- - User: I have a question about the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#}. - Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} - User: {quest#} - Assistant: {#The answer is |!}{answ#} - - |- - User: I want to know more about the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#}. - Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} - User: {quest#} - Assistant: {#The answer is |!}{answ#} - - |- - Task: Answer the following question about the molecule with {SMILES__description} {SMILES#}. - {#Description|Question|Request!}: {quest#} - {#Result|Answer|Completion!}: {answ#} +templates: + - |- + Task: Please answer the following question about the molecule with {SMILES__description} {SMILES#}. + {#Description|Question|Request!}: {quest#} + {#Result|Answer|Completion!}: {answ#} + - |- + {#Question|Q!}: {quest#} + {#Constraint:|Description:|!} The {#Molecule|Compound|Chemical!} {#has the|can be represented with the!} {SMILES__description} {SMILES#}. + {#Answer|Result|Answer!}: {answ#} + - |- + User: I have a question about the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#}. + Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} + User: {quest#} + Assistant: {#The answer is |!}{answ#} + - |- + User: I want to know more about the {#molecule|chemical|compound!} with {SMILES__description} {SMILES#}. + Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} + User: {quest#} + Assistant: {#The answer is |!}{answ#} + - |- + Task: Answer the following question about the molecule with {SMILES__description} {SMILES#}. + {#Description|Question|Request!}: {quest#} + {#Result|Answer|Completion!}: {answ#}