From 7ee3b2431779f831eac28009d9ca221a8395aa2c Mon Sep 17 00:00:00 2001
From: Alexander Al-Feghali <alexander.al-feghali@mail.mcgill.ca>
Date: Sat, 24 Jun 2023 12:40:36 -0400
Subject: [PATCH 1/2] Added drugchat data

---
 data/drugchat_liang_zhang_et_al/meta.yml     | 34 +++++++++++++
 data/drugchat_liang_zhang_et_al/transform.py | 51 ++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 data/drugchat_liang_zhang_et_al/meta.yml
 create mode 100644 data/drugchat_liang_zhang_et_al/transform.py

diff --git a/data/drugchat_liang_zhang_et_al/meta.yml b/data/drugchat_liang_zhang_et_al/meta.yml
new file mode 100644
index 000000000..c979351dd
--- /dev/null
+++ b/data/drugchat_liang_zhang_et_al/meta.yml
@@ -0,0 +1,34 @@
+---
+name: drugchat_liang_zhang_et_al
+description: |-
+  Instruction tuning dataset used for the LLM component of DrugChat. 
+  10,834 compounds (3,8962  from ChEMBL and 6,942 from  PubChem) containing 
+  descriptive drug information were collected. 143,517 questions were generated
+  using the molecules' classification, properties and descriptions from ChEBI, LOTUS & YMDB.
+targets:
+  - id: Answer
+    description: answer to the question about the SMILES
+    type: string
+identifiers:
+  - id: SMILES
+    type: SMILES
+    description: SMILES
+  - id: Question
+    type: string
+    description: Question about SMILES
+license: CC BY 4.0
+links:
+  - url: https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922
+    description: corresponding publication
+  - url: https://github.com/UCSD-AI4H/drugchat
+    description: rep & data source
+num_points: 143,517
+bibtex:
+  - |-
+    @article{Liang2023,
+    author = "Youwei Liang and Ruiyi Zhang and Li Zhang and Pengtao Xie",
+    title = "{DrugChat: Towards Enabling ChatGPT-Like Capabilities on Drug Molecule Graphs}",
+    year = "2023",
+    month = "5",
+    url = "https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922",
+    doi = "10.36227/techrxiv.22945922.v1"}
diff --git a/data/drugchat_liang_zhang_et_al/transform.py b/data/drugchat_liang_zhang_et_al/transform.py
new file mode 100644
index 000000000..e15c992b7
--- /dev/null
+++ b/data/drugchat_liang_zhang_et_al/transform.py
@@ -0,0 +1,51 @@
+from datasets import load_dataset, concatenate_datasets
+
+PUBCHEM_DATASET = "alxfgh/PubChem_Drug_Instruction_Tuning"
+CHEMBL_DATASET = "alxfgh/ChEMBL_Drug_Instruction_Tuning"
+
+
+if __name__ == "__main__":
+    # Load the two datasets
+    dataset1 = load_dataset(PUBCHEM_DATASET)
+    dataset2 = load_dataset(CHEMBL_DATASET)
+
+    # Verify that the datasets have the same schema (i.e., the same fields)
+    assert (
+        dataset1["train"].features == dataset2["train"].features
+    ), "Datasets do not have the same schema"
+
+    # Concatenate the 'train' split of dataset2 to the 'train' split of dataset1
+    combined_dataset = concatenate_datasets([dataset1["train"], dataset2["train"]])
+
+    # Define the fractions for train/test/valid split
+    train_fraction = 0.8
+    test_fraction = 0.1
+    # The remaining part will be the validation fraction
+
+    # Generate the train/test/valid splits
+    train_test_valid_datasets = combined_dataset.train_test_split(
+        test_size=test_fraction, shuffle=True
+    )
+    train_valid_datasets = train_test_valid_datasets["train"].train_test_split(
+        test_size=(1 - train_fraction) / (1 - test_fraction), shuffle=True
+    )
+
+    final_datasets = {
+        "train": train_valid_datasets["train"],
+        "test": train_test_valid_datasets["test"],
+        "valid": train_valid_datasets["test"],
+    }
+
+    # Add the 'split' column to each dataset
+    for split in final_datasets:
+        final_datasets[split] = final_datasets[split].add_column(
+            "split", [split] * len(final_datasets[split])
+        )
+
+    # Concatenate all splits again
+    all_datasets = concatenate_datasets(
+        [final_datasets[split] for split in final_datasets]
+    )
+
+    # Save the combined dataset as a CSV file
+    all_datasets.to_csv("drugchat_intruct_tuning.csv")

From 96964ae9ece6312a9e1eb0f65a636167810b1691 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Mon, 24 Jul 2023 18:22:58 +0200
Subject: [PATCH 2/2] feat: rename meta.yaml filename and csv output filename

---
 data/drugchat_liang_zhang_et_al/meta.yaml    | 34 ++++++++++++++++++++
 data/drugchat_liang_zhang_et_al/meta.yml     | 34 --------------------
 data/drugchat_liang_zhang_et_al/transform.py |  4 +--
 3 files changed, 36 insertions(+), 36 deletions(-)
 create mode 100644 data/drugchat_liang_zhang_et_al/meta.yaml
 delete mode 100644 data/drugchat_liang_zhang_et_al/meta.yml

diff --git a/data/drugchat_liang_zhang_et_al/meta.yaml b/data/drugchat_liang_zhang_et_al/meta.yaml
new file mode 100644
index 000000000..194a50036
--- /dev/null
+++ b/data/drugchat_liang_zhang_et_al/meta.yaml
@@ -0,0 +1,34 @@
+---
+name: drugchat_liang_zhang_et_al
+description: |-
+    Instruction tuning dataset used for the LLM component of DrugChat.
+    10,834 compounds (3,8962  from ChEMBL and 6,942 from  PubChem) containing
+    descriptive drug information were collected. 143,517 questions were generated
+    using the molecules' classification, properties and descriptions from ChEBI, LOTUS & YMDB.
+targets:
+    - id: Answer
+      description: answer to the question about the SMILES
+      type: string
+identifiers:
+    - id: SMILES
+      type: SMILES
+      description: SMILES
+    - id: Question
+      type: string
+      description: Question about SMILES
+license: CC BY 4.0
+links:
+    - url: https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922
+      description: corresponding publication
+    - url: https://github.com/UCSD-AI4H/drugchat
+      description: rep & data source
+num_points: 143,517
+bibtex:
+    - |-
+      @article{Liang2023,
+      author = "Youwei Liang and Ruiyi Zhang and Li Zhang and Pengtao Xie",
+      title = "{DrugChat: Towards Enabling ChatGPT-Like Capabilities on Drug Molecule Graphs}",
+      year = "2023",
+      month = "5",
+      url = "https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922",
+      doi = "10.36227/techrxiv.22945922.v1"}
diff --git a/data/drugchat_liang_zhang_et_al/meta.yml b/data/drugchat_liang_zhang_et_al/meta.yml
deleted file mode 100644
index c979351dd..000000000
--- a/data/drugchat_liang_zhang_et_al/meta.yml
+++ /dev/null
@@ -1,34 +0,0 @@
----
-name: drugchat_liang_zhang_et_al
-description: |-
-  Instruction tuning dataset used for the LLM component of DrugChat. 
-  10,834 compounds (3,8962  from ChEMBL and 6,942 from  PubChem) containing 
-  descriptive drug information were collected. 143,517 questions were generated
-  using the molecules' classification, properties and descriptions from ChEBI, LOTUS & YMDB.
-targets:
-  - id: Answer
-    description: answer to the question about the SMILES
-    type: string
-identifiers:
-  - id: SMILES
-    type: SMILES
-    description: SMILES
-  - id: Question
-    type: string
-    description: Question about SMILES
-license: CC BY 4.0
-links:
-  - url: https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922
-    description: corresponding publication
-  - url: https://github.com/UCSD-AI4H/drugchat
-    description: rep & data source
-num_points: 143,517
-bibtex:
-  - |-
-    @article{Liang2023,
-    author = "Youwei Liang and Ruiyi Zhang and Li Zhang and Pengtao Xie",
-    title = "{DrugChat: Towards Enabling ChatGPT-Like Capabilities on Drug Molecule Graphs}",
-    year = "2023",
-    month = "5",
-    url = "https://www.techrxiv.org/articles/preprint/DrugChat_Towards_Enabling_ChatGPT-Like_Capabilities_on_Drug_Molecule_Graphs/22945922",
-    doi = "10.36227/techrxiv.22945922.v1"}
diff --git a/data/drugchat_liang_zhang_et_al/transform.py b/data/drugchat_liang_zhang_et_al/transform.py
index e15c992b7..f17eecd02 100644
--- a/data/drugchat_liang_zhang_et_al/transform.py
+++ b/data/drugchat_liang_zhang_et_al/transform.py
@@ -1,4 +1,4 @@
-from datasets import load_dataset, concatenate_datasets
+from datasets import concatenate_datasets, load_dataset
 
 PUBCHEM_DATASET = "alxfgh/PubChem_Drug_Instruction_Tuning"
 CHEMBL_DATASET = "alxfgh/ChEMBL_Drug_Instruction_Tuning"
@@ -48,4 +48,4 @@
     )
 
     # Save the combined dataset as a CSV file
-    all_datasets.to_csv("drugchat_intruct_tuning.csv")
+    all_datasets.to_csv("data_clean.csv")