From 55b0e90f359354a2c19658c2be4e8ed271672781 Mon Sep 17 00:00:00 2001
From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com>
Date: Tue, 27 Jun 2023 16:13:01 -0400
Subject: [PATCH 1/3] Add papyrus protein targets

Add  papyrus data 05.4_combined_set_protein_targets.tsv.gz which have a target and each information of 7k target
---
 data/papyrus_protein_targets/meta.yaml    |  94 ++++++++++
 data/papyrus_protein_targets/transform.py | 211 ++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 data/papyrus_protein_targets/meta.yaml
 create mode 100644 data/papyrus_protein_targets/transform.py

diff --git a/data/papyrus_protein_targets/meta.yaml b/data/papyrus_protein_targets/meta.yaml
new file mode 100644
index 000000000..b7c3f1319
--- /dev/null
+++ b/data/papyrus_protein_targets/meta.yaml
@@ -0,0 +1,94 @@
+name: papyrus_protein_targets
+description: Papyrus is an aggregated dataset of small molecule bioactivities. File
+  contains data about proteins (e.g. sequence, organism,classification).
+targets:
+- id: Organism
+  description: Organism of the protein
+  units: ''
+  type: text
+  names:
+  - noun: The organism that the protein extracted from
+  - noun: For which organism protein related to
+  - noun: living that the protein extract from
+  uris:
+  - http://purl.bioontology.org/ontology/CCON
+- id: organism_common_name
+  description: common name of the organism that protein extract from.
+  units: ''
+  type: text
+  names:
+  - noun: common name of the organism that the protein extracted from
+  - noun: common name of the organism for which protein related to
+  - noun: common name of the living that the protein extracted from
+  uris:
+  - http://purl.bioontology.org/ontology/CCON
+- id: Classification
+  description: Protein classification as given by ChEMBL(version 29). Levels are separated
+    by '->'. Multiple classifications are separated by a semilcolon ';'
+  units: ''
+  type: text
+  names:
+  - noun: Protein classification
+  - noun: protein classification by levels
+  - noun: Levels for which protein classify
+- id: seq_length
+  description: Length of the protein sequence
+  units: ''
+  type: continuous
+  names:
+  - noun: Protein sequence length
+  - noun: Length for protein string
+- id: Sequence
+  description: Protein sequence including mutations
+  units: ''
+  type: string
+  names:
+  - noun: Protein sequence character
+  - noun: FASTQ of the protein
+  - noun: protein string
+  uris:
+  - http://purl.bioontology.org/ontology/MESH/D009154
+identifiers:
+- id: target_id
+  type: Other
+  names:
+  - noun: protein identifier wtih mutation
+  - noun: target id plus mutation
+  - noun: protein target combined with mutation
+  description: A unique Papyrus protein identifier. It results from the concatenation
+    of accessions and mutations(e.g. P47747_WT or P10721_V559D_T670I)
+- id: target_id_without_mutation
+  type: Other
+  names:
+  - noun: protein identifier
+  - noun: target id
+  - noun: protein target
+  description: A unique protein identifier
+- id: UniProtID
+  type: Other
+  names:
+  - noun: UniProt identifier
+  - noun: UniProtID
+  description: The UniProt identifier of the sequence
+license: CC BY-SA 4.0
+links:
+- url: https://doi.org/10.1186/s13321-022-00672-x
+  description: corresponding publication
+- url: https://doi.org/10.4121/16896406.v3
+  description: data source
+- url: https://data.4tu.nl/articles/_/16896406/3
+  description: data source
+num_points: 7058
+bibtex:
+- |-
+  @article{B_quignon_2023,
+            doi = {10.1186/s13321-022-00672-x},
+            url = {https://doi.org/10.1186%2Fs13321-022-00672-x},
+            year = {2023},
+            month = jan,
+            publisher = {Springer Science and Business Media LLC},
+            volume = {15},
+            number = {1},
+            author = {O. J. M. Bequignon and B. J. Bongers and W. Jespers and A. P. IJzerman and B. van der Water and G. J. P. van Westen},
+            title = {Papyrus: a large-scale curated dataset aimed at bioactivity predictions},
+            journal = {Journal of Cheminformatics}
diff --git a/data/papyrus_protein_targets/transform.py b/data/papyrus_protein_targets/transform.py
new file mode 100644
index 000000000..95ef90e58
--- /dev/null
+++ b/data/papyrus_protein_targets/transform.py
@@ -0,0 +1,211 @@
+import pandas as pd
+import yaml
+
+
+def get_and_transform_data():
+    target_folder = 'papyrus_protein_targets'
+    data_path = "https://data.4tu.nl/file/ca10bf7d-f508-4d54-9c9a-5a9e9c1adef9/e5863d58-c613-418b-8393-012eb6c9a04a"
+    fn_data_original = "data_original.csv"
+    df = pd.read_csv(data_path, compression='gzip', sep='\t')
+    df.to_csv(fn_data_original,index=None)
+    df = df.fillna('unkown')
+    df['organism_common_name'] = df['Organism'].apply(lambda s : s[s.index('(')+1:-1] if '(' in s else 'unknown')
+    df['target_id_without_mutation'] = df['target_id'].apply(lambda s : s.split('_')[0] if '_' in s else s)
+    df['UniProtID'] = df['UniProtID'].apply(lambda s : s.split('_')[0] if '_' in s else s)
+    df = df.drop_duplicates(subset='target_id')
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ['target_id',
+     'HGNC_symbol',
+     'UniProtID',
+     'Status',
+     'Organism',
+     'Classification',
+     'Length',
+     'Sequence',
+     'organism_common_name',
+     'target_id_without_mutation']
+
+    fields_clean = ['target_id',
+                    'target_id_without_mutation',
+                    'HGNC_symbol',
+                    'UniProtID',
+                    'Status',
+                    'Organism',
+                    'organism_common_name',
+                    'Classification',
+                    'Length',
+                    'Sequence'
+                     ]
+    df =df[fields_clean]
+    fields_clean = ['target_id',
+                    'target_id_without_mutation',
+                    'HGNC_symbol',
+                    'UniProtID',
+                    'Status',
+                    'Organism',
+                    'organism_common_name',
+                    'Classification',
+                    'seq_length',
+                    'Sequence'
+                     ]
+
+    assert fields_orig != fields_clean
+    assert not df.duplicated().sum()
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    # create meta yaml
+    meta = {
+        "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
+        "description": """Papyrus is an aggregated dataset of small molecule bioactivities. File contains data about proteins (e.g. sequence, organism,classification).""",
+        "targets": [
+            {
+                "id": "Organism",  # name of the column in a tabular dataset
+                "description": "Organism of the protein",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "text",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts).
+                {"noun": "The organism that the protein extracted from"},
+                {"noun": "For which organism protein related to"},
+                {"noun": "living that the protein extract from"},
+
+                ],
+                  "uris":[
+                "http://purl.bioontology.org/ontology/CCON", #organism
+        ],
+            },
+            {
+                "id": "organism_common_name",  # name of the column in a tabular dataset
+                "description": "common name of the organism that protein extract from.",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "text",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts).
+                {"noun": "common name of the organism that the protein extracted from"},
+                {"noun": "common name of the organism for which protein related to"},
+                {"noun": "common name of the living that the protein extracted from"},
+                ],
+                "uris":[
+                "http://purl.bioontology.org/ontology/CCON", #organism
+        		]
+            },
+           {
+                "id": "Classification",  # name of the column in a tabular dataset
+                "description": "Protein classification as given by ChEMBL(version 29). Levels are separated by '->'. Multiple classifications are separated by a semilcolon ';'",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "text",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts).
+                {"noun": "Protein classification"},
+                {"noun": "protein classification by levels"},
+                {"noun": "Levels for which protein classify"},
+                ],
+            },
+            {
+                "id": "seq_length",  # name of the column in a tabular dataset
+                "description": "Length of the protein sequence",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "continuous",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts).
+                {"noun": "Protein sequence length"},
+                {"noun": "Length for protein string"},
+                ],
+            },
+            {
+                "id": "Sequence",  # name of the column in a tabular dataset
+                "description": "Protein sequence including mutations",  # description of what this column means
+                "units": "",  # units of the values in this column (leave empty if unitless)
+                "type": "string",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts).
+                {"noun": "Protein sequence character"},
+                {"noun": "FASTQ of the protein"},
+                {"noun": "protein string"},
+                ],
+                  "uris":[
+                "http://purl.bioontology.org/ontology/MESH/D009154" #mutation
+        		],
+            },
+
+        ],
+        "identifiers": [
+            {
+                    "id": "target_id",  # column name
+                    "type": "Other",
+                    "names": [
+                        {"noun": "protein identifier wtih mutation"},
+                        {"noun": "target id plus mutation"},
+                        {"noun": "protein target combined with mutation"},
+                    ],
+                    "description": "A unique Papyrus protein identifier. It results from the concatenation of accessions and mutations(e.g. P47747_WT or P10721_V559D_T670I)",  # description (optional, except for "Other")
+                },
+            {
+                    "id": "target_id_without_mutation",  # column name
+                    "type": "Other",
+                    "names": [
+                        {"noun": "protein identifier"},
+                        {"noun": "target id"},
+                        {"noun": "protein target"},
+                    ],
+                    "description": "A unique protein identifier",  # description (optional, except for "Other")
+                },
+            {
+                    "id": "UniProtID",  # column name
+                    "type": "Other",
+                    "names": [
+                        {"noun": "UniProt identifier"},
+                        {"noun": "UniProtID"},
+                    ],
+                    "description": "The UniProt identifier of the sequence",  # description (optional, except for "Other")
+                },
+        ],
+        "license": "CC BY-SA 4.0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)   
+            {
+                "url": "https://doi.org/10.1186/s13321-022-00672-x",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://doi.org/10.4121/16896406.v3",
+                "description": "data source",
+            },
+            {
+                "url": "https://data.4tu.nl/articles/_/16896406/3",
+                "description": "data source",
+
+            }
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+
+        "bibtex": [
+            """@article{B_quignon_2023,
+          doi = {10.1186/s13321-022-00672-x},
+          url = {https://doi.org/10.1186%2Fs13321-022-00672-x},
+          year = {2023},
+          month = jan,
+          publisher = {Springer Science and Business Media LLC},
+          volume = {15},
+          number = {1},
+          author = {O. J. M. Bequignon and B. J. Bongers and W. Jespers and A. P. IJzerman and B. van der Water and G. J. P. van Westen},
+          title = {Papyrus: a large-scale curated dataset aimed at bioactivity predictions},
+          journal = {Journal of Cheminformatics}""",
+        ],
+    }
+    
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+if __name__ == "__main__":
+    get_and_transform_data()

From bd3c62a9a1b2d9c2736b88ab44f396e6b179ce52 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Wed, 26 Jul 2023 14:56:18 +0200
Subject: [PATCH 2/3] feat: run precommit hooks

---
 data/papyrus_protein_targets/transform.py | 217 ++++++++++++----------
 1 file changed, 115 insertions(+), 102 deletions(-)

diff --git a/data/papyrus_protein_targets/transform.py b/data/papyrus_protein_targets/transform.py
index 95ef90e58..503fb2cf2 100644
--- a/data/papyrus_protein_targets/transform.py
+++ b/data/papyrus_protein_targets/transform.py
@@ -3,51 +3,61 @@
 
 
 def get_and_transform_data():
-    target_folder = 'papyrus_protein_targets'
+    target_folder = "papyrus_protein_targets"
     data_path = "https://data.4tu.nl/file/ca10bf7d-f508-4d54-9c9a-5a9e9c1adef9/e5863d58-c613-418b-8393-012eb6c9a04a"
     fn_data_original = "data_original.csv"
-    df = pd.read_csv(data_path, compression='gzip', sep='\t')
-    df.to_csv(fn_data_original,index=None)
-    df = df.fillna('unkown')
-    df['organism_common_name'] = df['Organism'].apply(lambda s : s[s.index('(')+1:-1] if '(' in s else 'unknown')
-    df['target_id_without_mutation'] = df['target_id'].apply(lambda s : s.split('_')[0] if '_' in s else s)
-    df['UniProtID'] = df['UniProtID'].apply(lambda s : s.split('_')[0] if '_' in s else s)
-    df = df.drop_duplicates(subset='target_id')
+    df = pd.read_csv(data_path, compression="gzip", sep="\t")
+    df.to_csv(fn_data_original, index=None)
+    df = df.fillna("unkown")
+    df["organism_common_name"] = df["Organism"].apply(
+        lambda s: s[s.index("(") + 1 : -1] if "(" in s else "unknown"
+    )
+    df["target_id_without_mutation"] = df["target_id"].apply(
+        lambda s: s.split("_")[0] if "_" in s else s
+    )
+    df["UniProtID"] = df["UniProtID"].apply(
+        lambda s: s.split("_")[0] if "_" in s else s
+    )
+    df = df.drop_duplicates(subset="target_id")
     fields_orig = df.columns.tolist()
-    assert fields_orig == ['target_id',
-     'HGNC_symbol',
-     'UniProtID',
-     'Status',
-     'Organism',
-     'Classification',
-     'Length',
-     'Sequence',
-     'organism_common_name',
-     'target_id_without_mutation']
+    assert fields_orig == [
+        "target_id",
+        "HGNC_symbol",
+        "UniProtID",
+        "Status",
+        "Organism",
+        "Classification",
+        "Length",
+        "Sequence",
+        "organism_common_name",
+        "target_id_without_mutation",
+    ]
 
-    fields_clean = ['target_id',
-                    'target_id_without_mutation',
-                    'HGNC_symbol',
-                    'UniProtID',
-                    'Status',
-                    'Organism',
-                    'organism_common_name',
-                    'Classification',
-                    'Length',
-                    'Sequence'
-                     ]
-    df =df[fields_clean]
-    fields_clean = ['target_id',
-                    'target_id_without_mutation',
-                    'HGNC_symbol',
-                    'UniProtID',
-                    'Status',
-                    'Organism',
-                    'organism_common_name',
-                    'Classification',
-                    'seq_length',
-                    'Sequence'
-                     ]
+    fields_clean = [
+        "target_id",
+        "target_id_without_mutation",
+        "HGNC_symbol",
+        "UniProtID",
+        "Status",
+        "Organism",
+        "organism_common_name",
+        "Classification",
+        "Length",
+        "Sequence",
+    ]
+    df = df[fields_clean]
+    fields_clean = [
+        "target_id",
+        "target_id_without_mutation",
+        "HGNC_symbol",
+        "UniProtID",
+        "Status",
+        "Organism",
+        "organism_common_name",
+        "Classification",
+        "seq_length",
+        "Sequence",
+    ]
 
     assert fields_orig != fields_clean
     assert not df.duplicated().sum()
@@ -57,7 +67,7 @@ def get_and_transform_data():
     # create meta yaml
     meta = {
         "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
-        "description": """Papyrus is an aggregated dataset of small molecule bioactivities. File contains data about proteins (e.g. sequence, organism,classification).""",
+        "description": """Papyrus is an aggregated dataset of small molecule bioactivities. File contains data about proteins (e.g. sequence, organism,classification).""",  # noqa: E501
         "targets": [
             {
                 "id": "Organism",  # name of the column in a tabular dataset
@@ -65,38 +75,43 @@ def get_and_transform_data():
                 "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "text",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts).
-                {"noun": "The organism that the protein extracted from"},
-                {"noun": "For which organism protein related to"},
-                {"noun": "living that the protein extract from"},
-
+                    {"noun": "The organism that the protein extracted from"},
+                    {"noun": "For which organism protein related to"},
+                    {"noun": "living that the protein extract from"},
+                ],
+                "uris": [
+                    "http://purl.bioontology.org/ontology/CCON",  # organism
                 ],
-                  "uris":[
-                "http://purl.bioontology.org/ontology/CCON", #organism
-        ],
             },
             {
                 "id": "organism_common_name",  # name of the column in a tabular dataset
-                "description": "common name of the organism that protein extract from.",  # description of what this column means
+                "description": "common name of the organism that protein extract from.",
                 "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "text",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts).
-                {"noun": "common name of the organism that the protein extracted from"},
-                {"noun": "common name of the organism for which protein related to"},
-                {"noun": "common name of the living that the protein extracted from"},
+                    {
+                        "noun": "common name of the organism that the protein extracted from"
+                    },
+                    {
+                        "noun": "common name of the organism for which protein related to"
+                    },
+                    {
+                        "noun": "common name of the living that the protein extracted from"
+                    },
+                ],
+                "uris": [
+                    "http://purl.bioontology.org/ontology/CCON",  # organism
                 ],
-                "uris":[
-                "http://purl.bioontology.org/ontology/CCON", #organism
-        		]
             },
-           {
+            {
                 "id": "Classification",  # name of the column in a tabular dataset
-                "description": "Protein classification as given by ChEMBL(version 29). Levels are separated by '->'. Multiple classifications are separated by a semilcolon ';'",  # description of what this column means
+                "description": "Protein classification as given by ChEMBL(version 29). Levels are separated by '->'. Multiple classifications are separated by a semilcolon ';'",  # noqa: E501
                 "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "text",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts).
-                {"noun": "Protein classification"},
-                {"noun": "protein classification by levels"},
-                {"noun": "Levels for which protein classify"},
+                    {"noun": "Protein classification"},
+                    {"noun": "protein classification by levels"},
+                    {"noun": "Levels for which protein classify"},
                 ],
             },
             {
@@ -105,8 +120,8 @@ def get_and_transform_data():
                 "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts).
-                {"noun": "Protein sequence length"},
-                {"noun": "Length for protein string"},
+                    {"noun": "Protein sequence length"},
+                    {"noun": "Length for protein string"},
                 ],
             },
             {
@@ -115,49 +130,48 @@ def get_and_transform_data():
                 "units": "",  # units of the values in this column (leave empty if unitless)
                 "type": "string",  # can be "categorical", "ordinal", "continuous"
                 "names": [  # names for the property (to sample from for building the prompts).
-                {"noun": "Protein sequence character"},
-                {"noun": "FASTQ of the protein"},
-                {"noun": "protein string"},
+                    {"noun": "Protein sequence character"},
+                    {"noun": "FASTQ of the protein"},
+                    {"noun": "protein string"},
+                ],
+                "uris": [
+                    "http://purl.bioontology.org/ontology/MESH/D009154"  # mutation
                 ],
-                  "uris":[
-                "http://purl.bioontology.org/ontology/MESH/D009154" #mutation
-        		],
             },
-
         ],
         "identifiers": [
             {
-                    "id": "target_id",  # column name
-                    "type": "Other",
-                    "names": [
-                        {"noun": "protein identifier wtih mutation"},
-                        {"noun": "target id plus mutation"},
-                        {"noun": "protein target combined with mutation"},
-                    ],
-                    "description": "A unique Papyrus protein identifier. It results from the concatenation of accessions and mutations(e.g. P47747_WT or P10721_V559D_T670I)",  # description (optional, except for "Other")
-                },
+                "id": "target_id",  # column name
+                "type": "Other",
+                "names": [
+                    {"noun": "protein identifier wtih mutation"},
+                    {"noun": "target id plus mutation"},
+                    {"noun": "protein target combined with mutation"},
+                ],
+                "description": "A unique Papyrus protein identifier. It results from the concatenation of accessions and mutations(e.g. P47747_WT or P10721_V559D_T670I)",  # noqa: E501
+            },
             {
-                    "id": "target_id_without_mutation",  # column name
-                    "type": "Other",
-                    "names": [
-                        {"noun": "protein identifier"},
-                        {"noun": "target id"},
-                        {"noun": "protein target"},
-                    ],
-                    "description": "A unique protein identifier",  # description (optional, except for "Other")
-                },
+                "id": "target_id_without_mutation",  # column name
+                "type": "Other",
+                "names": [
+                    {"noun": "protein identifier"},
+                    {"noun": "target id"},
+                    {"noun": "protein target"},
+                ],
+                "description": "A unique protein identifier",  # description (optional, except for "Other")
+            },
             {
-                    "id": "UniProtID",  # column name
-                    "type": "Other",
-                    "names": [
-                        {"noun": "UniProt identifier"},
-                        {"noun": "UniProtID"},
-                    ],
-                    "description": "The UniProt identifier of the sequence",  # description (optional, except for "Other")
-                },
+                "id": "UniProtID",  # column name
+                "type": "Other",
+                "names": [
+                    {"noun": "UniProt identifier"},
+                    {"noun": "UniProtID"},
+                ],
+                "description": "The UniProt identifier of the sequence",  # description (optional, except for "Other")
+            },
         ],
         "license": "CC BY-SA 4.0",  # license under which the original dataset was published
-        "links": [  # list of relevant links (original dataset, other uses, etc.)   
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
             {
                 "url": "https://doi.org/10.1186/s13321-022-00672-x",
                 "description": "corresponding publication",
@@ -169,11 +183,9 @@ def get_and_transform_data():
             {
                 "url": "https://data.4tu.nl/articles/_/16896406/3",
                 "description": "data source",
-
-            }
+            },
         ],
         "num_points": len(df),  # number of datapoints in this dataset
-
         "bibtex": [
             """@article{B_quignon_2023,
           doi = {10.1186/s13321-022-00672-x},
@@ -185,10 +197,10 @@ def get_and_transform_data():
           number = {1},
           author = {O. J. M. Bequignon and B. J. Bongers and W. Jespers and A. P. IJzerman and B. van der Water and G. J. P. van Westen},
           title = {Papyrus: a large-scale curated dataset aimed at bioactivity predictions},
-          journal = {Journal of Cheminformatics}""",
+          journal = {Journal of Cheminformatics}""",  # noqa: E501
         ],
     }
-    
+
     def str_presenter(dumper, data):
         """configures yaml for dumping multiline strings
         Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
@@ -207,5 +219,6 @@ def str_presenter(dumper, data):
 
     print(f"Finished processing {meta['name']} dataset!")
 
+
 if __name__ == "__main__":
     get_and_transform_data()

From 1b11da32e7b0188e9bc3455b269c8f10c15c2a21 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Wed, 26 Jul 2023 14:56:53 +0200
Subject: [PATCH 3/3] feat: run precommit hooks

---
 data/papyrus_protein_targets/meta.yaml | 173 ++++++++++++-------------
 1 file changed, 86 insertions(+), 87 deletions(-)

diff --git a/data/papyrus_protein_targets/meta.yaml b/data/papyrus_protein_targets/meta.yaml
index b7c3f1319..5fed65f67 100644
--- a/data/papyrus_protein_targets/meta.yaml
+++ b/data/papyrus_protein_targets/meta.yaml
@@ -1,94 +1,93 @@
+---
 name: papyrus_protein_targets
-description: Papyrus is an aggregated dataset of small molecule bioactivities. File
-  contains data about proteins (e.g. sequence, organism,classification).
+description: Papyrus is an aggregated dataset of small molecule bioactivities. File contains data about proteins (e.g. sequence, organism,classification).
 targets:
-- id: Organism
-  description: Organism of the protein
-  units: ''
-  type: text
-  names:
-  - noun: The organism that the protein extracted from
-  - noun: For which organism protein related to
-  - noun: living that the protein extract from
-  uris:
-  - http://purl.bioontology.org/ontology/CCON
-- id: organism_common_name
-  description: common name of the organism that protein extract from.
-  units: ''
-  type: text
-  names:
-  - noun: common name of the organism that the protein extracted from
-  - noun: common name of the organism for which protein related to
-  - noun: common name of the living that the protein extracted from
-  uris:
-  - http://purl.bioontology.org/ontology/CCON
-- id: Classification
-  description: Protein classification as given by ChEMBL(version 29). Levels are separated
-    by '->'. Multiple classifications are separated by a semilcolon ';'
-  units: ''
-  type: text
-  names:
-  - noun: Protein classification
-  - noun: protein classification by levels
-  - noun: Levels for which protein classify
-- id: seq_length
-  description: Length of the protein sequence
-  units: ''
-  type: continuous
-  names:
-  - noun: Protein sequence length
-  - noun: Length for protein string
-- id: Sequence
-  description: Protein sequence including mutations
-  units: ''
-  type: string
-  names:
-  - noun: Protein sequence character
-  - noun: FASTQ of the protein
-  - noun: protein string
-  uris:
-  - http://purl.bioontology.org/ontology/MESH/D009154
+    - id: Organism
+      description: Organism of the protein
+      units: ''
+      type: text
+      names:
+          - noun: The organism that the protein extracted from
+          - noun: For which organism protein related to
+          - noun: living that the protein extract from
+      uris:
+          - http://purl.bioontology.org/ontology/CCON
+    - id: organism_common_name
+      description: common name of the organism that protein extract from.
+      units: ''
+      type: text
+      names:
+          - noun: common name of the organism that the protein extracted from
+          - noun: common name of the organism for which protein related to
+          - noun: common name of the living that the protein extracted from
+      uris:
+          - http://purl.bioontology.org/ontology/CCON
+    - id: Classification
+      description: Protein classification as given by ChEMBL(version 29). Levels are separated by '->'. Multiple classifications are separated by a semilcolon
+          ';'
+      units: ''
+      type: text
+      names:
+          - noun: Protein classification
+          - noun: protein classification by levels
+          - noun: Levels for which protein classify
+    - id: seq_length
+      description: Length of the protein sequence
+      units: ''
+      type: continuous
+      names:
+          - noun: Protein sequence length
+          - noun: Length for protein string
+    - id: Sequence
+      description: Protein sequence including mutations
+      units: ''
+      type: string
+      names:
+          - noun: Protein sequence character
+          - noun: FASTQ of the protein
+          - noun: protein string
+      uris:
+          - http://purl.bioontology.org/ontology/MESH/D009154
 identifiers:
-- id: target_id
-  type: Other
-  names:
-  - noun: protein identifier wtih mutation
-  - noun: target id plus mutation
-  - noun: protein target combined with mutation
-  description: A unique Papyrus protein identifier. It results from the concatenation
-    of accessions and mutations(e.g. P47747_WT or P10721_V559D_T670I)
-- id: target_id_without_mutation
-  type: Other
-  names:
-  - noun: protein identifier
-  - noun: target id
-  - noun: protein target
-  description: A unique protein identifier
-- id: UniProtID
-  type: Other
-  names:
-  - noun: UniProt identifier
-  - noun: UniProtID
-  description: The UniProt identifier of the sequence
+    - id: target_id
+      type: Other
+      names:
+          - noun: protein identifier wtih mutation
+          - noun: target id plus mutation
+          - noun: protein target combined with mutation
+      description: A unique Papyrus protein identifier. It results from the concatenation of accessions and mutations(e.g. P47747_WT or P10721_V559D_T670I)
+    - id: target_id_without_mutation
+      type: Other
+      names:
+          - noun: protein identifier
+          - noun: target id
+          - noun: protein target
+      description: A unique protein identifier
+    - id: UniProtID
+      type: Other
+      names:
+          - noun: UniProt identifier
+          - noun: UniProtID
+      description: The UniProt identifier of the sequence
 license: CC BY-SA 4.0
 links:
-- url: https://doi.org/10.1186/s13321-022-00672-x
-  description: corresponding publication
-- url: https://doi.org/10.4121/16896406.v3
-  description: data source
-- url: https://data.4tu.nl/articles/_/16896406/3
-  description: data source
+    - url: https://doi.org/10.1186/s13321-022-00672-x
+      description: corresponding publication
+    - url: https://doi.org/10.4121/16896406.v3
+      description: data source
+    - url: https://data.4tu.nl/articles/_/16896406/3
+      description: data source
 num_points: 7058
 bibtex:
-- |-
-  @article{B_quignon_2023,
-            doi = {10.1186/s13321-022-00672-x},
-            url = {https://doi.org/10.1186%2Fs13321-022-00672-x},
-            year = {2023},
-            month = jan,
-            publisher = {Springer Science and Business Media LLC},
-            volume = {15},
-            number = {1},
-            author = {O. J. M. Bequignon and B. J. Bongers and W. Jespers and A. P. IJzerman and B. van der Water and G. J. P. van Westen},
-            title = {Papyrus: a large-scale curated dataset aimed at bioactivity predictions},
-            journal = {Journal of Cheminformatics}
+    - |-
+      @article{B_quignon_2023,
+                doi = {10.1186/s13321-022-00672-x},
+                url = {https://doi.org/10.1186%2Fs13321-022-00672-x},
+                year = {2023},
+                month = jan,
+                publisher = {Springer Science and Business Media LLC},
+                volume = {15},
+                number = {1},
+                author = {O. J. M. Bequignon and B. J. Bongers and W. Jespers and A. P. IJzerman and B. van der Water and G. J. P. van Westen},
+                title = {Papyrus: a large-scale curated dataset aimed at bioactivity predictions},
+                journal = {Journal of Cheminformatics}