diff --git a/data/offsides/meta.yaml b/data/offsides/meta.yaml new file mode 100644 index 000000000..3f9270c58 --- /dev/null +++ b/data/offsides/meta.yaml @@ -0,0 +1,54 @@ +--- +name: offsides +description: |- + OffSIDES is a database of individual drug side effect + signals mined from the FDA's Adverse Event Reporting System. The + innovation of OffSIDES is that a propensity score matching (PSM) model + is used to identify control drugs and produce better PRR estimates. In + OffSIDES we focus on drug safety signals that are not already + established by being listed on the structured product label - hence + they are off-label drug side effects. +targets: + - id: PRR + description: proportional reporting ratio + type: continuous + names: + - proportional reporting ratio + - id: PRR_error + description: standard error of the PRR estimate + type: continuous + sample: false + names: + - standard error of the proportional reporting ratio error + - id: mean_reporting_frequency + description: mean reporting frequency for the drug + type: continuous + names: + - mean reporting frequency +identifier: + - id: drug_concept_name + description: RxNorm name string for the drug + type: categorical + - id: condition_concept_name + description: MedDRA identifier for the side effect + type: categorical +license: CC BY 4.0 +links: + - url: https://tatonettilab.org/resources/nsides/ + description: data source + - url: https://nsides.io/ + description: database website +num_points: 2977338 +bibtex: |- + @article{Tatonetti2012, + author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.}, + title = {Data-driven prediction of drug effects and interactions}, + journal = {Sci Transl Med}, + volume = {4}, + number = {125}, + pages = {125ra31}, + year = {2012}, + doi = {10.1126/scitranslmed.3003377}, + pmid = {22422992}, + pmcid = {PMC3382018} + } diff --git a/data/offsides/offsides_data_prep.ipynb b/data/offsides/offsides_data_prep.ipynb new file mode 100644 index 000000000..e498392bf --- /dev/null +++ b/data/offsides/offsides_data_prep.ipynb @@ -0,0 +1,872 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ee354cad", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import yaml" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a5577953", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/4254589737.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", + "\n", + "\n", + " df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n", + "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/4254589737.py:1: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n" + ] + } + ], + "source": [ + "df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n", + " error_bad_lines=False)\n", + "#df.shape\n", + "#df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "da5b4b81", + "metadata": {}, + "outputs": [], + "source": [ + " # check if fields are the same\n", + "\n", + "\n", + "expected_columns = ['drug_rxnorn_id',\n", + " 'drug_concept_name',\n", + " 'condition_meddra_id',\n", + " 'condition_concept_name',\n", + " 'A',\n", + " 'B',\n", + " 'C',\n", + " 'D',\n", + " 'PRR',\n", + " 'PRR_error',\n", + " 'mean_reporting_frequency']\n", + "\n", + "assert df.columns.tolist() == expected_columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "8d016424", + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "Found duplicate rows in the dataframe", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m df\u001b[38;5;241m.\u001b[39mduplicated()\u001b[38;5;241m.\u001b[39msum(), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound duplicate rows in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m df\u001b[38;5;241m.\u001b[39mdrop_duplicates(inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "\u001b[0;31mAssertionError\u001b[0m: Found duplicate rows in the dataframe" + ] + } + ], + "source": [ + "assert not df.duplicated().sum(), \"Found duplicate rows in the dataframe\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "f523b30a", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop_duplicates(inplace=True)\n", + "# df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "43ed46ed", + "metadata": {}, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a6230d38", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(fn_data_csv, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8b1da608", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 apoorvasrinivasan staff 279M Mar 14 18:22 data_clean.csv\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d1509dca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "drug_rxnorn_id,drug_concept_name,condition_meddra_id,condition_concept_name,A,B,C,D,PRR,PRR_error,mean_reporting_frequency\r\n", + "4024,\"ergoloid mesylates, USP\",10002034,Anaemia,6,126,21,1299,2.85714,0.45382,0.0454545\r\n", + "4024,\"ergoloid mesylates, USP\",10002965,Aplasia pure red cell,1,131,1,1319,10.0,1.41126,0.00757576\r\n", + "4024,\"ergoloid mesylates, USP\",10013442,Disseminated intravascular coagulation,1,131,6,1314,1.66667,1.07626,0.00757576\r\n", + "4024,\"ergoloid mesylates, USP\",10023126,Jaundice,2,130,7,1313,2.85714,0.79657,0.0151515\r\n" + ] + } + ], + "source": [ + "!head -n 5 {fn_data_csv}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f33da1dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
drug_rxnorn_iddrug_concept_namecondition_meddra_idcondition_concept_nameABCDPRRPRR_errormean_reporting_frequency
04024ergoloid mesylates, USP10002034Anaemia61262112992.857140.453820.045455
14024ergoloid mesylates, USP10002965Aplasia pure red cell11311131910.01.411260.007576
24024ergoloid mesylates, USP10013442Disseminated intravascular coagulation1131613141.666671.076260.007576
34024ergoloid mesylates, USP10023126Jaundice2130713132.857140.796570.015152
44024ergoloid mesylates, USP10016288Febrile neutropenia1131513152.01.091630.007576
\n", + "
" + ], + "text/plain": [ + " drug_rxnorn_id drug_concept_name condition_meddra_id \\\n", + "0 4024 ergoloid mesylates, USP 10002034 \n", + "1 4024 ergoloid mesylates, USP 10002965 \n", + "2 4024 ergoloid mesylates, USP 10013442 \n", + "3 4024 ergoloid mesylates, USP 10023126 \n", + "4 4024 ergoloid mesylates, USP 10016288 \n", + "\n", + " condition_concept_name A B C D PRR \\\n", + "0 Anaemia 6 126 21 1299 2.85714 \n", + "1 Aplasia pure red cell 1 131 1 1319 10.0 \n", + "2 Disseminated intravascular coagulation 1 131 6 1314 1.66667 \n", + "3 Jaundice 2 130 7 1313 2.85714 \n", + "4 Febrile neutropenia 1 131 5 1315 2.0 \n", + "\n", + " PRR_error mean_reporting_frequency \n", + "0 0.45382 0.045455 \n", + "1 1.41126 0.007576 \n", + "2 1.07626 0.007576 \n", + "3 0.79657 0.015152 \n", + "4 1.09163 0.007576 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "17bcca0d", + "metadata": {}, + "source": [ + "## Load from csv" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4d12adad", + "metadata": {}, + "outputs": [], + "source": [ + "fn_data_csv = \"data_clean.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ae4cbf36", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/2664504625.py:1: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(fn_data_csv)\n" + ] + } + ], + "source": [ + "df = pd.read_csv(fn_data_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "214a2b81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
drug_rxnorn_iddrug_concept_namecondition_meddra_idcondition_concept_nameABCDPRRPRR_errormean_reporting_frequency
04024ergoloid mesylates, USP10002034Anaemia61262112992.857140.453820.045455
14024ergoloid mesylates, USP10002965Aplasia pure red cell11311131910.01.411260.007576
24024ergoloid mesylates, USP10013442Disseminated intravascular coagulation1131613141.666671.076260.007576
34024ergoloid mesylates, USP10023126Jaundice2130713132.857140.796570.015152
44024ergoloid mesylates, USP10016288Febrile neutropenia1131513152.01.091630.007576
\n", + "
" + ], + "text/plain": [ + " drug_rxnorn_id drug_concept_name condition_meddra_id \\\n", + "0 4024 ergoloid mesylates, USP 10002034 \n", + "1 4024 ergoloid mesylates, USP 10002965 \n", + "2 4024 ergoloid mesylates, USP 10013442 \n", + "3 4024 ergoloid mesylates, USP 10023126 \n", + "4 4024 ergoloid mesylates, USP 10016288 \n", + "\n", + " condition_concept_name A B C D PRR \\\n", + "0 Anaemia 6 126 21 1299 2.85714 \n", + "1 Aplasia pure red cell 1 131 1 1319 10.0 \n", + "2 Disseminated intravascular coagulation 1 131 6 1314 1.66667 \n", + "3 Jaundice 2 130 7 1313 2.85714 \n", + "4 Febrile neutropenia 1 131 5 1315 2.0 \n", + "\n", + " PRR_error mean_reporting_frequency \n", + "0 0.45382 0.045455 \n", + "1 1.41126 0.007576 \n", + "2 1.07626 0.007576 \n", + "3 0.79657 0.015152 \n", + "4 1.09163 0.007576 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "854b807c", + "metadata": {}, + "source": [ + "## Meta YAML" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2d24e114", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "meta = {\n", + " \"name\": \"offsides\",\n", + " \"description\": \"OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"PRR\",\n", + " \"description\": \"Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\",\n", + " \"type\": \"continuous\",\n", + " \"names\": [\"Proportional reporting ratio\"]\n", + " },\n", + " {\n", + " \"id\": \"PRR_error\",\n", + " \"description\": \"Standard error of the PRR estimate\",\n", + " \"type\": \"continuous\",\n", + " \"names\": [\"Proportional reporting ratio error\"]\n", + " },\n", + " {\n", + " \"id\": \"mean_reporting_frequency\",\n", + " \"description\": \"Proportion of reports for the drug that report the side effect, A/(A+B)\",\n", + " \"type\": \"continuous\",\n", + " \"names\": [\"mean reporting frequency\"]\n", + " }\n", + " ],\n", + " \"identifier\": [\n", + " {\n", + " \"id\": \"drug_concept_name\",\n", + " \"description\": \"RxNorm name string for the drug\",\n", + " \"type\": \"categorical\"\n", + " },\n", + " {\n", + " \"id\": \"condition_concept_name\",\n", + " \"description\": \"MedDRA identifier for the side effect\",\n", + " \"type\": \"categorical\"\n", + " }\n", + " ],\n", + " \"license\": \"CC BY 4.0\",\n", + " \"links\": [\n", + " {\n", + " \"url\": \"https://tatonettilab.org/resources/nsides/\",\n", + " \"description\": \"data source\"\n", + " },\n", + " {\n", + " \"url\": \"https://nsides.io/\",\n", + " \"description\": \"database website\"\n", + " }\n", + " ],\n", + " \"num_points\": len(df),\n", + " \"bibtex\": \"\"\"\n", + " @article{Tatonetti2012,\n", + " author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n", + " title = {Data-driven prediction of drug effects and interactions},\n", + " journal = {Sci Transl Med},\n", + " volume = {4},\n", + " number = {125},\n", + " pages = {125ra31},\n", + " year = {2012},\n", + " doi = {10.1126/scitranslmed.3003377},\n", + " pmid = {22422992},\n", + " pmcid = {PMC3382018}\n", + " }\n", + " \"\"\"\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6e8aafee", + "metadata": {}, + "outputs": [], + "source": [ + "fn_meta = \"meta.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6ff83544", + "metadata": {}, + "outputs": [], + "source": [ + "with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d370342f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 apoorvasrinivasan staff 1.8K Mar 14 18:25 meta.yaml\r\n" + ] + } + ], + "source": [ + "!ls -lh {fn_meta}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "40548210", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name: offsides\r\n", + "description: OffSIDES is a database of individual drug side effect signals mined from\r\n", + " the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity\r\n", + " score matching (PSM) model is used to identify control drugs and produce better\r\n", + " PRR estimates. In OffSIDES we focus on drug safety signals that are not already\r\n", + " established by being listed on the structured product label -- hence they are off-label\r\n", + " drug side effects.\r\n", + "targets:\r\n", + "- id: PRR\r\n", + " description: Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\r\n", + " type: continuous\r\n", + " names:\r\n", + " - Proportional reporting ratio\r\n", + "- id: PRR_error\r\n", + " description: Standard error of the PRR estimate\r\n", + " type: continuous\r\n", + " names:\r\n", + " - Proportional reporting ratio error\r\n", + "- id: mean_reporting_frequency\r\n", + " description: Proportion of reports for the drug that report the side effect, A/(A+B)\r\n", + " type: continuous\r\n", + " names:\r\n", + " - mean reporting frequency\r\n", + "identifier:\r\n", + "- id: drug_concept_name\r\n", + " description: RxNorm name string for the drug\r\n", + " type: categorical\r\n", + "- id: condition_concept_name\r\n", + " description: MedDRA identifier for the side effect\r\n", + " type: categorical\r\n", + "license: CC BY 4.0\r\n", + "links:\r\n", + "- url: https://tatonettilab.org/resources/nsides/\r\n", + " description: data source\r\n", + "- url: https://nsides.io/\r\n", + " description: database website\r\n", + "num_points: 3206558\r\n", + "bibtex: \"\\n @article{Tatonetti2012,\\n author = {Tatonetti, Nicholas\\\r\n", + " \\ P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\\n title\\\r\n", + " \\ = {Data-driven prediction of drug effects and interactions},\\n journal\\\r\n", + " \\ = {Sci Transl Med},\\n volume = {4},\\n number = {125},\\n pages\\\r\n", + " \\ = {125ra31},\\n year = {2012},\\n doi = {10.1126/scitranslmed.3003377},\\n\\\r\n", + " \\ pmid = {22422992},\\n pmcid = {PMC3382018}\\n }\\n \"\r\n" + ] + } + ], + "source": [ + "!cat {fn_meta}" + ] + }, + { + "cell_type": "markdown", + "id": "0ff77293", + "metadata": {}, + "source": [ + "## Create transform.py" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "38e8a677", + "metadata": {}, + "outputs": [], + "source": [ + "path_file = \"transform.py\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e2f46f61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting transform.py\n" + ] + } + ], + "source": [ + "%%writefile $path_file\n", + "import pandas as pd\n", + "import requests\n", + "import yaml\n", + "\n", + "\n", + "def get_and_transform_data():\n", + " # load data\n", + " df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n", + " error_bad_lines=False)\n", + "\n", + " # check if fields are the same\n", + " expected_columns = ['drug_rxnorn_id',\n", + " 'drug_concept_name',\n", + " 'condition_meddra_id',\n", + " 'condition_concept_name',\n", + " 'A',\n", + " 'B',\n", + " 'C',\n", + " 'D',\n", + " 'PRR',\n", + " 'PRR_error',\n", + " 'mean_reporting_frequency']\n", + "\n", + " assert df.columns.tolist() == expected_columns\n", + " \n", + " # remove duplicates\n", + " df.drop_duplicates(inplace=True)\n", + " # check duplicates\n", + " assert not df.duplicated().sum(), \"Found duplicate rows in the dataframe\"\n", + " \n", + " \n", + "\n", + " # save to csv\n", + " fn_data_csv = \"data_clean.csv\"\n", + " df.to_csv(fn_data_csv, index=False)\n", + "\n", + " # create meta yaml\n", + " meta = {\n", + " \"name\": \"offsides\",\n", + " \"description\": \"OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.\",\n", + " \"targets\": [\n", + " {\n", + " \"id\": \"PRR\",\n", + " \"description\": \"Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\",\n", + " \"type\": \"continuous\",\n", + " \"names\": [\"Proportional reporting ratio\"]\n", + " },\n", + " {\n", + " \"id\": \"PRR_error\",\n", + " \"description\": \"Standard error of the PRR estimate\",\n", + " \"type\": \"continuous\",\n", + " \"names\": [\"Proportional reporting ratio error\"]\n", + " },\n", + " {\n", + " \"id\": \"mean_reporting_frequency\",\n", + " \"description\": \"Proportion of reports for the drug that report the side effect, A/(A+B)\",\n", + " \"type\": \"continuous\",\n", + " \"names\": [\"mean reporting frequency\"]\n", + " }\n", + " ],\n", + " \"identifier\": [\n", + " {\n", + " \"id\": \"drug_concept_name\",\n", + " \"description\": \"RxNorm name string for the drug\",\n", + " \"type\": \"categorical\"\n", + " },\n", + " {\n", + " \"id\": \"condition_concept_name\",\n", + " \"description\": \"MedDRA identifier for the side effect\",\n", + " \"type\": \"categorical\"\n", + " }\n", + " ],\n", + " \"license\": \"CC BY 4.0\",\n", + " \"links\": [\n", + " {\n", + " \"url\": \"https://tatonettilab.org/resources/nsides/\",\n", + " \"description\": \"data source\"\n", + " },\n", + " {\n", + " \"url\": \"https://nsides.io/\",\n", + " \"description\": \"database website\"\n", + " }\n", + " ],\n", + " \"num_points\": len(df),\n", + " \"bibtex\": \"\"\"\n", + " @article{Tatonetti2012,\n", + " author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n", + " title = {Data-driven prediction of drug effects and interactions},\n", + " journal = {Sci Transl Med},\n", + " volume = {4},\n", + " number = {125},\n", + " pages = {125ra31},\n", + " year = {2012},\n", + " doi = {10.1126/scitranslmed.3003377},\n", + " pmid = {22422992},\n", + " pmcid = {PMC3382018}\n", + " }\n", + " \"\"\"\n", + " }\n", + "\n", + " def str_presenter(dumper, data):\n", + " \"\"\"configures yaml for dumping multiline strings\n", + " Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n", + " \"\"\"\n", + " if data.count(\"\\n\") > 0: # check for multiline string\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n", + " return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n", + "\n", + " yaml.add_representer(str, str_presenter)\n", + " yaml.representer.SafeRepresenter.add_representer(\n", + " str, str_presenter\n", + " ) # to use with safe_dum\n", + " fn_meta = \"meta.yaml\"\n", + " with open(fn_meta, \"w\") as f:\n", + " yaml.dump(meta, f, sort_keys=False)\n", + "\n", + " print(f\"Finished processing {meta['name']} dataset!\")\n", + "\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " get_and_transform_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1ac51787", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "transform.py:8: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", + "\n", + "\n", + " df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n", + "transform.py:8: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n", + "Finished processing offsides dataset!\n" + ] + } + ], + "source": [ + "!python3 transform.py" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data/offsides/transform.py b/data/offsides/transform.py new file mode 100644 index 000000000..4ea0ed0fb --- /dev/null +++ b/data/offsides/transform.py @@ -0,0 +1,127 @@ +import pandas as pd +import yaml + + +def get_and_transform_data(): + # load data + df = pd.read_csv( + "https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz", + compression="gzip", + on_bad_lines="skip", + low_memory=False, + ) + + # check if fields are the same + expected_columns = [ + "drug_rxnorn_id", + "drug_concept_name", + "condition_meddra_id", + "condition_concept_name", + "A", + "B", + "C", + "D", + "PRR", + "PRR_error", + "mean_reporting_frequency", + ] + + assert df.columns.tolist() == expected_columns + + # drop columns A, B, C, D + df.drop(columns=["A", "B", "C", "D"], inplace=True) + # remove duplicates + df.drop_duplicates(inplace=True) + # check duplicates + assert not df.duplicated().sum(), "Found duplicate rows in the dataframe" + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + # create meta yaml + meta = { + "name": "offsides", + "description": """OffSIDES is a database of individual drug side effect +signals mined from the FDA's Adverse Event Reporting System. The +innovation of OffSIDES is that a propensity score matching (PSM) model +is used to identify control drugs and produce better PRR estimates. In +OffSIDES we focus on drug safety signals that are not already +established by being listed on the structured product label - hence +they are off-label drug side effects.""", + "targets": [ + { + "id": "PRR", + "description": "proportional reporting ratio", + "type": "continuous", + "names": ["proportional reporting ratio"], + }, + { + "id": "PRR_error", + "description": "standard error of the PRR estimate", + "type": "continuous", + "sample": False, + "names": ["standard error of the proportional reporting ratio error"], + }, + { + "id": "mean_reporting_frequency", + "description": "mean reporting frequency for the drug", + "type": "continuous", + "names": ["mean reporting frequency"], + }, + ], + "identifier": [ + { + "id": "drug_concept_name", + "description": "RxNorm name string for the drug", + "type": "categorical", + }, + { + "id": "condition_concept_name", + "description": "MedDRA identifier for the side effect", + "type": "categorical", + }, + ], + "license": "CC BY 4.0", + "links": [ + { + "url": "https://tatonettilab.org/resources/nsides/", + "description": "data source", + }, + {"url": "https://nsides.io/", "description": "database website"}, + ], + "num_points": len(df), + "bibtex": """@article{Tatonetti2012, +author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.}, +title = {Data-driven prediction of drug effects and interactions}, +journal = {Sci Transl Med}, +volume = {4}, +number = {125}, +pages = {125ra31}, +year = {2012}, +doi = {10.1126/scitranslmed.3003377}, +pmid = {22422992}, +pmcid = {PMC3382018} +}""", + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data()