From bc331883233077f8ceb074f663c7b4b62773cfd6 Mon Sep 17 00:00:00 2001
From: Apoorva Srinivasan <apoorvasrinivasan@apoorvas-mbp.lan>
Date: Wed, 15 Mar 2023 13:39:07 -0700
Subject: [PATCH 1/9] Apply pre-commit hook changes

---
 data/nsides/offsides/meta.yaml                |  39 +
 data/nsides/offsides/offsides_data_prep.ipynb | 872 ++++++++++++++++++
 data/nsides/offsides/transform.py             | 124 +++
 3 files changed, 1035 insertions(+)
 create mode 100644 data/nsides/offsides/meta.yaml
 create mode 100644 data/nsides/offsides/offsides_data_prep.ipynb
 create mode 100644 data/nsides/offsides/transform.py

diff --git a/data/nsides/offsides/meta.yaml b/data/nsides/offsides/meta.yaml
new file mode 100644
index 000000000..2849ce626
--- /dev/null
+++ b/data/nsides/offsides/meta.yaml
@@ -0,0 +1,39 @@
+---
+name: offsides
+description: OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES
+    is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety
+    signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.
+targets:
+    - id: PRR
+      description: Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))
+      type: continuous
+      names:
+          - Proportional reporting ratio
+    - id: PRR_error
+      description: Standard error of the PRR estimate
+      type: continuous
+      names:
+          - Proportional reporting ratio error
+    - id: mean_reporting_frequency
+      description: Proportion of reports for the drug that report the side effect,  A/(A+B)
+      type: continuous
+      names:
+          - mean reporting frequency
+identifier:
+    - id: drug_concept_name
+      description: RxNorm name string for the drug
+      type: categorical
+    - id: condition_concept_name
+      description: MedDRA identifier for the side effect
+      type: categorical
+license: CC BY 4.0
+links:
+    - url: https://tatonettilab.org/resources/nsides/
+      description: data source
+    - url: https://nsides.io/
+      description: database website
+num_points: 3042873
+bibtex: "\n        @article{Tatonetti2012,\n        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n    \
+    \    title = {Data-driven prediction of drug effects and interactions},\n        journal = {Sci Transl Med},\n        volume = {4},\n        number\
+    \ = {125},\n        pages = {125ra31},\n        year = {2012},\n        doi = {10.1126/scitranslmed.3003377},\n        pmid = {22422992},\n        pmcid\
+    \ = {PMC3382018}\n        }\n        "
diff --git a/data/nsides/offsides/offsides_data_prep.ipynb b/data/nsides/offsides/offsides_data_prep.ipynb
new file mode 100644
index 000000000..e498392bf
--- /dev/null
+++ b/data/nsides/offsides/offsides_data_prep.ipynb
@@ -0,0 +1,872 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ee354cad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a5577953",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/4254589737.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "\n",
+      "\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+      "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/4254589737.py:1: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+    "                   error_bad_lines=False)\n",
+    "#df.shape\n",
+    "#df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "da5b4b81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # check if fields are the same\n",
+    "\n",
+    "\n",
+    "expected_columns = ['drug_rxnorn_id',\n",
+    " 'drug_concept_name',\n",
+    " 'condition_meddra_id',\n",
+    " 'condition_concept_name',\n",
+    " 'A',\n",
+    " 'B',\n",
+    " 'C',\n",
+    " 'D',\n",
+    " 'PRR',\n",
+    " 'PRR_error',\n",
+    " 'mean_reporting_frequency']\n",
+    "\n",
+    "assert df.columns.tolist() == expected_columns\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "8d016424",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AssertionError",
+     "evalue": "Found duplicate rows in the dataframe",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m df\u001b[38;5;241m.\u001b[39mduplicated()\u001b[38;5;241m.\u001b[39msum(), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound duplicate rows in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      2\u001b[0m df\u001b[38;5;241m.\u001b[39mdrop_duplicates(inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: Found duplicate rows in the dataframe"
+     ]
+    }
+   ],
+   "source": [
+    "assert not df.duplicated().sum(), \"Found duplicate rows in the dataframe\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "f523b30a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop_duplicates(inplace=True)\n",
+    "# df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "43ed46ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn_data_csv = \"data_clean.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a6230d38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(fn_data_csv, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8b1da608",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-rw-r--r--  1 apoorvasrinivasan  staff   279M Mar 14 18:22 data_clean.csv\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -lh {fn_data_csv}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d1509dca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "drug_rxnorn_id,drug_concept_name,condition_meddra_id,condition_concept_name,A,B,C,D,PRR,PRR_error,mean_reporting_frequency\r\n",
+      "4024,\"ergoloid mesylates, USP\",10002034,Anaemia,6,126,21,1299,2.85714,0.45382,0.0454545\r\n",
+      "4024,\"ergoloid mesylates, USP\",10002965,Aplasia pure red cell,1,131,1,1319,10.0,1.41126,0.00757576\r\n",
+      "4024,\"ergoloid mesylates, USP\",10013442,Disseminated intravascular coagulation,1,131,6,1314,1.66667,1.07626,0.00757576\r\n",
+      "4024,\"ergoloid mesylates, USP\",10023126,Jaundice,2,130,7,1313,2.85714,0.79657,0.0151515\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!head -n 5 {fn_data_csv}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "f33da1dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>drug_rxnorn_id</th>\n",
+       "      <th>drug_concept_name</th>\n",
+       "      <th>condition_meddra_id</th>\n",
+       "      <th>condition_concept_name</th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>PRR</th>\n",
+       "      <th>PRR_error</th>\n",
+       "      <th>mean_reporting_frequency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002034</td>\n",
+       "      <td>Anaemia</td>\n",
+       "      <td>6</td>\n",
+       "      <td>126</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1299</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.45382</td>\n",
+       "      <td>0.045455</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002965</td>\n",
+       "      <td>Aplasia pure red cell</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1319</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.41126</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10013442</td>\n",
+       "      <td>Disseminated intravascular coagulation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1314</td>\n",
+       "      <td>1.66667</td>\n",
+       "      <td>1.07626</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10023126</td>\n",
+       "      <td>Jaundice</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1313</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.79657</td>\n",
+       "      <td>0.015152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10016288</td>\n",
+       "      <td>Febrile neutropenia</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1315</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.09163</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  drug_rxnorn_id        drug_concept_name condition_meddra_id  \\\n",
+       "0           4024  ergoloid mesylates, USP            10002034   \n",
+       "1           4024  ergoloid mesylates, USP            10002965   \n",
+       "2           4024  ergoloid mesylates, USP            10013442   \n",
+       "3           4024  ergoloid mesylates, USP            10023126   \n",
+       "4           4024  ergoloid mesylates, USP            10016288   \n",
+       "\n",
+       "                   condition_concept_name  A    B   C     D      PRR  \\\n",
+       "0                                 Anaemia  6  126  21  1299  2.85714   \n",
+       "1                   Aplasia pure red cell  1  131   1  1319     10.0   \n",
+       "2  Disseminated intravascular coagulation  1  131   6  1314  1.66667   \n",
+       "3                                Jaundice  2  130   7  1313  2.85714   \n",
+       "4                     Febrile neutropenia  1  131   5  1315      2.0   \n",
+       "\n",
+       "  PRR_error mean_reporting_frequency  \n",
+       "0   0.45382                 0.045455  \n",
+       "1   1.41126                 0.007576  \n",
+       "2   1.07626                 0.007576  \n",
+       "3   0.79657                 0.015152  \n",
+       "4   1.09163                 0.007576  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17bcca0d",
+   "metadata": {},
+   "source": [
+    "## Load from csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4d12adad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn_data_csv = \"data_clean.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ae4cbf36",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/2664504625.py:1: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv(fn_data_csv)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(fn_data_csv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "214a2b81",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>drug_rxnorn_id</th>\n",
+       "      <th>drug_concept_name</th>\n",
+       "      <th>condition_meddra_id</th>\n",
+       "      <th>condition_concept_name</th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>PRR</th>\n",
+       "      <th>PRR_error</th>\n",
+       "      <th>mean_reporting_frequency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002034</td>\n",
+       "      <td>Anaemia</td>\n",
+       "      <td>6</td>\n",
+       "      <td>126</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1299</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.45382</td>\n",
+       "      <td>0.045455</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002965</td>\n",
+       "      <td>Aplasia pure red cell</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1319</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.41126</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10013442</td>\n",
+       "      <td>Disseminated intravascular coagulation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1314</td>\n",
+       "      <td>1.66667</td>\n",
+       "      <td>1.07626</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10023126</td>\n",
+       "      <td>Jaundice</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1313</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.79657</td>\n",
+       "      <td>0.015152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10016288</td>\n",
+       "      <td>Febrile neutropenia</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1315</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.09163</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  drug_rxnorn_id        drug_concept_name condition_meddra_id  \\\n",
+       "0           4024  ergoloid mesylates, USP            10002034   \n",
+       "1           4024  ergoloid mesylates, USP            10002965   \n",
+       "2           4024  ergoloid mesylates, USP            10013442   \n",
+       "3           4024  ergoloid mesylates, USP            10023126   \n",
+       "4           4024  ergoloid mesylates, USP            10016288   \n",
+       "\n",
+       "                   condition_concept_name  A    B   C     D      PRR  \\\n",
+       "0                                 Anaemia  6  126  21  1299  2.85714   \n",
+       "1                   Aplasia pure red cell  1  131   1  1319     10.0   \n",
+       "2  Disseminated intravascular coagulation  1  131   6  1314  1.66667   \n",
+       "3                                Jaundice  2  130   7  1313  2.85714   \n",
+       "4                     Febrile neutropenia  1  131   5  1315      2.0   \n",
+       "\n",
+       "  PRR_error mean_reporting_frequency  \n",
+       "0   0.45382                 0.045455  \n",
+       "1   1.41126                 0.007576  \n",
+       "2   1.07626                 0.007576  \n",
+       "3   0.79657                 0.015152  \n",
+       "4   1.09163                 0.007576  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "854b807c",
+   "metadata": {},
+   "source": [
+    "## Meta YAML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "2d24e114",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "meta = {\n",
+    "    \"name\": \"offsides\",\n",
+    "    \"description\": \"OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.\",\n",
+    "    \"targets\": [\n",
+    "        {\n",
+    "            \"id\": \"PRR\",\n",
+    "            \"description\": \"Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"PRR_error\",\n",
+    "            \"description\": \"Standard error of the PRR estimate\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio error\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"mean_reporting_frequency\",\n",
+    "            \"description\": \"Proportion of reports for the drug that report the side effect,  A/(A+B)\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"mean reporting frequency\"]\n",
+    "        }\n",
+    "    ],\n",
+    "    \"identifier\": [\n",
+    "        {\n",
+    "            \"id\": \"drug_concept_name\",\n",
+    "            \"description\": \"RxNorm name string for the drug\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"condition_concept_name\",\n",
+    "            \"description\": \"MedDRA identifier for the side effect\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"license\": \"CC BY 4.0\",\n",
+    "    \"links\": [\n",
+    "        {\n",
+    "            \"url\": \"https://tatonettilab.org/resources/nsides/\",\n",
+    "            \"description\": \"data source\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"url\": \"https://nsides.io/\",\n",
+    "            \"description\": \"database website\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"num_points\": len(df),\n",
+    "    \"bibtex\": \"\"\"\n",
+    "        @article{Tatonetti2012,\n",
+    "        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n",
+    "        title = {Data-driven prediction of drug effects and interactions},\n",
+    "        journal = {Sci Transl Med},\n",
+    "        volume = {4},\n",
+    "        number = {125},\n",
+    "        pages = {125ra31},\n",
+    "        year = {2012},\n",
+    "        doi = {10.1126/scitranslmed.3003377},\n",
+    "        pmid = {22422992},\n",
+    "        pmcid = {PMC3382018}\n",
+    "        }\n",
+    "        \"\"\"\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6e8aafee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn_meta = \"meta.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6ff83544",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(fn_meta, \"w\") as f:\n",
+    "    yaml.dump(meta, f, sort_keys=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d370342f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-rw-r--r--  1 apoorvasrinivasan  staff   1.8K Mar 14 18:25 meta.yaml\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -lh {fn_meta}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "40548210",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "name: offsides\r\n",
+      "description: OffSIDES is a database of individual drug side effect signals mined from\r\n",
+      "  the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity\r\n",
+      "  score matching (PSM) model is used to identify control drugs and produce better\r\n",
+      "  PRR estimates. In OffSIDES we focus on drug safety signals that are not already\r\n",
+      "  established by being listed on the structured product label -- hence they are off-label\r\n",
+      "  drug side effects.\r\n",
+      "targets:\r\n",
+      "- id: PRR\r\n",
+      "  description: Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\r\n",
+      "  type: continuous\r\n",
+      "  names:\r\n",
+      "  - Proportional reporting ratio\r\n",
+      "- id: PRR_error\r\n",
+      "  description: Standard error of the PRR estimate\r\n",
+      "  type: continuous\r\n",
+      "  names:\r\n",
+      "  - Proportional reporting ratio error\r\n",
+      "- id: mean_reporting_frequency\r\n",
+      "  description: Proportion of reports for the drug that report the side effect,  A/(A+B)\r\n",
+      "  type: continuous\r\n",
+      "  names:\r\n",
+      "  - mean reporting frequency\r\n",
+      "identifier:\r\n",
+      "- id: drug_concept_name\r\n",
+      "  description: RxNorm name string for the drug\r\n",
+      "  type: categorical\r\n",
+      "- id: condition_concept_name\r\n",
+      "  description: MedDRA identifier for the side effect\r\n",
+      "  type: categorical\r\n",
+      "license: CC BY 4.0\r\n",
+      "links:\r\n",
+      "- url: https://tatonettilab.org/resources/nsides/\r\n",
+      "  description: data source\r\n",
+      "- url: https://nsides.io/\r\n",
+      "  description: database website\r\n",
+      "num_points: 3206558\r\n",
+      "bibtex: \"\\n        @article{Tatonetti2012,\\n        author = {Tatonetti, Nicholas\\\r\n",
+      "  \\ P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\\n        title\\\r\n",
+      "  \\ = {Data-driven prediction of drug effects and interactions},\\n        journal\\\r\n",
+      "  \\ = {Sci Transl Med},\\n        volume = {4},\\n        number = {125},\\n        pages\\\r\n",
+      "  \\ = {125ra31},\\n        year = {2012},\\n        doi = {10.1126/scitranslmed.3003377},\\n\\\r\n",
+      "  \\        pmid = {22422992},\\n        pmcid = {PMC3382018}\\n        }\\n        \"\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat {fn_meta}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ff77293",
+   "metadata": {},
+   "source": [
+    "## Create transform.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "38e8a677",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_file = \"transform.py\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e2f46f61",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting transform.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile $path_file\n",
+    "import pandas as pd\n",
+    "import requests\n",
+    "import yaml\n",
+    "\n",
+    "\n",
+    "def get_and_transform_data():\n",
+    "    # load data\n",
+    "    df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+    "                   error_bad_lines=False)\n",
+    "\n",
+    "    # check if fields are the same\n",
+    "    expected_columns = ['drug_rxnorn_id',\n",
+    "     'drug_concept_name',\n",
+    "     'condition_meddra_id',\n",
+    "     'condition_concept_name',\n",
+    "     'A',\n",
+    "     'B',\n",
+    "     'C',\n",
+    "     'D',\n",
+    "     'PRR',\n",
+    "     'PRR_error',\n",
+    "     'mean_reporting_frequency']\n",
+    "\n",
+    "    assert df.columns.tolist() == expected_columns\n",
+    "    \n",
+    "    # remove duplicates\n",
+    "    df.drop_duplicates(inplace=True)\n",
+    "    # check duplicates\n",
+    "    assert not df.duplicated().sum(), \"Found duplicate rows in the dataframe\"\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "    # save to csv\n",
+    "    fn_data_csv = \"data_clean.csv\"\n",
+    "    df.to_csv(fn_data_csv, index=False)\n",
+    "\n",
+    "    # create meta yaml\n",
+    "    meta = {\n",
+    "    \"name\": \"offsides\",\n",
+    "    \"description\": \"OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.\",\n",
+    "    \"targets\": [\n",
+    "        {\n",
+    "            \"id\": \"PRR\",\n",
+    "            \"description\": \"Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"PRR_error\",\n",
+    "            \"description\": \"Standard error of the PRR estimate\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio error\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"mean_reporting_frequency\",\n",
+    "            \"description\": \"Proportion of reports for the drug that report the side effect,  A/(A+B)\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"mean reporting frequency\"]\n",
+    "        }\n",
+    "    ],\n",
+    "    \"identifier\": [\n",
+    "        {\n",
+    "            \"id\": \"drug_concept_name\",\n",
+    "            \"description\": \"RxNorm name string for the drug\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"condition_concept_name\",\n",
+    "            \"description\": \"MedDRA identifier for the side effect\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"license\": \"CC BY 4.0\",\n",
+    "    \"links\": [\n",
+    "        {\n",
+    "            \"url\": \"https://tatonettilab.org/resources/nsides/\",\n",
+    "            \"description\": \"data source\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"url\": \"https://nsides.io/\",\n",
+    "            \"description\": \"database website\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"num_points\": len(df),\n",
+    "    \"bibtex\": \"\"\"\n",
+    "        @article{Tatonetti2012,\n",
+    "        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n",
+    "        title = {Data-driven prediction of drug effects and interactions},\n",
+    "        journal = {Sci Transl Med},\n",
+    "        volume = {4},\n",
+    "        number = {125},\n",
+    "        pages = {125ra31},\n",
+    "        year = {2012},\n",
+    "        doi = {10.1126/scitranslmed.3003377},\n",
+    "        pmid = {22422992},\n",
+    "        pmcid = {PMC3382018}\n",
+    "        }\n",
+    "        \"\"\"\n",
+    "    }\n",
+    "\n",
+    "    def str_presenter(dumper, data):\n",
+    "        \"\"\"configures yaml for dumping multiline strings\n",
+    "        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n",
+    "        \"\"\"\n",
+    "        if data.count(\"\\n\") > 0:  # check for multiline string\n",
+    "            return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n",
+    "        return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n",
+    "\n",
+    "    yaml.add_representer(str, str_presenter)\n",
+    "    yaml.representer.SafeRepresenter.add_representer(\n",
+    "        str, str_presenter\n",
+    "    )  # to use with safe_dum\n",
+    "    fn_meta = \"meta.yaml\"\n",
+    "    with open(fn_meta, \"w\") as f:\n",
+    "        yaml.dump(meta, f, sort_keys=False)\n",
+    "\n",
+    "    print(f\"Finished processing {meta['name']} dataset!\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    get_and_transform_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1ac51787",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "transform.py:8: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "\n",
+      "\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+      "transform.py:8: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+      "Finished processing offsides dataset!\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python3 transform.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/nsides/offsides/transform.py b/data/nsides/offsides/transform.py
new file mode 100644
index 000000000..af5ee116f
--- /dev/null
+++ b/data/nsides/offsides/transform.py
@@ -0,0 +1,124 @@
+import pandas as pd
+import yaml
+
+
+def get_and_transform_data():
+    # load data
+    df = pd.read_csv(
+        "https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz",
+        compression="gzip",
+        error_bad_lines=False,
+    )
+
+    # check if fields are the same
+    expected_columns = [
+        "drug_rxnorn_id",
+        "drug_concept_name",
+        "condition_meddra_id",
+        "condition_concept_name",
+        "A",
+        "B",
+        "C",
+        "D",
+        "PRR",
+        "PRR_error",
+        "mean_reporting_frequency",
+    ]
+
+    assert df.columns.tolist() == expected_columns
+    # remove duplicates
+    df.drop_duplicates(inplace=True)
+    # check duplicates
+    assert not df.duplicated().sum(), "Found duplicate rows in the dataframe"
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    # create meta yaml
+    meta = {
+        "name": "offsides",
+        "description": (
+            "OffSIDES is a database of individual drug side effect signals mined from the FDA's "
+            "Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score "
+            "matching (PSM) model is used to identify control drugs and produce better PRR estimates. "
+            "In OffSIDES we focus on drug safety signals that are not already established by being "
+            "listed on the structured product label -- hence they are off-label drug side effects."
+        ),
+        "targets": [
+            {
+                "id": "PRR",
+                "description": "Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))",
+                "type": "continuous",
+                "names": ["Proportional reporting ratio"],
+            },
+            {
+                "id": "PRR_error",
+                "description": "Standard error of the PRR estimate",
+                "type": "continuous",
+                "names": ["Proportional reporting ratio error"],
+            },
+            {
+                "id": "mean_reporting_frequency",
+                "description": "Proportion of reports for the drug that report the side effect,  A/(A+B)",
+                "type": "continuous",
+                "names": ["mean reporting frequency"],
+            },
+        ],
+        "identifier": [
+            {
+                "id": "drug_concept_name",
+                "description": "RxNorm name string for the drug",
+                "type": "categorical",
+            },
+            {
+                "id": "condition_concept_name",
+                "description": "MedDRA identifier for the side effect",
+                "type": "categorical",
+            },
+        ],
+        "license": "CC BY 4.0",
+        "links": [
+            {
+                "url": "https://tatonettilab.org/resources/nsides/",
+                "description": "data source",
+            },
+            {"url": "https://nsides.io/", "description": "database website"},
+        ],
+        "num_points": len(df),
+        "bibtex": """
+        @article{Tatonetti2012,
+        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},
+        title = {Data-driven prediction of drug effects and interactions},
+        journal = {Sci Transl Med},
+        volume = {4},
+        number = {125},
+        pages = {125ra31},
+        year = {2012},
+        doi = {10.1126/scitranslmed.3003377},
+        pmid = {22422992},
+        pmcid = {PMC3382018}
+        }
+        """,
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()

From dfcd06d3eff38dde7dff2be31266464012676e00 Mon Sep 17 00:00:00 2001
From: Apoorva Srinivasan <apoorvasrinivasan@apoorvas-mbp.lan>
Date: Wed, 29 Mar 2023 09:51:59 -0700
Subject: [PATCH 2/9] updatign changes

---
 data/nsides/offsides/transform.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/data/nsides/offsides/transform.py b/data/nsides/offsides/transform.py
index af5ee116f..2b75cc44e 100644
--- a/data/nsides/offsides/transform.py
+++ b/data/nsides/offsides/transform.py
@@ -26,6 +26,10 @@ def get_and_transform_data():
     ]
 
     assert df.columns.tolist() == expected_columns
+
+    # drop columns
+    # drop A, B, C, D
+    df.drop(columns=["A", "B", "C", "D"], inplace=True)
     # remove duplicates
     df.drop_duplicates(inplace=True)
     # check duplicates
@@ -47,7 +51,7 @@ def get_and_transform_data():
         "targets": [
             {
                 "id": "PRR",
-                "description": "Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))",
+                "description": "Proportional reporting ratio",
                 "type": "continuous",
                 "names": ["Proportional reporting ratio"],
             },
@@ -59,7 +63,7 @@ def get_and_transform_data():
             },
             {
                 "id": "mean_reporting_frequency",
-                "description": "Proportion of reports for the drug that report the side effect,  A/(A+B)",
+                "description": "Proportion of reports for the drug that report the side effect",
                 "type": "continuous",
                 "names": ["mean reporting frequency"],
             },

From b3fbbcacc0b60ee5d9e79903d1c4847f7c62ce04 Mon Sep 17 00:00:00 2001
From: Apoorva Srinivasan <apoorvasrinivasan@apoorvas-mbp.lan>
Date: Wed, 29 Mar 2023 23:14:43 -0700
Subject: [PATCH 3/9] updating meta.yaml

---
 data/nsides/offsides/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/nsides/offsides/meta.yaml b/data/nsides/offsides/meta.yaml
index 2849ce626..9e3933093 100644
--- a/data/nsides/offsides/meta.yaml
+++ b/data/nsides/offsides/meta.yaml
@@ -5,7 +5,7 @@ description: OffSIDES is a database of individual drug side effect signals mined
     signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.
 targets:
     - id: PRR
-      description: Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))
+      description: Proportional reporting ratio)
       type: continuous
       names:
           - Proportional reporting ratio
@@ -15,7 +15,7 @@ targets:
       names:
           - Proportional reporting ratio error
     - id: mean_reporting_frequency
-      description: Proportion of reports for the drug that report the side effect,  A/(A+B)
+      description: Proportion of reports for the drug that report the side effect
       type: continuous
       names:
           - mean reporting frequency

From 102536be0af033b01d6d1ea55feb5746eaae7507 Mon Sep 17 00:00:00 2001
From: Apoorva Srinivasan
 <43023448+apoorvasrinivasan26@users.noreply.github.com>
Date: Mon, 3 Apr 2023 09:13:36 -0700
Subject: [PATCH 4/9] Update data/nsides/offsides/meta.yaml

Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
---
 data/nsides/offsides/meta.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/data/nsides/offsides/meta.yaml b/data/nsides/offsides/meta.yaml
index 9e3933093..0559f5664 100644
--- a/data/nsides/offsides/meta.yaml
+++ b/data/nsides/offsides/meta.yaml
@@ -9,11 +9,6 @@ targets:
       type: continuous
       names:
           - Proportional reporting ratio
-    - id: PRR_error
-      description: Standard error of the PRR estimate
-      type: continuous
-      names:
-          - Proportional reporting ratio error
     - id: mean_reporting_frequency
       description: Proportion of reports for the drug that report the side effect
       type: continuous

From cb00a026c0b50703aa286bd83a5c06753d009a7d Mon Sep 17 00:00:00 2001
From: Apoorva Srinivasan
 <43023448+apoorvasrinivasan26@users.noreply.github.com>
Date: Mon, 3 Apr 2023 09:13:51 -0700
Subject: [PATCH 5/9] Update data/nsides/offsides/meta.yaml

Co-authored-by: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
---
 data/nsides/offsides/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/nsides/offsides/meta.yaml b/data/nsides/offsides/meta.yaml
index 0559f5664..4fb28b949 100644
--- a/data/nsides/offsides/meta.yaml
+++ b/data/nsides/offsides/meta.yaml
@@ -5,7 +5,7 @@ description: OffSIDES is a database of individual drug side effect signals mined
     signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.
 targets:
     - id: PRR
-      description: Proportional reporting ratio)
+      description: Proportional reporting ratio
       type: continuous
       names:
           - Proportional reporting ratio

From ae9e9287f84f0acd93a274145f17f0b67eb7e3b3 Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Wed, 3 May 2023 16:18:54 +0200
Subject: [PATCH 6/9] feat: fix small issues and move dir

---
 data/nsides/offsides/meta.yaml                | 34 ------------
 data/offsides/meta.yaml                       | 53 +++++++++++++++++++
 .../offsides/offsides_data_prep.ipynb         |  0
 data/{nsides => }/offsides/transform.py       | 43 ++++++++-------
 4 files changed, 74 insertions(+), 56 deletions(-)
 delete mode 100644 data/nsides/offsides/meta.yaml
 create mode 100644 data/offsides/meta.yaml
 rename data/{nsides => }/offsides/offsides_data_prep.ipynb (100%)
 rename data/{nsides => }/offsides/transform.py (75%)

diff --git a/data/nsides/offsides/meta.yaml b/data/nsides/offsides/meta.yaml
deleted file mode 100644
index 4fb28b949..000000000
--- a/data/nsides/offsides/meta.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
----
-name: offsides
-description: OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES
-    is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety
-    signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.
-targets:
-    - id: PRR
-      description: Proportional reporting ratio
-      type: continuous
-      names:
-          - Proportional reporting ratio
-    - id: mean_reporting_frequency
-      description: Proportion of reports for the drug that report the side effect
-      type: continuous
-      names:
-          - mean reporting frequency
-identifier:
-    - id: drug_concept_name
-      description: RxNorm name string for the drug
-      type: categorical
-    - id: condition_concept_name
-      description: MedDRA identifier for the side effect
-      type: categorical
-license: CC BY 4.0
-links:
-    - url: https://tatonettilab.org/resources/nsides/
-      description: data source
-    - url: https://nsides.io/
-      description: database website
-num_points: 3042873
-bibtex: "\n        @article{Tatonetti2012,\n        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n    \
-    \    title = {Data-driven prediction of drug effects and interactions},\n        journal = {Sci Transl Med},\n        volume = {4},\n        number\
-    \ = {125},\n        pages = {125ra31},\n        year = {2012},\n        doi = {10.1126/scitranslmed.3003377},\n        pmid = {22422992},\n        pmcid\
-    \ = {PMC3382018}\n        }\n        "
diff --git a/data/offsides/meta.yaml b/data/offsides/meta.yaml
new file mode 100644
index 000000000..ef1d9bb77
--- /dev/null
+++ b/data/offsides/meta.yaml
@@ -0,0 +1,53 @@
+---
+name: offsides
+description: |-
+    OffSIDES is a database of individual drug side effect
+    signals mined from the FDA's. Adverse Event Reporting System. The
+    innovation of OffSIDES is that a propensity score matching (PSM) model
+    is used to identify control drugs and produce better PRR estimates. In
+    OffSIDES we focus on drug safety signals that are not already
+    established by being listed on the structured product label - hence
+    they are off-label drug side effects.
+targets:
+    - id: PRR
+      description: Proportional reporting ratio
+      type: continuous
+      names:
+          - Proportional reporting ratio
+    - id: PRR_error
+      description: Standard error of the PRR estimate
+      type: continuous
+      names:
+          - Proportional reporting ratio error
+    - id: mean_reporting_frequency
+      description: Proportion of reports for the drug that report the side effect
+      type: continuous
+      names:
+          - mean reporting frequency
+identifier:
+    - id: drug_concept_name
+      description: RxNorm name string for the drug
+      type: categorical
+    - id: condition_concept_name
+      description: MedDRA identifier for the side effect
+      type: categorical
+license: CC BY 4.0
+links:
+    - url: https://tatonettilab.org/resources/nsides/
+      description: data source
+    - url: https://nsides.io/
+      description: database website
+num_points: 2977338
+bibtex: |-
+    @article{Tatonetti2012,
+    author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},
+    title = {Data-driven prediction of drug effects and interactions},
+    journal = {Sci Transl Med},
+    volume = {4},
+    number = {125},
+    pages = {125ra31},
+    year = {2012},
+    doi = {10.1126/scitranslmed.3003377},
+    pmid = {22422992},
+    pmcid = {PMC3382018}
+    }
diff --git a/data/nsides/offsides/offsides_data_prep.ipynb b/data/offsides/offsides_data_prep.ipynb
similarity index 100%
rename from data/nsides/offsides/offsides_data_prep.ipynb
rename to data/offsides/offsides_data_prep.ipynb
diff --git a/data/nsides/offsides/transform.py b/data/offsides/transform.py
similarity index 75%
rename from data/nsides/offsides/transform.py
rename to data/offsides/transform.py
index 2b75cc44e..66220b631 100644
--- a/data/nsides/offsides/transform.py
+++ b/data/offsides/transform.py
@@ -7,7 +7,8 @@ def get_and_transform_data():
     df = pd.read_csv(
         "https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz",
         compression="gzip",
-        error_bad_lines=False,
+        on_bad_lines="skip",
+        low_memory=False,
     )
 
     # check if fields are the same
@@ -41,13 +42,13 @@ def get_and_transform_data():
     # create meta yaml
     meta = {
         "name": "offsides",
-        "description": (
-            "OffSIDES is a database of individual drug side effect signals mined from the FDA's "
-            "Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score "
-            "matching (PSM) model is used to identify control drugs and produce better PRR estimates. "
-            "In OffSIDES we focus on drug safety signals that are not already established by being "
-            "listed on the structured product label -- hence they are off-label drug side effects."
-        ),
+        "description": """OffSIDES is a database of individual drug side effect
+signals mined from the FDA's. Adverse Event Reporting System. The
+innovation of OffSIDES is that a propensity score matching (PSM) model
+is used to identify control drugs and produce better PRR estimates. In
+OffSIDES we focus on drug safety signals that are not already
+established by being listed on the structured product label - hence
+they are off-label drug side effects.""",
         "targets": [
             {
                 "id": "PRR",
@@ -89,20 +90,18 @@ def get_and_transform_data():
             {"url": "https://nsides.io/", "description": "database website"},
         ],
         "num_points": len(df),
-        "bibtex": """
-        @article{Tatonetti2012,
-        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},
-        title = {Data-driven prediction of drug effects and interactions},
-        journal = {Sci Transl Med},
-        volume = {4},
-        number = {125},
-        pages = {125ra31},
-        year = {2012},
-        doi = {10.1126/scitranslmed.3003377},
-        pmid = {22422992},
-        pmcid = {PMC3382018}
-        }
-        """,
+        "bibtex": """@article{Tatonetti2012,
+author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},
+title = {Data-driven prediction of drug effects and interactions},
+journal = {Sci Transl Med},
+volume = {4},
+number = {125},
+pages = {125ra31},
+year = {2012},
+doi = {10.1126/scitranslmed.3003377},
+pmid = {22422992},
+pmcid = {PMC3382018}
+}""",
     }
 
     def str_presenter(dumper, data):

From 2dfd8a6490cdfc04b4e27dc6509a2098ffa5a787 Mon Sep 17 00:00:00 2001
From: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
Date: Fri, 5 May 2023 13:19:58 +0200
Subject: [PATCH 7/9] Update data/offsides/meta.yaml

---
 data/offsides/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/offsides/meta.yaml b/data/offsides/meta.yaml
index ef1d9bb77..f8ae8dbe4 100644
--- a/data/offsides/meta.yaml
+++ b/data/offsides/meta.yaml
@@ -17,6 +17,7 @@ targets:
     - id: PRR_error
       description: Standard error of the PRR estimate
       type: continuous
+      sample: false
       names:
           - Proportional reporting ratio error
     - id: mean_reporting_frequency

From 568e777f87da96a9642c20bde74773cba0a928e8 Mon Sep 17 00:00:00 2001
From: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com>
Date: Fri, 5 May 2023 13:20:39 +0200
Subject: [PATCH 8/9] Update data/offsides/transform.py

---
 data/offsides/transform.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/offsides/transform.py b/data/offsides/transform.py
index 66220b631..ea538e655 100644
--- a/data/offsides/transform.py
+++ b/data/offsides/transform.py
@@ -60,6 +60,7 @@ def get_and_transform_data():
                 "id": "PRR_error",
                 "description": "Standard error of the PRR estimate",
                 "type": "continuous",
+                "sample": False,
                 "names": ["Proportional reporting ratio error"],
             },
             {

From 2ffa75c6f07fad27d55f270f71b7458bd542708d Mon Sep 17 00:00:00 2001
From: Michael Pieler <Michael.Pieler@Gmail.com>
Date: Fri, 5 May 2023 14:17:26 +0200
Subject: [PATCH 9/9] feat: minor text changes

---
 data/offsides/meta.yaml    | 12 ++++++------
 data/offsides/transform.py | 15 +++++++--------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/data/offsides/meta.yaml b/data/offsides/meta.yaml
index f8ae8dbe4..3f9270c58 100644
--- a/data/offsides/meta.yaml
+++ b/data/offsides/meta.yaml
@@ -2,7 +2,7 @@
 name: offsides
 description: |-
     OffSIDES is a database of individual drug side effect
-    signals mined from the FDA's. Adverse Event Reporting System. The
+    signals mined from the FDA's Adverse Event Reporting System. The
     innovation of OffSIDES is that a propensity score matching (PSM) model
     is used to identify control drugs and produce better PRR estimates. In
     OffSIDES we focus on drug safety signals that are not already
@@ -10,18 +10,18 @@ description: |-
     they are off-label drug side effects.
 targets:
     - id: PRR
-      description: Proportional reporting ratio
+      description: proportional reporting ratio
       type: continuous
       names:
-          - Proportional reporting ratio
+          - proportional reporting ratio
     - id: PRR_error
-      description: Standard error of the PRR estimate
+      description: standard error of the PRR estimate
       type: continuous
       sample: false
       names:
-          - Proportional reporting ratio error
+          - standard error of the proportional reporting ratio error
     - id: mean_reporting_frequency
-      description: Proportion of reports for the drug that report the side effect
+      description: mean reporting frequency for the drug
       type: continuous
       names:
           - mean reporting frequency
diff --git a/data/offsides/transform.py b/data/offsides/transform.py
index ea538e655..4ea0ed0fb 100644
--- a/data/offsides/transform.py
+++ b/data/offsides/transform.py
@@ -28,8 +28,7 @@ def get_and_transform_data():
 
     assert df.columns.tolist() == expected_columns
 
-    # drop columns
-    # drop A, B, C, D
+    # drop columns A, B, C, D
     df.drop(columns=["A", "B", "C", "D"], inplace=True)
     # remove duplicates
     df.drop_duplicates(inplace=True)
@@ -43,7 +42,7 @@ def get_and_transform_data():
     meta = {
         "name": "offsides",
         "description": """OffSIDES is a database of individual drug side effect
-signals mined from the FDA's. Adverse Event Reporting System. The
+signals mined from the FDA's Adverse Event Reporting System. The
 innovation of OffSIDES is that a propensity score matching (PSM) model
 is used to identify control drugs and produce better PRR estimates. In
 OffSIDES we focus on drug safety signals that are not already
@@ -52,20 +51,20 @@ def get_and_transform_data():
         "targets": [
             {
                 "id": "PRR",
-                "description": "Proportional reporting ratio",
+                "description": "proportional reporting ratio",
                 "type": "continuous",
-                "names": ["Proportional reporting ratio"],
+                "names": ["proportional reporting ratio"],
             },
             {
                 "id": "PRR_error",
-                "description": "Standard error of the PRR estimate",
+                "description": "standard error of the PRR estimate",
                 "type": "continuous",
                 "sample": False,
-                "names": ["Proportional reporting ratio error"],
+                "names": ["standard error of the proportional reporting ratio error"],
             },
             {
                 "id": "mean_reporting_frequency",
-                "description": "Proportion of reports for the drug that report the side effect",
+                "description": "mean reporting frequency for the drug",
                 "type": "continuous",
                 "names": ["mean reporting frequency"],
             },