diff --git a/data/offsides/meta.yaml b/data/offsides/meta.yaml
new file mode 100644
index 000000000..3f9270c58
--- /dev/null
+++ b/data/offsides/meta.yaml
@@ -0,0 +1,54 @@
+---
+name: offsides
+description: |-
+    OffSIDES is a database of individual drug side effect
+    signals mined from the FDA's Adverse Event Reporting System. The
+    innovation of OffSIDES is that a propensity score matching (PSM) model
+    is used to identify control drugs and produce better PRR estimates. In
+    OffSIDES we focus on drug safety signals that are not already
+    established by being listed on the structured product label - hence
+    they are off-label drug side effects.
+targets:
+    - id: PRR
+      description: proportional reporting ratio
+      type: continuous
+      names:
+          - proportional reporting ratio
+    - id: PRR_error
+      description: standard error of the PRR estimate
+      type: continuous
+      sample: false
+      names:
+          - standard error of the proportional reporting ratio error
+    - id: mean_reporting_frequency
+      description: mean reporting frequency for the drug
+      type: continuous
+      names:
+          - mean reporting frequency
+identifier:
+    - id: drug_concept_name
+      description: RxNorm name string for the drug
+      type: categorical
+    - id: condition_concept_name
+      description: MedDRA identifier for the side effect
+      type: categorical
+license: CC BY 4.0
+links:
+    - url: https://tatonettilab.org/resources/nsides/
+      description: data source
+    - url: https://nsides.io/
+      description: database website
+num_points: 2977338
+bibtex: |-
+    @article{Tatonetti2012,
+    author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},
+    title = {Data-driven prediction of drug effects and interactions},
+    journal = {Sci Transl Med},
+    volume = {4},
+    number = {125},
+    pages = {125ra31},
+    year = {2012},
+    doi = {10.1126/scitranslmed.3003377},
+    pmid = {22422992},
+    pmcid = {PMC3382018}
+    }
diff --git a/data/offsides/offsides_data_prep.ipynb b/data/offsides/offsides_data_prep.ipynb
new file mode 100644
index 000000000..e498392bf
--- /dev/null
+++ b/data/offsides/offsides_data_prep.ipynb
@@ -0,0 +1,872 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ee354cad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a5577953",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/4254589737.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "\n",
+      "\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+      "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/4254589737.py:1: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+    "                   error_bad_lines=False)\n",
+    "#df.shape\n",
+    "#df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "da5b4b81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # check if fields are the same\n",
+    "\n",
+    "\n",
+    "expected_columns = ['drug_rxnorn_id',\n",
+    " 'drug_concept_name',\n",
+    " 'condition_meddra_id',\n",
+    " 'condition_concept_name',\n",
+    " 'A',\n",
+    " 'B',\n",
+    " 'C',\n",
+    " 'D',\n",
+    " 'PRR',\n",
+    " 'PRR_error',\n",
+    " 'mean_reporting_frequency']\n",
+    "\n",
+    "assert df.columns.tolist() == expected_columns\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "8d016424",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AssertionError",
+     "evalue": "Found duplicate rows in the dataframe",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[33], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m df\u001b[38;5;241m.\u001b[39mduplicated()\u001b[38;5;241m.\u001b[39msum(), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound duplicate rows in the dataframe\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      2\u001b[0m df\u001b[38;5;241m.\u001b[39mdrop_duplicates(inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: Found duplicate rows in the dataframe"
+     ]
+    }
+   ],
+   "source": [
+    "assert not df.duplicated().sum(), \"Found duplicate rows in the dataframe\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "f523b30a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop_duplicates(inplace=True)\n",
+    "# df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "43ed46ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn_data_csv = \"data_clean.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a6230d38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(fn_data_csv, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8b1da608",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-rw-r--r--  1 apoorvasrinivasan  staff   279M Mar 14 18:22 data_clean.csv\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -lh {fn_data_csv}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "d1509dca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "drug_rxnorn_id,drug_concept_name,condition_meddra_id,condition_concept_name,A,B,C,D,PRR,PRR_error,mean_reporting_frequency\r\n",
+      "4024,\"ergoloid mesylates, USP\",10002034,Anaemia,6,126,21,1299,2.85714,0.45382,0.0454545\r\n",
+      "4024,\"ergoloid mesylates, USP\",10002965,Aplasia pure red cell,1,131,1,1319,10.0,1.41126,0.00757576\r\n",
+      "4024,\"ergoloid mesylates, USP\",10013442,Disseminated intravascular coagulation,1,131,6,1314,1.66667,1.07626,0.00757576\r\n",
+      "4024,\"ergoloid mesylates, USP\",10023126,Jaundice,2,130,7,1313,2.85714,0.79657,0.0151515\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!head -n 5 {fn_data_csv}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "f33da1dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>drug_rxnorn_id</th>\n",
+       "      <th>drug_concept_name</th>\n",
+       "      <th>condition_meddra_id</th>\n",
+       "      <th>condition_concept_name</th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>PRR</th>\n",
+       "      <th>PRR_error</th>\n",
+       "      <th>mean_reporting_frequency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002034</td>\n",
+       "      <td>Anaemia</td>\n",
+       "      <td>6</td>\n",
+       "      <td>126</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1299</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.45382</td>\n",
+       "      <td>0.045455</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002965</td>\n",
+       "      <td>Aplasia pure red cell</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1319</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.41126</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10013442</td>\n",
+       "      <td>Disseminated intravascular coagulation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1314</td>\n",
+       "      <td>1.66667</td>\n",
+       "      <td>1.07626</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10023126</td>\n",
+       "      <td>Jaundice</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1313</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.79657</td>\n",
+       "      <td>0.015152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10016288</td>\n",
+       "      <td>Febrile neutropenia</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1315</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.09163</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  drug_rxnorn_id        drug_concept_name condition_meddra_id  \\\n",
+       "0           4024  ergoloid mesylates, USP            10002034   \n",
+       "1           4024  ergoloid mesylates, USP            10002965   \n",
+       "2           4024  ergoloid mesylates, USP            10013442   \n",
+       "3           4024  ergoloid mesylates, USP            10023126   \n",
+       "4           4024  ergoloid mesylates, USP            10016288   \n",
+       "\n",
+       "                   condition_concept_name  A    B   C     D      PRR  \\\n",
+       "0                                 Anaemia  6  126  21  1299  2.85714   \n",
+       "1                   Aplasia pure red cell  1  131   1  1319     10.0   \n",
+       "2  Disseminated intravascular coagulation  1  131   6  1314  1.66667   \n",
+       "3                                Jaundice  2  130   7  1313  2.85714   \n",
+       "4                     Febrile neutropenia  1  131   5  1315      2.0   \n",
+       "\n",
+       "  PRR_error mean_reporting_frequency  \n",
+       "0   0.45382                 0.045455  \n",
+       "1   1.41126                 0.007576  \n",
+       "2   1.07626                 0.007576  \n",
+       "3   0.79657                 0.015152  \n",
+       "4   1.09163                 0.007576  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17bcca0d",
+   "metadata": {},
+   "source": [
+    "## Load from csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4d12adad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn_data_csv = \"data_clean.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "ae4cbf36",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/3c/d8_kt2gd6n5857w_5x4gccrc0000gn/T/ipykernel_28019/2664504625.py:1: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv(fn_data_csv)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(fn_data_csv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "214a2b81",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>drug_rxnorn_id</th>\n",
+       "      <th>drug_concept_name</th>\n",
+       "      <th>condition_meddra_id</th>\n",
+       "      <th>condition_concept_name</th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>PRR</th>\n",
+       "      <th>PRR_error</th>\n",
+       "      <th>mean_reporting_frequency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002034</td>\n",
+       "      <td>Anaemia</td>\n",
+       "      <td>6</td>\n",
+       "      <td>126</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1299</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.45382</td>\n",
+       "      <td>0.045455</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10002965</td>\n",
+       "      <td>Aplasia pure red cell</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1319</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>1.41126</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10013442</td>\n",
+       "      <td>Disseminated intravascular coagulation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1314</td>\n",
+       "      <td>1.66667</td>\n",
+       "      <td>1.07626</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10023126</td>\n",
+       "      <td>Jaundice</td>\n",
+       "      <td>2</td>\n",
+       "      <td>130</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1313</td>\n",
+       "      <td>2.85714</td>\n",
+       "      <td>0.79657</td>\n",
+       "      <td>0.015152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4024</td>\n",
+       "      <td>ergoloid mesylates, USP</td>\n",
+       "      <td>10016288</td>\n",
+       "      <td>Febrile neutropenia</td>\n",
+       "      <td>1</td>\n",
+       "      <td>131</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1315</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.09163</td>\n",
+       "      <td>0.007576</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  drug_rxnorn_id        drug_concept_name condition_meddra_id  \\\n",
+       "0           4024  ergoloid mesylates, USP            10002034   \n",
+       "1           4024  ergoloid mesylates, USP            10002965   \n",
+       "2           4024  ergoloid mesylates, USP            10013442   \n",
+       "3           4024  ergoloid mesylates, USP            10023126   \n",
+       "4           4024  ergoloid mesylates, USP            10016288   \n",
+       "\n",
+       "                   condition_concept_name  A    B   C     D      PRR  \\\n",
+       "0                                 Anaemia  6  126  21  1299  2.85714   \n",
+       "1                   Aplasia pure red cell  1  131   1  1319     10.0   \n",
+       "2  Disseminated intravascular coagulation  1  131   6  1314  1.66667   \n",
+       "3                                Jaundice  2  130   7  1313  2.85714   \n",
+       "4                     Febrile neutropenia  1  131   5  1315      2.0   \n",
+       "\n",
+       "  PRR_error mean_reporting_frequency  \n",
+       "0   0.45382                 0.045455  \n",
+       "1   1.41126                 0.007576  \n",
+       "2   1.07626                 0.007576  \n",
+       "3   0.79657                 0.015152  \n",
+       "4   1.09163                 0.007576  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "854b807c",
+   "metadata": {},
+   "source": [
+    "## Meta YAML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "2d24e114",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "meta = {\n",
+    "    \"name\": \"offsides\",\n",
+    "    \"description\": \"OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.\",\n",
+    "    \"targets\": [\n",
+    "        {\n",
+    "            \"id\": \"PRR\",\n",
+    "            \"description\": \"Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"PRR_error\",\n",
+    "            \"description\": \"Standard error of the PRR estimate\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio error\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"mean_reporting_frequency\",\n",
+    "            \"description\": \"Proportion of reports for the drug that report the side effect,  A/(A+B)\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"mean reporting frequency\"]\n",
+    "        }\n",
+    "    ],\n",
+    "    \"identifier\": [\n",
+    "        {\n",
+    "            \"id\": \"drug_concept_name\",\n",
+    "            \"description\": \"RxNorm name string for the drug\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"condition_concept_name\",\n",
+    "            \"description\": \"MedDRA identifier for the side effect\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"license\": \"CC BY 4.0\",\n",
+    "    \"links\": [\n",
+    "        {\n",
+    "            \"url\": \"https://tatonettilab.org/resources/nsides/\",\n",
+    "            \"description\": \"data source\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"url\": \"https://nsides.io/\",\n",
+    "            \"description\": \"database website\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"num_points\": len(df),\n",
+    "    \"bibtex\": \"\"\"\n",
+    "        @article{Tatonetti2012,\n",
+    "        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n",
+    "        title = {Data-driven prediction of drug effects and interactions},\n",
+    "        journal = {Sci Transl Med},\n",
+    "        volume = {4},\n",
+    "        number = {125},\n",
+    "        pages = {125ra31},\n",
+    "        year = {2012},\n",
+    "        doi = {10.1126/scitranslmed.3003377},\n",
+    "        pmid = {22422992},\n",
+    "        pmcid = {PMC3382018}\n",
+    "        }\n",
+    "        \"\"\"\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6e8aafee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fn_meta = \"meta.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6ff83544",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(fn_meta, \"w\") as f:\n",
+    "    yaml.dump(meta, f, sort_keys=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d370342f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-rw-r--r--  1 apoorvasrinivasan  staff   1.8K Mar 14 18:25 meta.yaml\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -lh {fn_meta}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "40548210",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "name: offsides\r\n",
+      "description: OffSIDES is a database of individual drug side effect signals mined from\r\n",
+      "  the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity\r\n",
+      "  score matching (PSM) model is used to identify control drugs and produce better\r\n",
+      "  PRR estimates. In OffSIDES we focus on drug safety signals that are not already\r\n",
+      "  established by being listed on the structured product label -- hence they are off-label\r\n",
+      "  drug side effects.\r\n",
+      "targets:\r\n",
+      "- id: PRR\r\n",
+      "  description: Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\r\n",
+      "  type: continuous\r\n",
+      "  names:\r\n",
+      "  - Proportional reporting ratio\r\n",
+      "- id: PRR_error\r\n",
+      "  description: Standard error of the PRR estimate\r\n",
+      "  type: continuous\r\n",
+      "  names:\r\n",
+      "  - Proportional reporting ratio error\r\n",
+      "- id: mean_reporting_frequency\r\n",
+      "  description: Proportion of reports for the drug that report the side effect,  A/(A+B)\r\n",
+      "  type: continuous\r\n",
+      "  names:\r\n",
+      "  - mean reporting frequency\r\n",
+      "identifier:\r\n",
+      "- id: drug_concept_name\r\n",
+      "  description: RxNorm name string for the drug\r\n",
+      "  type: categorical\r\n",
+      "- id: condition_concept_name\r\n",
+      "  description: MedDRA identifier for the side effect\r\n",
+      "  type: categorical\r\n",
+      "license: CC BY 4.0\r\n",
+      "links:\r\n",
+      "- url: https://tatonettilab.org/resources/nsides/\r\n",
+      "  description: data source\r\n",
+      "- url: https://nsides.io/\r\n",
+      "  description: database website\r\n",
+      "num_points: 3206558\r\n",
+      "bibtex: \"\\n        @article{Tatonetti2012,\\n        author = {Tatonetti, Nicholas\\\r\n",
+      "  \\ P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\\n        title\\\r\n",
+      "  \\ = {Data-driven prediction of drug effects and interactions},\\n        journal\\\r\n",
+      "  \\ = {Sci Transl Med},\\n        volume = {4},\\n        number = {125},\\n        pages\\\r\n",
+      "  \\ = {125ra31},\\n        year = {2012},\\n        doi = {10.1126/scitranslmed.3003377},\\n\\\r\n",
+      "  \\        pmid = {22422992},\\n        pmcid = {PMC3382018}\\n        }\\n        \"\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat {fn_meta}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ff77293",
+   "metadata": {},
+   "source": [
+    "## Create transform.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "38e8a677",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_file = \"transform.py\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e2f46f61",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting transform.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile $path_file\n",
+    "import pandas as pd\n",
+    "import requests\n",
+    "import yaml\n",
+    "\n",
+    "\n",
+    "def get_and_transform_data():\n",
+    "    # load data\n",
+    "    df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+    "                   error_bad_lines=False)\n",
+    "\n",
+    "    # check if fields are the same\n",
+    "    expected_columns = ['drug_rxnorn_id',\n",
+    "     'drug_concept_name',\n",
+    "     'condition_meddra_id',\n",
+    "     'condition_concept_name',\n",
+    "     'A',\n",
+    "     'B',\n",
+    "     'C',\n",
+    "     'D',\n",
+    "     'PRR',\n",
+    "     'PRR_error',\n",
+    "     'mean_reporting_frequency']\n",
+    "\n",
+    "    assert df.columns.tolist() == expected_columns\n",
+    "    \n",
+    "    # remove duplicates\n",
+    "    df.drop_duplicates(inplace=True)\n",
+    "    # check duplicates\n",
+    "    assert not df.duplicated().sum(), \"Found duplicate rows in the dataframe\"\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "    # save to csv\n",
+    "    fn_data_csv = \"data_clean.csv\"\n",
+    "    df.to_csv(fn_data_csv, index=False)\n",
+    "\n",
+    "    # create meta yaml\n",
+    "    meta = {\n",
+    "    \"name\": \"offsides\",\n",
+    "    \"description\": \"OffSIDES is a database of individual drug side effect signals mined from the FDA's Adverse Event Reporting System. The innovation of OffSIDES is that a propensity score matching (PSM) model is used to identify control drugs and produce better PRR estimates. In OffSIDES we focus on drug safety signals that are not already established by being listed on the structured product label -- hence they are off-label drug side effects.\",\n",
+    "    \"targets\": [\n",
+    "        {\n",
+    "            \"id\": \"PRR\",\n",
+    "            \"description\": \"Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"PRR_error\",\n",
+    "            \"description\": \"Standard error of the PRR estimate\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"Proportional reporting ratio error\"]\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"mean_reporting_frequency\",\n",
+    "            \"description\": \"Proportion of reports for the drug that report the side effect,  A/(A+B)\",\n",
+    "            \"type\": \"continuous\",\n",
+    "            \"names\": [\"mean reporting frequency\"]\n",
+    "        }\n",
+    "    ],\n",
+    "    \"identifier\": [\n",
+    "        {\n",
+    "            \"id\": \"drug_concept_name\",\n",
+    "            \"description\": \"RxNorm name string for the drug\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"id\": \"condition_concept_name\",\n",
+    "            \"description\": \"MedDRA identifier for the side effect\",\n",
+    "            \"type\": \"categorical\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"license\": \"CC BY 4.0\",\n",
+    "    \"links\": [\n",
+    "        {\n",
+    "            \"url\": \"https://tatonettilab.org/resources/nsides/\",\n",
+    "            \"description\": \"data source\"\n",
+    "        },\n",
+    "        {\n",
+    "            \"url\": \"https://nsides.io/\",\n",
+    "            \"description\": \"database website\"\n",
+    "        }\n",
+    "    ],\n",
+    "    \"num_points\": len(df),\n",
+    "    \"bibtex\": \"\"\"\n",
+    "        @article{Tatonetti2012,\n",
+    "        author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},\n",
+    "        title = {Data-driven prediction of drug effects and interactions},\n",
+    "        journal = {Sci Transl Med},\n",
+    "        volume = {4},\n",
+    "        number = {125},\n",
+    "        pages = {125ra31},\n",
+    "        year = {2012},\n",
+    "        doi = {10.1126/scitranslmed.3003377},\n",
+    "        pmid = {22422992},\n",
+    "        pmcid = {PMC3382018}\n",
+    "        }\n",
+    "        \"\"\"\n",
+    "    }\n",
+    "\n",
+    "    def str_presenter(dumper, data):\n",
+    "        \"\"\"configures yaml for dumping multiline strings\n",
+    "        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data\n",
+    "        \"\"\"\n",
+    "        if data.count(\"\\n\") > 0:  # check for multiline string\n",
+    "            return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data, style=\"|\")\n",
+    "        return dumper.represent_scalar(\"tag:yaml.org,2002:str\", data)\n",
+    "\n",
+    "    yaml.add_representer(str, str_presenter)\n",
+    "    yaml.representer.SafeRepresenter.add_representer(\n",
+    "        str, str_presenter\n",
+    "    )  # to use with safe_dum\n",
+    "    fn_meta = \"meta.yaml\"\n",
+    "    with open(fn_meta, \"w\") as f:\n",
+    "        yaml.dump(meta, f, sort_keys=False)\n",
+    "\n",
+    "    print(f\"Finished processing {meta['name']} dataset!\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    get_and_transform_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1ac51787",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "transform.py:8: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
+      "\n",
+      "\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+      "transform.py:8: DtypeWarning: Columns (0,2,4,5,6,7,8,9,10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv('https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz', compression='gzip',\n",
+      "Finished processing offsides dataset!\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python3 transform.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data/offsides/transform.py b/data/offsides/transform.py
new file mode 100644
index 000000000..4ea0ed0fb
--- /dev/null
+++ b/data/offsides/transform.py
@@ -0,0 +1,127 @@
+import pandas as pd
+import yaml
+
+
+def get_and_transform_data():
+    # load data
+    df = pd.read_csv(
+        "https://tatonettilab.org/resources/nsides/OFFSIDES.csv.gz",
+        compression="gzip",
+        on_bad_lines="skip",
+        low_memory=False,
+    )
+
+    # check if fields are the same
+    expected_columns = [
+        "drug_rxnorn_id",
+        "drug_concept_name",
+        "condition_meddra_id",
+        "condition_concept_name",
+        "A",
+        "B",
+        "C",
+        "D",
+        "PRR",
+        "PRR_error",
+        "mean_reporting_frequency",
+    ]
+
+    assert df.columns.tolist() == expected_columns
+
+    # drop columns A, B, C, D
+    df.drop(columns=["A", "B", "C", "D"], inplace=True)
+    # remove duplicates
+    df.drop_duplicates(inplace=True)
+    # check duplicates
+    assert not df.duplicated().sum(), "Found duplicate rows in the dataframe"
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    # create meta yaml
+    meta = {
+        "name": "offsides",
+        "description": """OffSIDES is a database of individual drug side effect
+signals mined from the FDA's Adverse Event Reporting System. The
+innovation of OffSIDES is that a propensity score matching (PSM) model
+is used to identify control drugs and produce better PRR estimates. In
+OffSIDES we focus on drug safety signals that are not already
+established by being listed on the structured product label - hence
+they are off-label drug side effects.""",
+        "targets": [
+            {
+                "id": "PRR",
+                "description": "proportional reporting ratio",
+                "type": "continuous",
+                "names": ["proportional reporting ratio"],
+            },
+            {
+                "id": "PRR_error",
+                "description": "standard error of the PRR estimate",
+                "type": "continuous",
+                "sample": False,
+                "names": ["standard error of the proportional reporting ratio error"],
+            },
+            {
+                "id": "mean_reporting_frequency",
+                "description": "mean reporting frequency for the drug",
+                "type": "continuous",
+                "names": ["mean reporting frequency"],
+            },
+        ],
+        "identifier": [
+            {
+                "id": "drug_concept_name",
+                "description": "RxNorm name string for the drug",
+                "type": "categorical",
+            },
+            {
+                "id": "condition_concept_name",
+                "description": "MedDRA identifier for the side effect",
+                "type": "categorical",
+            },
+        ],
+        "license": "CC BY 4.0",
+        "links": [
+            {
+                "url": "https://tatonettilab.org/resources/nsides/",
+                "description": "data source",
+            },
+            {"url": "https://nsides.io/", "description": "database website"},
+        ],
+        "num_points": len(df),
+        "bibtex": """@article{Tatonetti2012,
+author = {Tatonetti, Nicholas P. and Ye, Peter P. and Daneshjou, Roxana and Altman, Russ B.},
+title = {Data-driven prediction of drug effects and interactions},
+journal = {Sci Transl Med},
+volume = {4},
+number = {125},
+pages = {125ra31},
+year = {2012},
+doi = {10.1126/scitranslmed.3003377},
+pmid = {22422992},
+pmcid = {PMC3382018}
+}""",
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()