theislab · ethanweinberger · Apr 11, 2022
diff --git a/datasets/Srivatsan_2019_sciplex3_curation.ipynb b/datasets/Srivatsan_2019_sciplex3_curation.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "498416da",
+   "metadata": {},
+   "source": [
+    "Accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM4150378"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4abe7004",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "import os\n",
+    "\n",
+    "import pandas as pd\n",
+    "from anndata import AnnData\n",
+    "\n",
+    "from utils import download_binary_file\n",
+    "from scipy.sparse import csr_matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5747389",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def download_srivatsan_2019_sciplex3(output_path: str) -> None:\n",
+    "    \"\"\"\n",
+    "    Download Srivatsan et al. 2019 sciplex3 data from the hosting URLs.\n",
+    "\n",
+    "    Args:\n",
+    "    ----\n",
+    "        output_path: Output path to store the downloaded and unzipped\n",
+    "        directories.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "        None. File directories are downloaded to output_path.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    count_matrix_url = (\n",
+    "        \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&file\"\n",
+    "        \"=GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz\"\n",
+    "    )\n",
+    "    count_matrix_filename = os.path.join(output_path, count_matrix_url.split(\"=\")[-1])\n",
+    "    download_binary_file(count_matrix_url, count_matrix_filename)\n",
+    "\n",
+    "    cell_metadata_url = (\n",
+    "        \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&\"\n",
+    "        \"file=GSM4150378_sciPlex3_pData.txt.gz\"\n",
+    "    )\n",
+    "    cell_metadata_filename = os.path.join(output_path, cell_metadata_url.split(\"=\")[-1])\n",
+    "    download_binary_file(cell_metadata_url, cell_metadata_filename)\n",
+    "\n",
+    "    gene_metadata_url = (\n",
+    "        \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file\"\n",
+    "        \"&file=GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz\"\n",
+    "    )\n",
+    "    cell_metadata_filename = os.path.join(output_path, gene_metadata_url.split(\"=\")[-1])\n",
+    "    download_binary_file(gene_metadata_url, cell_metadata_filename)\n",
+    "\n",
+    "\n",
+    "def read_srivatsan_2019_sciplex3(file_directory: str) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Read the sciplex3 expression data from Srivatsan et al. 2019 in the given directory.\n",
+    "\n",
+    "    Args:\n",
+    "    ----\n",
+    "        file_directory: Directory containing Srivatsan et al. 2019 data.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "        A data frame containing single-cell gene expression counts. The count\n",
+    "        matrix is stored in triplet format. I.e., each row of the data frame\n",
+    "        has the format (row, column, count) stored in columns (i, j, x) respectively.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    with gzip.open(\n",
+    "        os.path.join(\n",
+    "            file_directory,\n",
+    "            \"GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz\",\n",
+    "        ),\n",
+    "        \"rb\",\n",
+    "    ) as f:\n",
+    "        df = pd.read_csv(f, sep=\"\\t\", header=None, names=[\"i\", \"j\", \"x\"])\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d19b7de5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "download_path = \"./srivatsan_2019_sciplex3\"\n",
+    "\n",
+    "os.makedirs(download_path, exist_ok=True)\n",
+    "download_srivatsan_2019_sciplex3(download_path)\n",
+    "df = read_srivatsan_2019_sciplex3(download_path)\n",
+    "\n",
+    "# The Srivatsan count data is in a sparse triplet format represented\n",
+    "# by three columns 'i', 'j', and 'x'. 'i' refers to a row number, 'j' refers to\n",
+    "# a column number, and 'x' refers to a count value.\n",
+    "counts = df[\"x\"]\n",
+    "rows = (\n",
+    "    df[\"i\"] - 1\n",
+    ")  # Indices were originally 1-base-indexed --> switch to 0-base-indexing\n",
+    "cols = df[\"j\"] - 1\n",
+    "\n",
+    "# This dataset is large enough that we need to store it as a scipy sparse matrix\n",
+    "# for preprocessing (>600 GB in RAM as a dense matrix)\n",
+    "count_matrix = csr_matrix((counts.values, (rows.values, cols.values)), shape=(max(rows) + 1, max(cols) + 1))\n",
+    "\n",
+    "# Switch matrix from gene rows and cell columns to cell rows and gene columns\n",
+    "count_matrix = count_matrix.T\n",
+    "\n",
+    "cell_metadata = pd.read_csv(\n",
+    "    os.path.join(\n",
+    "        download_path,\n",
+    "        \"GSM4150378_sciPlex3_pData.txt.gz\",\n",
+    "    ),\n",
+    "    sep=\" \",\n",
+    ")\n",
+    "\n",
+    "gene_metadata = pd.read_csv(\n",
+    "    os.path.join(\n",
+    "        download_path,\n",
+    "        \"GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz\",\n",
+    "    ),\n",
+    "    sep=\" \",\n",
+    "    index_col=0,\n",
+    ")\n",
+    "\n",
+    "# The gene list contains both mouse and human genes due to quirks\n",
+    "# in how the authors saved their data. We only care about human genes\n",
+    "# (since the cell lines we're using are from humans), so we discard\n",
+    "# the mouse genes. The human genes come before the mice genes in the\n",
+    "# data, so we can just compute the number of human genes (x) and then subset\n",
+    "# the data to the first x genes.\n",
+    "num_human_genes = sum(['ENSG' in x for x in gene_metadata.index.values])\n",
+    "count_matrix = count_matrix[:, :num_human_genes]\n",
+    "gene_metadata = gene_metadata.head(num_human_genes)\n",
+    "\n",
+    "adata = AnnData(\n",
+    "    X=count_matrix, obs=cell_metadata, var=gene_metadata\n",
+    ")\n",
+    "\n",
+    "# Filter out cells for which we don't have data\n",
+    "adata = adata[adata.obs['cell_type'].notna()]\n",
+    "adata = adata[adata.obs['product_name'].notna()]\n",
+    "\n",
+    "# For readability\n",
+    "adata.obs['product_name'] = [x.split(' ')[0] for x in adata.obs['product_name']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13f04aba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata.write_h5ad(\"Srivatsan_2019_sciplex3_raw.h5ad\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f87d3ed4-e87d-4608-bec9-a9e745d483c1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}