From ed5596231e337defaf4e24d7a96ce7e728b63ae8 Mon Sep 17 00:00:00 2001 From: Ethan Weinberger Date: Mon, 11 Apr 2022 11:44:26 -0700 Subject: [PATCH] Add curation notebook for Srivatsan 2019 sciplex-3 experiment --- .../Srivatsan_2019_sciplex3_curation.ipynb | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 datasets/Srivatsan_2019_sciplex3_curation.ipynb diff --git a/datasets/Srivatsan_2019_sciplex3_curation.ipynb b/datasets/Srivatsan_2019_sciplex3_curation.ipynb new file mode 100644 index 0000000..8427b4a --- /dev/null +++ b/datasets/Srivatsan_2019_sciplex3_curation.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "498416da", + "metadata": {}, + "source": [ + "Accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM4150378" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4abe7004", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "import os\n", + "\n", + "import pandas as pd\n", + "from anndata import AnnData\n", + "\n", + "from utils import download_binary_file\n", + "from scipy.sparse import csr_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5747389", + "metadata": {}, + "outputs": [], + "source": [ + "def download_srivatsan_2019_sciplex3(output_path: str) -> None:\n", + " \"\"\"\n", + " Download Srivatsan et al. 2019 sciplex3 data from the hosting URLs.\n", + "\n", + " Args:\n", + " ----\n", + " output_path: Output path to store the downloaded and unzipped\n", + " directories.\n", + "\n", + " Returns\n", + " -------\n", + " None. File directories are downloaded to output_path.\n", + " \"\"\"\n", + "\n", + " count_matrix_url = (\n", + " \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&file\"\n", + " \"=GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz\"\n", + " )\n", + " count_matrix_filename = os.path.join(output_path, count_matrix_url.split(\"=\")[-1])\n", + " download_binary_file(count_matrix_url, count_matrix_filename)\n", + "\n", + " cell_metadata_url = (\n", + " \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&\"\n", + " \"file=GSM4150378_sciPlex3_pData.txt.gz\"\n", + " )\n", + " cell_metadata_filename = os.path.join(output_path, cell_metadata_url.split(\"=\")[-1])\n", + " download_binary_file(cell_metadata_url, cell_metadata_filename)\n", + "\n", + " gene_metadata_url = (\n", + " \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file\"\n", + " \"&file=GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz\"\n", + " )\n", + " cell_metadata_filename = os.path.join(output_path, gene_metadata_url.split(\"=\")[-1])\n", + " download_binary_file(gene_metadata_url, cell_metadata_filename)\n", + "\n", + "\n", + "def read_srivatsan_2019_sciplex3(file_directory: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Read the sciplex3 expression data from Srivatsan et al. 2019 in the given directory.\n", + "\n", + " Args:\n", + " ----\n", + " file_directory: Directory containing Srivatsan et al. 2019 data.\n", + "\n", + " Returns\n", + " -------\n", + " A data frame containing single-cell gene expression counts. The count\n", + " matrix is stored in triplet format. I.e., each row of the data frame\n", + " has the format (row, column, count) stored in columns (i, j, x) respectively.\n", + " \"\"\"\n", + "\n", + " with gzip.open(\n", + " os.path.join(\n", + " file_directory,\n", + " \"GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz\",\n", + " ),\n", + " \"rb\",\n", + " ) as f:\n", + " df = pd.read_csv(f, sep=\"\\t\", header=None, names=[\"i\", \"j\", \"x\"])\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d19b7de5", + "metadata": {}, + "outputs": [], + "source": [ + "download_path = \"./srivatsan_2019_sciplex3\"\n", + "\n", + "os.makedirs(download_path, exist_ok=True)\n", + "download_srivatsan_2019_sciplex3(download_path)\n", + "df = read_srivatsan_2019_sciplex3(download_path)\n", + "\n", + "# The Srivatsan count data is in a sparse triplet format represented\n", + "# by three columns 'i', 'j', and 'x'. 'i' refers to a row number, 'j' refers to\n", + "# a column number, and 'x' refers to a count value.\n", + "counts = df[\"x\"]\n", + "rows = (\n", + " df[\"i\"] - 1\n", + ") # Indices were originally 1-base-indexed --> switch to 0-base-indexing\n", + "cols = df[\"j\"] - 1\n", + "\n", + "# This dataset is large enough that we need to store it as a scipy sparse matrix\n", + "# for preprocessing (>600 GB in RAM as a dense matrix)\n", + "count_matrix = csr_matrix((counts.values, (rows.values, cols.values)), shape=(max(rows) + 1, max(cols) + 1))\n", + "\n", + "# Switch matrix from gene rows and cell columns to cell rows and gene columns\n", + "count_matrix = count_matrix.T\n", + "\n", + "cell_metadata = pd.read_csv(\n", + " os.path.join(\n", + " download_path,\n", + " \"GSM4150378_sciPlex3_pData.txt.gz\",\n", + " ),\n", + " sep=\" \",\n", + ")\n", + "\n", + "gene_metadata = pd.read_csv(\n", + " os.path.join(\n", + " download_path,\n", + " \"GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz\",\n", + " ),\n", + " sep=\" \",\n", + " index_col=0,\n", + ")\n", + "\n", + "# The gene list contains both mouse and human genes due to quirks\n", + "# in how the authors saved their data. We only care about human genes\n", + "# (since the cell lines we're using are from humans), so we discard\n", + "# the mouse genes. The human genes come before the mice genes in the\n", + "# data, so we can just compute the number of human genes (x) and then subset\n", + "# the data to the first x genes.\n", + "num_human_genes = sum(['ENSG' in x for x in gene_metadata.index.values])\n", + "count_matrix = count_matrix[:, :num_human_genes]\n", + "gene_metadata = gene_metadata.head(num_human_genes)\n", + "\n", + "adata = AnnData(\n", + " X=count_matrix, obs=cell_metadata, var=gene_metadata\n", + ")\n", + "\n", + "# Filter out cells for which we don't have data\n", + "adata = adata[adata.obs['cell_type'].notna()]\n", + "adata = adata[adata.obs['product_name'].notna()]\n", + "\n", + "# For readability\n", + "adata.obs['product_name'] = [x.split(' ')[0] for x in adata.obs['product_name']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13f04aba", + "metadata": {}, + "outputs": [], + "source": [ + "adata.write_h5ad(\"Srivatsan_2019_sciplex3_raw.h5ad\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f87d3ed4-e87d-4608-bec9-a9e745d483c1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}