Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions datasets/Srivatsan_2019_sciplex3_curation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "498416da",
"metadata": {},
"source": [
"Accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM4150378"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4abe7004",
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import os\n",
"\n",
"import pandas as pd\n",
"from anndata import AnnData\n",
"\n",
"from utils import download_binary_file\n",
"from scipy.sparse import csr_matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5747389",
"metadata": {},
"outputs": [],
"source": [
"def download_srivatsan_2019_sciplex3(output_path: str) -> None:\n",
" \"\"\"\n",
" Download Srivatsan et al. 2019 sciplex3 data from the hosting URLs.\n",
"\n",
" Args:\n",
" ----\n",
" output_path: Output path to store the downloaded and unzipped\n",
" directories.\n",
"\n",
" Returns\n",
" -------\n",
" None. File directories are downloaded to output_path.\n",
" \"\"\"\n",
"\n",
" count_matrix_url = (\n",
" \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&file\"\n",
" \"=GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz\"\n",
" )\n",
" count_matrix_filename = os.path.join(output_path, count_matrix_url.split(\"=\")[-1])\n",
" download_binary_file(count_matrix_url, count_matrix_filename)\n",
"\n",
" cell_metadata_url = (\n",
" \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&\"\n",
" \"file=GSM4150378_sciPlex3_pData.txt.gz\"\n",
" )\n",
" cell_metadata_filename = os.path.join(output_path, cell_metadata_url.split(\"=\")[-1])\n",
" download_binary_file(cell_metadata_url, cell_metadata_filename)\n",
"\n",
" gene_metadata_url = (\n",
" \"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file\"\n",
" \"&file=GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz\"\n",
" )\n",
" cell_metadata_filename = os.path.join(output_path, gene_metadata_url.split(\"=\")[-1])\n",
" download_binary_file(gene_metadata_url, cell_metadata_filename)\n",
"\n",
"\n",
"def read_srivatsan_2019_sciplex3(file_directory: str) -> pd.DataFrame:\n",
" \"\"\"\n",
" Read the sciplex3 expression data from Srivatsan et al. 2019 in the given directory.\n",
"\n",
" Args:\n",
" ----\n",
" file_directory: Directory containing Srivatsan et al. 2019 data.\n",
"\n",
" Returns\n",
" -------\n",
" A data frame containing single-cell gene expression counts. The count\n",
" matrix is stored in triplet format. I.e., each row of the data frame\n",
" has the format (row, column, count) stored in columns (i, j, x) respectively.\n",
" \"\"\"\n",
"\n",
" with gzip.open(\n",
" os.path.join(\n",
" file_directory,\n",
" \"GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz\",\n",
" ),\n",
" \"rb\",\n",
" ) as f:\n",
" df = pd.read_csv(f, sep=\"\\t\", header=None, names=[\"i\", \"j\", \"x\"])\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d19b7de5",
"metadata": {},
"outputs": [],
"source": [
"download_path = \"./srivatsan_2019_sciplex3\"\n",
"\n",
"os.makedirs(download_path, exist_ok=True)\n",
"download_srivatsan_2019_sciplex3(download_path)\n",
"df = read_srivatsan_2019_sciplex3(download_path)\n",
"\n",
"# The Srivatsan count data is in a sparse triplet format represented\n",
"# by three columns 'i', 'j', and 'x'. 'i' refers to a row number, 'j' refers to\n",
"# a column number, and 'x' refers to a count value.\n",
"counts = df[\"x\"]\n",
"rows = (\n",
" df[\"i\"] - 1\n",
") # Indices were originally 1-base-indexed --> switch to 0-base-indexing\n",
"cols = df[\"j\"] - 1\n",
"\n",
"# This dataset is large enough that we need to store it as a scipy sparse matrix\n",
"# for preprocessing (>600 GB in RAM as a dense matrix)\n",
"count_matrix = csr_matrix((counts.values, (rows.values, cols.values)), shape=(max(rows) + 1, max(cols) + 1))\n",
"\n",
"# Switch matrix from gene rows and cell columns to cell rows and gene columns\n",
"count_matrix = count_matrix.T\n",
"\n",
"cell_metadata = pd.read_csv(\n",
" os.path.join(\n",
" download_path,\n",
" \"GSM4150378_sciPlex3_pData.txt.gz\",\n",
" ),\n",
" sep=\" \",\n",
")\n",
"\n",
"gene_metadata = pd.read_csv(\n",
" os.path.join(\n",
" download_path,\n",
" \"GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz\",\n",
" ),\n",
" sep=\" \",\n",
" index_col=0,\n",
")\n",
"\n",
"# The gene list contains both mouse and human genes due to quirks\n",
"# in how the authors saved their data. We only care about human genes\n",
"# (since the cell lines we're using are from humans), so we discard\n",
"# the mouse genes. The human genes come before the mice genes in the\n",
"# data, so we can just compute the number of human genes (x) and then subset\n",
"# the data to the first x genes.\n",
"num_human_genes = sum(['ENSG' in x for x in gene_metadata.index.values])\n",
"count_matrix = count_matrix[:, :num_human_genes]\n",
"gene_metadata = gene_metadata.head(num_human_genes)\n",
"\n",
"adata = AnnData(\n",
" X=count_matrix, obs=cell_metadata, var=gene_metadata\n",
")\n",
"\n",
"# Filter out cells for which we don't have data\n",
"adata = adata[adata.obs['cell_type'].notna()]\n",
"adata = adata[adata.obs['product_name'].notna()]\n",
"\n",
"# For readability\n",
"adata.obs['product_name'] = [x.split(' ')[0] for x in adata.obs['product_name']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "13f04aba",
"metadata": {},
"outputs": [],
"source": [
"adata.write_h5ad(\"Srivatsan_2019_sciplex3_raw.h5ad\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f87d3ed4-e87d-4608-bec9-a9e745d483c1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}