From 9ffa1209c161ca5c0009e1be958433289b20433f Mon Sep 17 00:00:00 2001 From: jbloom Date: Wed, 4 Dec 2024 07:40:56 +0900 Subject: [PATCH 1/9] add `PolyclonalCollection.mut_escape_df_replicates_by_region` --- polyclonal/polyclonal_collection.py | 35 +++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/polyclonal/polyclonal_collection.py b/polyclonal/polyclonal_collection.py index 748bc17..8524a99 100644 --- a/polyclonal/polyclonal_collection.py +++ b/polyclonal/polyclonal_collection.py @@ -583,6 +583,41 @@ def mut_escape_df_replicates(self): ignore_index=True, ) + def mut_escape_df_replicates_by_region(self, desc_name, model_sites): + """Get mutation escape by model only keeping specific sites per model. + + Parameters + ---------- + desc_name : str + Descriptor in :attr:`PolyclonalCollection.model_descriptors` used to + identify models. + model_sites : dict + Keyed by the descriptor identifying each model, with values + being list or set of the sites to keep for that model. All sites + must be in :attr:`PolyclonalCollection.sites`. + + Returns + ------- + pandas.DataFrame + Mutation escape by model, only keeping indicated sites for each model. + + """ + if desc_name not in self.model_descriptors: + raise ValueError(f"{desc_name=} not in {self.model_descriptors.keys()=}") + descs = self.model_descriptors[desc_name] + if set(model_sites) != set(descs): + raise ValueError(f"{model_sites.keys()=} differs from {descs=}") + dfs = [] + for model, desc, descriptors in zip(self.models, descs, self.model_descriptors): + sites_to_keep = set(self.model_sites[desc]) + invalid_sites = sorted(sites_to_keep - set(self.sites)) + if invalid_sites: + raise ValueError(f"invalid sites in `model_sites`:\n{invalid_sites}") + dfs.append( + model.mut_escape_df.query("site in @sites_to_keep").assign(**descriptors) + ) + return pd.concat(dfs, ignore_index=True) + def mut_icXX_df_replicates(self, **kwargs): """Get data frame of ICXX and log fold change for each mutation by model. From e9a9764d01a9220245c3a9ce1b2af14dd5937b49 Mon Sep 17 00:00:00 2001 From: jbloom Date: Thu, 5 Dec 2024 08:11:17 +0900 Subject: [PATCH 2/9] fix to plotting to work w numpy 2 --- CHANGELOG.rst | 4 ++++ polyclonal/plot.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fae328c..6dda33d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,10 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog `_. +6.13 +---- +- Fixed ``altair`` plots to work with ``numpy`` version 2, which caused problems in some cases apparently due to a data type conversion issue. + 6.12 ---- - Switch to using ``scipy.sparse`` arrays rather than matrices to keep up with `this `_ `change `_ to ``binarymap`` (now require ``binarymap`` >= 0.7). diff --git a/polyclonal/plot.py b/polyclonal/plot.py index 0a1d735..4d4f829 100644 --- a/polyclonal/plot.py +++ b/polyclonal/plot.py @@ -945,7 +945,7 @@ def replace_std(col): base_chart = base_chart.transform_calculate( _stat_not_hidden=alt.expr.if_( alt.datum["_stat_hide"], - data_df[stat_col].min(), + float(data_df[stat_col].min()), alt.datum["_stat"], ), ).transform_joinaggregate( From 81fd83a6587abe366d9d36cad2c24d4d8ed355cd Mon Sep 17 00:00:00 2001 From: jbloom Date: Thu, 5 Dec 2024 13:54:35 -0800 Subject: [PATCH 3/9] start adding regions as param to `PolyclonalCollection` --- notebooks/RBD_average.ipynb | 602 +++++++++++++++------------- polyclonal/polyclonal_collection.py | 63 ++- 2 files changed, 375 insertions(+), 290 deletions(-) diff --git a/notebooks/RBD_average.ipynb b/notebooks/RBD_average.ipynb index 684e938..11e3dd3 100644 --- a/notebooks/RBD_average.ipynb +++ b/notebooks/RBD_average.ipynb @@ -29,13 +29,6 @@ "execution_count": 1, "id": "b242df9c", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:36:10.732205Z", - "iopub.status.busy": "2023-10-17T03:36:10.731345Z", - "iopub.status.idle": "2023-10-17T03:36:19.105369Z", - "shell.execute_reply": "2023-10-17T03:36:19.103991Z", - "shell.execute_reply.started": "2023-10-17T03:36:10.732176Z" - }, "tags": [] }, "outputs": [], @@ -80,13 +73,6 @@ "execution_count": 2, "id": "1f86179e", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:36:19.113583Z", - "iopub.status.busy": "2023-10-17T03:36:19.113323Z", - "iopub.status.idle": "2023-10-17T03:37:53.053430Z", - "shell.execute_reply": "2023-10-17T03:37:53.052114Z", - "shell.execute_reply.started": "2023-10-17T03:36:19.113556Z" - }, "tags": [] }, "outputs": [], @@ -124,13 +110,6 @@ "execution_count": 3, "id": "d6e2060b", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:53.064005Z", - "iopub.status.busy": "2023-10-17T03:37:53.063722Z", - "iopub.status.idle": "2023-10-17T03:37:53.084236Z", - "shell.execute_reply": "2023-10-17T03:37:53.083326Z", - "shell.execute_reply.started": "2023-10-17T03:37:53.063976Z" - }, "tags": [] }, "outputs": [ @@ -224,15 +203,7 @@ "cell_type": "code", "execution_count": 4, "id": "8bb91e55", - "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:53.099272Z", - "iopub.status.busy": "2023-10-17T03:37:53.098847Z", - "iopub.status.idle": "2023-10-17T03:37:54.336999Z", - "shell.execute_reply": "2023-10-17T03:37:54.336120Z", - "shell.execute_reply.started": "2023-10-17T03:37:53.099239Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "model_avg = polyclonal.PolyclonalAverage(models_df)" @@ -250,43 +221,36 @@ "cell_type": "code", "execution_count": 5, "id": "7608fa8b", - "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:54.344087Z", - "iopub.status.busy": "2023-10-17T03:37:54.343758Z", - "iopub.status.idle": "2023-10-17T03:37:54.636492Z", - "shell.execute_reply": "2023-10-17T03:37:54.635913Z", - "shell.execute_reply.started": "2023-10-17T03:37:54.344065Z" - } - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -360,13 +325,6 @@ "execution_count": 6, "id": "3f07294f", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:54.642919Z", - "iopub.status.busy": "2023-10-17T03:37:54.642732Z", - "iopub.status.idle": "2023-10-17T03:37:54.774113Z", - "shell.execute_reply": "2023-10-17T03:37:54.773524Z", - "shell.execute_reply.started": "2023-10-17T03:37:54.642905Z" - }, "tags": [] }, "outputs": [ @@ -375,28 +333,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -469,13 +429,6 @@ "execution_count": 7, "id": "acf519c5", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:54.782955Z", - "iopub.status.busy": "2023-10-17T03:37:54.782650Z", - "iopub.status.idle": "2023-10-17T03:37:55.229815Z", - "shell.execute_reply": "2023-10-17T03:37:55.229150Z", - "shell.execute_reply.started": "2023-10-17T03:37:54.782932Z" - }, "tags": [] }, "outputs": [ @@ -522,10 +475,10 @@ " N\n", " A\n", " N331A\n", - " -0.066612\n", - " -0.031718\n", - " 0.026679\n", - " 0.324873\n", + " -0.067564\n", + " -0.030975\n", + " 0.021971\n", + " 0.328544\n", " 4\n", " 17.75\n", " 1.0\n", @@ -537,10 +490,10 @@ " N\n", " D\n", " N331D\n", - " -0.098052\n", - " -0.100283\n", - " -0.071111\n", - " 0.022076\n", + " -0.090326\n", + " -0.083941\n", + " -0.074902\n", + " 0.020104\n", " 4\n", " 11.25\n", " 1.0\n", @@ -552,10 +505,10 @@ " N\n", " E\n", " N331E\n", - " -0.061359\n", - " -0.008731\n", - " -0.001655\n", - " 0.112576\n", + " -0.066548\n", + " -0.009506\n", + " -0.001983\n", + " 0.119161\n", " 4\n", " 10.25\n", " 1.0\n", @@ -567,10 +520,10 @@ " N\n", " F\n", " N331F\n", - " 0.270243\n", - " 0.170869\n", - " 0.014552\n", - " 0.330315\n", + " 0.283373\n", + " 0.192162\n", + " 0.018695\n", + " 0.333196\n", " 4\n", " 10.00\n", " 1.0\n", @@ -582,10 +535,10 @@ " N\n", " G\n", " N331G\n", - " 0.186271\n", - " 0.164941\n", - " 0.031369\n", - " 0.232272\n", + " 0.193667\n", + " 0.177908\n", + " -0.026998\n", + " 0.221850\n", " 4\n", " 25.00\n", " 1.0\n", @@ -612,10 +565,10 @@ " T\n", " R\n", " T531R\n", - " -0.176471\n", - " -0.112467\n", - " -0.033899\n", - " 0.187640\n", + " -0.174502\n", + " -0.097747\n", + " -0.026822\n", + " 0.210255\n", " 4\n", " 27.00\n", " 1.0\n", @@ -627,10 +580,10 @@ " T\n", " S\n", " T531S\n", - " -0.075136\n", - " -0.011741\n", - " 0.000405\n", - " 0.155037\n", + " -0.082028\n", + " -0.014567\n", + " 0.005165\n", + " 0.148563\n", " 4\n", " 31.75\n", " 1.0\n", @@ -642,10 +595,10 @@ " T\n", " V\n", " T531V\n", - " -0.041719\n", - " -0.005589\n", - " 0.012383\n", - " 0.094055\n", + " -0.042999\n", + " -0.007213\n", + " 0.018650\n", + " 0.103766\n", " 4\n", " 19.50\n", " 1.0\n", @@ -657,10 +610,10 @@ " T\n", " W\n", " T531W\n", - " 0.169718\n", - " 0.100366\n", - " -0.004133\n", - " 0.266617\n", + " 0.163447\n", + " 0.088392\n", + " -0.005205\n", + " 0.276730\n", " 4\n", " 5.25\n", " 1.0\n", @@ -672,10 +625,10 @@ " T\n", " Y\n", " T531Y\n", - " -0.051762\n", - " -0.050223\n", - " -0.032461\n", - " 0.020582\n", + " -0.052903\n", + " -0.055342\n", + " -0.032529\n", + " 0.017376\n", " 4\n", " 11.75\n", " 1.0\n", @@ -687,30 +640,30 @@ ], "text/plain": [ " epitope site wildtype mutant mutation escape_mean escape_median \\\n", - "0 1 331 N A N331A -0.066612 -0.031718 \n", - "1 1 331 N D N331D -0.098052 -0.100283 \n", - "2 1 331 N E N331E -0.061359 -0.008731 \n", - "3 1 331 N F N331F 0.270243 0.170869 \n", - "4 1 331 N G N331G 0.186271 0.164941 \n", + "0 1 331 N A N331A -0.067564 -0.030975 \n", + "1 1 331 N D N331D -0.090326 -0.083941 \n", + "2 1 331 N E N331E -0.066548 -0.009506 \n", + "3 1 331 N F N331F 0.283373 0.192162 \n", + "4 1 331 N G N331G 0.193667 0.177908 \n", "... ... ... ... ... ... ... ... \n", - "3859 2 531 T R T531R -0.176471 -0.112467 \n", - "3860 2 531 T S T531S -0.075136 -0.011741 \n", - "3861 2 531 T V T531V -0.041719 -0.005589 \n", - "3862 2 531 T W T531W 0.169718 0.100366 \n", - "3863 2 531 T Y T531Y -0.051762 -0.050223 \n", + "3859 2 531 T R T531R -0.174502 -0.097747 \n", + "3860 2 531 T S T531S -0.082028 -0.014567 \n", + "3861 2 531 T V T531V -0.042999 -0.007213 \n", + "3862 2 531 T W T531W 0.163447 0.088392 \n", + "3863 2 531 T Y T531Y -0.052903 -0.055342 \n", "\n", " escape_min_magnitude escape_std n_models times_seen frac_models \n", - "0 0.026679 0.324873 4 17.75 1.0 \n", - "1 -0.071111 0.022076 4 11.25 1.0 \n", - "2 -0.001655 0.112576 4 10.25 1.0 \n", - "3 0.014552 0.330315 4 10.00 1.0 \n", - "4 0.031369 0.232272 4 25.00 1.0 \n", + "0 0.021971 0.328544 4 17.75 1.0 \n", + "1 -0.074902 0.020104 4 11.25 1.0 \n", + "2 -0.001983 0.119161 4 10.25 1.0 \n", + "3 0.018695 0.333196 4 10.00 1.0 \n", + "4 -0.026998 0.221850 4 25.00 1.0 \n", "... ... ... ... ... ... \n", - "3859 -0.033899 0.187640 4 27.00 1.0 \n", - "3860 0.000405 0.155037 4 31.75 1.0 \n", - "3861 0.012383 0.094055 4 19.50 1.0 \n", - "3862 -0.004133 0.266617 4 5.25 1.0 \n", - "3863 -0.032461 0.020582 4 11.75 1.0 \n", + "3859 -0.026822 0.210255 4 27.00 1.0 \n", + "3860 0.005165 0.148563 4 31.75 1.0 \n", + "3861 0.018650 0.103766 4 19.50 1.0 \n", + "3862 -0.005205 0.276730 4 5.25 1.0 \n", + "3863 -0.032529 0.017376 4 11.75 1.0 \n", "\n", "[3864 rows x 12 columns]" ] @@ -738,15 +691,7 @@ "cell_type": "code", "execution_count": 8, "id": "2c642238-9278-4a81-a668-5df15a336f5c", - "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:55.236631Z", - "iopub.status.busy": "2023-10-17T03:37:55.236390Z", - "iopub.status.idle": "2023-10-17T03:37:58.562096Z", - "shell.execute_reply": "2023-10-17T03:37:58.561273Z", - "shell.execute_reply.started": "2023-10-17T03:37:55.236612Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -787,10 +732,10 @@ " 331\n", " N\n", " A\n", - " -0.159587\n", - " -0.206684\n", - " -0.135967\n", - " 0.257279\n", + " -0.164860\n", + " -0.194622\n", + " -0.119023\n", + " 0.278079\n", " 4\n", " 17.75\n", " 1.0\n", @@ -800,10 +745,10 @@ " 331\n", " N\n", " D\n", - " -0.086087\n", - " -0.054268\n", - " -0.002643\n", - " 0.101813\n", + " -0.094874\n", + " -0.051912\n", + " -0.011410\n", + " 0.115779\n", " 4\n", " 11.25\n", " 1.0\n", @@ -813,10 +758,10 @@ " 331\n", " N\n", " E\n", - " 0.034466\n", - " 0.016051\n", - " 0.015768\n", - " 0.067786\n", + " 0.032633\n", + " 0.015820\n", + " 0.011409\n", + " 0.059330\n", " 4\n", " 10.25\n", " 1.0\n", @@ -826,10 +771,10 @@ " 331\n", " N\n", " F\n", - " 0.106040\n", - " 0.142322\n", - " 0.141842\n", - " 0.287263\n", + " 0.098268\n", + " 0.139888\n", + " 0.115545\n", + " 0.286302\n", " 4\n", " 10.00\n", " 1.0\n", @@ -839,10 +784,10 @@ " 331\n", " N\n", " G\n", - " 0.283214\n", - " 0.282422\n", - " 0.088191\n", - " 0.163483\n", + " 0.294765\n", + " 0.283074\n", + " 0.122967\n", + " 0.156693\n", " 4\n", " 25.00\n", " 1.0\n", @@ -865,10 +810,10 @@ " 531\n", " T\n", " S\n", - " -0.068568\n", - " -0.016541\n", - " -0.004995\n", - " 0.144948\n", + " -0.069089\n", + " -0.020452\n", + " -0.012045\n", + " 0.138868\n", " 4\n", " 31.75\n", " 1.0\n", @@ -891,10 +836,10 @@ " 531\n", " T\n", " V\n", - " 0.013326\n", - " -0.002602\n", - " 0.039361\n", - " 0.236514\n", + " 0.008317\n", + " -0.010543\n", + " 0.031919\n", + " 0.243751\n", " 4\n", " 19.50\n", " 1.0\n", @@ -904,10 +849,10 @@ " 531\n", " T\n", " W\n", - " 0.199249\n", - " 0.119866\n", - " -0.009133\n", - " 0.287019\n", + " 0.193380\n", + " 0.109259\n", + " -0.008928\n", + " 0.292706\n", " 4\n", " 5.25\n", " 1.0\n", @@ -917,10 +862,10 @@ " 531\n", " T\n", " Y\n", - " -0.021049\n", - " -0.048014\n", - " -0.044700\n", - " 0.124192\n", + " -0.017727\n", + " -0.049556\n", + " -0.045506\n", + " 0.123266\n", " 4\n", " 11.75\n", " 1.0\n", @@ -932,43 +877,43 @@ ], "text/plain": [ " site wildtype mutant log2 fold change IC90 mean \\\n", - "0 331 N A -0.159587 \n", - "1 331 N D -0.086087 \n", - "2 331 N E 0.034466 \n", - "3 331 N F 0.106040 \n", - "4 331 N G 0.283214 \n", + "0 331 N A -0.164860 \n", + "1 331 N D -0.094874 \n", + "2 331 N E 0.032633 \n", + "3 331 N F 0.098268 \n", + "4 331 N G 0.294765 \n", "... ... ... ... ... \n", - "2100 531 T S -0.068568 \n", + "2100 531 T S -0.069089 \n", "2101 531 T T 0.000000 \n", - "2102 531 T V 0.013326 \n", - "2103 531 T W 0.199249 \n", - "2104 531 T Y -0.021049 \n", + "2102 531 T V 0.008317 \n", + "2103 531 T W 0.193380 \n", + "2104 531 T Y -0.017727 \n", "\n", " log2 fold change IC90 median log2 fold change IC90 min_magnitude \\\n", - "0 -0.206684 -0.135967 \n", - "1 -0.054268 -0.002643 \n", - "2 0.016051 0.015768 \n", - "3 0.142322 0.141842 \n", - "4 0.282422 0.088191 \n", + "0 -0.194622 -0.119023 \n", + "1 -0.051912 -0.011410 \n", + "2 0.015820 0.011409 \n", + "3 0.139888 0.115545 \n", + "4 0.283074 0.122967 \n", "... ... ... \n", - "2100 -0.016541 -0.004995 \n", + "2100 -0.020452 -0.012045 \n", "2101 0.000000 0.000000 \n", - "2102 -0.002602 0.039361 \n", - "2103 0.119866 -0.009133 \n", - "2104 -0.048014 -0.044700 \n", + "2102 -0.010543 0.031919 \n", + "2103 0.109259 -0.008928 \n", + "2104 -0.049556 -0.045506 \n", "\n", " log2 fold change IC90 std n_models times_seen frac_models \n", - "0 0.257279 4 17.75 1.0 \n", - "1 0.101813 4 11.25 1.0 \n", - "2 0.067786 4 10.25 1.0 \n", - "3 0.287263 4 10.00 1.0 \n", - "4 0.163483 4 25.00 1.0 \n", + "0 0.278079 4 17.75 1.0 \n", + "1 0.115779 4 11.25 1.0 \n", + "2 0.059330 4 10.25 1.0 \n", + "3 0.286302 4 10.00 1.0 \n", + "4 0.156693 4 25.00 1.0 \n", "... ... ... ... ... \n", - "2100 0.144948 4 31.75 1.0 \n", + "2100 0.138868 4 31.75 1.0 \n", "2101 0.000000 4 NaN 1.0 \n", - "2102 0.236514 4 19.50 1.0 \n", - "2103 0.287019 4 5.25 1.0 \n", - "2104 0.124192 4 11.75 1.0 \n", + "2102 0.243751 4 19.50 1.0 \n", + "2103 0.292706 4 5.25 1.0 \n", + "2104 0.123266 4 11.75 1.0 \n", "\n", "[2105 rows x 10 columns]" ] @@ -1000,15 +945,7 @@ "cell_type": "code", "execution_count": 9, "id": "b882943f", - "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:58.569000Z", - "iopub.status.busy": "2023-10-17T03:37:58.568790Z", - "iopub.status.idle": "2023-10-17T03:37:58.646779Z", - "shell.execute_reply": "2023-10-17T03:37:58.646098Z", - "shell.execute_reply.started": "2023-10-17T03:37:58.568984Z" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -1050,7 +987,7 @@ " N\n", " A\n", " N331A\n", - " -0.090115\n", + " -0.083922\n", " 19\n", " avg2muts\n", " 1\n", @@ -1062,7 +999,7 @@ " N\n", " D\n", " N331D\n", - " -0.110795\n", + " -0.076938\n", " 10\n", " avg2muts\n", " 1\n", @@ -1074,7 +1011,7 @@ " N\n", " E\n", " N331E\n", - " -0.001655\n", + " -0.007855\n", " 11\n", " avg2muts\n", " 1\n", @@ -1086,7 +1023,7 @@ " N\n", " F\n", " N331F\n", - " 0.037305\n", + " 0.039737\n", " 10\n", " avg2muts\n", " 1\n", @@ -1098,7 +1035,7 @@ " N\n", " G\n", " N331G\n", - " 0.031369\n", + " 0.045873\n", " 18\n", " avg2muts\n", " 1\n", @@ -1122,7 +1059,7 @@ " T\n", " R\n", " T531R\n", - " -0.033899\n", + " -0.033279\n", " 24\n", " avg3muts\n", " 2\n", @@ -1134,7 +1071,7 @@ " T\n", " S\n", " T531S\n", - " 0.000405\n", + " -0.005251\n", " 42\n", " avg3muts\n", " 2\n", @@ -1146,7 +1083,7 @@ " T\n", " V\n", " T531V\n", - " -0.023562\n", + " -0.033077\n", " 24\n", " avg3muts\n", " 2\n", @@ -1158,7 +1095,7 @@ " T\n", " W\n", " T531W\n", - " 0.532476\n", + " 0.546467\n", " 6\n", " avg3muts\n", " 2\n", @@ -1170,7 +1107,7 @@ " T\n", " Y\n", " T531Y\n", - " -0.074142\n", + " -0.068398\n", " 9\n", " avg3muts\n", " 2\n", @@ -1182,17 +1119,17 @@ ], "text/plain": [ " epitope site wildtype mutant mutation escape times_seen library \\\n", - "0 1 331 N A N331A -0.090115 19 avg2muts \n", - "1 1 331 N D N331D -0.110795 10 avg2muts \n", - "2 1 331 N E N331E -0.001655 11 avg2muts \n", - "3 1 331 N F N331F 0.037305 10 avg2muts \n", - "4 1 331 N G N331G 0.031369 18 avg2muts \n", + "0 1 331 N A N331A -0.083922 19 avg2muts \n", + "1 1 331 N D N331D -0.076938 10 avg2muts \n", + "2 1 331 N E N331E -0.007855 11 avg2muts \n", + "3 1 331 N F N331F 0.039737 10 avg2muts \n", + "4 1 331 N G N331G 0.045873 18 avg2muts \n", "... ... ... ... ... ... ... ... ... \n", - "15429 2 531 T R T531R -0.033899 24 avg3muts \n", - "15430 2 531 T S T531S 0.000405 42 avg3muts \n", - "15431 2 531 T V T531V -0.023562 24 avg3muts \n", - "15432 2 531 T W T531W 0.532476 6 avg3muts \n", - "15433 2 531 T Y T531Y -0.074142 9 avg3muts \n", + "15429 2 531 T R T531R -0.033279 24 avg3muts \n", + "15430 2 531 T S T531S -0.005251 42 avg3muts \n", + "15431 2 531 T V T531V -0.033077 24 avg3muts \n", + "15432 2 531 T W T531W 0.546467 6 avg3muts \n", + "15433 2 531 T Y T531Y -0.068398 9 avg3muts \n", "\n", " replicate \n", "0 1 \n", @@ -1235,13 +1172,6 @@ "execution_count": 10, "id": "b085a23f", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:37:58.653230Z", - "iopub.status.busy": "2023-10-17T03:37:58.653032Z", - "iopub.status.idle": "2023-10-17T03:38:01.042319Z", - "shell.execute_reply": "2023-10-17T03:38:01.041346Z", - "shell.execute_reply.started": "2023-10-17T03:37:58.653215Z" - }, "tags": [] }, "outputs": [ @@ -1250,28 +1180,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1343,13 +1275,6 @@ "execution_count": 11, "id": "6afa045f-4a62-4b4d-8285-4b550823a022", "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:38:01.045838Z", - "iopub.status.busy": "2023-10-17T03:38:01.045550Z", - "iopub.status.idle": "2023-10-17T03:38:03.419332Z", - "shell.execute_reply": "2023-10-17T03:38:03.418503Z", - "shell.execute_reply.started": "2023-10-17T03:38:01.045822Z" - }, "tags": [] }, "outputs": [ @@ -1358,28 +1283,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1452,27 +1379,87 @@ "cell_type": "code", "execution_count": 12, "id": "41957bd8-070a-41d7-b2f9-647eb7908a87", - "metadata": { - "execution": { - "iopub.execute_input": "2023-10-17T03:38:03.420623Z", - "iopub.status.busy": "2023-10-17T03:38:03.420289Z", - "iopub.status.idle": "2023-10-17T03:38:11.140080Z", - "shell.execute_reply": "2023-10-17T03:38:11.139368Z", - "shell.execute_reply.started": "2023-10-17T03:38:03.420605Z" - } - }, + "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "'addtl_slider_stats_as_max'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# NBVAL_IGNORE_OUTPUT\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mmodel_avg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmut_icXX_plot\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43maddtl_slider_stats\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtimes_seen\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mavg_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedian\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/polyclonal/polyclonal/polyclonal_collection.py:1174\u001b[0m, in \u001b[0;36mPolyclonalCollection.mut_icXX_plot\u001b[0;34m(self, x, icXX_col, log_fold_change_icXX_col, min_c, max_c, logbase, check_wt_icXX, biochem_order_aas, df_to_merge, positive_color, negative_color, avg_type, init_n_models, per_model_tooltip, scale_stat_col, **kwargs)\u001b[0m\n\u001b[1;32m 1172\u001b[0m max_escape_std \u001b[38;5;241m=\u001b[39m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_df\u001b[39m\u001b[38;5;124m\"\u001b[39m][std_col]\u001b[38;5;241m.\u001b[39mmax()\n\u001b[1;32m 1173\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maddtl_slider_stats\u001b[39m\u001b[38;5;124m\"\u001b[39m][std_col] \u001b[38;5;241m=\u001b[39m max_escape_std\n\u001b[0;32m-> 1174\u001b[0m \u001b[43mkwargs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maddtl_slider_stats_as_max\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mappend(std_col)\n\u001b[1;32m 1176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mheatmap_min_at_least\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m kwargs:\n\u001b[1;32m 1177\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mheatmap_min_at_least\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m\n", - "\u001b[0;31mKeyError\u001b[0m: 'addtl_slider_stats_as_max'" - ] + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1484,10 +1471,47 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "d88f1d70-b6cd-4732-804e-54e380544b02", + "metadata": {}, + "source": [ + "## Escape values by region\n", + "In some cases, you may want to only get the escape values for a specific region of the protein for each model being averaged.\n", + "For instance, this may be the case if you covered half the protein in one library and the other half in the other library.\n", + "\n", + "There are functions to get the values by region.\n", + "Specfically:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c9b166d0-714d-4e74-b1ff-520f958c952c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_avg.models" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "73a195d1-4f21-4c25-a147-fdb51118e2c8", + "id": "096412a3-c925-453d-95c2-3d405c9e5c1d", "metadata": {}, "outputs": [], "source": [] @@ -1509,7 +1533,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.7" } }, "nbformat": 4, diff --git a/polyclonal/polyclonal_collection.py b/polyclonal/polyclonal_collection.py index 8524a99..b1e3d70 100644 --- a/polyclonal/polyclonal_collection.py +++ b/polyclonal/polyclonal_collection.py @@ -268,6 +268,12 @@ class PolyclonalCollection: for each row must be unique. default_avg_to_plot : {"mean", "median"} By default when plotting, plot either "mean" or "median". + region_col : None or str + Use this option if you want to only include sites in a specific region of the + protein for specific models (this is useful for instance if you split the + protein into halves in two different libraries). In this case, `region_col` + should be a columnn in `models_df` with the values being the list of sites + to use for that specific model. Attributes ---------- @@ -299,6 +305,16 @@ class PolyclonalCollection: All sites for which the model is defined. default_avg_to_plot : {"mean", "median"} By default when plotting, plot either "mean" or "median". + regions : list + List of same length as :attr:`PolyclonalCollection.models` with each entry being + the set of sites that are being used in returned results for that model. If + `region_col` is `None`, this is all sites (:attr:`PolyclonalCollection.sites`), + but if `region_col` is used to define regions for different models then + the different sets of sites may differ for models. + n_models_by_site : dict + Keyed by each site in :attr:`PolyclonalCollection.sites`, with the value being + the number of models for which that site is in region for that model (this will + just be the number of models when not using `region_col`). Example ------- @@ -327,10 +343,35 @@ class PolyclonalCollection: ... ) >>> model_collection.sites (3, 5, 6) + >>> model_collection.regions == [{3, 5, 6}, {3, 5, 6}] + True + >>> model_collection.n_models_by_site + {3: 2, 5: 2, 6: 2} + + Now create a toy example with different regions for each model: + + >>> models_df = pd.DataFrame( + ... { + ... "model": [ + ... polyclonal.Polyclonal(data_to_fit=data_to_fit, n_epitopes=1), + ... polyclonal.Polyclonal(data_to_fit=data_to_fit2, n_epitopes=1), + ... ], + ... "description": ["model_1", "model_2"], + ... "region": [[3, 5], [3, 5, 6]], + ... } + ... ) + >>> model_region = polyclonal.PolyclonalCollection( + ... models_df, default_avg_to_plot="mean", region_col="region", + ... ) + >>> model_region.sites + (3, 5, 6) + >>> assert model_region.regions == [{3, 5}, {3, 5, 6}], model_region.regions + >>> model_region.n_models_by_site + {3: 2, 5: 2, 6: 1} """ - def __init__(self, models_df, *, default_avg_to_plot): + def __init__(self, models_df, *, default_avg_to_plot, region_col=None): """See main class docstring for details.""" if default_avg_to_plot not in {"mean", "median"}: raise ValueError(f"invalid {default_avg_to_plot=}") @@ -341,6 +382,13 @@ def __init__(self, models_df, *, default_avg_to_plot): raise ValueError(f"No models:\n{models_df=}") descriptors_df = models_df.drop(columns="model").reset_index(drop=True) + + if region_col is not None: + if region_col not in models_df.columns: + raise ValueError(f"{region_col=} not in {models_df.columns=}") + self.regions = [set(region) for region in models_df["region"]] + descriptors_df = descriptors_df.drop(columns=region_col) + if not len(descriptors_df.columns): raise ValueError("not descriptor columns in `models_df`") self.descriptor_names = descriptors_df.columns.tolist() @@ -349,6 +397,8 @@ def __init__(self, models_df, *, default_avg_to_plot): for name in self.descriptor_names if descriptors_df[name].nunique(dropna=False) > 1 ] + if not len(self.unique_descriptor_names): + raise ValueError("no `unique_descriptor_names`") if len(descriptors_df.drop_duplicates()) != len(self.models): raise ValueError("some models have the same descriptors") self.model_descriptors = list(descriptors_df.to_dict(orient="index").values()) @@ -372,6 +422,17 @@ def __init__(self, models_df, *, default_avg_to_plot): assert all(isinstance(r, int) for r in sites), sites self.sites = tuple(sites) + if region_col is None: + self.regions = [set(self.sites) for _ in self.models] + for i, region in enumerate(self.regions): + if not region.issubset(self.sites): + raise ValueError(f"for model {i + 1}, {region - set(self.sites)=}") + + self.n_models_by_site = { + site: sum(site in region for region in self.regions) + for site in self.sites + } + @property def activity_wt_df_replicates(self): """pandas.DataFrame: Epitope activities for all models.""" From 6296e34b7e53f0005837a22413c01df7e8f0277b Mon Sep 17 00:00:00 2001 From: jbloom Date: Fri, 13 Dec 2024 15:04:31 -0800 Subject: [PATCH 4/9] finish adding regions as param to `PolyclonalCollection` --- polyclonal/polyclonal_collection.py | 102 ++++++++++++++-------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/polyclonal/polyclonal_collection.py b/polyclonal/polyclonal_collection.py index b1e3d70..7d24174 100644 --- a/polyclonal/polyclonal_collection.py +++ b/polyclonal/polyclonal_collection.py @@ -386,8 +386,11 @@ def __init__(self, models_df, *, default_avg_to_plot, region_col=None): if region_col is not None: if region_col not in models_df.columns: raise ValueError(f"{region_col=} not in {models_df.columns=}") - self.regions = [set(region) for region in models_df["region"]] + self.regions = [set(region) for region in models_df[region_col]] descriptors_df = descriptors_df.drop(columns=region_col) + self._has_regions = True + else: + self._has_regions = False if not len(descriptors_df.columns): raise ValueError("not descriptor columns in `models_df`") @@ -429,8 +432,7 @@ def __init__(self, models_df, *, default_avg_to_plot, region_col=None): raise ValueError(f"for model {i + 1}, {region - set(self.sites)=}") self.n_models_by_site = { - site: sum(site in region for region in self.regions) - for site in self.sites + site: sum(site in region for region in self.regions) for site in self.sites } @property @@ -638,47 +640,14 @@ def mut_escape_df_replicates(self): """pandas.DataFrame: Mutation escape by model.""" return pd.concat( [ - m.mut_escape_df.assign(**desc) - for m, desc in zip(self.models, self.model_descriptors) + m.mut_escape_df.query("site in @sites").assign(**desc) + for m, desc, sites, in zip( + self.models, self.model_descriptors, self.regions + ) ], ignore_index=True, ) - def mut_escape_df_replicates_by_region(self, desc_name, model_sites): - """Get mutation escape by model only keeping specific sites per model. - - Parameters - ---------- - desc_name : str - Descriptor in :attr:`PolyclonalCollection.model_descriptors` used to - identify models. - model_sites : dict - Keyed by the descriptor identifying each model, with values - being list or set of the sites to keep for that model. All sites - must be in :attr:`PolyclonalCollection.sites`. - - Returns - ------- - pandas.DataFrame - Mutation escape by model, only keeping indicated sites for each model. - - """ - if desc_name not in self.model_descriptors: - raise ValueError(f"{desc_name=} not in {self.model_descriptors.keys()=}") - descs = self.model_descriptors[desc_name] - if set(model_sites) != set(descs): - raise ValueError(f"{model_sites.keys()=} differs from {descs=}") - dfs = [] - for model, desc, descriptors in zip(self.models, descs, self.model_descriptors): - sites_to_keep = set(self.model_sites[desc]) - invalid_sites = sorted(sites_to_keep - set(self.sites)) - if invalid_sites: - raise ValueError(f"invalid sites in `model_sites`:\n{invalid_sites}") - dfs.append( - model.mut_escape_df.query("site in @sites_to_keep").assign(**descriptors) - ) - return pd.concat(dfs, ignore_index=True) - def mut_icXX_df_replicates(self, **kwargs): """Get data frame of ICXX and log fold change for each mutation by model. @@ -695,8 +664,10 @@ def mut_icXX_df_replicates(self, **kwargs): """ return pd.concat( [ - m.mut_icXX_df(**kwargs).assign(**desc) - for m, desc in zip(self.models, self.model_descriptors) + m.mut_icXX_df(**kwargs).query("site in @sites").assign(**desc) + for m, desc, sites in zip( + self.models, self.model_descriptors, self.regions + ) ], ignore_index=True, ) @@ -722,7 +693,9 @@ def mut_escape_df(self): ) .aggregate(**aggs) .assign( - frac_models=lambda x: x["n_models"] / len(self.models), + frac_models=lambda x: ( + x["n_models"] / x["site"].map(self.n_models_by_site) + ), # make categorical to sort, then return to original type epitope=lambda x: pd.Categorical( x["epitope"], @@ -781,7 +754,9 @@ def mut_icXX_df(self, **kwargs): df.groupby(["site", "wildtype", "mutant"], as_index=False) .aggregate(**aggs) .assign( - frac_models=lambda x: x["n_models"] / len(self.models), + frac_models=lambda x: ( + x["n_models"] / x["site"].map(self.n_models_by_site) + ), # make categorical to sort, then return to original type site=lambda x: pd.Categorical( x["site"], @@ -1108,7 +1083,7 @@ def mut_escape_plot( ) if init_n_models is None: - init_n_models = int(math.ceil(len(self.models) / 2)) + init_n_models = int(math.ceil(min(self.n_models_by_site.values()) / 2)) if "n_models" not in kwargs["addtl_slider_stats"]: kwargs["addtl_slider_stats"]["n_models"] = init_n_models @@ -1247,7 +1222,7 @@ def mut_icXX_plot( kwargs["addtl_slider_stats_as_max"] = [] if init_n_models is None: - init_n_models = int(math.ceil(len(self.models) / 2)) + init_n_models = int(math.ceil(min(self.n_models_by_site.values()) / 2)) kwargs["addtl_slider_stats"]["n_models"] = init_n_models kwargs["data_df"] = polyclonal.Polyclonal._merge_df_to_merge( @@ -1305,8 +1280,12 @@ def mut_escape_site_summary_df_replicates(self, **kwargs): """ return pd.concat( [ - m.mut_escape_site_summary_df(**kwargs).assign(**desc) - for m, desc in zip(self.models, self.model_descriptors) + m.mut_escape_site_summary_df(**kwargs) + .query("site in @sites") + .assign(**desc) + for m, desc, site in zip( + self.models, self.model_descriptors, self.regions + ) ], ignore_index=True, ) @@ -1348,7 +1327,9 @@ def mut_escape_site_summary_df(self, **kwargs): n_models=pd.NamedAgg("escape", "count"), ) .assign( - frac_models=lambda x: x["n_models"] / len(self.models), + frac_models=lambda x: ( + x["n_models"] / x["site"].map(self.n_models_by_site) + ), ) .merge( df.groupby(["epitope", "site"]).aggregate({"n mutations": "mean"}), @@ -1396,6 +1377,8 @@ def icXX_replicates(self, variants_df, **kwargs): model are missing in that row. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") return pd.concat( [ m.icXX(m.filter_variants_by_seen_muts(variants_df), **kwargs).assign( @@ -1425,6 +1408,8 @@ def icXX(self, variants_df, **kwargs): icXX and summary stats for each variant across all models. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") if "col" in kwargs: col = kwargs["col"] else: @@ -1474,6 +1459,8 @@ def prob_escape_replicates(self, variants_df, **kwargs): missing in that row. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") return pd.concat( [ m.prob_escape( @@ -1507,6 +1494,8 @@ def prob_escape(self, variants_df, **kwargs): for each variant at each concentration across models. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") variants_df = variants_df.drop_duplicates() return ( self.prob_escape_replicates(variants_df=variants_df, **kwargs) @@ -1535,6 +1524,8 @@ class PolyclonalAverage(PolyclonalCollection): Same meaning as for :class:`PolyclonalCollection`. However, the resulting collection of models will have **copies** of these models rather than the actual objects in `models_df`. + region_col : str or None + Same meaning as for :class:`PolyclonalCollection`. harmonize_to : :class:`PolyclonalCollection` or None When harmonizing the epitopes, harmonize to this model. If `None`, just harmonize to the first model in `models_df`. @@ -1548,7 +1539,14 @@ class PolyclonalAverage(PolyclonalCollection): """ - def __init__(self, models_df, *, harmonize_to=None, default_avg_to_plot="median"): + def __init__( + self, + models_df, + *, + region_col=None, + harmonize_to=None, + default_avg_to_plot="median", + ): """See main class docstring.""" if not len(models_df): raise ValueError("no models in `model_df`") @@ -1559,7 +1557,9 @@ def __init__(self, models_df, *, harmonize_to=None, default_avg_to_plot="median" m.epitope_harmonized_model(harmonize_to)[0] for m in models_df["model"] ] - super().__init__(models_df, default_avg_to_plot=default_avg_to_plot) + super().__init__( + models_df, region_col=region_col, default_avg_to_plot=default_avg_to_plot + ) class PolyclonalBootstrap(PolyclonalCollection): From 19bd38485c69345779a6656760f0b50f08a6d3af Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 14 Dec 2024 06:01:21 -0800 Subject: [PATCH 5/9] add region averaging to `RBD_average` notebook --- notebooks/RBD_average.ipynb | 1372 ++++++++++++++++++++++++++++++----- 1 file changed, 1173 insertions(+), 199 deletions(-) diff --git a/notebooks/RBD_average.ipynb b/notebooks/RBD_average.ipynb index 11e3dd3..5d647e8 100644 --- a/notebooks/RBD_average.ipynb +++ b/notebooks/RBD_average.ipynb @@ -29,6 +29,13 @@ "execution_count": 1, "id": "b242df9c", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:53:56.905113Z", + "iopub.status.busy": "2024-12-14T13:53:56.904578Z", + "iopub.status.idle": "2024-12-14T13:54:12.514164Z", + "shell.execute_reply": "2024-12-14T13:54:12.513193Z", + "shell.execute_reply.started": "2024-12-14T13:53:56.905057Z" + }, "tags": [] }, "outputs": [], @@ -73,6 +80,13 @@ "execution_count": 2, "id": "1f86179e", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:54:12.515977Z", + "iopub.status.busy": "2024-12-14T13:54:12.515521Z", + "iopub.status.idle": "2024-12-14T13:56:53.731871Z", + "shell.execute_reply": "2024-12-14T13:56:53.729741Z", + "shell.execute_reply.started": "2024-12-14T13:54:12.515943Z" + }, "tags": [] }, "outputs": [], @@ -110,6 +124,13 @@ "execution_count": 3, "id": "d6e2060b", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:56:53.734807Z", + "iopub.status.busy": "2024-12-14T13:56:53.734217Z", + "iopub.status.idle": "2024-12-14T13:56:53.779530Z", + "shell.execute_reply": "2024-12-14T13:56:53.778507Z", + "shell.execute_reply.started": "2024-12-14T13:56:53.734748Z" + }, "tags": [] }, "outputs": [ @@ -203,7 +224,15 @@ "cell_type": "code", "execution_count": 4, "id": "8bb91e55", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:56:53.783657Z", + "iopub.status.busy": "2024-12-14T13:56:53.783059Z", + "iopub.status.idle": "2024-12-14T13:56:56.394188Z", + "shell.execute_reply": "2024-12-14T13:56:56.393245Z", + "shell.execute_reply.started": "2024-12-14T13:56:53.783613Z" + } + }, "outputs": [], "source": [ "model_avg = polyclonal.PolyclonalAverage(models_df)" @@ -221,30 +250,38 @@ "cell_type": "code", "execution_count": 5, "id": "7608fa8b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:56:56.399407Z", + "iopub.status.busy": "2024-12-14T13:56:56.398904Z", + "iopub.status.idle": "2024-12-14T13:56:57.024531Z", + "shell.execute_reply": "2024-12-14T13:56:57.023823Z", + "shell.execute_reply.started": "2024-12-14T13:56:56.399359Z" + } + }, "outputs": [ { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -325,6 +362,13 @@ "execution_count": 6, "id": "3f07294f", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:56:57.025954Z", + "iopub.status.busy": "2024-12-14T13:56:57.025708Z", + "iopub.status.idle": "2024-12-14T13:56:57.217625Z", + "shell.execute_reply": "2024-12-14T13:56:57.216542Z", + "shell.execute_reply.started": "2024-12-14T13:56:57.025929Z" + }, "tags": [] }, "outputs": [ @@ -333,23 +377,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -429,6 +473,13 @@ "execution_count": 7, "id": "acf519c5", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:56:57.219226Z", + "iopub.status.busy": "2024-12-14T13:56:57.218771Z", + "iopub.status.idle": "2024-12-14T13:56:57.840139Z", + "shell.execute_reply": "2024-12-14T13:56:57.839423Z", + "shell.execute_reply.started": "2024-12-14T13:56:57.219189Z" + }, "tags": [] }, "outputs": [ @@ -475,10 +526,10 @@ " N\n", " A\n", " N331A\n", - " -0.067564\n", - " -0.030975\n", - " 0.021971\n", - " 0.328544\n", + " -0.070506\n", + " -0.029126\n", + " 0.025048\n", + " 0.330401\n", " 4\n", " 17.75\n", " 1.0\n", @@ -490,10 +541,10 @@ " N\n", " D\n", " N331D\n", - " -0.090326\n", - " -0.083941\n", - " -0.074902\n", - " 0.020104\n", + " -0.106079\n", + " -0.106459\n", + " -0.074615\n", + " 0.025421\n", " 4\n", " 11.25\n", " 1.0\n", @@ -505,10 +556,10 @@ " N\n", " E\n", " N331E\n", - " -0.066548\n", - " -0.009506\n", - " -0.001983\n", - " 0.119161\n", + " -0.075338\n", + " -0.008291\n", + " 0.002352\n", + " 0.141307\n", " 4\n", " 10.25\n", " 1.0\n", @@ -520,10 +571,10 @@ " N\n", " F\n", " N331F\n", - " 0.283373\n", - " 0.192162\n", - " 0.018695\n", - " 0.333196\n", + " 0.278933\n", + " 0.173117\n", + " 0.030457\n", + " 0.334194\n", " 4\n", " 10.00\n", " 1.0\n", @@ -535,10 +586,10 @@ " N\n", " G\n", " N331G\n", - " 0.193667\n", - " 0.177908\n", - " -0.026998\n", - " 0.221850\n", + " 0.187362\n", + " 0.161299\n", + " -0.039176\n", + " 0.228411\n", " 4\n", " 25.00\n", " 1.0\n", @@ -565,10 +616,10 @@ " T\n", " R\n", " T531R\n", - " -0.174502\n", - " -0.097747\n", - " -0.026822\n", - " 0.210255\n", + " -0.180296\n", + " -0.114042\n", + " -0.041709\n", + " 0.187469\n", " 4\n", " 27.00\n", " 1.0\n", @@ -580,10 +631,10 @@ " T\n", " S\n", " T531S\n", - " -0.082028\n", - " -0.014567\n", - " 0.005165\n", - " 0.148563\n", + " -0.077768\n", + " -0.020680\n", + " 0.012301\n", + " 0.137118\n", " 4\n", " 31.75\n", " 1.0\n", @@ -595,10 +646,10 @@ " T\n", " V\n", " T531V\n", - " -0.042999\n", - " -0.007213\n", - " 0.018650\n", - " 0.103766\n", + " -0.037153\n", + " -0.009067\n", + " 0.011169\n", + " 0.095150\n", " 4\n", " 19.50\n", " 1.0\n", @@ -610,10 +661,10 @@ " T\n", " W\n", " T531W\n", - " 0.163447\n", - " 0.088392\n", - " -0.005205\n", - " 0.276730\n", + " 0.171903\n", + " 0.101605\n", + " -0.004711\n", + " 0.276231\n", " 4\n", " 5.25\n", " 1.0\n", @@ -625,10 +676,10 @@ " T\n", " Y\n", " T531Y\n", - " -0.052903\n", - " -0.055342\n", - " -0.032529\n", - " 0.017376\n", + " -0.058269\n", + " -0.055309\n", + " -0.038848\n", + " 0.021822\n", " 4\n", " 11.75\n", " 1.0\n", @@ -640,30 +691,30 @@ ], "text/plain": [ " epitope site wildtype mutant mutation escape_mean escape_median \\\n", - "0 1 331 N A N331A -0.067564 -0.030975 \n", - "1 1 331 N D N331D -0.090326 -0.083941 \n", - "2 1 331 N E N331E -0.066548 -0.009506 \n", - "3 1 331 N F N331F 0.283373 0.192162 \n", - "4 1 331 N G N331G 0.193667 0.177908 \n", + "0 1 331 N A N331A -0.070506 -0.029126 \n", + "1 1 331 N D N331D -0.106079 -0.106459 \n", + "2 1 331 N E N331E -0.075338 -0.008291 \n", + "3 1 331 N F N331F 0.278933 0.173117 \n", + "4 1 331 N G N331G 0.187362 0.161299 \n", "... ... ... ... ... ... ... ... \n", - "3859 2 531 T R T531R -0.174502 -0.097747 \n", - "3860 2 531 T S T531S -0.082028 -0.014567 \n", - "3861 2 531 T V T531V -0.042999 -0.007213 \n", - "3862 2 531 T W T531W 0.163447 0.088392 \n", - "3863 2 531 T Y T531Y -0.052903 -0.055342 \n", + "3859 2 531 T R T531R -0.180296 -0.114042 \n", + "3860 2 531 T S T531S -0.077768 -0.020680 \n", + "3861 2 531 T V T531V -0.037153 -0.009067 \n", + "3862 2 531 T W T531W 0.171903 0.101605 \n", + "3863 2 531 T Y T531Y -0.058269 -0.055309 \n", "\n", " escape_min_magnitude escape_std n_models times_seen frac_models \n", - "0 0.021971 0.328544 4 17.75 1.0 \n", - "1 -0.074902 0.020104 4 11.25 1.0 \n", - "2 -0.001983 0.119161 4 10.25 1.0 \n", - "3 0.018695 0.333196 4 10.00 1.0 \n", - "4 -0.026998 0.221850 4 25.00 1.0 \n", + "0 0.025048 0.330401 4 17.75 1.0 \n", + "1 -0.074615 0.025421 4 11.25 1.0 \n", + "2 0.002352 0.141307 4 10.25 1.0 \n", + "3 0.030457 0.334194 4 10.00 1.0 \n", + "4 -0.039176 0.228411 4 25.00 1.0 \n", "... ... ... ... ... ... \n", - "3859 -0.026822 0.210255 4 27.00 1.0 \n", - "3860 0.005165 0.148563 4 31.75 1.0 \n", - "3861 0.018650 0.103766 4 19.50 1.0 \n", - "3862 -0.005205 0.276730 4 5.25 1.0 \n", - "3863 -0.032529 0.017376 4 11.75 1.0 \n", + "3859 -0.041709 0.187469 4 27.00 1.0 \n", + "3860 0.012301 0.137118 4 31.75 1.0 \n", + "3861 0.011169 0.095150 4 19.50 1.0 \n", + "3862 -0.004711 0.276231 4 5.25 1.0 \n", + "3863 -0.038848 0.021822 4 11.75 1.0 \n", "\n", "[3864 rows x 12 columns]" ] @@ -691,7 +742,15 @@ "cell_type": "code", "execution_count": 8, "id": "2c642238-9278-4a81-a668-5df15a336f5c", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:56:57.841977Z", + "iopub.status.busy": "2024-12-14T13:56:57.841561Z", + "iopub.status.idle": "2024-12-14T13:57:04.364109Z", + "shell.execute_reply": "2024-12-14T13:57:04.362814Z", + "shell.execute_reply.started": "2024-12-14T13:56:57.841947Z" + } + }, "outputs": [ { "data": { @@ -732,10 +791,10 @@ " 331\n", " N\n", " A\n", - " -0.164860\n", - " -0.194622\n", - " -0.119023\n", - " 0.278079\n", + " -0.158794\n", + " -0.192897\n", + " -0.115146\n", + " 0.270364\n", " 4\n", " 17.75\n", " 1.0\n", @@ -745,10 +804,10 @@ " 331\n", " N\n", " D\n", - " -0.094874\n", - " -0.051912\n", - " -0.011410\n", - " 0.115779\n", + " -0.096712\n", + " -0.058956\n", + " -0.016521\n", + " 0.106236\n", " 4\n", " 11.25\n", " 1.0\n", @@ -758,10 +817,10 @@ " 331\n", " N\n", " E\n", - " 0.032633\n", - " 0.015820\n", - " 0.011409\n", - " 0.059330\n", + " 0.036998\n", + " 0.018060\n", + " 0.017496\n", + " 0.073099\n", " 4\n", " 10.25\n", " 1.0\n", @@ -771,10 +830,10 @@ " 331\n", " N\n", " F\n", - " 0.098268\n", - " 0.139888\n", - " 0.115545\n", - " 0.286302\n", + " 0.104497\n", + " 0.142641\n", + " 0.135784\n", + " 0.288862\n", " 4\n", " 10.00\n", " 1.0\n", @@ -784,10 +843,10 @@ " 331\n", " N\n", " G\n", - " 0.294765\n", - " 0.283074\n", - " 0.122967\n", - " 0.156693\n", + " 0.286305\n", + " 0.279946\n", + " 0.103283\n", + " 0.160085\n", " 4\n", " 25.00\n", " 1.0\n", @@ -810,10 +869,10 @@ " 531\n", " T\n", " S\n", - " -0.069089\n", - " -0.020452\n", - " -0.012045\n", - " 0.138868\n", + " -0.072452\n", + " -0.022248\n", + " -0.012782\n", + " 0.131153\n", " 4\n", " 31.75\n", " 1.0\n", @@ -836,10 +895,10 @@ " 531\n", " T\n", " V\n", - " 0.008317\n", - " -0.010543\n", - " 0.031919\n", - " 0.243751\n", + " 0.011190\n", + " -0.008491\n", + " 0.035284\n", + " 0.230686\n", " 4\n", " 19.50\n", " 1.0\n", @@ -849,10 +908,10 @@ " 531\n", " T\n", " W\n", - " 0.193380\n", - " 0.109259\n", - " -0.008928\n", - " 0.292706\n", + " 0.198453\n", + " 0.121785\n", + " -0.009031\n", + " 0.290432\n", " 4\n", " 5.25\n", " 1.0\n", @@ -862,10 +921,10 @@ " 531\n", " T\n", " Y\n", - " -0.017727\n", - " -0.049556\n", - " -0.045506\n", - " 0.123266\n", + " -0.022972\n", + " -0.054006\n", + " -0.049928\n", + " 0.135754\n", " 4\n", " 11.75\n", " 1.0\n", @@ -877,43 +936,43 @@ ], "text/plain": [ " site wildtype mutant log2 fold change IC90 mean \\\n", - "0 331 N A -0.164860 \n", - "1 331 N D -0.094874 \n", - "2 331 N E 0.032633 \n", - "3 331 N F 0.098268 \n", - "4 331 N G 0.294765 \n", + "0 331 N A -0.158794 \n", + "1 331 N D -0.096712 \n", + "2 331 N E 0.036998 \n", + "3 331 N F 0.104497 \n", + "4 331 N G 0.286305 \n", "... ... ... ... ... \n", - "2100 531 T S -0.069089 \n", + "2100 531 T S -0.072452 \n", "2101 531 T T 0.000000 \n", - "2102 531 T V 0.008317 \n", - "2103 531 T W 0.193380 \n", - "2104 531 T Y -0.017727 \n", + "2102 531 T V 0.011190 \n", + "2103 531 T W 0.198453 \n", + "2104 531 T Y -0.022972 \n", "\n", " log2 fold change IC90 median log2 fold change IC90 min_magnitude \\\n", - "0 -0.194622 -0.119023 \n", - "1 -0.051912 -0.011410 \n", - "2 0.015820 0.011409 \n", - "3 0.139888 0.115545 \n", - "4 0.283074 0.122967 \n", + "0 -0.192897 -0.115146 \n", + "1 -0.058956 -0.016521 \n", + "2 0.018060 0.017496 \n", + "3 0.142641 0.135784 \n", + "4 0.279946 0.103283 \n", "... ... ... \n", - "2100 -0.020452 -0.012045 \n", + "2100 -0.022248 -0.012782 \n", "2101 0.000000 0.000000 \n", - "2102 -0.010543 0.031919 \n", - "2103 0.109259 -0.008928 \n", - "2104 -0.049556 -0.045506 \n", + "2102 -0.008491 0.035284 \n", + "2103 0.121785 -0.009031 \n", + "2104 -0.054006 -0.049928 \n", "\n", " log2 fold change IC90 std n_models times_seen frac_models \n", - "0 0.278079 4 17.75 1.0 \n", - "1 0.115779 4 11.25 1.0 \n", - "2 0.059330 4 10.25 1.0 \n", - "3 0.286302 4 10.00 1.0 \n", - "4 0.156693 4 25.00 1.0 \n", + "0 0.270364 4 17.75 1.0 \n", + "1 0.106236 4 11.25 1.0 \n", + "2 0.073099 4 10.25 1.0 \n", + "3 0.288862 4 10.00 1.0 \n", + "4 0.160085 4 25.00 1.0 \n", "... ... ... ... ... \n", - "2100 0.138868 4 31.75 1.0 \n", + "2100 0.131153 4 31.75 1.0 \n", "2101 0.000000 4 NaN 1.0 \n", - "2102 0.243751 4 19.50 1.0 \n", - "2103 0.292706 4 5.25 1.0 \n", - "2104 0.123266 4 11.75 1.0 \n", + "2102 0.230686 4 19.50 1.0 \n", + "2103 0.290432 4 5.25 1.0 \n", + "2104 0.135754 4 11.75 1.0 \n", "\n", "[2105 rows x 10 columns]" ] @@ -945,7 +1004,15 @@ "cell_type": "code", "execution_count": 9, "id": "b882943f", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:57:04.366300Z", + "iopub.status.busy": "2024-12-14T13:57:04.365736Z", + "iopub.status.idle": "2024-12-14T13:57:04.520174Z", + "shell.execute_reply": "2024-12-14T13:57:04.519573Z", + "shell.execute_reply.started": "2024-12-14T13:57:04.366249Z" + } + }, "outputs": [ { "data": { @@ -987,7 +1054,7 @@ " N\n", " A\n", " N331A\n", - " -0.083922\n", + " -0.083300\n", " 19\n", " avg2muts\n", " 1\n", @@ -999,7 +1066,7 @@ " N\n", " D\n", " N331D\n", - " -0.076938\n", + " -0.104757\n", " 10\n", " avg2muts\n", " 1\n", @@ -1011,7 +1078,7 @@ " N\n", " E\n", " N331E\n", - " -0.007855\n", + " -0.004766\n", " 11\n", " avg2muts\n", " 1\n", @@ -1023,7 +1090,7 @@ " N\n", " F\n", " N331F\n", - " 0.039737\n", + " 0.033077\n", " 10\n", " avg2muts\n", " 1\n", @@ -1035,7 +1102,7 @@ " N\n", " G\n", " N331G\n", - " 0.045873\n", + " 0.046883\n", " 18\n", " avg2muts\n", " 1\n", @@ -1059,7 +1126,7 @@ " T\n", " R\n", " T531R\n", - " -0.033279\n", + " -0.041709\n", " 24\n", " avg3muts\n", " 2\n", @@ -1071,7 +1138,7 @@ " T\n", " S\n", " T531S\n", - " -0.005251\n", + " -0.015296\n", " 42\n", " avg3muts\n", " 2\n", @@ -1083,7 +1150,7 @@ " T\n", " V\n", " T531V\n", - " -0.033077\n", + " -0.029302\n", " 24\n", " avg3muts\n", " 2\n", @@ -1095,7 +1162,7 @@ " T\n", " W\n", " T531W\n", - " 0.546467\n", + " 0.547600\n", " 6\n", " avg3muts\n", " 2\n", @@ -1107,7 +1174,7 @@ " T\n", " Y\n", " T531Y\n", - " -0.068398\n", + " -0.083607\n", " 9\n", " avg3muts\n", " 2\n", @@ -1119,17 +1186,17 @@ ], "text/plain": [ " epitope site wildtype mutant mutation escape times_seen library \\\n", - "0 1 331 N A N331A -0.083922 19 avg2muts \n", - "1 1 331 N D N331D -0.076938 10 avg2muts \n", - "2 1 331 N E N331E -0.007855 11 avg2muts \n", - "3 1 331 N F N331F 0.039737 10 avg2muts \n", - "4 1 331 N G N331G 0.045873 18 avg2muts \n", + "0 1 331 N A N331A -0.083300 19 avg2muts \n", + "1 1 331 N D N331D -0.104757 10 avg2muts \n", + "2 1 331 N E N331E -0.004766 11 avg2muts \n", + "3 1 331 N F N331F 0.033077 10 avg2muts \n", + "4 1 331 N G N331G 0.046883 18 avg2muts \n", "... ... ... ... ... ... ... ... ... \n", - "15429 2 531 T R T531R -0.033279 24 avg3muts \n", - "15430 2 531 T S T531S -0.005251 42 avg3muts \n", - "15431 2 531 T V T531V -0.033077 24 avg3muts \n", - "15432 2 531 T W T531W 0.546467 6 avg3muts \n", - "15433 2 531 T Y T531Y -0.068398 9 avg3muts \n", + "15429 2 531 T R T531R -0.041709 24 avg3muts \n", + "15430 2 531 T S T531S -0.015296 42 avg3muts \n", + "15431 2 531 T V T531V -0.029302 24 avg3muts \n", + "15432 2 531 T W T531W 0.547600 6 avg3muts \n", + "15433 2 531 T Y T531Y -0.083607 9 avg3muts \n", "\n", " replicate \n", "0 1 \n", @@ -1172,6 +1239,13 @@ "execution_count": 10, "id": "b085a23f", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:57:04.521387Z", + "iopub.status.busy": "2024-12-14T13:57:04.520989Z", + "iopub.status.idle": "2024-12-14T13:57:08.444409Z", + "shell.execute_reply": "2024-12-14T13:57:08.443286Z", + "shell.execute_reply.started": "2024-12-14T13:57:04.521358Z" + }, "tags": [] }, "outputs": [ @@ -1180,23 +1254,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1275,6 +1349,13 @@ "execution_count": 11, "id": "6afa045f-4a62-4b4d-8285-4b550823a022", "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:57:08.445881Z", + "iopub.status.busy": "2024-12-14T13:57:08.445500Z", + "iopub.status.idle": "2024-12-14T13:57:12.311988Z", + "shell.execute_reply": "2024-12-14T13:57:12.311460Z", + "shell.execute_reply.started": "2024-12-14T13:57:08.445854Z" + }, "tags": [] }, "outputs": [ @@ -1283,23 +1364,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1379,30 +1460,38 @@ "cell_type": "code", "execution_count": 12, "id": "41957bd8-070a-41d7-b2f9-647eb7908a87", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:57:12.315322Z", + "iopub.status.busy": "2024-12-14T13:57:12.314996Z", + "iopub.status.idle": "2024-12-14T13:57:25.931459Z", + "shell.execute_reply": "2024-12-14T13:57:25.930700Z", + "shell.execute_reply.started": "2024-12-14T13:57:12.315299Z" + } + }, "outputs": [ { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1480,41 +1569,926 @@ "In some cases, you may want to only get the escape values for a specific region of the protein for each model being averaged.\n", "For instance, this may be the case if you covered half the protein in one library and the other half in the other library.\n", "\n", - "There are functions to get the values by region.\n", - "Specfically:" + "In this case, you should initialize with a column in the models data with a `region_col` that specifies the sites for each model:" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "c9b166d0-714d-4e74-b1ff-520f958c952c", - "metadata": {}, + "execution_count": 14, + "id": "432f2375-6ec6-4d62-8a8a-d768b618454b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:05.887056Z", + "iopub.status.busy": "2024-12-14T13:58:05.885968Z", + "iopub.status.idle": "2024-12-14T13:58:08.404744Z", + "shell.execute_reply": "2024-12-14T13:58:08.403665Z", + "shell.execute_reply.started": "2024-12-14T13:58:05.886989Z" + } + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is the input dataframe specifying sites to keep for each model:\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
libraryreplicatemodelsites_to_keep
0avg2muts1<polyclonal.polyclonal.Polyclonal object at 0x...(331, 332, 333, 334, 335, 336, 337, 338, 339, ...
1avg2muts2<polyclonal.polyclonal.Polyclonal object at 0x...(331, 332, 333, 334, 335, 336, 337, 338, 339, ...
2avg3muts1<polyclonal.polyclonal.Polyclonal object at 0x...[331, 332, 333, 334, 335, 336, 337, 338, 339, ...
3avg3muts2<polyclonal.polyclonal.Polyclonal object at 0x...[450, 451, 452, 453, 455, 456, 458, 459, 460, ...
\n", + "
" + ], "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" + " library replicate model \\\n", + "0 avg2muts 1 = 450]),\n", + " ],\n", + " columns=[\"library\", \"replicate\", \"sites_to_keep\"],\n", + ")\n", + "\n", + "models_region_df = models_df.merge(regions_df)\n", + "\n", + "print(\"Here is the input dataframe specifying sites to keep for each model:\")\n", + "display(models_region_df)\n", + "\n", + "model_region_avg = polyclonal.PolyclonalAverage(\n", + " models_region_df, region_col=\"sites_to_keep\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7417aeda-d6ab-4e30-a233-dc3d60d688c1", + "metadata": {}, + "source": [ + "We can see the number of sites in each region:" ] }, { "cell_type": "code", - "execution_count": null, - "id": "096412a3-c925-453d-95c2-3d405c9e5c1d", + "execution_count": 15, + "id": "c9b166d0-714d-4e74-b1ff-520f958c952c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:08.615016Z", + "iopub.status.busy": "2024-12-14T13:58:08.614534Z", + "iopub.status.idle": "2024-12-14T13:58:08.620439Z", + "shell.execute_reply": "2024-12-14T13:58:08.619707Z", + "shell.execute_reply.started": "2024-12-14T13:58:08.614987Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "desc={'library': 'avg2muts', 'replicate': 1}, len(sites)=173, min(sites)=331, max(sites)=531\n", + "desc={'library': 'avg2muts', 'replicate': 2}, len(sites)=173, min(sites)=331, max(sites)=531\n", + "desc={'library': 'avg3muts', 'replicate': 1}, len(sites)=112, min(sites)=331, max(sites)=460\n", + "desc={'library': 'avg3muts', 'replicate': 2}, len(sites)=70, min(sites)=450, max(sites)=531\n" + ] + } + ], + "source": [ + "for desc, sites in zip(model_region_avg.model_descriptors, model_region_avg.regions):\n", + " print(f\"{desc=}, {len(sites)=}, {min(sites)=}, {max(sites)=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ebce9460-faeb-4580-b4ba-03f222d41526", "metadata": {}, + "source": [ + "We can also get the number of models per site.\n", + "Based on how we initialized, this is 3 for all sites except those between 450 and 460 where it is 4:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "cb0401b7-d454-4c52-9ef2-2791fe32528f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:10.327513Z", + "iopub.status.busy": "2024-12-14T13:58:10.326870Z", + "iopub.status.idle": "2024-12-14T13:58:10.333416Z", + "shell.execute_reply": "2024-12-14T13:58:10.332243Z", + "shell.execute_reply.started": "2024-12-14T13:58:10.327475Z" + } + }, "outputs": [], - "source": [] + "source": [ + "assert model_region_avg.sites == model_avg.sites\n", + "\n", + "assert (\n", + " model_region_avg.n_models_by_site[r] == 3 + (450 <= r <= 460)\n", + " for r in model_region_avg.sites\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "60cf491d-20ca-47ee-a2a6-1b20f04c568c", + "metadata": {}, + "source": [ + "Now look at the mutation-escape data frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "23a4e53f-01fb-418d-8dfd-467a99e61a35", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:12.061035Z", + "iopub.status.busy": "2024-12-14T13:58:12.060339Z", + "iopub.status.idle": "2024-12-14T13:58:12.688630Z", + "shell.execute_reply": "2024-12-14T13:58:12.687911Z", + "shell.execute_reply.started": "2024-12-14T13:58:12.060983Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitopesitewildtypemutantmutationescape_meanescape_medianescape_min_magnitudeescape_stdn_modelstimes_seenfrac_models
01331NAN331A-0.188953-0.0833000.0250480.282079316.6666671.0
11331NDN331D-0.095844-0.104757-0.0746150.018464310.6666671.0
21331NEN331E-0.004743-0.0047660.0023520.00708439.3333331.0
31331NFN331F0.3617580.3131570.0330770.35548339.0000001.0
41331NGN331G0.0944740.046883-0.0391760.162750325.6666671.0
.......................................
38592531TRT531R-0.089931-0.069566-0.0417090.061009325.6666671.0
38602531TST531S-0.009686-0.0152960.0123010.019788329.0000001.0
38612531TVT531V-0.063681-0.0293020.0111690.096735319.3333331.0
38622531TWT531W0.2502700.207922-0.0047110.27858034.3333331.0
38632531TYT531Y-0.054589-0.041310-0.0388480.025161311.3333331.0
\n", + "

3864 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " epitope site wildtype mutant mutation escape_mean escape_median \\\n", + "0 1 331 N A N331A -0.188953 -0.083300 \n", + "1 1 331 N D N331D -0.095844 -0.104757 \n", + "2 1 331 N E N331E -0.004743 -0.004766 \n", + "3 1 331 N F N331F 0.361758 0.313157 \n", + "4 1 331 N G N331G 0.094474 0.046883 \n", + "... ... ... ... ... ... ... ... \n", + "3859 2 531 T R T531R -0.089931 -0.069566 \n", + "3860 2 531 T S T531S -0.009686 -0.015296 \n", + "3861 2 531 T V T531V -0.063681 -0.029302 \n", + "3862 2 531 T W T531W 0.250270 0.207922 \n", + "3863 2 531 T Y T531Y -0.054589 -0.041310 \n", + "\n", + " escape_min_magnitude escape_std n_models times_seen frac_models \n", + "0 0.025048 0.282079 3 16.666667 1.0 \n", + "1 -0.074615 0.018464 3 10.666667 1.0 \n", + "2 0.002352 0.007084 3 9.333333 1.0 \n", + "3 0.033077 0.355483 3 9.000000 1.0 \n", + "4 -0.039176 0.162750 3 25.666667 1.0 \n", + "... ... ... ... ... ... \n", + "3859 -0.041709 0.061009 3 25.666667 1.0 \n", + "3860 0.012301 0.019788 3 29.000000 1.0 \n", + "3861 0.011169 0.096735 3 19.333333 1.0 \n", + "3862 -0.004711 0.278580 3 4.333333 1.0 \n", + "3863 -0.038848 0.025161 3 11.333333 1.0 \n", + "\n", + "[3864 rows x 12 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "\n", + "model_region_avg.mut_escape_df" + ] + }, + { + "cell_type": "markdown", + "id": "7d508c99-bbe3-4b7e-aad6-051bbe6e3652", + "metadata": {}, + "source": [ + "For the sites where all four models are active, this will be the same as the model without regions:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "24873342-2d66-4161-bdfa-b521520d46aa", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:17.022473Z", + "iopub.status.busy": "2024-12-14T13:58:17.021756Z", + "iopub.status.idle": "2024-12-14T13:58:18.175505Z", + "shell.execute_reply": "2024-12-14T13:58:18.174246Z", + "shell.execute_reply.started": "2024-12-14T13:58:17.022415Z" + } + }, + "outputs": [], + "source": [ + "assert (\n", + " model_avg.mut_escape_df.query(\"(site >= 450) and (site <= 460)\").equals(\n", + " model_region_avg.mut_escape_df.query(\"(site >= 450) and (site <= 460)\")\n", + " )\n", + " is True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6f9d763f-595f-46f4-98eb-e6d041ca3406", + "metadata": {}, + "source": [ + "But they differ at other sites:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4bacc14c-d91c-4622-8fb8-0e2870a85631", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:18.926225Z", + "iopub.status.busy": "2024-12-14T13:58:18.925480Z", + "iopub.status.idle": "2024-12-14T13:58:20.099447Z", + "shell.execute_reply": "2024-12-14T13:58:20.098626Z", + "shell.execute_reply.started": "2024-12-14T13:58:18.926167Z" + } + }, + "outputs": [], + "source": [ + "assert model_avg.mut_escape_df.equals(model_region_avg.mut_escape_df) is False" + ] + }, + { + "cell_type": "markdown", + "id": "33a6c146-155e-4417-a6d5-6a9fc383a9bf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-13T22:22:32.690820Z", + "iopub.status.busy": "2024-12-13T22:22:32.690497Z", + "iopub.status.idle": "2024-12-13T22:22:33.327740Z", + "shell.execute_reply": "2024-12-13T22:22:33.327038Z", + "shell.execute_reply.started": "2024-12-13T22:22:32.690797Z" + } + }, + "source": [ + "Same for the mutation IC50 values:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d881a40f-2f3e-4fe1-aa88-ddd82e3d68f6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:21.139211Z", + "iopub.status.busy": "2024-12-14T13:58:21.138424Z", + "iopub.status.idle": "2024-12-14T13:58:27.843031Z", + "shell.execute_reply": "2024-12-14T13:58:27.842292Z", + "shell.execute_reply.started": "2024-12-14T13:58:21.139149Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sitewildtypemutantlog_fold_change_IC50 meanlog_fold_change_IC50 medianlog_fold_change_IC50 min_magnitudelog_fold_change_IC50 stdn_modelstimes_seenfrac_models
0331NA-0.278667-0.192977-0.1424770.193805316.6666671.0
1331ND-0.110483-0.0663450.0059800.143709310.6666671.0
2331NE0.0035400.0196660.0196660.03234739.3333331.0
3331NF0.1918090.1686990.0762060.12872439.0000001.0
4331NG0.3332710.3938550.1099610.200021325.6666671.0
.................................
2100531TS-0.0095800.0029360.0029360.022277329.0000001.0
2101531TT0.0000000.0000000.0000000.0000003NaN1.0
2102531TV-0.090553-0.0483690.0251480.141588319.3333331.0
2103531TW0.3095710.280769-0.0078630.33277034.3333331.0
2104531TY-0.084084-0.059005-0.0531400.048605311.3333331.0
\n", + "

2105 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " site wildtype mutant log_fold_change_IC50 mean \\\n", + "0 331 N A -0.278667 \n", + "1 331 N D -0.110483 \n", + "2 331 N E 0.003540 \n", + "3 331 N F 0.191809 \n", + "4 331 N G 0.333271 \n", + "... ... ... ... ... \n", + "2100 531 T S -0.009580 \n", + "2101 531 T T 0.000000 \n", + "2102 531 T V -0.090553 \n", + "2103 531 T W 0.309571 \n", + "2104 531 T Y -0.084084 \n", + "\n", + " log_fold_change_IC50 median log_fold_change_IC50 min_magnitude \\\n", + "0 -0.192977 -0.142477 \n", + "1 -0.066345 0.005980 \n", + "2 0.019666 0.019666 \n", + "3 0.168699 0.076206 \n", + "4 0.393855 0.109961 \n", + "... ... ... \n", + "2100 0.002936 0.002936 \n", + "2101 0.000000 0.000000 \n", + "2102 -0.048369 0.025148 \n", + "2103 0.280769 -0.007863 \n", + "2104 -0.059005 -0.053140 \n", + "\n", + " log_fold_change_IC50 std n_models times_seen frac_models \n", + "0 0.193805 3 16.666667 1.0 \n", + "1 0.143709 3 10.666667 1.0 \n", + "2 0.032347 3 9.333333 1.0 \n", + "3 0.128724 3 9.000000 1.0 \n", + "4 0.200021 3 25.666667 1.0 \n", + "... ... ... ... ... \n", + "2100 0.022277 3 29.000000 1.0 \n", + "2101 0.000000 3 NaN 1.0 \n", + "2102 0.141588 3 19.333333 1.0 \n", + "2103 0.332770 3 4.333333 1.0 \n", + "2104 0.048605 3 11.333333 1.0 \n", + "\n", + "[2105 rows x 10 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "\n", + "region_ic50_df = model_region_avg.mut_icXX_df(\n", + " x=0.5, icXX_col=\"IC50\", log_fold_change_icXX_col=\"log_fold_change_IC50\"\n", + ")\n", + "\n", + "region_ic50_df" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a5057dd3-1b05-4dd6-8f15-46905ba0b8eb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:27.845077Z", + "iopub.status.busy": "2024-12-14T13:58:27.844710Z", + "iopub.status.idle": "2024-12-14T13:58:34.479506Z", + "shell.execute_reply": "2024-12-14T13:58:34.478531Z", + "shell.execute_reply.started": "2024-12-14T13:58:27.845052Z" + } + }, + "outputs": [], + "source": [ + "ic50_df = model_avg.mut_icXX_df(\n", + " x=0.5, icXX_col=\"IC50\", log_fold_change_icXX_col=\"log_fold_change_IC50\"\n", + ")\n", + "\n", + "assert (\n", + " region_ic50_df.query(\"(site >= 450) and (site <= 460)\").equals(\n", + " ic50_df.query(\"(site >= 450) and (site <= 460)\")\n", + " )\n", + " is True\n", + ")\n", + "\n", + "assert region_ic50_df.equals(ic50_df) is False" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "cd4013b1-18ce-4a96-8d76-264057de2d66", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:59:04.711649Z", + "iopub.status.busy": "2024-12-14T13:59:04.710925Z", + "iopub.status.idle": "2024-12-14T13:59:08.741647Z", + "shell.execute_reply": "2024-12-14T13:59:08.740860Z", + "shell.execute_reply.started": "2024-12-14T13:59:04.711591Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "\n", + "model_region_avg.mut_escape_plot(addtl_slider_stats={\"times_seen\": 2})" + ] } ], "metadata": { @@ -1533,7 +2507,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.12.8" } }, "nbformat": 4, From 859f03e660ee2be99c646d61097854762259e6d0 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 14 Dec 2024 06:27:37 -0800 Subject: [PATCH 6/9] fix bug in `PolyclonalCollection` code (typo in variable name) --- polyclonal/polyclonal_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polyclonal/polyclonal_collection.py b/polyclonal/polyclonal_collection.py index 7d24174..6b4b0d0 100644 --- a/polyclonal/polyclonal_collection.py +++ b/polyclonal/polyclonal_collection.py @@ -1283,7 +1283,7 @@ def mut_escape_site_summary_df_replicates(self, **kwargs): m.mut_escape_site_summary_df(**kwargs) .query("site in @sites") .assign(**desc) - for m, desc, site in zip( + for m, desc, sites in zip( self.models, self.model_descriptors, self.regions ) ], From 996d4e5c3c537ff6837a35c9646de626b38590a5 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 14 Dec 2024 06:31:03 -0800 Subject: [PATCH 7/9] update CHANGELOG --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6dda33d..f5a42c1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ The format is based on `Keep a Changelog `_. 6.13 ---- - Fixed ``altair`` plots to work with ``numpy`` version 2, which caused problems in some cases apparently due to a data type conversion issue. +- Add ability to specify specific regions for each model in ``PolyclonalCollection``. This adds the ``region_col`` parameter and the ``PolyclonalCollection.regions`` and ``PolyclonalCollection.n_models_by_site`` attributes. Related to adding functionality to address `this issue `_. 6.12 ---- From d93ad0048320911121b5acdcaf22ae0d7e25e982 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 14 Dec 2024 06:31:26 -0800 Subject: [PATCH 8/9] increment version to 6.13 --- polyclonal/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polyclonal/__init__.py b/polyclonal/__init__.py index ed9a156..734610c 100644 --- a/polyclonal/__init__.py +++ b/polyclonal/__init__.py @@ -31,7 +31,7 @@ __author__ = "`the Bloom lab `_" __email__ = "jbloom@fredhutch.org" -__version__ = "6.12" +__version__ = "6.13" __url__ = "https://github.com/jbloomlab/polyclonal" from polyclonal.alphabets import AAS From b639fcaf47886a83411019a83770bdd726ca84c1 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 14 Dec 2024 06:44:59 -0800 Subject: [PATCH 9/9] catch edge case of `PolyclonalCollection` of just one model --- polyclonal/polyclonal_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polyclonal/polyclonal_collection.py b/polyclonal/polyclonal_collection.py index 6b4b0d0..c711648 100644 --- a/polyclonal/polyclonal_collection.py +++ b/polyclonal/polyclonal_collection.py @@ -398,7 +398,7 @@ def __init__(self, models_df, *, default_avg_to_plot, region_col=None): self.unique_descriptor_names = [ name for name in self.descriptor_names - if descriptors_df[name].nunique(dropna=False) > 1 + if descriptors_df[name].nunique(dropna=False) > 1 or len(self.models) == 1 ] if not len(self.unique_descriptor_names): raise ValueError("no `unique_descriptor_names`")