diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fae328c..f5a42c1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,11 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog `_. +6.13 +---- +- Fixed ``altair`` plots to work with ``numpy`` version 2, which caused problems in some cases apparently due to a data type conversion issue. +- Add ability to specify specific regions for each model in ``PolyclonalCollection``. This adds the ``region_col`` parameter and the ``PolyclonalCollection.regions`` and ``PolyclonalCollection.n_models_by_site`` attributes. Related to adding functionality to address `this issue `_. + 6.12 ---- - Switch to using ``scipy.sparse`` arrays rather than matrices to keep up with `this `_ `change `_ to ``binarymap`` (now require ``binarymap`` >= 0.7). diff --git a/notebooks/RBD_average.ipynb b/notebooks/RBD_average.ipynb index 684e938..5d647e8 100644 --- a/notebooks/RBD_average.ipynb +++ b/notebooks/RBD_average.ipynb @@ -30,11 +30,11 @@ "id": "b242df9c", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:36:10.732205Z", - "iopub.status.busy": "2023-10-17T03:36:10.731345Z", - "iopub.status.idle": "2023-10-17T03:36:19.105369Z", - "shell.execute_reply": "2023-10-17T03:36:19.103991Z", - "shell.execute_reply.started": "2023-10-17T03:36:10.732176Z" + "iopub.execute_input": "2024-12-14T13:53:56.905113Z", + "iopub.status.busy": "2024-12-14T13:53:56.904578Z", + "iopub.status.idle": "2024-12-14T13:54:12.514164Z", + "shell.execute_reply": "2024-12-14T13:54:12.513193Z", + "shell.execute_reply.started": "2024-12-14T13:53:56.905057Z" }, "tags": [] }, @@ -81,11 +81,11 @@ "id": "1f86179e", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:36:19.113583Z", - "iopub.status.busy": "2023-10-17T03:36:19.113323Z", - "iopub.status.idle": "2023-10-17T03:37:53.053430Z", - "shell.execute_reply": "2023-10-17T03:37:53.052114Z", - "shell.execute_reply.started": "2023-10-17T03:36:19.113556Z" + "iopub.execute_input": "2024-12-14T13:54:12.515977Z", + "iopub.status.busy": "2024-12-14T13:54:12.515521Z", + "iopub.status.idle": "2024-12-14T13:56:53.731871Z", + "shell.execute_reply": "2024-12-14T13:56:53.729741Z", + "shell.execute_reply.started": "2024-12-14T13:54:12.515943Z" }, "tags": [] }, @@ -125,11 +125,11 @@ "id": "d6e2060b", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:53.064005Z", - "iopub.status.busy": "2023-10-17T03:37:53.063722Z", - "iopub.status.idle": "2023-10-17T03:37:53.084236Z", - "shell.execute_reply": "2023-10-17T03:37:53.083326Z", - "shell.execute_reply.started": "2023-10-17T03:37:53.063976Z" + "iopub.execute_input": "2024-12-14T13:56:53.734807Z", + "iopub.status.busy": "2024-12-14T13:56:53.734217Z", + "iopub.status.idle": "2024-12-14T13:56:53.779530Z", + "shell.execute_reply": "2024-12-14T13:56:53.778507Z", + "shell.execute_reply.started": "2024-12-14T13:56:53.734748Z" }, "tags": [] }, @@ -226,11 +226,11 @@ "id": "8bb91e55", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:53.099272Z", - "iopub.status.busy": "2023-10-17T03:37:53.098847Z", - "iopub.status.idle": "2023-10-17T03:37:54.336999Z", - "shell.execute_reply": "2023-10-17T03:37:54.336120Z", - "shell.execute_reply.started": "2023-10-17T03:37:53.099239Z" + "iopub.execute_input": "2024-12-14T13:56:53.783657Z", + "iopub.status.busy": "2024-12-14T13:56:53.783059Z", + "iopub.status.idle": "2024-12-14T13:56:56.394188Z", + "shell.execute_reply": "2024-12-14T13:56:56.393245Z", + "shell.execute_reply.started": "2024-12-14T13:56:53.783613Z" } }, "outputs": [], @@ -252,11 +252,11 @@ "id": "7608fa8b", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:54.344087Z", - "iopub.status.busy": "2023-10-17T03:37:54.343758Z", - "iopub.status.idle": "2023-10-17T03:37:54.636492Z", - "shell.execute_reply": "2023-10-17T03:37:54.635913Z", - "shell.execute_reply.started": "2023-10-17T03:37:54.344065Z" + "iopub.execute_input": "2024-12-14T13:56:56.399407Z", + "iopub.status.busy": "2024-12-14T13:56:56.398904Z", + "iopub.status.idle": "2024-12-14T13:56:57.024531Z", + "shell.execute_reply": "2024-12-14T13:56:57.023823Z", + "shell.execute_reply.started": "2024-12-14T13:56:56.399359Z" } }, "outputs": [ @@ -265,28 +265,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -361,11 +363,11 @@ "id": "3f07294f", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:54.642919Z", - "iopub.status.busy": "2023-10-17T03:37:54.642732Z", - "iopub.status.idle": "2023-10-17T03:37:54.774113Z", - "shell.execute_reply": "2023-10-17T03:37:54.773524Z", - "shell.execute_reply.started": "2023-10-17T03:37:54.642905Z" + "iopub.execute_input": "2024-12-14T13:56:57.025954Z", + "iopub.status.busy": "2024-12-14T13:56:57.025708Z", + "iopub.status.idle": "2024-12-14T13:56:57.217625Z", + "shell.execute_reply": "2024-12-14T13:56:57.216542Z", + "shell.execute_reply.started": "2024-12-14T13:56:57.025929Z" }, "tags": [] }, @@ -375,28 +377,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -470,11 +474,11 @@ "id": "acf519c5", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:54.782955Z", - "iopub.status.busy": "2023-10-17T03:37:54.782650Z", - "iopub.status.idle": "2023-10-17T03:37:55.229815Z", - "shell.execute_reply": "2023-10-17T03:37:55.229150Z", - "shell.execute_reply.started": "2023-10-17T03:37:54.782932Z" + "iopub.execute_input": "2024-12-14T13:56:57.219226Z", + "iopub.status.busy": "2024-12-14T13:56:57.218771Z", + "iopub.status.idle": "2024-12-14T13:56:57.840139Z", + "shell.execute_reply": "2024-12-14T13:56:57.839423Z", + "shell.execute_reply.started": "2024-12-14T13:56:57.219189Z" }, "tags": [] }, @@ -522,10 +526,10 @@ " N\n", " A\n", " N331A\n", - " -0.066612\n", - " -0.031718\n", - " 0.026679\n", - " 0.324873\n", + " -0.070506\n", + " -0.029126\n", + " 0.025048\n", + " 0.330401\n", " 4\n", " 17.75\n", " 1.0\n", @@ -537,10 +541,10 @@ " N\n", " D\n", " N331D\n", - " -0.098052\n", - " -0.100283\n", - " -0.071111\n", - " 0.022076\n", + " -0.106079\n", + " -0.106459\n", + " -0.074615\n", + " 0.025421\n", " 4\n", " 11.25\n", " 1.0\n", @@ -552,10 +556,10 @@ " N\n", " E\n", " N331E\n", - " -0.061359\n", - " -0.008731\n", - " -0.001655\n", - " 0.112576\n", + " -0.075338\n", + " -0.008291\n", + " 0.002352\n", + " 0.141307\n", " 4\n", " 10.25\n", " 1.0\n", @@ -567,10 +571,10 @@ " N\n", " F\n", " N331F\n", - " 0.270243\n", - " 0.170869\n", - " 0.014552\n", - " 0.330315\n", + " 0.278933\n", + " 0.173117\n", + " 0.030457\n", + " 0.334194\n", " 4\n", " 10.00\n", " 1.0\n", @@ -582,10 +586,10 @@ " N\n", " G\n", " N331G\n", - " 0.186271\n", - " 0.164941\n", - " 0.031369\n", - " 0.232272\n", + " 0.187362\n", + " 0.161299\n", + " -0.039176\n", + " 0.228411\n", " 4\n", " 25.00\n", " 1.0\n", @@ -612,10 +616,10 @@ " T\n", " R\n", " T531R\n", - " -0.176471\n", - " -0.112467\n", - " -0.033899\n", - " 0.187640\n", + " -0.180296\n", + " -0.114042\n", + " -0.041709\n", + " 0.187469\n", " 4\n", " 27.00\n", " 1.0\n", @@ -627,10 +631,10 @@ " T\n", " S\n", " T531S\n", - " -0.075136\n", - " -0.011741\n", - " 0.000405\n", - " 0.155037\n", + " -0.077768\n", + " -0.020680\n", + " 0.012301\n", + " 0.137118\n", " 4\n", " 31.75\n", " 1.0\n", @@ -642,10 +646,10 @@ " T\n", " V\n", " T531V\n", - " -0.041719\n", - " -0.005589\n", - " 0.012383\n", - " 0.094055\n", + " -0.037153\n", + " -0.009067\n", + " 0.011169\n", + " 0.095150\n", " 4\n", " 19.50\n", " 1.0\n", @@ -657,10 +661,10 @@ " T\n", " W\n", " T531W\n", - " 0.169718\n", - " 0.100366\n", - " -0.004133\n", - " 0.266617\n", + " 0.171903\n", + " 0.101605\n", + " -0.004711\n", + " 0.276231\n", " 4\n", " 5.25\n", " 1.0\n", @@ -672,10 +676,10 @@ " T\n", " Y\n", " T531Y\n", - " -0.051762\n", - " -0.050223\n", - " -0.032461\n", - " 0.020582\n", + " -0.058269\n", + " -0.055309\n", + " -0.038848\n", + " 0.021822\n", " 4\n", " 11.75\n", " 1.0\n", @@ -687,30 +691,30 @@ ], "text/plain": [ " epitope site wildtype mutant mutation escape_mean escape_median \\\n", - "0 1 331 N A N331A -0.066612 -0.031718 \n", - "1 1 331 N D N331D -0.098052 -0.100283 \n", - "2 1 331 N E N331E -0.061359 -0.008731 \n", - "3 1 331 N F N331F 0.270243 0.170869 \n", - "4 1 331 N G N331G 0.186271 0.164941 \n", + "0 1 331 N A N331A -0.070506 -0.029126 \n", + "1 1 331 N D N331D -0.106079 -0.106459 \n", + "2 1 331 N E N331E -0.075338 -0.008291 \n", + "3 1 331 N F N331F 0.278933 0.173117 \n", + "4 1 331 N G N331G 0.187362 0.161299 \n", "... ... ... ... ... ... ... ... \n", - "3859 2 531 T R T531R -0.176471 -0.112467 \n", - "3860 2 531 T S T531S -0.075136 -0.011741 \n", - "3861 2 531 T V T531V -0.041719 -0.005589 \n", - "3862 2 531 T W T531W 0.169718 0.100366 \n", - "3863 2 531 T Y T531Y -0.051762 -0.050223 \n", + "3859 2 531 T R T531R -0.180296 -0.114042 \n", + "3860 2 531 T S T531S -0.077768 -0.020680 \n", + "3861 2 531 T V T531V -0.037153 -0.009067 \n", + "3862 2 531 T W T531W 0.171903 0.101605 \n", + "3863 2 531 T Y T531Y -0.058269 -0.055309 \n", "\n", " escape_min_magnitude escape_std n_models times_seen frac_models \n", - "0 0.026679 0.324873 4 17.75 1.0 \n", - "1 -0.071111 0.022076 4 11.25 1.0 \n", - "2 -0.001655 0.112576 4 10.25 1.0 \n", - "3 0.014552 0.330315 4 10.00 1.0 \n", - "4 0.031369 0.232272 4 25.00 1.0 \n", + "0 0.025048 0.330401 4 17.75 1.0 \n", + "1 -0.074615 0.025421 4 11.25 1.0 \n", + "2 0.002352 0.141307 4 10.25 1.0 \n", + "3 0.030457 0.334194 4 10.00 1.0 \n", + "4 -0.039176 0.228411 4 25.00 1.0 \n", "... ... ... ... ... ... \n", - "3859 -0.033899 0.187640 4 27.00 1.0 \n", - "3860 0.000405 0.155037 4 31.75 1.0 \n", - "3861 0.012383 0.094055 4 19.50 1.0 \n", - "3862 -0.004133 0.266617 4 5.25 1.0 \n", - "3863 -0.032461 0.020582 4 11.75 1.0 \n", + "3859 -0.041709 0.187469 4 27.00 1.0 \n", + "3860 0.012301 0.137118 4 31.75 1.0 \n", + "3861 0.011169 0.095150 4 19.50 1.0 \n", + "3862 -0.004711 0.276231 4 5.25 1.0 \n", + "3863 -0.038848 0.021822 4 11.75 1.0 \n", "\n", "[3864 rows x 12 columns]" ] @@ -740,11 +744,11 @@ "id": "2c642238-9278-4a81-a668-5df15a336f5c", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:55.236631Z", - "iopub.status.busy": "2023-10-17T03:37:55.236390Z", - "iopub.status.idle": "2023-10-17T03:37:58.562096Z", - "shell.execute_reply": "2023-10-17T03:37:58.561273Z", - "shell.execute_reply.started": "2023-10-17T03:37:55.236612Z" + "iopub.execute_input": "2024-12-14T13:56:57.841977Z", + "iopub.status.busy": "2024-12-14T13:56:57.841561Z", + "iopub.status.idle": "2024-12-14T13:57:04.364109Z", + "shell.execute_reply": "2024-12-14T13:57:04.362814Z", + "shell.execute_reply.started": "2024-12-14T13:56:57.841947Z" } }, "outputs": [ @@ -787,10 +791,10 @@ " 331\n", " N\n", " A\n", - " -0.159587\n", - " -0.206684\n", - " -0.135967\n", - " 0.257279\n", + " -0.158794\n", + " -0.192897\n", + " -0.115146\n", + " 0.270364\n", " 4\n", " 17.75\n", " 1.0\n", @@ -800,10 +804,10 @@ " 331\n", " N\n", " D\n", - " -0.086087\n", - " -0.054268\n", - " -0.002643\n", - " 0.101813\n", + " -0.096712\n", + " -0.058956\n", + " -0.016521\n", + " 0.106236\n", " 4\n", " 11.25\n", " 1.0\n", @@ -813,10 +817,10 @@ " 331\n", " N\n", " E\n", - " 0.034466\n", - " 0.016051\n", - " 0.015768\n", - " 0.067786\n", + " 0.036998\n", + " 0.018060\n", + " 0.017496\n", + " 0.073099\n", " 4\n", " 10.25\n", " 1.0\n", @@ -826,10 +830,10 @@ " 331\n", " N\n", " F\n", - " 0.106040\n", - " 0.142322\n", - " 0.141842\n", - " 0.287263\n", + " 0.104497\n", + " 0.142641\n", + " 0.135784\n", + " 0.288862\n", " 4\n", " 10.00\n", " 1.0\n", @@ -839,10 +843,10 @@ " 331\n", " N\n", " G\n", - " 0.283214\n", - " 0.282422\n", - " 0.088191\n", - " 0.163483\n", + " 0.286305\n", + " 0.279946\n", + " 0.103283\n", + " 0.160085\n", " 4\n", " 25.00\n", " 1.0\n", @@ -865,10 +869,10 @@ " 531\n", " T\n", " S\n", - " -0.068568\n", - " -0.016541\n", - " -0.004995\n", - " 0.144948\n", + " -0.072452\n", + " -0.022248\n", + " -0.012782\n", + " 0.131153\n", " 4\n", " 31.75\n", " 1.0\n", @@ -891,10 +895,10 @@ " 531\n", " T\n", " V\n", - " 0.013326\n", - " -0.002602\n", - " 0.039361\n", - " 0.236514\n", + " 0.011190\n", + " -0.008491\n", + " 0.035284\n", + " 0.230686\n", " 4\n", " 19.50\n", " 1.0\n", @@ -904,10 +908,10 @@ " 531\n", " T\n", " W\n", - " 0.199249\n", - " 0.119866\n", - " -0.009133\n", - " 0.287019\n", + " 0.198453\n", + " 0.121785\n", + " -0.009031\n", + " 0.290432\n", " 4\n", " 5.25\n", " 1.0\n", @@ -917,10 +921,10 @@ " 531\n", " T\n", " Y\n", - " -0.021049\n", - " -0.048014\n", - " -0.044700\n", - " 0.124192\n", + " -0.022972\n", + " -0.054006\n", + " -0.049928\n", + " 0.135754\n", " 4\n", " 11.75\n", " 1.0\n", @@ -932,43 +936,43 @@ ], "text/plain": [ " site wildtype mutant log2 fold change IC90 mean \\\n", - "0 331 N A -0.159587 \n", - "1 331 N D -0.086087 \n", - "2 331 N E 0.034466 \n", - "3 331 N F 0.106040 \n", - "4 331 N G 0.283214 \n", + "0 331 N A -0.158794 \n", + "1 331 N D -0.096712 \n", + "2 331 N E 0.036998 \n", + "3 331 N F 0.104497 \n", + "4 331 N G 0.286305 \n", "... ... ... ... ... \n", - "2100 531 T S -0.068568 \n", + "2100 531 T S -0.072452 \n", "2101 531 T T 0.000000 \n", - "2102 531 T V 0.013326 \n", - "2103 531 T W 0.199249 \n", - "2104 531 T Y -0.021049 \n", + "2102 531 T V 0.011190 \n", + "2103 531 T W 0.198453 \n", + "2104 531 T Y -0.022972 \n", "\n", " log2 fold change IC90 median log2 fold change IC90 min_magnitude \\\n", - "0 -0.206684 -0.135967 \n", - "1 -0.054268 -0.002643 \n", - "2 0.016051 0.015768 \n", - "3 0.142322 0.141842 \n", - "4 0.282422 0.088191 \n", + "0 -0.192897 -0.115146 \n", + "1 -0.058956 -0.016521 \n", + "2 0.018060 0.017496 \n", + "3 0.142641 0.135784 \n", + "4 0.279946 0.103283 \n", "... ... ... \n", - "2100 -0.016541 -0.004995 \n", + "2100 -0.022248 -0.012782 \n", "2101 0.000000 0.000000 \n", - "2102 -0.002602 0.039361 \n", - "2103 0.119866 -0.009133 \n", - "2104 -0.048014 -0.044700 \n", + "2102 -0.008491 0.035284 \n", + "2103 0.121785 -0.009031 \n", + "2104 -0.054006 -0.049928 \n", "\n", " log2 fold change IC90 std n_models times_seen frac_models \n", - "0 0.257279 4 17.75 1.0 \n", - "1 0.101813 4 11.25 1.0 \n", - "2 0.067786 4 10.25 1.0 \n", - "3 0.287263 4 10.00 1.0 \n", - "4 0.163483 4 25.00 1.0 \n", + "0 0.270364 4 17.75 1.0 \n", + "1 0.106236 4 11.25 1.0 \n", + "2 0.073099 4 10.25 1.0 \n", + "3 0.288862 4 10.00 1.0 \n", + "4 0.160085 4 25.00 1.0 \n", "... ... ... ... ... \n", - "2100 0.144948 4 31.75 1.0 \n", + "2100 0.131153 4 31.75 1.0 \n", "2101 0.000000 4 NaN 1.0 \n", - "2102 0.236514 4 19.50 1.0 \n", - "2103 0.287019 4 5.25 1.0 \n", - "2104 0.124192 4 11.75 1.0 \n", + "2102 0.230686 4 19.50 1.0 \n", + "2103 0.290432 4 5.25 1.0 \n", + "2104 0.135754 4 11.75 1.0 \n", "\n", "[2105 rows x 10 columns]" ] @@ -1002,11 +1006,11 @@ "id": "b882943f", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:58.569000Z", - "iopub.status.busy": "2023-10-17T03:37:58.568790Z", - "iopub.status.idle": "2023-10-17T03:37:58.646779Z", - "shell.execute_reply": "2023-10-17T03:37:58.646098Z", - "shell.execute_reply.started": "2023-10-17T03:37:58.568984Z" + "iopub.execute_input": "2024-12-14T13:57:04.366300Z", + "iopub.status.busy": "2024-12-14T13:57:04.365736Z", + "iopub.status.idle": "2024-12-14T13:57:04.520174Z", + "shell.execute_reply": "2024-12-14T13:57:04.519573Z", + "shell.execute_reply.started": "2024-12-14T13:57:04.366249Z" } }, "outputs": [ @@ -1050,7 +1054,7 @@ " N\n", " A\n", " N331A\n", - " -0.090115\n", + " -0.083300\n", " 19\n", " avg2muts\n", " 1\n", @@ -1062,7 +1066,7 @@ " N\n", " D\n", " N331D\n", - " -0.110795\n", + " -0.104757\n", " 10\n", " avg2muts\n", " 1\n", @@ -1074,7 +1078,7 @@ " N\n", " E\n", " N331E\n", - " -0.001655\n", + " -0.004766\n", " 11\n", " avg2muts\n", " 1\n", @@ -1086,7 +1090,7 @@ " N\n", " F\n", " N331F\n", - " 0.037305\n", + " 0.033077\n", " 10\n", " avg2muts\n", " 1\n", @@ -1098,7 +1102,7 @@ " N\n", " G\n", " N331G\n", - " 0.031369\n", + " 0.046883\n", " 18\n", " avg2muts\n", " 1\n", @@ -1122,7 +1126,7 @@ " T\n", " R\n", " T531R\n", - " -0.033899\n", + " -0.041709\n", " 24\n", " avg3muts\n", " 2\n", @@ -1134,7 +1138,7 @@ " T\n", " S\n", " T531S\n", - " 0.000405\n", + " -0.015296\n", " 42\n", " avg3muts\n", " 2\n", @@ -1146,7 +1150,7 @@ " T\n", " V\n", " T531V\n", - " -0.023562\n", + " -0.029302\n", " 24\n", " avg3muts\n", " 2\n", @@ -1158,7 +1162,7 @@ " T\n", " W\n", " T531W\n", - " 0.532476\n", + " 0.547600\n", " 6\n", " avg3muts\n", " 2\n", @@ -1170,7 +1174,7 @@ " T\n", " Y\n", " T531Y\n", - " -0.074142\n", + " -0.083607\n", " 9\n", " avg3muts\n", " 2\n", @@ -1182,17 +1186,17 @@ ], "text/plain": [ " epitope site wildtype mutant mutation escape times_seen library \\\n", - "0 1 331 N A N331A -0.090115 19 avg2muts \n", - "1 1 331 N D N331D -0.110795 10 avg2muts \n", - "2 1 331 N E N331E -0.001655 11 avg2muts \n", - "3 1 331 N F N331F 0.037305 10 avg2muts \n", - "4 1 331 N G N331G 0.031369 18 avg2muts \n", + "0 1 331 N A N331A -0.083300 19 avg2muts \n", + "1 1 331 N D N331D -0.104757 10 avg2muts \n", + "2 1 331 N E N331E -0.004766 11 avg2muts \n", + "3 1 331 N F N331F 0.033077 10 avg2muts \n", + "4 1 331 N G N331G 0.046883 18 avg2muts \n", "... ... ... ... ... ... ... ... ... \n", - "15429 2 531 T R T531R -0.033899 24 avg3muts \n", - "15430 2 531 T S T531S 0.000405 42 avg3muts \n", - "15431 2 531 T V T531V -0.023562 24 avg3muts \n", - "15432 2 531 T W T531W 0.532476 6 avg3muts \n", - "15433 2 531 T Y T531Y -0.074142 9 avg3muts \n", + "15429 2 531 T R T531R -0.041709 24 avg3muts \n", + "15430 2 531 T S T531S -0.015296 42 avg3muts \n", + "15431 2 531 T V T531V -0.029302 24 avg3muts \n", + "15432 2 531 T W T531W 0.547600 6 avg3muts \n", + "15433 2 531 T Y T531Y -0.083607 9 avg3muts \n", "\n", " replicate \n", "0 1 \n", @@ -1236,11 +1240,11 @@ "id": "b085a23f", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:37:58.653230Z", - "iopub.status.busy": "2023-10-17T03:37:58.653032Z", - "iopub.status.idle": "2023-10-17T03:38:01.042319Z", - "shell.execute_reply": "2023-10-17T03:38:01.041346Z", - "shell.execute_reply.started": "2023-10-17T03:37:58.653215Z" + "iopub.execute_input": "2024-12-14T13:57:04.521387Z", + "iopub.status.busy": "2024-12-14T13:57:04.520989Z", + "iopub.status.idle": "2024-12-14T13:57:08.444409Z", + "shell.execute_reply": "2024-12-14T13:57:08.443286Z", + "shell.execute_reply.started": "2024-12-14T13:57:04.521358Z" }, "tags": [] }, @@ -1250,28 +1254,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1344,11 +1350,11 @@ "id": "6afa045f-4a62-4b4d-8285-4b550823a022", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:38:01.045838Z", - "iopub.status.busy": "2023-10-17T03:38:01.045550Z", - "iopub.status.idle": "2023-10-17T03:38:03.419332Z", - "shell.execute_reply": "2023-10-17T03:38:03.418503Z", - "shell.execute_reply.started": "2023-10-17T03:38:01.045822Z" + "iopub.execute_input": "2024-12-14T13:57:08.445881Z", + "iopub.status.busy": "2024-12-14T13:57:08.445500Z", + "iopub.status.idle": "2024-12-14T13:57:12.311988Z", + "shell.execute_reply": "2024-12-14T13:57:12.311460Z", + "shell.execute_reply.started": "2024-12-14T13:57:08.445854Z" }, "tags": [] }, @@ -1358,28 +1364,29 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1454,25 +1462,93 @@ "id": "41957bd8-070a-41d7-b2f9-647eb7908a87", "metadata": { "execution": { - "iopub.execute_input": "2023-10-17T03:38:03.420623Z", - "iopub.status.busy": "2023-10-17T03:38:03.420289Z", - "iopub.status.idle": "2023-10-17T03:38:11.140080Z", - "shell.execute_reply": "2023-10-17T03:38:11.139368Z", - "shell.execute_reply.started": "2023-10-17T03:38:03.420605Z" + "iopub.execute_input": "2024-12-14T13:57:12.315322Z", + "iopub.status.busy": "2024-12-14T13:57:12.314996Z", + "iopub.status.idle": "2024-12-14T13:57:25.931459Z", + "shell.execute_reply": "2024-12-14T13:57:25.930700Z", + "shell.execute_reply.started": "2024-12-14T13:57:12.315299Z" } }, "outputs": [ { - "ename": "KeyError", - "evalue": "'addtl_slider_stats_as_max'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# NBVAL_IGNORE_OUTPUT\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mmodel_avg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmut_icXX_plot\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43maddtl_slider_stats\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtimes_seen\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mavg_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmedian\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/polyclonal/polyclonal/polyclonal_collection.py:1174\u001b[0m, in \u001b[0;36mPolyclonalCollection.mut_icXX_plot\u001b[0;34m(self, x, icXX_col, log_fold_change_icXX_col, min_c, max_c, logbase, check_wt_icXX, biochem_order_aas, df_to_merge, positive_color, negative_color, avg_type, init_n_models, per_model_tooltip, scale_stat_col, **kwargs)\u001b[0m\n\u001b[1;32m 1172\u001b[0m max_escape_std \u001b[38;5;241m=\u001b[39m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_df\u001b[39m\u001b[38;5;124m\"\u001b[39m][std_col]\u001b[38;5;241m.\u001b[39mmax()\n\u001b[1;32m 1173\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maddtl_slider_stats\u001b[39m\u001b[38;5;124m\"\u001b[39m][std_col] \u001b[38;5;241m=\u001b[39m max_escape_std\n\u001b[0;32m-> 1174\u001b[0m \u001b[43mkwargs\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maddtl_slider_stats_as_max\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mappend(std_col)\n\u001b[1;32m 1176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mheatmap_min_at_least\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m kwargs:\n\u001b[1;32m 1177\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mheatmap_min_at_least\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m2\u001b[39m\n", - "\u001b[0;31mKeyError\u001b[0m: 'addtl_slider_stats_as_max'" - ] + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1484,13 +1560,935 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "d88f1d70-b6cd-4732-804e-54e380544b02", + "metadata": {}, + "source": [ + "## Escape values by region\n", + "In some cases, you may want to only get the escape values for a specific region of the protein for each model being averaged.\n", + "For instance, this may be the case if you covered half the protein in one library and the other half in the other library.\n", + "\n", + "In this case, you should initialize with a column in the models data with a `region_col` that specifies the sites for each model:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "432f2375-6ec6-4d62-8a8a-d768b618454b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:05.887056Z", + "iopub.status.busy": "2024-12-14T13:58:05.885968Z", + "iopub.status.idle": "2024-12-14T13:58:08.404744Z", + "shell.execute_reply": "2024-12-14T13:58:08.403665Z", + "shell.execute_reply.started": "2024-12-14T13:58:05.886989Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is the input dataframe specifying sites to keep for each model:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
libraryreplicatemodelsites_to_keep
0avg2muts1<polyclonal.polyclonal.Polyclonal object at 0x...(331, 332, 333, 334, 335, 336, 337, 338, 339, ...
1avg2muts2<polyclonal.polyclonal.Polyclonal object at 0x...(331, 332, 333, 334, 335, 336, 337, 338, 339, ...
2avg3muts1<polyclonal.polyclonal.Polyclonal object at 0x...[331, 332, 333, 334, 335, 336, 337, 338, 339, ...
3avg3muts2<polyclonal.polyclonal.Polyclonal object at 0x...[450, 451, 452, 453, 455, 456, 458, 459, 460, ...
\n", + "
" + ], + "text/plain": [ + " library replicate model \\\n", + "0 avg2muts 1 = 450]),\n", + " ],\n", + " columns=[\"library\", \"replicate\", \"sites_to_keep\"],\n", + ")\n", + "\n", + "models_region_df = models_df.merge(regions_df)\n", + "\n", + "print(\"Here is the input dataframe specifying sites to keep for each model:\")\n", + "display(models_region_df)\n", + "\n", + "model_region_avg = polyclonal.PolyclonalAverage(\n", + " models_region_df, region_col=\"sites_to_keep\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7417aeda-d6ab-4e30-a233-dc3d60d688c1", + "metadata": {}, + "source": [ + "We can see the number of sites in each region:" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "73a195d1-4f21-4c25-a147-fdb51118e2c8", + "execution_count": 15, + "id": "c9b166d0-714d-4e74-b1ff-520f958c952c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:08.615016Z", + "iopub.status.busy": "2024-12-14T13:58:08.614534Z", + "iopub.status.idle": "2024-12-14T13:58:08.620439Z", + "shell.execute_reply": "2024-12-14T13:58:08.619707Z", + "shell.execute_reply.started": "2024-12-14T13:58:08.614987Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "desc={'library': 'avg2muts', 'replicate': 1}, len(sites)=173, min(sites)=331, max(sites)=531\n", + "desc={'library': 'avg2muts', 'replicate': 2}, len(sites)=173, min(sites)=331, max(sites)=531\n", + "desc={'library': 'avg3muts', 'replicate': 1}, len(sites)=112, min(sites)=331, max(sites)=460\n", + "desc={'library': 'avg3muts', 'replicate': 2}, len(sites)=70, min(sites)=450, max(sites)=531\n" + ] + } + ], + "source": [ + "for desc, sites in zip(model_region_avg.model_descriptors, model_region_avg.regions):\n", + " print(f\"{desc=}, {len(sites)=}, {min(sites)=}, {max(sites)=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ebce9460-faeb-4580-b4ba-03f222d41526", "metadata": {}, + "source": [ + "We can also get the number of models per site.\n", + "Based on how we initialized, this is 3 for all sites except those between 450 and 460 where it is 4:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "cb0401b7-d454-4c52-9ef2-2791fe32528f", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:10.327513Z", + "iopub.status.busy": "2024-12-14T13:58:10.326870Z", + "iopub.status.idle": "2024-12-14T13:58:10.333416Z", + "shell.execute_reply": "2024-12-14T13:58:10.332243Z", + "shell.execute_reply.started": "2024-12-14T13:58:10.327475Z" + } + }, "outputs": [], - "source": [] + "source": [ + "assert model_region_avg.sites == model_avg.sites\n", + "\n", + "assert (\n", + " model_region_avg.n_models_by_site[r] == 3 + (450 <= r <= 460)\n", + " for r in model_region_avg.sites\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "60cf491d-20ca-47ee-a2a6-1b20f04c568c", + "metadata": {}, + "source": [ + "Now look at the mutation-escape data frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "23a4e53f-01fb-418d-8dfd-467a99e61a35", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:12.061035Z", + "iopub.status.busy": "2024-12-14T13:58:12.060339Z", + "iopub.status.idle": "2024-12-14T13:58:12.688630Z", + "shell.execute_reply": "2024-12-14T13:58:12.687911Z", + "shell.execute_reply.started": "2024-12-14T13:58:12.060983Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epitopesitewildtypemutantmutationescape_meanescape_medianescape_min_magnitudeescape_stdn_modelstimes_seenfrac_models
01331NAN331A-0.188953-0.0833000.0250480.282079316.6666671.0
11331NDN331D-0.095844-0.104757-0.0746150.018464310.6666671.0
21331NEN331E-0.004743-0.0047660.0023520.00708439.3333331.0
31331NFN331F0.3617580.3131570.0330770.35548339.0000001.0
41331NGN331G0.0944740.046883-0.0391760.162750325.6666671.0
.......................................
38592531TRT531R-0.089931-0.069566-0.0417090.061009325.6666671.0
38602531TST531S-0.009686-0.0152960.0123010.019788329.0000001.0
38612531TVT531V-0.063681-0.0293020.0111690.096735319.3333331.0
38622531TWT531W0.2502700.207922-0.0047110.27858034.3333331.0
38632531TYT531Y-0.054589-0.041310-0.0388480.025161311.3333331.0
\n", + "

3864 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " epitope site wildtype mutant mutation escape_mean escape_median \\\n", + "0 1 331 N A N331A -0.188953 -0.083300 \n", + "1 1 331 N D N331D -0.095844 -0.104757 \n", + "2 1 331 N E N331E -0.004743 -0.004766 \n", + "3 1 331 N F N331F 0.361758 0.313157 \n", + "4 1 331 N G N331G 0.094474 0.046883 \n", + "... ... ... ... ... ... ... ... \n", + "3859 2 531 T R T531R -0.089931 -0.069566 \n", + "3860 2 531 T S T531S -0.009686 -0.015296 \n", + "3861 2 531 T V T531V -0.063681 -0.029302 \n", + "3862 2 531 T W T531W 0.250270 0.207922 \n", + "3863 2 531 T Y T531Y -0.054589 -0.041310 \n", + "\n", + " escape_min_magnitude escape_std n_models times_seen frac_models \n", + "0 0.025048 0.282079 3 16.666667 1.0 \n", + "1 -0.074615 0.018464 3 10.666667 1.0 \n", + "2 0.002352 0.007084 3 9.333333 1.0 \n", + "3 0.033077 0.355483 3 9.000000 1.0 \n", + "4 -0.039176 0.162750 3 25.666667 1.0 \n", + "... ... ... ... ... ... \n", + "3859 -0.041709 0.061009 3 25.666667 1.0 \n", + "3860 0.012301 0.019788 3 29.000000 1.0 \n", + "3861 0.011169 0.096735 3 19.333333 1.0 \n", + "3862 -0.004711 0.278580 3 4.333333 1.0 \n", + "3863 -0.038848 0.025161 3 11.333333 1.0 \n", + "\n", + "[3864 rows x 12 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "\n", + "model_region_avg.mut_escape_df" + ] + }, + { + "cell_type": "markdown", + "id": "7d508c99-bbe3-4b7e-aad6-051bbe6e3652", + "metadata": {}, + "source": [ + "For the sites where all four models are active, this will be the same as the model without regions:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "24873342-2d66-4161-bdfa-b521520d46aa", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:17.022473Z", + "iopub.status.busy": "2024-12-14T13:58:17.021756Z", + "iopub.status.idle": "2024-12-14T13:58:18.175505Z", + "shell.execute_reply": "2024-12-14T13:58:18.174246Z", + "shell.execute_reply.started": "2024-12-14T13:58:17.022415Z" + } + }, + "outputs": [], + "source": [ + "assert (\n", + " model_avg.mut_escape_df.query(\"(site >= 450) and (site <= 460)\").equals(\n", + " model_region_avg.mut_escape_df.query(\"(site >= 450) and (site <= 460)\")\n", + " )\n", + " is True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6f9d763f-595f-46f4-98eb-e6d041ca3406", + "metadata": {}, + "source": [ + "But they differ at other sites:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4bacc14c-d91c-4622-8fb8-0e2870a85631", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:18.926225Z", + "iopub.status.busy": "2024-12-14T13:58:18.925480Z", + "iopub.status.idle": "2024-12-14T13:58:20.099447Z", + "shell.execute_reply": "2024-12-14T13:58:20.098626Z", + "shell.execute_reply.started": "2024-12-14T13:58:18.926167Z" + } + }, + "outputs": [], + "source": [ + "assert model_avg.mut_escape_df.equals(model_region_avg.mut_escape_df) is False" + ] + }, + { + "cell_type": "markdown", + "id": "33a6c146-155e-4417-a6d5-6a9fc383a9bf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-13T22:22:32.690820Z", + "iopub.status.busy": "2024-12-13T22:22:32.690497Z", + "iopub.status.idle": "2024-12-13T22:22:33.327740Z", + "shell.execute_reply": "2024-12-13T22:22:33.327038Z", + "shell.execute_reply.started": "2024-12-13T22:22:32.690797Z" + } + }, + "source": [ + "Same for the mutation IC50 values:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d881a40f-2f3e-4fe1-aa88-ddd82e3d68f6", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:21.139211Z", + "iopub.status.busy": "2024-12-14T13:58:21.138424Z", + "iopub.status.idle": "2024-12-14T13:58:27.843031Z", + "shell.execute_reply": "2024-12-14T13:58:27.842292Z", + "shell.execute_reply.started": "2024-12-14T13:58:21.139149Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sitewildtypemutantlog_fold_change_IC50 meanlog_fold_change_IC50 medianlog_fold_change_IC50 min_magnitudelog_fold_change_IC50 stdn_modelstimes_seenfrac_models
0331NA-0.278667-0.192977-0.1424770.193805316.6666671.0
1331ND-0.110483-0.0663450.0059800.143709310.6666671.0
2331NE0.0035400.0196660.0196660.03234739.3333331.0
3331NF0.1918090.1686990.0762060.12872439.0000001.0
4331NG0.3332710.3938550.1099610.200021325.6666671.0
.................................
2100531TS-0.0095800.0029360.0029360.022277329.0000001.0
2101531TT0.0000000.0000000.0000000.0000003NaN1.0
2102531TV-0.090553-0.0483690.0251480.141588319.3333331.0
2103531TW0.3095710.280769-0.0078630.33277034.3333331.0
2104531TY-0.084084-0.059005-0.0531400.048605311.3333331.0
\n", + "

2105 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " site wildtype mutant log_fold_change_IC50 mean \\\n", + "0 331 N A -0.278667 \n", + "1 331 N D -0.110483 \n", + "2 331 N E 0.003540 \n", + "3 331 N F 0.191809 \n", + "4 331 N G 0.333271 \n", + "... ... ... ... ... \n", + "2100 531 T S -0.009580 \n", + "2101 531 T T 0.000000 \n", + "2102 531 T V -0.090553 \n", + "2103 531 T W 0.309571 \n", + "2104 531 T Y -0.084084 \n", + "\n", + " log_fold_change_IC50 median log_fold_change_IC50 min_magnitude \\\n", + "0 -0.192977 -0.142477 \n", + "1 -0.066345 0.005980 \n", + "2 0.019666 0.019666 \n", + "3 0.168699 0.076206 \n", + "4 0.393855 0.109961 \n", + "... ... ... \n", + "2100 0.002936 0.002936 \n", + "2101 0.000000 0.000000 \n", + "2102 -0.048369 0.025148 \n", + "2103 0.280769 -0.007863 \n", + "2104 -0.059005 -0.053140 \n", + "\n", + " log_fold_change_IC50 std n_models times_seen frac_models \n", + "0 0.193805 3 16.666667 1.0 \n", + "1 0.143709 3 10.666667 1.0 \n", + "2 0.032347 3 9.333333 1.0 \n", + "3 0.128724 3 9.000000 1.0 \n", + "4 0.200021 3 25.666667 1.0 \n", + "... ... ... ... ... \n", + "2100 0.022277 3 29.000000 1.0 \n", + "2101 0.000000 3 NaN 1.0 \n", + "2102 0.141588 3 19.333333 1.0 \n", + "2103 0.332770 3 4.333333 1.0 \n", + "2104 0.048605 3 11.333333 1.0 \n", + "\n", + "[2105 rows x 10 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "\n", + "region_ic50_df = model_region_avg.mut_icXX_df(\n", + " x=0.5, icXX_col=\"IC50\", log_fold_change_icXX_col=\"log_fold_change_IC50\"\n", + ")\n", + "\n", + "region_ic50_df" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a5057dd3-1b05-4dd6-8f15-46905ba0b8eb", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:58:27.845077Z", + "iopub.status.busy": "2024-12-14T13:58:27.844710Z", + "iopub.status.idle": "2024-12-14T13:58:34.479506Z", + "shell.execute_reply": "2024-12-14T13:58:34.478531Z", + "shell.execute_reply.started": "2024-12-14T13:58:27.845052Z" + } + }, + "outputs": [], + "source": [ + "ic50_df = model_avg.mut_icXX_df(\n", + " x=0.5, icXX_col=\"IC50\", log_fold_change_icXX_col=\"log_fold_change_IC50\"\n", + ")\n", + "\n", + "assert (\n", + " region_ic50_df.query(\"(site >= 450) and (site <= 460)\").equals(\n", + " ic50_df.query(\"(site >= 450) and (site <= 460)\")\n", + " )\n", + " is True\n", + ")\n", + "\n", + "assert region_ic50_df.equals(ic50_df) is False" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "cd4013b1-18ce-4a96-8d76-264057de2d66", + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-14T13:59:04.711649Z", + "iopub.status.busy": "2024-12-14T13:59:04.710925Z", + "iopub.status.idle": "2024-12-14T13:59:08.741647Z", + "shell.execute_reply": "2024-12-14T13:59:08.740860Z", + "shell.execute_reply.started": "2024-12-14T13:59:04.711591Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "\n", + "model_region_avg.mut_escape_plot(addtl_slider_stats={\"times_seen\": 2})" + ] } ], "metadata": { @@ -1509,7 +2507,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/polyclonal/__init__.py b/polyclonal/__init__.py index ed9a156..734610c 100644 --- a/polyclonal/__init__.py +++ b/polyclonal/__init__.py @@ -31,7 +31,7 @@ __author__ = "`the Bloom lab `_" __email__ = "jbloom@fredhutch.org" -__version__ = "6.12" +__version__ = "6.13" __url__ = "https://github.com/jbloomlab/polyclonal" from polyclonal.alphabets import AAS diff --git a/polyclonal/plot.py b/polyclonal/plot.py index 0a1d735..4d4f829 100644 --- a/polyclonal/plot.py +++ b/polyclonal/plot.py @@ -945,7 +945,7 @@ def replace_std(col): base_chart = base_chart.transform_calculate( _stat_not_hidden=alt.expr.if_( alt.datum["_stat_hide"], - data_df[stat_col].min(), + float(data_df[stat_col].min()), alt.datum["_stat"], ), ).transform_joinaggregate( diff --git a/polyclonal/polyclonal_collection.py b/polyclonal/polyclonal_collection.py index 748bc17..c711648 100644 --- a/polyclonal/polyclonal_collection.py +++ b/polyclonal/polyclonal_collection.py @@ -268,6 +268,12 @@ class PolyclonalCollection: for each row must be unique. default_avg_to_plot : {"mean", "median"} By default when plotting, plot either "mean" or "median". + region_col : None or str + Use this option if you want to only include sites in a specific region of the + protein for specific models (this is useful for instance if you split the + protein into halves in two different libraries). In this case, `region_col` + should be a columnn in `models_df` with the values being the list of sites + to use for that specific model. Attributes ---------- @@ -299,6 +305,16 @@ class PolyclonalCollection: All sites for which the model is defined. default_avg_to_plot : {"mean", "median"} By default when plotting, plot either "mean" or "median". + regions : list + List of same length as :attr:`PolyclonalCollection.models` with each entry being + the set of sites that are being used in returned results for that model. If + `region_col` is `None`, this is all sites (:attr:`PolyclonalCollection.sites`), + but if `region_col` is used to define regions for different models then + the different sets of sites may differ for models. + n_models_by_site : dict + Keyed by each site in :attr:`PolyclonalCollection.sites`, with the value being + the number of models for which that site is in region for that model (this will + just be the number of models when not using `region_col`). Example ------- @@ -327,10 +343,35 @@ class PolyclonalCollection: ... ) >>> model_collection.sites (3, 5, 6) + >>> model_collection.regions == [{3, 5, 6}, {3, 5, 6}] + True + >>> model_collection.n_models_by_site + {3: 2, 5: 2, 6: 2} + + Now create a toy example with different regions for each model: + + >>> models_df = pd.DataFrame( + ... { + ... "model": [ + ... polyclonal.Polyclonal(data_to_fit=data_to_fit, n_epitopes=1), + ... polyclonal.Polyclonal(data_to_fit=data_to_fit2, n_epitopes=1), + ... ], + ... "description": ["model_1", "model_2"], + ... "region": [[3, 5], [3, 5, 6]], + ... } + ... ) + >>> model_region = polyclonal.PolyclonalCollection( + ... models_df, default_avg_to_plot="mean", region_col="region", + ... ) + >>> model_region.sites + (3, 5, 6) + >>> assert model_region.regions == [{3, 5}, {3, 5, 6}], model_region.regions + >>> model_region.n_models_by_site + {3: 2, 5: 2, 6: 1} """ - def __init__(self, models_df, *, default_avg_to_plot): + def __init__(self, models_df, *, default_avg_to_plot, region_col=None): """See main class docstring for details.""" if default_avg_to_plot not in {"mean", "median"}: raise ValueError(f"invalid {default_avg_to_plot=}") @@ -341,14 +382,26 @@ def __init__(self, models_df, *, default_avg_to_plot): raise ValueError(f"No models:\n{models_df=}") descriptors_df = models_df.drop(columns="model").reset_index(drop=True) + + if region_col is not None: + if region_col not in models_df.columns: + raise ValueError(f"{region_col=} not in {models_df.columns=}") + self.regions = [set(region) for region in models_df[region_col]] + descriptors_df = descriptors_df.drop(columns=region_col) + self._has_regions = True + else: + self._has_regions = False + if not len(descriptors_df.columns): raise ValueError("not descriptor columns in `models_df`") self.descriptor_names = descriptors_df.columns.tolist() self.unique_descriptor_names = [ name for name in self.descriptor_names - if descriptors_df[name].nunique(dropna=False) > 1 + if descriptors_df[name].nunique(dropna=False) > 1 or len(self.models) == 1 ] + if not len(self.unique_descriptor_names): + raise ValueError("no `unique_descriptor_names`") if len(descriptors_df.drop_duplicates()) != len(self.models): raise ValueError("some models have the same descriptors") self.model_descriptors = list(descriptors_df.to_dict(orient="index").values()) @@ -372,6 +425,16 @@ def __init__(self, models_df, *, default_avg_to_plot): assert all(isinstance(r, int) for r in sites), sites self.sites = tuple(sites) + if region_col is None: + self.regions = [set(self.sites) for _ in self.models] + for i, region in enumerate(self.regions): + if not region.issubset(self.sites): + raise ValueError(f"for model {i + 1}, {region - set(self.sites)=}") + + self.n_models_by_site = { + site: sum(site in region for region in self.regions) for site in self.sites + } + @property def activity_wt_df_replicates(self): """pandas.DataFrame: Epitope activities for all models.""" @@ -577,8 +640,10 @@ def mut_escape_df_replicates(self): """pandas.DataFrame: Mutation escape by model.""" return pd.concat( [ - m.mut_escape_df.assign(**desc) - for m, desc in zip(self.models, self.model_descriptors) + m.mut_escape_df.query("site in @sites").assign(**desc) + for m, desc, sites, in zip( + self.models, self.model_descriptors, self.regions + ) ], ignore_index=True, ) @@ -599,8 +664,10 @@ def mut_icXX_df_replicates(self, **kwargs): """ return pd.concat( [ - m.mut_icXX_df(**kwargs).assign(**desc) - for m, desc in zip(self.models, self.model_descriptors) + m.mut_icXX_df(**kwargs).query("site in @sites").assign(**desc) + for m, desc, sites in zip( + self.models, self.model_descriptors, self.regions + ) ], ignore_index=True, ) @@ -626,7 +693,9 @@ def mut_escape_df(self): ) .aggregate(**aggs) .assign( - frac_models=lambda x: x["n_models"] / len(self.models), + frac_models=lambda x: ( + x["n_models"] / x["site"].map(self.n_models_by_site) + ), # make categorical to sort, then return to original type epitope=lambda x: pd.Categorical( x["epitope"], @@ -685,7 +754,9 @@ def mut_icXX_df(self, **kwargs): df.groupby(["site", "wildtype", "mutant"], as_index=False) .aggregate(**aggs) .assign( - frac_models=lambda x: x["n_models"] / len(self.models), + frac_models=lambda x: ( + x["n_models"] / x["site"].map(self.n_models_by_site) + ), # make categorical to sort, then return to original type site=lambda x: pd.Categorical( x["site"], @@ -1012,7 +1083,7 @@ def mut_escape_plot( ) if init_n_models is None: - init_n_models = int(math.ceil(len(self.models) / 2)) + init_n_models = int(math.ceil(min(self.n_models_by_site.values()) / 2)) if "n_models" not in kwargs["addtl_slider_stats"]: kwargs["addtl_slider_stats"]["n_models"] = init_n_models @@ -1151,7 +1222,7 @@ def mut_icXX_plot( kwargs["addtl_slider_stats_as_max"] = [] if init_n_models is None: - init_n_models = int(math.ceil(len(self.models) / 2)) + init_n_models = int(math.ceil(min(self.n_models_by_site.values()) / 2)) kwargs["addtl_slider_stats"]["n_models"] = init_n_models kwargs["data_df"] = polyclonal.Polyclonal._merge_df_to_merge( @@ -1209,8 +1280,12 @@ def mut_escape_site_summary_df_replicates(self, **kwargs): """ return pd.concat( [ - m.mut_escape_site_summary_df(**kwargs).assign(**desc) - for m, desc in zip(self.models, self.model_descriptors) + m.mut_escape_site_summary_df(**kwargs) + .query("site in @sites") + .assign(**desc) + for m, desc, sites in zip( + self.models, self.model_descriptors, self.regions + ) ], ignore_index=True, ) @@ -1252,7 +1327,9 @@ def mut_escape_site_summary_df(self, **kwargs): n_models=pd.NamedAgg("escape", "count"), ) .assign( - frac_models=lambda x: x["n_models"] / len(self.models), + frac_models=lambda x: ( + x["n_models"] / x["site"].map(self.n_models_by_site) + ), ) .merge( df.groupby(["epitope", "site"]).aggregate({"n mutations": "mean"}), @@ -1300,6 +1377,8 @@ def icXX_replicates(self, variants_df, **kwargs): model are missing in that row. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") return pd.concat( [ m.icXX(m.filter_variants_by_seen_muts(variants_df), **kwargs).assign( @@ -1329,6 +1408,8 @@ def icXX(self, variants_df, **kwargs): icXX and summary stats for each variant across all models. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") if "col" in kwargs: col = kwargs["col"] else: @@ -1378,6 +1459,8 @@ def prob_escape_replicates(self, variants_df, **kwargs): missing in that row. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") return pd.concat( [ m.prob_escape( @@ -1411,6 +1494,8 @@ def prob_escape(self, variants_df, **kwargs): for each variant at each concentration across models. """ + if self._has_regions: + raise ValueError("Cannot use this method when defining per-model regions") variants_df = variants_df.drop_duplicates() return ( self.prob_escape_replicates(variants_df=variants_df, **kwargs) @@ -1439,6 +1524,8 @@ class PolyclonalAverage(PolyclonalCollection): Same meaning as for :class:`PolyclonalCollection`. However, the resulting collection of models will have **copies** of these models rather than the actual objects in `models_df`. + region_col : str or None + Same meaning as for :class:`PolyclonalCollection`. harmonize_to : :class:`PolyclonalCollection` or None When harmonizing the epitopes, harmonize to this model. If `None`, just harmonize to the first model in `models_df`. @@ -1452,7 +1539,14 @@ class PolyclonalAverage(PolyclonalCollection): """ - def __init__(self, models_df, *, harmonize_to=None, default_avg_to_plot="median"): + def __init__( + self, + models_df, + *, + region_col=None, + harmonize_to=None, + default_avg_to_plot="median", + ): """See main class docstring.""" if not len(models_df): raise ValueError("no models in `model_df`") @@ -1463,7 +1557,9 @@ def __init__(self, models_df, *, harmonize_to=None, default_avg_to_plot="median" m.epitope_harmonized_model(harmonize_to)[0] for m in models_df["model"] ] - super().__init__(models_df, default_avg_to_plot=default_avg_to_plot) + super().__init__( + models_df, region_col=region_col, default_avg_to_plot=default_avg_to_plot + ) class PolyclonalBootstrap(PolyclonalCollection):