From 5a630dc3f4c77285ec17179d88eabd88587435f6 Mon Sep 17 00:00:00 2001 From: hugues_esc Date: Fri, 21 Nov 2025 14:14:07 +0100 Subject: [PATCH 1/5] feat: improve docs fix: cuda problem --- .DS_Store | Bin 0 -> 6148 bytes README.md | 120 ++++++++++++++++++++++++++++++++++++----- inference/act_model.py | 4 +- models/.DS_Store | Bin 0 -> 6148 bytes training/.DS_Store | Bin 0 -> 6148 bytes 5 files changed, 108 insertions(+), 16 deletions(-) create mode 100644 .DS_Store create mode 100644 models/.DS_Store create mode 100644 training/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e5ea10e1eb51c4bcfed6a5e715a0dfdedd98ad82 GIT binary patch literal 6148 zcmeHLO-lkn7=9;3tqytYxECV`0wJj5Cfg;t*~cX)ztX^UB^5gEsUINpCpz^n`X523 zE?tAbo_A){S$ExINX!c}&+fd>%sxKj&Nvd0sEnFrq9PF$P#Cits1kznTw<}NTeg9M z$LO}p)kfg=V@%s%9k33pq67TyW@(?=bWcO`{mmV$J()Q_@Vt6AsH3g0vvqZ}*LgdR z=9{VcerLWmIzg{g+5{k)gA4Q=D8X}g{nIV zefSXi$wHq{gnT;2cO;#JtI*cg0qa22fsDE>^8SCa{QMv7Wbdp4)`5TJfXI4PuY!-H z_tw~l -## Create the CataPro environment -To run CataPro, you should create a conda environment that includes the following packages: +--- - pytorch >= 1.13.0 - transformers - numpy - pandas - RDKit +## Installation -In addition, CataPro also relies on additional pre-trained models, including [prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) and [molt5-base-smiles2caption](https://huggingface.co/laituan245/molt5-base-smiles2caption). These two models are used for extracting features from enzymes and substrates, respectively. You need to place the weights for these two pre-trained models in the `models` directory. +## Setup a Python environment + +To ensure a clean and isolated setup, we recommend to use [uv](https://docs.astral.sh/uv/), a lightweight tool that simplifies Python environment and package management. If you don’t have it yet: + +```p +# macOS / Linux +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +```powershell +# Windows +powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" +$env:Path += ";$env:USERPROFILE\.local\bin" +``` + +Create and activate a virtual environment with uv: + +```bash +# macOS / Linux +uv venv +source .venv/bin/activate +``` + +```powershell +# Windows +uv venv +.venv\Scripts\activate +``` + +## Install dependencies + +```bash +uv pip install torch transformers numpy pandas RDKit sentencepiece +``` + +### 2. Clone the CataPro repository + +```bash +git clone https://github.com/zchwang/CataPro +``` + +### 3. Set up Git LFS + +CataPro uses [Git Large File Storage (LFS)](https://git-lfs.github.com/) to handle large model files. +If you don't have Git LFS installed, you can install it using the following command: + +```bash +git lfs install +``` + +### 4. Download the models + +In addition, CataPro also relies on additional pre-trained models, including [prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) and [molt5-base-smiles2caption](https://huggingface.co/laituan245/molt5-base-smiles2caption). These two models are used for extracting features from enzymes and substrates, respectively. + +!!! warning + + The models prot_t5_xl_uniref50 and molt5-base-smiles2caption required for CataPro are 64 and 1.9 GB, respectively. + +```bash +# macOS / Linux +cd CataPro/models/ + +LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Rostlab/prot_t5_xl_uniref50 +cd prot_t5_xl_uniref50 +git lfs pull + +cd .. + +LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/laituan245/molt5-base-smiles2caption +cd molt5-base-smiles2caption +git lfs pull + +cd ../.. +``` + +```powershell +# Windows +git -c filter.lfs.smudge= -c filter.lfs.required=false clone https://huggingface.co/Rostlab/prot_t5_xl_uniref50 +cd prot_t5_xl_uniref50 +git lfs pull + +cd .. + +git -c filter.lfs.smudge= -c filter.lfs.required=false clone https://huggingface.co/laituan245/molt5-base-smiles2caption +cd molt5-base-smiles2caption +git lfs pull + +cd ../.. +``` + +--- ## Contact Zechen Wang, PhD, Shandong University, wangzch97@gmail.com

+--- + ## Usage ### 1. Prepare the input files for inference Enzyme and substrate information should be organized in a DataFrame created with pandas (in CSV format). Each enzyme-substrate pair must include the Enzyme_id, type (wild-type or mutant), the enzyme sequence, and the substrate's SMILES. The format is as follows: @@ -33,14 +120,19 @@ You can also refer to a sample file samples/sample_inp.csv ### 2. Next, you can use the following command to run CataPro to infer the kinetic parameters of the enzymatic reaction: - python predict.py \ - -inp_fpath samples/sample_inp.csv \ - -model_dpath models \ - -batch_size 64 \ - -device cuda:0 \ - -out_fpath catapro_prediction.csv +```bash +# In CataPro folder +python inference/predict.py \ + -inp_fpath samples/sample_inp.csv \ + -model_dpath models \ + -batch_size 64 \ + -device cuda:0 \ + -out_fpath catapro_prediction.csv +``` Finally, the prediction results from CataPro are stored in the "catapro_prediction.csv" file. You can also run "bash run_catapro.sh" directly in the inference directory to achieve the above process. +--- + ## Question and Answer To be updated ... diff --git a/inference/act_model.py b/inference/act_model.py index e0e2442..7e8c883 100755 --- a/inference/act_model.py +++ b/inference/act_model.py @@ -50,8 +50,8 @@ def __init__(self, rate=0.0, alpha=0.4, device="cuda:0"): super(ActivityModel, self).__init__() self.alpha = alpha - self.kcat_model = KcatModel().to(device) - self.Km_model = KmModel().to(device) + self.kcat_model = KcatModel(device=device).to(device) + self.Km_model = KmModel(device=device).to(device) self.prot_norm = nn.BatchNorm1d(1024).to(device) self.molt5_norm = nn.BatchNorm1d(768).to(device) diff --git a/models/.DS_Store b/models/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0b8386f1d6aabab3b9ccc1f6e1e9c33d7495768f GIT binary patch literal 6148 zcmeHKu};G<5WQ4kR;7538U)xkk zS|)_xPO_h4zq4OnqBtfZp1!0Lq7f1GQ3N{!R1G2HQfkrBo^v3(IdUo}r4?oLRAXDm zZ*)MO-5JekNzZggmFJf|Op;}hZ$xzWV=OO7BQG!Fc##zgjQ2m^qVRhCe$&oPzs=pO ztLvATl7{pMo%3T#Ih+Z#g;~}5x=OZMTiJWHr`c7#9Eu1U&#z%>i@39N0M^--iH2Fj6c%x=#lxcLV_T(Cq?k z`BjvhKrvD*Jz@mGJ{9OwS&ta()6pL&FH$T$`gCGFe6X&}dO~5hI_3`?ojCGnwK-r8 z96HeTmm}H#Z`;rRhe>v24wwUf$^q`hNj$|Z>E2qqIoWFilzS8j;gueDDQLK@7+l$k cuTWf|Kad76QY<~92Vp+~&IYT@fgg3?3kaQh>Hq)$ literal 0 HcmV?d00001 diff --git a/training/.DS_Store b/training/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9fd0380676ec0be02b10c9977bd96bd96aa121c6 GIT binary patch literal 6148 zcmeHKze~eV5WZJ~MC{PfF}rl=CN9PjCl{fhquN#sn2;jsCgi{K&*?wl;CJt?Y0{*{ zMG?7!d*93b$h&+gxgHU@@#}mF4#yar$xI^v`$t7is;safMXsug;}gD(He!b@$pe4&PCux7X`QmlaRyTc5q@>o|uw zGBE~>0b}4N7(mZv2@eHrH3p0UW1wPyzYh_VF;*-E{ig$q9|3>?%wDkPS%PyS#aOWv zga_gz6)34rM+_(FutypfE0%(iPELmprzbm|P@F#<_m6xyxmeIvW55__GO#C?1MdG9 z+vopgl3f`C#=yT~zy--98RL<(x3(URdu;^0gR*d3DOjao<6ALexfLHly}%yv1{f=r Sg0MjBkASDa7GvO78TbO+<7QF- literal 0 HcmV?d00001 From 3879d15c785a40c1f8e53e9d7c9b8a44f432e61c Mon Sep 17 00:00:00 2001 From: Hugues Esc_ <85628846+h-escoffier@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:15:11 +0100 Subject: [PATCH 2/5] Delete .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index e5ea10e1eb51c4bcfed6a5e715a0dfdedd98ad82..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHLO-lkn7=9;3tqytYxECV`0wJj5Cfg;t*~cX)ztX^UB^5gEsUINpCpz^n`X523 zE?tAbo_A){S$ExINX!c}&+fd>%sxKj&Nvd0sEnFrq9PF$P#Cits1kznTw<}NTeg9M z$LO}p)kfg=V@%s%9k33pq67TyW@(?=bWcO`{mmV$J()Q_@Vt6AsH3g0vvqZ}*LgdR z=9{VcerLWmIzg{g+5{k)gA4Q=D8X}g{nIV zefSXi$wHq{gnT;2cO;#JtI*cg0qa22fsDE>^8SCa{QMv7Wbdp4)`5TJfXI4PuY!-H z_tw~l Date: Fri, 21 Nov 2025 14:15:26 +0100 Subject: [PATCH 3/5] Delete models/.DS_Store --- models/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 models/.DS_Store diff --git a/models/.DS_Store b/models/.DS_Store deleted file mode 100644 index 0b8386f1d6aabab3b9ccc1f6e1e9c33d7495768f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKu};G<5WQ4kR;7538U)xkk zS|)_xPO_h4zq4OnqBtfZp1!0Lq7f1GQ3N{!R1G2HQfkrBo^v3(IdUo}r4?oLRAXDm zZ*)MO-5JekNzZggmFJf|Op;}hZ$xzWV=OO7BQG!Fc##zgjQ2m^qVRhCe$&oPzs=pO ztLvATl7{pMo%3T#Ih+Z#g;~}5x=OZMTiJWHr`c7#9Eu1U&#z%>i@39N0M^--iH2Fj6c%x=#lxcLV_T(Cq?k z`BjvhKrvD*Jz@mGJ{9OwS&ta()6pL&FH$T$`gCGFe6X&}dO~5hI_3`?ojCGnwK-r8 z96HeTmm}H#Z`;rRhe>v24wwUf$^q`hNj$|Z>E2qqIoWFilzS8j;gueDDQLK@7+l$k cuTWf|Kad76QY<~92Vp+~&IYT@fgg3?3kaQh>Hq)$ From 81920176b13a2ca3877b90811b09a446ce0ca904 Mon Sep 17 00:00:00 2001 From: Hugues Esc_ <85628846+h-escoffier@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:15:40 +0100 Subject: [PATCH 4/5] Delete training/.DS_Store --- training/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 training/.DS_Store diff --git a/training/.DS_Store b/training/.DS_Store deleted file mode 100644 index 9fd0380676ec0be02b10c9977bd96bd96aa121c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKze~eV5WZJ~MC{PfF}rl=CN9PjCl{fhquN#sn2;jsCgi{K&*?wl;CJt?Y0{*{ zMG?7!d*93b$h&+gxgHU@@#}mF4#yar$xI^v`$t7is;safMXsug;}gD(He!b@$pe4&PCux7X`QmlaRyTc5q@>o|uw zGBE~>0b}4N7(mZv2@eHrH3p0UW1wPyzYh_VF;*-E{ig$q9|3>?%wDkPS%PyS#aOWv zga_gz6)34rM+_(FutypfE0%(iPELmprzbm|P@F#<_m6xyxmeIvW55__GO#C?1MdG9 z+vopgl3f`C#=yT~zy--98RL<(x3(URdu;^0gR*d3DOjao<6ALexfLHly}%yv1{f=r Sg0MjBkASDa7GvO78TbO+<7QF- From 6fc96eeb2faac068c252a53e6286c522cf6873ff Mon Sep 17 00:00:00 2001 From: hugues_esc Date: Fri, 21 Nov 2025 14:18:58 +0100 Subject: [PATCH 5/5] feat: add warning message in the readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6bd4792..3d94fa0 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,9 @@ git lfs install In addition, CataPro also relies on additional pre-trained models, including [prot_t5_xl_uniref50](https://huggingface.co/Rostlab/prot_t5_xl_uniref50) and [molt5-base-smiles2caption](https://huggingface.co/laituan245/molt5-base-smiles2caption). These two models are used for extracting features from enzymes and substrates, respectively. -!!! warning - - The models prot_t5_xl_uniref50 and molt5-base-smiles2caption required for CataPro are 64 and 1.9 GB, respectively. +> [!WARNING] +> The models prot_t5_xl_uniref50 and molt5-base-smiles2caption required for CataPro are 64 and 1.9 GB, +> respectively. ```bash # macOS / Linux