diff --git a/.github/workflows/release-to-pypi.yml b/.github/workflows/release-to-pypi.yml index 19a23ca..ddb4340 100644 --- a/.github/workflows/release-to-pypi.yml +++ b/.github/workflows/release-to-pypi.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - test-pypi paths-ignore: - "README.md" @@ -32,9 +33,33 @@ jobs: name: python-package-distributions path: dist/ + publish-to-testpypi: + name: Publish Python 🐍 distribution 📦 to TestPyPI + if: github.ref == 'refs/heads/test-pypi' + needs: + - build + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/plcg + permissions: + id-token: write + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution 📦 to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + publish-to-pypi: name: >- Publish Python 🐍 distribution 📦 to PyPI + if: github.ref == 'refs/heads/main' needs: - build runs-on: ubuntu-latest @@ -57,6 +82,7 @@ jobs: name: >- Sign the Python 🐍 distribution 📦 with Sigstore and upload them to GitHub Release + if: github.ref == 'refs/heads/main' needs: - publish-to-pypi runs-on: ubuntu-latest diff --git a/README.md b/README.md index 6b855f5..3e0d428 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,27 @@ A package created by the Pardee Lab Computation Group containing a variety of py [Local Colabfold](https://github.com/YoshitakaMo/localcolabfold) -Must be in path such that the user can call colabfold_batch from inside terminal in which you are running the project +(Required to use the [Colabfold Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_prediction/colabfold/)). -[pyrosetta](https://www.pyrosetta.org/) +Must be in path such that the user can call colabfold_batch from inside terminal in which you are running the project. -Should be installed into it's own environment as it is not yet supported in python 3.11, and thus we take the conda path as a function argument +[s4pred](https://github.com/psipred/s4pred) +(Required to use [s4pred Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_prediction/s4pred/)). + +Should be installed such that the consumer can pass the path into prediction functions. + +[Pyrosetta](https://www.pyrosetta.org/) + +(Required to use the [Pyrosetta Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_scoring/rosetta/)). + +Should be installed into it's own environment as it is not yet supported in python 3.11, and thus we take the conda path as a function argument. + +[SpServer](https://github.com/structuralbioinformatics/SPServer) + +(Required to use the [SpServer Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_scoring/spserver/)). + +Should be cloned from our [SpServer Python 3 Migration](https://github.com/Pardee-Lab-Computation-Group/SPServer-py3) so the consumer can pass the parent directory path. ## Development model: @@ -20,7 +35,7 @@ Workflows auto release new versions (Which are manually updated in [setup.py](ht - Pushes to [main](https://github.com/Pardee-Lab-Computation-Group/plcg/tree/main) release to pypi. - Pushes to [test-pypi](https://github.com/Pardee-Lab-Computation-Group/plcg/tree/test-pypi) release to test-pypi. -General practice is that we make pr (or push directly) to test-pypi first, pull it into a real project to test, and then pr to main. +General practice is that we make pr (or push directly) to test-pypi first, pull it into a real project to test, and then pr to main. - If you don't want to release, simply don't update version and workflows will fail harmlessly. - In the future we might update to only release when it finds the a change in setup.py version. diff --git a/requirements.txt b/requirements.txt index 686bfa5..8b1d3c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pandas==2.2.0 numpy==1.26.3 -python-Levenshtein==0.23.0 \ No newline at end of file +python-Levenshtein==0.23.0 +torch==2.2.0 \ No newline at end of file diff --git a/setup.py b/setup.py index d13026e..37ab293 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="plcg", - version="1.0.3", + version="1.1.0", author="Spencer Perkins", author_email="spencer.perkins44sp@gmail.com", description="Computational biology and machine learning utilities for the Pardee Lab Computation Group", diff --git a/src/plcg/determinism/seed.py b/src/plcg/determinism/seed.py new file mode 100644 index 0000000..676cc1a --- /dev/null +++ b/src/plcg/determinism/seed.py @@ -0,0 +1,15 @@ +import random + +import torch +import numpy as np + + +def set_seed(seed: int, cudnn_deterministic: bool = True) -> None: + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + torch.cuda.manual_seed_all(seed) + + if cudnn_deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = True diff --git a/src/plcg/structure_prediction/s4pred/s4pred.py b/src/plcg/structure_prediction/s4pred/s4pred.py new file mode 100644 index 0000000..8c29175 --- /dev/null +++ b/src/plcg/structure_prediction/s4pred/s4pred.py @@ -0,0 +1,109 @@ +import subprocess +import os +from typing import NamedTuple, cast, overload + +from plcg.structure_prediction.fasta.fasta import save_fasta_file + + +class SecondaryStructure(NamedTuple): + pred: str + filtered_pred: str + threshold: int | float + n_helix: int + n_sheet: int + n_coil: int + f_helix: float + f_sheet: float + f_coil: float + + +@overload +def calc_secondary_structure( + s4pred_path: str, seq: str | list[str], *, threshold: int | float = ... +) -> list[SecondaryStructure]: ... +@overload +def calc_secondary_structure( + s4pred_path: str, *, fasta_path: str, threshold: int | float = ... +) -> list[SecondaryStructure]: ... +def calc_secondary_structure( + s4pred_path: str, seq=None, fasta_path=None, threshold: int | float = 5 +) -> list[SecondaryStructure]: + if seq is not None: + fasta_path = _create_temp_fasta(seq) + fasta_path = cast(str, fasta_path) + + output = _run_s4pred(s4pred_path, fasta_path) + + if seq is not None: + os.remove(fasta_path) + + secondary_structures = _parse_s4pred_output(output, threshold) + return secondary_structures + + +def _create_temp_fasta(seq: str | list[str]) -> str: + fasta_path = "./tmp.fasta" + if isinstance(seq, str): + seq = [seq] + save_fasta_file(seq, fasta_path) + return fasta_path + + +def _run_s4pred(s4pred_path: str, fasta_path: str) -> str: + result = subprocess.run( + [ + "python", + f"{s4pred_path}/run_model.py", + "-t", + "horiz", + fasta_path, + ], + capture_output=True, + check=True, + ) + return result.stdout.decode() + + +def _parse_s4pred_output( + output: str, threshold: int | float +) -> list[SecondaryStructure]: + ss_data = output.split("#")[1:] + return list( + map(lambda ss_str: _process_secondary_structure(ss_str, threshold), ss_data) + ) + + +def _process_secondary_structure(ss_str: str, threshold: float) -> SecondaryStructure: + res = ss_str.split("\n") + conf = res[2][res[2].rfind(" ") + 1 :] + pred = res[3][res[3].rfind(" ") + 1 :] + + filtered_pred = _filter_prediction_by_confidence(conf, pred, threshold) + n_h, n_e, n_c, f_h, f_e, f_c = _count_secondary_structure(filtered_pred) + + return SecondaryStructure( + pred=pred, + filtered_pred=filtered_pred, + threshold=threshold, + n_helix=n_h, + n_sheet=n_e, + n_coil=n_c, + f_helix=f_h, + f_sheet=f_e, + f_coil=f_c, + ) + + +def _filter_prediction_by_confidence(conf: str, pred: str, threshold: float) -> str: + return "".join(pred[i] for i, c in enumerate(conf) if int(c) >= threshold) + + +def _count_secondary_structure(filtered_pred: str): + n_h = filtered_pred.count("H") + n_e = filtered_pred.count("E") + n_c = filtered_pred.count("C") + n = len(filtered_pred) + f_h = n_h / n if n > 0 else 0 + f_e = n_e / n if n > 0 else 0 + f_c = n_c / n if n > 0 else 0 + return n_h, n_e, n_c, f_h, f_e, f_c