Pardee-Lab-Computation-Group · SpecialThing44 · Mar 24, 2025 · Mar 22, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/.github/workflows/release-to-pypi.yml b/.github/workflows/release-to-pypi.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - main
+      - test-pypi
     paths-ignore:
       - "README.md"
 
@@ -32,9 +33,33 @@ jobs:
           name: python-package-distributions
           path: dist/
 
+  publish-to-testpypi:
+    name: Publish Python 🐍 distribution 📦 to TestPyPI
+    if: github.ref == 'refs/heads/test-pypi'
+    needs:
+      - build
+    runs-on: ubuntu-latest
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/plcg
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+      - name: Publish distribution 📦 to TestPyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+
   publish-to-pypi:
     name: >-
       Publish Python 🐍 distribution 📦 to PyPI
+    if: github.ref == 'refs/heads/main'
     needs:
       - build
     runs-on: ubuntu-latest
@@ -57,6 +82,7 @@ jobs:
     name: >-
       Sign the Python 🐍 distribution 📦 with Sigstore
       and upload them to GitHub Release
+    if: github.ref == 'refs/heads/main'
     needs:
       - publish-to-pypi
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -6,12 +6,27 @@ A package created by the Pardee Lab Computation Group containing a variety of py
 
 [Local Colabfold](https://github.com/YoshitakaMo/localcolabfold)
 
-Must be in path such that the user can call colabfold_batch from inside terminal in which you are running the project
+(Required to use the [Colabfold Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_prediction/colabfold/)).
 
-[pyrosetta](https://www.pyrosetta.org/)
+Must be in path such that the user can call colabfold_batch from inside terminal in which you are running the project.
 
-Should be installed into it's own environment as it is not yet supported in python 3.11, and thus we take the conda path as a function argument
+[s4pred](https://github.com/psipred/s4pred)
 
+(Required to use [s4pred Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_prediction/s4pred/)).
+
+Should be installed such that the consumer can pass the path into prediction functions.
+
+[Pyrosetta](https://www.pyrosetta.org/)
+
+(Required to use the [Pyrosetta Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_scoring/rosetta/)).
+
+Should be installed into it's own environment as it is not yet supported in python 3.11, and thus we take the conda path as a function argument.
+
+[SpServer](https://github.com/structuralbioinformatics/SPServer)
+
+(Required to use the [SpServer Package](https://github.com/Pardee-Lab-Computation-Group/plcg/blob/main/src/plcg/structure_scoring/spserver/)).
+
+Should be cloned from our [SpServer Python 3 Migration](https://github.com/Pardee-Lab-Computation-Group/SPServer-py3) so the consumer can pass the parent directory path.
 
 ## Development model:
 
@@ -20,7 +35,7 @@ Workflows auto release new versions (Which are manually updated in [setup.py](ht
 - Pushes to [main](https://github.com/Pardee-Lab-Computation-Group/plcg/tree/main) release to pypi.
 - Pushes to [test-pypi](https://github.com/Pardee-Lab-Computation-Group/plcg/tree/test-pypi) release to test-pypi.
 
-General practice is that we make pr (or push directly) to test-pypi first, pull it into a real project to test, and then pr to main. 
+General practice is that we make pr (or push directly) to test-pypi first, pull it into a real project to test, and then pr to main.
 
 - If you don't want to release, simply don't update version and workflows will fail harmlessly.
 - In the future we might update to only release when it finds the a change in setup.py version.
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 pandas==2.2.0
 numpy==1.26.3
-python-Levenshtein==0.23.0
+python-Levenshtein==0.23.0
+torch==2.2.0
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="plcg",
-    version="1.0.3",
+    version="1.1.0",
     author="Spencer Perkins",
     author_email="spencer.perkins44sp@gmail.com",
     description="Computational biology and machine learning utilities for the Pardee Lab Computation Group",

diff --git a/src/plcg/determinism/seed.py b/src/plcg/determinism/seed.py
@@ -0,0 +1,15 @@
+import random
+
+import torch
+import numpy as np
+
+
+def set_seed(seed: int, cudnn_deterministic: bool = True) -> None:
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+    if cudnn_deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = True
diff --git a/src/plcg/structure_prediction/s4pred/s4pred.py b/src/plcg/structure_prediction/s4pred/s4pred.py
@@ -0,0 +1,109 @@
+import subprocess
+import os
+from typing import NamedTuple, cast, overload
+
+from plcg.structure_prediction.fasta.fasta import save_fasta_file
+
+
+class SecondaryStructure(NamedTuple):
+    pred: str
+    filtered_pred: str
+    threshold: int | float
+    n_helix: int
+    n_sheet: int
+    n_coil: int
+    f_helix: float
+    f_sheet: float
+    f_coil: float
+
+
+@overload
+def calc_secondary_structure(
+    s4pred_path: str, seq: str | list[str], *, threshold: int | float = ...
+) -> list[SecondaryStructure]: ...
+@overload
+def calc_secondary_structure(
+    s4pred_path: str, *, fasta_path: str, threshold: int | float = ...
+) -> list[SecondaryStructure]: ...
+def calc_secondary_structure(
+    s4pred_path: str, seq=None, fasta_path=None, threshold: int | float = 5
+) -> list[SecondaryStructure]:
+    if seq is not None:
+        fasta_path = _create_temp_fasta(seq)
+    fasta_path = cast(str, fasta_path)
+
+    output = _run_s4pred(s4pred_path, fasta_path)
+
+    if seq is not None:
+        os.remove(fasta_path)
+
+    secondary_structures = _parse_s4pred_output(output, threshold)
+    return secondary_structures
+
+
+def _create_temp_fasta(seq: str | list[str]) -> str:
+    fasta_path = "./tmp.fasta"
+    if isinstance(seq, str):
+        seq = [seq]
+    save_fasta_file(seq, fasta_path)
+    return fasta_path
+
+
+def _run_s4pred(s4pred_path: str, fasta_path: str) -> str:
+    result = subprocess.run(
+        [
+            "python",
+            f"{s4pred_path}/run_model.py",
+            "-t",
+            "horiz",
+            fasta_path,
+        ],
+        capture_output=True,
+        check=True,
+    )
+    return result.stdout.decode()
+
+
+def _parse_s4pred_output(
+    output: str, threshold: int | float
+) -> list[SecondaryStructure]:
+    ss_data = output.split("#")[1:]
+    return list(
+        map(lambda ss_str: _process_secondary_structure(ss_str, threshold), ss_data)
+    )
+
+
+def _process_secondary_structure(ss_str: str, threshold: float) -> SecondaryStructure:
+    res = ss_str.split("\n")
+    conf = res[2][res[2].rfind(" ") + 1 :]
+    pred = res[3][res[3].rfind(" ") + 1 :]
+
+    filtered_pred = _filter_prediction_by_confidence(conf, pred, threshold)
+    n_h, n_e, n_c, f_h, f_e, f_c = _count_secondary_structure(filtered_pred)
+
+    return SecondaryStructure(
+        pred=pred,
+        filtered_pred=filtered_pred,
+        threshold=threshold,
+        n_helix=n_h,
+        n_sheet=n_e,
+        n_coil=n_c,
+        f_helix=f_h,
+        f_sheet=f_e,
+        f_coil=f_c,
+    )
+
+
+def _filter_prediction_by_confidence(conf: str, pred: str, threshold: float) -> str:
+    return "".join(pred[i] for i, c in enumerate(conf) if int(c) >= threshold)
+
+
+def _count_secondary_structure(filtered_pred: str):
+    n_h = filtered_pred.count("H")
+    n_e = filtered_pred.count("E")
+    n_c = filtered_pred.count("C")
+    n = len(filtered_pred)
+    f_h = n_h / n if n > 0 else 0
+    f_e = n_e / n if n > 0 else 0
+    f_c = n_c / n if n > 0 else 0
+    return n_h, n_e, n_c, f_h, f_e, f_c