TalusBio · jspaezp · Apr 22, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -6,11 +6,11 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v2
+      - uses: actions/checkout@v4
+      - name: Setup Python 3.10
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.10"
 
       - name: Run black
         uses: psf/black@stable

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -10,8 +10,8 @@ jobs:
   deploy:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: 3.x
 
@@ -25,4 +25,4 @@ jobs:
           fc-match Montserrat
 
       - run: pip install ".[docs]"
-      - run: mkdocs gh-deploy --force
+      - run: mkdocs gh-deploy --force
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -12,9 +12,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: '3.x'
     - name: Install dependencies

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -19,11 +19,11 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v5
       with:
-        python-version: "3.8"
+        python-version: "3.10"
 
     - name: Install dependencies
       run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags
+    rev: 25.1.0 # Replace by any tag/version: https://github.com/psf/black/tags
     hooks:
       - id: black
         language_version: python3 # Should be a command that runs python3.6+
diff --git a/gopher/__init__.py b/gopher/__init__.py
@@ -1,4 +1,5 @@
 """See the README for detailed documentation and examples."""
+
 try:
     from importlib.metadata import PackageNotFoundError, version
 

diff --git a/gopher/annotations.py b/gopher/annotations.py
@@ -1,4 +1,5 @@
 """Get GO annotations."""
+
 import uuid
 from pathlib import Path
 

diff --git a/gopher/config.py b/gopher/config.py
@@ -1,4 +1,5 @@
 """This module contains the configuration details for ppx"""
+
 import logging
 import os
 from pathlib import Path

diff --git a/gopher/enrichment.py b/gopher/enrichment.py
@@ -1,4 +1,5 @@
 """Calculate the enrichments for a collection of experiments."""
+
 import logging
 
 import numpy as np

diff --git a/gopher/gopher.py b/gopher/gopher.py
@@ -1,4 +1,5 @@
 """The command line entry point for gopher-enrich"""
+
 import logging
 from argparse import ArgumentParser
 

diff --git a/gopher/ontologies.py b/gopher/ontologies.py
@@ -1,4 +1,5 @@
 """Download the GO ontologies"""
+
 from collections import defaultdict
 
 from . import config, utils

diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py
@@ -1,2 +1,3 @@
 """The parsers"""
+
 from .tabular import read_encyclopedia, read_metamorpheus, read_diann
diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
@@ -1,6 +1,10 @@
 """Parse tabular result files from common tools"""
+
+import os
+import io
 import pandas as pd
 import numpy as np
+from cloudpathlib import AnyPath
 
 
 def read_encyclopedia(proteins_txt: str) -> pd.DataFrame:
@@ -54,10 +58,19 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
     )
     return proteins
 
-def read_diann(proteins_tsv: str) -> pd.DataFrame:
+
+def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]:
+    with open(AnyPath(file)) as f:
+        firstcol = f.readline()
+
+    return firstcol.strip().split("\t")
+
+
+def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame:
     """
-    Reads a DIANN-generated TSV file containing protein information, processes
-    it, and returns a cleaned Pandas DataFrame with relevant data.
+    Reads a DIANN-generated TSV file (pg_matrix) containing protein information.
+
+    Also processes it, and returns a cleaned Pandas DataFrame with relevant data.
 
     The function:
     - Extracts the first protein accession from the "Protein.Ids" column to use
@@ -73,40 +86,40 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame:
                 'Protein.Names',
                 'Genes',
                 'First.Protein.Description',
-                <several MSR columns>
+                <several Intensity columns>
 
 
     Returns:
         pd.DataFrame: A DataFrame with the processed protein data, indexed by
             the first protein accession.
-            The returned DataFrame has the "Protein.Ids" column as the 
-            index and all columns are the MSR columns.          
+            The returned DataFrame has the "Protein.Ids" column as the
+            index and all columns are the MSR columns.
     """
-    proteins = pd.read_table(proteins_tsv)
-    accessions = proteins["Protein.Ids"].str.split(";").str[0]
 
-    proteins = proteins.set_index(accessions)
-    proteins = proteins.rename_axis("Protein", axis="index")
-    proteins = proteins.drop(
-        columns=[
-            "Protein.Group",
-            "Protein.Ids",
-            "Protein.Names",
-            "Genes",
-            "First.Protein.Description",
-        ]
+    columns = _read_colnames(proteins_tsv)
+
+    expect = [
+        "Protein.Group",
+        "Protein.Ids",
+        "Protein.Names",
+        "Genes",
+        "First.Protein.Description",
+    ]
+
+    if not all(c in columns for c in expect):
+        msg = f"Expected columns {expect}, got {columns}, make sure you are"
+        msg += " using the 'diann_report.pg_matrix.tsv' output."
+        raise ValueError(msg)
+
+    schema: dict[str, type] = {k: float for k in columns if k not in expect}
+    schema["Protein.Ids"] = str
+
+    proteins = pd.read_table(
+        AnyPath(proteins_tsv), dtype=schema, usecols=list(schema)
     )
+    proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0]
+
+    proteins = proteins.set_index("Protein.Ids", drop=True)
+    proteins = proteins.rename_axis("Protein", axis="index")
 
-    # Check data types
-    # (if loading from S3, default types are 'O'
-    if proteins.index.dtype not in ["O", "category", "str"]:
-        raise ValueError(
-            f"Protein index is incorrect type: {proteins.index.dtype}"
-        )
-    if not all(
-        np.issubdtype(dtype, np.floating) or dtype == "O"
-        for dtype in proteins.dtypes
-    ):
-        raise ValueError("Non-numeric columns present")
-
     return proteins
diff --git a/gopher/stats.py b/gopher/stats.py
@@ -1,4 +1,5 @@
 """Numba Mann-Whitney U test"""
+
 import numba as nb
 import numpy as np
 from scipy import stats

diff --git a/gopher/utils.py b/gopher/utils.py
@@ -1,4 +1,5 @@
 """Utility functions"""
+
 import socket
 from pathlib import Path
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,10 +13,27 @@ classifiers = [
     "Operating System :: OS Independent",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
-requires-python = ">=3.6"
-
+requires-python = ">=3.10"
+dependencies = [
+  "numpy > 2.0, < 3.0",
+  "pandas > 2.0, < 3.0",
+  "scipy",
+  "tqdm",
+  "statsmodels",
+  "biopython", # ... we can implement a fasta parser ...
+  "loguru",
+  "numba",
+  "requests",
+  "seaborn",
+  "matplotlib",
+  "cloudpathlib",
+]
 dynamic = ["version"]
 
+[project.scripts]
+gopher = "gopher.gopher:main"
+
+
 [project.readme]
 file = "README.md"
 content-type = "text/markdown"
@@ -40,6 +57,11 @@ docs = [
 dev = [
     "pre-commit>=2.7.1",
     "black>=19.10b0",
+    "pytest",
+]
+s3 = [
+  "cloudpathlib[s3]",
+  "boto3",
 ]
 
 [tool.setuptools]
@@ -52,7 +74,7 @@ find = {namespaces = false}
 
 [tool.black]
 line-length = 79
-target-version = ['py37']
+target-version = ['py310']
 include = '\.pyi?$'
 exclude = '''
 

diff --git a/setup.cfg b/setup.cfg
diff --git a/setup.py b/setup.py
diff --git a/tests/unit_tests/annotations_test.py b/tests/unit_tests/annotations_test.py
@@ -1,4 +1,5 @@
 """Test that the annotations functions are working correctly"""
+
 import re
 
 import pandas as pd

diff --git a/tests/unit_tests/enrichment_test.py b/tests/unit_tests/enrichment_test.py
@@ -1,4 +1,5 @@
 """Test that the enrichment functions are working correctly"""
+
 import random
 
 import numpy as np

diff --git a/tests/unit_tests/normalize_test.py b/tests/unit_tests/normalize_test.py
@@ -6,12 +6,14 @@
 
 from gopher import normalize
 
+CURRPATH = Path(__file__).parent
+
 
 @pytest.fixture
 def real_data(tmp_path):
     """Test using small files."""
-    fasta_df = Path("../data/small-yeast.fasta")
-    quant = pd.read_csv("../data/yeast_small.csv")
+    fasta_df = CURRPATH / "../data/small-yeast.fasta"
+    quant = pd.read_csv(CURRPATH / "../data/yeast_small.csv")
     quant = quant.set_index("Protein")
 
     return quant, fasta_df
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,3 @@
		"""The parsers"""

		from .tabular import read_encyclopedia, read_metamorpheus, read_diann