Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Setup Python 3.8
uses: actions/setup-python@v2
- uses: actions/checkout@v4
- name: Setup Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.10"

- name: Run black
uses: psf/black@stable
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x

Expand All @@ -25,4 +25,4 @@ jobs:
fc-match Montserrat

- run: pip install ".[docs]"
- run: mkdocs gh-deploy --force
- run: mkdocs gh-deploy --force
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install dependencies
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ jobs:
os: [ubuntu-latest, windows-latest, macos-latest]

steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.10"

- name: Install dependencies
run: |
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/psf/black
rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags
rev: 25.1.0 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+
1 change: 1 addition & 0 deletions gopher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""See the README for detailed documentation and examples."""

try:
from importlib.metadata import PackageNotFoundError, version

Expand Down
1 change: 1 addition & 0 deletions gopher/annotations.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Get GO annotations."""

import uuid
from pathlib import Path

Expand Down
1 change: 1 addition & 0 deletions gopher/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This module contains the configuration details for ppx"""

import logging
import os
from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions gopher/enrichment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Calculate the enrichments for a collection of experiments."""

import logging

import numpy as np
Expand Down
1 change: 1 addition & 0 deletions gopher/gopher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""The command line entry point for gopher-enrich"""

import logging
from argparse import ArgumentParser

Expand Down
1 change: 1 addition & 0 deletions gopher/ontologies.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Download the GO ontologies"""

from collections import defaultdict

from . import config, utils
Expand Down
1 change: 1 addition & 0 deletions gopher/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
"""The parsers"""

from .tabular import read_encyclopedia, read_metamorpheus, read_diann
73 changes: 43 additions & 30 deletions gopher/parsers/tabular.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""Parse tabular result files from common tools"""

import os
import io
import pandas as pd
import numpy as np
from cloudpathlib import AnyPath


def read_encyclopedia(proteins_txt: str) -> pd.DataFrame:
Expand Down Expand Up @@ -54,10 +58,19 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
)
return proteins

def read_diann(proteins_tsv: str) -> pd.DataFrame:

def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]:
with open(AnyPath(file)) as f:
firstcol = f.readline()

return firstcol.strip().split("\t")


def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame:
"""
Reads a DIANN-generated TSV file containing protein information, processes
it, and returns a cleaned Pandas DataFrame with relevant data.
Reads a DIANN-generated TSV file (pg_matrix) containing protein information.

Also processes it, and returns a cleaned Pandas DataFrame with relevant data.

The function:
- Extracts the first protein accession from the "Protein.Ids" column to use
Expand All @@ -73,40 +86,40 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame:
'Protein.Names',
'Genes',
'First.Protein.Description',
<several MSR columns>
<several Intensity columns>


Returns:
pd.DataFrame: A DataFrame with the processed protein data, indexed by
the first protein accession.
The returned DataFrame has the "Protein.Ids" column as the
index and all columns are the MSR columns.
The returned DataFrame has the "Protein.Ids" column as the
index and all columns are the MSR columns.
"""
proteins = pd.read_table(proteins_tsv)
accessions = proteins["Protein.Ids"].str.split(";").str[0]

proteins = proteins.set_index(accessions)
proteins = proteins.rename_axis("Protein", axis="index")
proteins = proteins.drop(
columns=[
"Protein.Group",
"Protein.Ids",
"Protein.Names",
"Genes",
"First.Protein.Description",
]
columns = _read_colnames(proteins_tsv)

expect = [
"Protein.Group",
"Protein.Ids",
"Protein.Names",
"Genes",
"First.Protein.Description",
]

if not all(c in columns for c in expect):
Comment thread
ltatka marked this conversation as resolved.
msg = f"Expected columns {expect}, got {columns}, make sure you are"
msg += " using the 'diann_report.pg_matrix.tsv' output."
raise ValueError(msg)

schema: dict[str, type] = {k: float for k in columns if k not in expect}
schema["Protein.Ids"] = str

proteins = pd.read_table(
AnyPath(proteins_tsv), dtype=schema, usecols=list(schema)
)
proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0]
Copy link
Copy Markdown
Member

@ltatka ltatka Mar 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current version of gopher requires that the index column be the accessions and that all columns are intensity columns. If you don't drop "Genes", "Protein.Group" etc, downstream functions such as test_enrichment will not work (unless you are also planning on changing those).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they are not being read, so they dont have to be dropped.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the returned data frame will have them, right? And then test_enrichment will throw an error, unless you've also modified that.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it does not, the test checks (and proves) that is not the case.


proteins = proteins.set_index("Protein.Ids", drop=True)
proteins = proteins.rename_axis("Protein", axis="index")

# Check data types
# (if loading from S3, default types are 'O'
if proteins.index.dtype not in ["O", "category", "str"]:
raise ValueError(
f"Protein index is incorrect type: {proteins.index.dtype}"
)
if not all(
np.issubdtype(dtype, np.floating) or dtype == "O"
for dtype in proteins.dtypes
):
raise ValueError("Non-numeric columns present")

return proteins
1 change: 1 addition & 0 deletions gopher/stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Numba Mann-Whitney U test"""

import numba as nb
import numpy as np
from scipy import stats
Expand Down
1 change: 1 addition & 0 deletions gopher/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utility functions"""

import socket
from pathlib import Path

Expand Down
28 changes: 25 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,27 @@ classifiers = [
"Operating System :: OS Independent",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
requires-python = ">=3.6"

requires-python = ">=3.10"
dependencies = [
"numpy > 2.0, < 3.0",
"pandas > 2.0, < 3.0",
"scipy",
"tqdm",
"statsmodels",
"biopython", # ... we can implement a fasta parser ...
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment: Indeed we could implement a FASTA parser, it just hasn't seemed worth it with how little use Gopher has gotten in recent times. Maybe worth it now though.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fair enough ... although IDK ... biopython is a really heavy dependency and a fasta parser is like 20 lines :P so I usually go with the re-implementing if I am not already using pyteomics. I guess that since this project already depends on scipy+numpy+pandas+matplotlib+statsmodels ... bio is not a big deal

"loguru",
"numba",
"requests",
"seaborn",
"matplotlib",
"cloudpathlib",
]
dynamic = ["version"]

[project.scripts]
gopher = "gopher.gopher:main"


[project.readme]
file = "README.md"
content-type = "text/markdown"
Expand All @@ -40,6 +57,11 @@ docs = [
dev = [
"pre-commit>=2.7.1",
"black>=19.10b0",
"pytest",
]
s3 = [
"cloudpathlib[s3]",
"boto3",
]

[tool.setuptools]
Expand All @@ -52,7 +74,7 @@ find = {namespaces = false}

[tool.black]
line-length = 79
target-version = ['py37']
target-version = ['py310']
include = '\.pyi?$'
exclude = '''

Expand Down
49 changes: 0 additions & 49 deletions setup.cfg

This file was deleted.

4 changes: 0 additions & 4 deletions setup.py

This file was deleted.

1 change: 1 addition & 0 deletions tests/unit_tests/annotations_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test that the annotations functions are working correctly"""

import re

import pandas as pd
Expand Down
1 change: 1 addition & 0 deletions tests/unit_tests/enrichment_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test that the enrichment functions are working correctly"""

import random

import numpy as np
Expand Down
6 changes: 4 additions & 2 deletions tests/unit_tests/normalize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@

from gopher import normalize

CURRPATH = Path(__file__).parent


@pytest.fixture
def real_data(tmp_path):
"""Test using small files."""
fasta_df = Path("../data/small-yeast.fasta")
quant = pd.read_csv("../data/yeast_small.csv")
fasta_df = CURRPATH / "../data/small-yeast.fasta"
quant = pd.read_csv(CURRPATH / "../data/yeast_small.csv")
quant = quant.set_index("Protein")

return quant, fasta_df
Expand Down
Loading