From 8c173727366e2afc2a7f85ace2f14c181dcc3125 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 27 Mar 2025 09:51:16 -0700 Subject: [PATCH 1/6] feat: efficient data read and pyproject cleanup --- gopher/parsers/tabular.py | 73 ++++++++++++++++++------------ pyproject.toml | 18 +++++++- tests/unit_tests/normalize_test.py | 6 ++- tests/unit_tests/tabular_test.py | 26 +++++++++-- 4 files changed, 85 insertions(+), 38 deletions(-) diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index 7aae459..c66f231 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -1,4 +1,6 @@ """Parse tabular result files from common tools""" +import os +import io import pandas as pd import numpy as np @@ -54,10 +56,23 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame: ) return proteins -def read_diann(proteins_tsv: str) -> pd.DataFrame: + +def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]: + if isinstance(file, io.TextIOBase): + firstcol = file.readline() + file.seek(0) + else: + with open(file) as f: + firstcol = f.readline() + + return firstcol.strip().split("\t") + + +def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame: """ - Reads a DIANN-generated TSV file containing protein information, processes - it, and returns a cleaned Pandas DataFrame with relevant data. + Reads a DIANN-generated TSV file (pg_matrix) containing protein information. + + Also processes it, and returns a cleaned Pandas DataFrame with relevant data. The function: - Extracts the first protein accession from the "Protein.Ids" column to use @@ -73,40 +88,38 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame: 'Protein.Names', 'Genes', 'First.Protein.Description', - + Returns: pd.DataFrame: A DataFrame with the processed protein data, indexed by the first protein accession. - The returned DataFrame has the "Protein.Ids" column as the - index and all columns are the MSR columns. + The returned DataFrame has the "Protein.Ids" column as the + index and all columns are the MSR columns. """ - proteins = pd.read_table(proteins_tsv) - accessions = proteins["Protein.Ids"].str.split(";").str[0] - proteins = proteins.set_index(accessions) + columns = _read_colnames(proteins_tsv) + + expect = [ + "Protein.Group", + "Protein.Ids", + "Protein.Names", + "Genes", + "First.Protein.Description", + ] + + if not all(c in columns for c in expect): + msg = f"Expected columns {expect}, got {columns}, make sure you are" + msg += " using the 'diann_report.pg_matrix.tsv' output." + raise ValueError(msg) + + schema: dict[str, type] = {k: float for k in columns if k not in expect} + schema["Protein.Ids"] = str + + proteins = pd.read_table(proteins_tsv, dtype=schema, usecols=list(schema)) + proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0] + + proteins = proteins.set_index("Protein.Ids", drop=True) proteins = proteins.rename_axis("Protein", axis="index") - proteins = proteins.drop( - columns=[ - "Protein.Group", - "Protein.Ids", - "Protein.Names", - "Genes", - "First.Protein.Description", - ] - ) - # Check data types - # (if loading from S3, default types are 'O' - if proteins.index.dtype not in ["O", "category", "str"]: - raise ValueError( - f"Protein index is incorrect type: {proteins.index.dtype}" - ) - if not all( - np.issubdtype(dtype, np.floating) or dtype == "O" - for dtype in proteins.dtypes - ): - raise ValueError("Non-numeric columns present") - return proteins diff --git a/pyproject.toml b/pyproject.toml index d304b40..38eb9eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,20 @@ classifiers = [ "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Bio-Informatics", ] -requires-python = ">=3.6" +requires-python = ">=3.10" +dependencies = [ + "numpy > 2.0, < 3.0", + "pandas > 2.0, < 3.0", + "scipy", + "tqdm", + "statsmodels", + "biopython", # ... we can implement a fasta parser ... + "loguru", + "numba", + "requests", + "seaborn", + "matplotlib", +] dynamic = ["version"] @@ -40,6 +53,7 @@ docs = [ dev = [ "pre-commit>=2.7.1", "black>=19.10b0", + "pytest", ] [tool.setuptools] @@ -52,7 +66,7 @@ find = {namespaces = false} [tool.black] line-length = 79 -target-version = ['py37'] +target-version = ['py310'] include = '\.pyi?$' exclude = ''' diff --git a/tests/unit_tests/normalize_test.py b/tests/unit_tests/normalize_test.py index 88e5e97..e896a09 100644 --- a/tests/unit_tests/normalize_test.py +++ b/tests/unit_tests/normalize_test.py @@ -6,12 +6,14 @@ from gopher import normalize +CURRPATH = Path(__file__).parent + @pytest.fixture def real_data(tmp_path): """Test using small files.""" - fasta_df = Path("../data/small-yeast.fasta") - quant = pd.read_csv("../data/yeast_small.csv") + fasta_df = CURRPATH / "../data/small-yeast.fasta" + quant = pd.read_csv(CURRPATH / "../data/yeast_small.csv") quant = quant.set_index("Protein") return quant, fasta_df diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py index 954b5db..e084495 100644 --- a/tests/unit_tests/tabular_test.py +++ b/tests/unit_tests/tabular_test.py @@ -1,9 +1,11 @@ import pandas as pd +import pytest from io import StringIO from pandas.testing import assert_frame_equal from gopher.parsers.tabular import read_diann + def test_read_diann_removes_metadata_and_sets_index(): # Simulated DIANN output mock_data = StringIO( @@ -16,12 +18,28 @@ def test_read_diann_removes_metadata_and_sets_index(): # Expected DataFrame expected = pd.DataFrame( { - "Intensity.Sample1": [1000, 1500], - "Intensity.Sample2": [2000, 2500], - }, - index=["P12345", "P23456"] + # The real diann data has float values in the intensities. + "Intensity.Sample1": [1000.0, 1500.0], + "Intensity.Sample2": [2000.0, 2500.0], + }, + index=["P12345", "P23456"], ) expected.index.name = "Protein" result = read_diann(mock_data) assert_frame_equal(result, expected) + + +def test_read_diann_faile_with_gg(): + # Simulated DIANN output + mock_data = StringIO( + """Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2 +GENE1\tDescription A\t1000\t2000 +GENE2\tDescription B\t1500\t2500 +""" + ) + + with pytest.raises(ValueError) as e: + result = read_diann(mock_data) + + assert "Expected columns" in str(e.value.args[0]) From 3c4a0dc6ab174b33ee705b123b3d2b8a9b278364 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 27 Mar 2025 09:58:57 -0700 Subject: [PATCH 2/6] chore: deleted redundant config --- pyproject.toml | 7 ++++++- setup.cfg | 49 ------------------------------------------------- setup.py | 4 ---- 3 files changed, 6 insertions(+), 54 deletions(-) delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml index 38eb9eb..27d95e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,9 +27,12 @@ dependencies = [ "seaborn", "matplotlib", ] - dynamic = ["version"] +[project.scripts] +gopher = "gopher.gopher:main" + + [project.readme] file = "README.md" content-type = "text/markdown" @@ -56,6 +59,8 @@ dev = [ "pytest", ] + + [tool.setuptools] include-package-data = false diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 95d723c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,49 +0,0 @@ -[metadata] -name = gopher-enrich -author = William E Fondrie -author_email = fondriew@gmail.com -description = Gene ontology enrichment analysis using protein expression. -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/TalusBio/gopher -project_urls = - Documentation = https://TalusBio.github.io/gopher - Bug Tracker = https://github.com/TalusBio/gopher/issues - Discussion Board = https://github.com/TalusBio/gopher/discussions -license = Apache 2.0 -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved - Operating System :: OS Independent - Topic :: Scientific/Engineering :: Bio-Informatics - -[options] -packages = find: -python_requires = >=3.6 -install_requires = - numpy - pandas - scipy>=1.7.1 - statsmodels - requests - numba - seaborn - biopython - tqdm - loguru - -[options.extras_require] -docs = - numpydoc>=1.0.0 - sphinx-argparse>=0.2.5 - pydata-sphinx-theme>=0.4.3 - nbsphinx>=0.7.1 - ipykernel>=5.3.0 - recommonmark>=0.5.0 -dev = - pre-commit>=2.7.1 - black>=19.10b0 - -[options.entry_points] -console_scripts = - gopher = gopher.gopher:main diff --git a/setup.py b/setup.py deleted file mode 100644 index 10d9469..0000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Setup ppx""" -import setuptools - -setuptools.setup() From b1c455f409e76c351e07ab915879a28c96b5e41d Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 27 Mar 2025 10:22:44 -0700 Subject: [PATCH 3/6] chore: updated pre-commit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d602a2..4923566 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags + rev: 25.1.0 # Replace by any tag/version: https://github.com/psf/black/tags hooks: - id: black language_version: python3 # Should be a command that runs python3.6+ From c31ee0b378d27f6504b1ac189eea60dd7c1bc4fb Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 27 Mar 2025 10:27:26 -0700 Subject: [PATCH 4/6] chore: updated gh actions --- .github/workflows/black.yml | 8 ++++---- .github/workflows/docs.yml | 6 +++--- .github/workflows/publish.yml | 4 ++-- .github/workflows/tests.yml | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 96272c8..9ad20f3 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,11 +6,11 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Setup Python 3.8 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Setup Python 3.10 + uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" - name: Run black uses: psf/black@stable diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 665e35a..c9976cb 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -10,8 +10,8 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: 3.x @@ -25,4 +25,4 @@ jobs: fc-match Montserrat - run: pip install ".[docs]" - - run: mkdocs gh-deploy --force \ No newline at end of file + - run: mkdocs gh-deploy --force diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index dea50c1..29fb969 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,9 +12,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install dependencies diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f1a6cf8..ab11422 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,11 +19,11 @@ jobs: os: [ubuntu-latest, windows-latest, macos-latest] steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" - name: Install dependencies run: | From 822c4321813d5a3379602633b57829f9b887087c Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 27 Mar 2025 10:32:03 -0700 Subject: [PATCH 5/6] chore: black --- gopher/__init__.py | 1 + gopher/annotations.py | 1 + gopher/config.py | 1 + gopher/enrichment.py | 1 + gopher/gopher.py | 1 + gopher/ontologies.py | 1 + gopher/parsers/__init__.py | 1 + gopher/parsers/tabular.py | 1 + gopher/stats.py | 1 + gopher/utils.py | 1 + tests/unit_tests/annotations_test.py | 1 + tests/unit_tests/enrichment_test.py | 1 + tests/unit_tests/test_version.py | 1 + 13 files changed, 13 insertions(+) diff --git a/gopher/__init__.py b/gopher/__init__.py index 40f0410..cd134cb 100644 --- a/gopher/__init__.py +++ b/gopher/__init__.py @@ -1,4 +1,5 @@ """See the README for detailed documentation and examples.""" + try: from importlib.metadata import PackageNotFoundError, version diff --git a/gopher/annotations.py b/gopher/annotations.py index 53a7d2e..d490cbb 100644 --- a/gopher/annotations.py +++ b/gopher/annotations.py @@ -1,4 +1,5 @@ """Get GO annotations.""" + import uuid from pathlib import Path diff --git a/gopher/config.py b/gopher/config.py index d011d11..1e97c9b 100644 --- a/gopher/config.py +++ b/gopher/config.py @@ -1,4 +1,5 @@ """This module contains the configuration details for ppx""" + import logging import os from pathlib import Path diff --git a/gopher/enrichment.py b/gopher/enrichment.py index 07358af..4d50f80 100644 --- a/gopher/enrichment.py +++ b/gopher/enrichment.py @@ -1,4 +1,5 @@ """Calculate the enrichments for a collection of experiments.""" + import logging import numpy as np diff --git a/gopher/gopher.py b/gopher/gopher.py index 0102c47..2f3cba0 100644 --- a/gopher/gopher.py +++ b/gopher/gopher.py @@ -1,4 +1,5 @@ """The command line entry point for gopher-enrich""" + import logging from argparse import ArgumentParser diff --git a/gopher/ontologies.py b/gopher/ontologies.py index 546c7c6..726ae1b 100644 --- a/gopher/ontologies.py +++ b/gopher/ontologies.py @@ -1,4 +1,5 @@ """Download the GO ontologies""" + from collections import defaultdict from . import config, utils diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py index bef63f5..bb1a80e 100644 --- a/gopher/parsers/__init__.py +++ b/gopher/parsers/__init__.py @@ -1,2 +1,3 @@ """The parsers""" + from .tabular import read_encyclopedia, read_metamorpheus, read_diann diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index c66f231..650b382 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -1,4 +1,5 @@ """Parse tabular result files from common tools""" + import os import io import pandas as pd diff --git a/gopher/stats.py b/gopher/stats.py index b760ac2..156c6b0 100644 --- a/gopher/stats.py +++ b/gopher/stats.py @@ -1,4 +1,5 @@ """Numba Mann-Whitney U test""" + import numba as nb import numpy as np from scipy import stats diff --git a/gopher/utils.py b/gopher/utils.py index 2f8126f..d8ec817 100644 --- a/gopher/utils.py +++ b/gopher/utils.py @@ -1,4 +1,5 @@ """Utility functions""" + import socket from pathlib import Path diff --git a/tests/unit_tests/annotations_test.py b/tests/unit_tests/annotations_test.py index 9e22a36..908af22 100644 --- a/tests/unit_tests/annotations_test.py +++ b/tests/unit_tests/annotations_test.py @@ -1,4 +1,5 @@ """Test that the annotations functions are working correctly""" + import re import pandas as pd diff --git a/tests/unit_tests/enrichment_test.py b/tests/unit_tests/enrichment_test.py index dacd3d5..a6dda4b 100644 --- a/tests/unit_tests/enrichment_test.py +++ b/tests/unit_tests/enrichment_test.py @@ -1,4 +1,5 @@ """Test that the enrichment functions are working correctly""" + import random import numpy as np diff --git a/tests/unit_tests/test_version.py b/tests/unit_tests/test_version.py index a6045b7..784c95e 100644 --- a/tests/unit_tests/test_version.py +++ b/tests/unit_tests/test_version.py @@ -1,4 +1,5 @@ """Test that setuptools-scm is working correctly""" + import gopher From 3cf875a50fdb1b8ce90452b00e0053b36598dd35 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Thu, 27 Mar 2025 12:06:09 -0700 Subject: [PATCH 6/6] feat: added explicit tests for s3 on diann data --- gopher/parsers/tabular.py | 13 +++--- pyproject.toml | 7 ++- tests/unit_tests/tabular_test.py | 80 +++++++++++++++++++++++++------- 3 files changed, 74 insertions(+), 26 deletions(-) diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index 650b382..ce99cff 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -4,6 +4,7 @@ import io import pandas as pd import numpy as np +from cloudpathlib import AnyPath def read_encyclopedia(proteins_txt: str) -> pd.DataFrame: @@ -59,12 +60,8 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame: def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]: - if isinstance(file, io.TextIOBase): - firstcol = file.readline() - file.seek(0) - else: - with open(file) as f: - firstcol = f.readline() + with open(AnyPath(file)) as f: + firstcol = f.readline() return firstcol.strip().split("\t") @@ -117,7 +114,9 @@ def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame: schema: dict[str, type] = {k: float for k in columns if k not in expect} schema["Protein.Ids"] = str - proteins = pd.read_table(proteins_tsv, dtype=schema, usecols=list(schema)) + proteins = pd.read_table( + AnyPath(proteins_tsv), dtype=schema, usecols=list(schema) + ) proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0] proteins = proteins.set_index("Protein.Ids", drop=True) diff --git a/pyproject.toml b/pyproject.toml index 27d95e0..d26f6b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "requests", "seaborn", "matplotlib", + "cloudpathlib", ] dynamic = ["version"] @@ -58,8 +59,10 @@ dev = [ "black>=19.10b0", "pytest", ] - - +s3 = [ + "cloudpathlib[s3]", + "boto3", +] [tool.setuptools] include-package-data = false diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py index e084495..c31cab2 100644 --- a/tests/unit_tests/tabular_test.py +++ b/tests/unit_tests/tabular_test.py @@ -1,19 +1,51 @@ +from pathlib import Path import pandas as pd import pytest -from io import StringIO from pandas.testing import assert_frame_equal +from cloudpathlib import CloudPath, implementation_registry +from cloudpathlib.local import ( + LocalS3Client, + LocalS3Path, + local_s3_implementation, +) from gopher.parsers.tabular import read_diann -def test_read_diann_removes_metadata_and_sets_index(): +@pytest.fixture +def cloud_asset_file(monkeypatch): + """Fixture that patches CloudPath dispatch and also sets up test assets in LocalS3Client's + local storage directory.""" + + monkeypatch.setitem(implementation_registry, "s3", local_s3_implementation) + + # Option 1: Use LocalS3Path to set up test assets directly + local_cloud_path = LocalS3Path( + "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv" + ) + # Simulated DIANN output + mock_data = ( + "Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2", + "PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000", + "PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500", + ) + local_cloud_path.write_text("\n".join(mock_data)) + + local_cloud_path_genes = LocalS3Path( + "s3://cloudpathlib-test-bucket/diann_report.gg_mat.tsv" + ) # Simulated DIANN output - mock_data = StringIO( - """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2 -PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000 -PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500 -""" + mock_data = ( + "Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2", + "GENE1\tDescription A\t1000\t2000", + "GENE2\tDescription B\t1500\t2500", + ) + local_cloud_path_genes.write_text("\n".join(mock_data)) + + cloud_path_1 = CloudPath( + "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv" ) + assert cloud_path_1.exists() # Expected DataFrame expected = pd.DataFrame( @@ -26,20 +58,34 @@ def test_read_diann_removes_metadata_and_sets_index(): ) expected.index.name = "Protein" - result = read_diann(mock_data) - assert_frame_equal(result, expected) + yield {"cloud_path": cloud_path_1, "expected": expected} + LocalS3Client.reset_default_storage_dir() # clean up temp directory and replace with new one -def test_read_diann_faile_with_gg(): - # Simulated DIANN output - mock_data = StringIO( - """Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2 -GENE1\tDescription A\t1000\t2000 -GENE2\tDescription B\t1500\t2500 -""" + +def test_read_diann_removes_metadata_and_sets_index_cloud(cloud_asset_file): + result = read_diann( + "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv" ) + assert_frame_equal(result, cloud_asset_file["expected"]) + + +def test_read_diann_removes_metadata_and_sets_index_local( + cloud_asset_file, tmpdir +): + local_path = Path(tmpdir) / "diann_report.pg_mat.tsv" + with open(local_path, "w") as f: + f.write(cloud_asset_file["cloud_path"].read_text()) + + result = read_diann(local_path) + assert_frame_equal(result, cloud_asset_file["expected"]) + + +def test_read_diann_faile_with_gg(cloud_asset_file): with pytest.raises(ValueError) as e: - result = read_diann(mock_data) + result = read_diann( + "s3://cloudpathlib-test-bucket/diann_report.gg_mat.tsv" + ) assert "Expected columns" in str(e.value.args[0])