diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index 96272c8..9ad20f3 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -6,11 +6,11 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Setup Python 3.8 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Setup Python 3.10 + uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" - name: Run black uses: psf/black@stable diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 665e35a..c9976cb 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -10,8 +10,8 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: 3.x @@ -25,4 +25,4 @@ jobs: fc-match Montserrat - run: pip install ".[docs]" - - run: mkdocs gh-deploy --force \ No newline at end of file + - run: mkdocs gh-deploy --force diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index dea50c1..29fb969 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,9 +12,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install dependencies diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f1a6cf8..ab11422 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,11 +19,11 @@ jobs: os: [ubuntu-latest, windows-latest, macos-latest] steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.10" - name: Install dependencies run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d602a2..4923566 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags + rev: 25.1.0 # Replace by any tag/version: https://github.com/psf/black/tags hooks: - id: black language_version: python3 # Should be a command that runs python3.6+ diff --git a/gopher/__init__.py b/gopher/__init__.py index 40f0410..cd134cb 100644 --- a/gopher/__init__.py +++ b/gopher/__init__.py @@ -1,4 +1,5 @@ """See the README for detailed documentation and examples.""" + try: from importlib.metadata import PackageNotFoundError, version diff --git a/gopher/annotations.py b/gopher/annotations.py index 53a7d2e..d490cbb 100644 --- a/gopher/annotations.py +++ b/gopher/annotations.py @@ -1,4 +1,5 @@ """Get GO annotations.""" + import uuid from pathlib import Path diff --git a/gopher/config.py b/gopher/config.py index d011d11..1e97c9b 100644 --- a/gopher/config.py +++ b/gopher/config.py @@ -1,4 +1,5 @@ """This module contains the configuration details for ppx""" + import logging import os from pathlib import Path diff --git a/gopher/enrichment.py b/gopher/enrichment.py index 07358af..4d50f80 100644 --- a/gopher/enrichment.py +++ b/gopher/enrichment.py @@ -1,4 +1,5 @@ """Calculate the enrichments for a collection of experiments.""" + import logging import numpy as np diff --git a/gopher/gopher.py b/gopher/gopher.py index 0102c47..2f3cba0 100644 --- a/gopher/gopher.py +++ b/gopher/gopher.py @@ -1,4 +1,5 @@ """The command line entry point for gopher-enrich""" + import logging from argparse import ArgumentParser diff --git a/gopher/ontologies.py b/gopher/ontologies.py index 546c7c6..726ae1b 100644 --- a/gopher/ontologies.py +++ b/gopher/ontologies.py @@ -1,4 +1,5 @@ """Download the GO ontologies""" + from collections import defaultdict from . import config, utils diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py index bef63f5..bb1a80e 100644 --- a/gopher/parsers/__init__.py +++ b/gopher/parsers/__init__.py @@ -1,2 +1,3 @@ """The parsers""" + from .tabular import read_encyclopedia, read_metamorpheus, read_diann diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index 7aae459..ce99cff 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -1,6 +1,10 @@ """Parse tabular result files from common tools""" + +import os +import io import pandas as pd import numpy as np +from cloudpathlib import AnyPath def read_encyclopedia(proteins_txt: str) -> pd.DataFrame: @@ -54,10 +58,19 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame: ) return proteins -def read_diann(proteins_tsv: str) -> pd.DataFrame: + +def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]: + with open(AnyPath(file)) as f: + firstcol = f.readline() + + return firstcol.strip().split("\t") + + +def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame: """ - Reads a DIANN-generated TSV file containing protein information, processes - it, and returns a cleaned Pandas DataFrame with relevant data. + Reads a DIANN-generated TSV file (pg_matrix) containing protein information. + + Also processes it, and returns a cleaned Pandas DataFrame with relevant data. The function: - Extracts the first protein accession from the "Protein.Ids" column to use @@ -73,40 +86,40 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame: 'Protein.Names', 'Genes', 'First.Protein.Description', - + Returns: pd.DataFrame: A DataFrame with the processed protein data, indexed by the first protein accession. - The returned DataFrame has the "Protein.Ids" column as the - index and all columns are the MSR columns. + The returned DataFrame has the "Protein.Ids" column as the + index and all columns are the MSR columns. """ - proteins = pd.read_table(proteins_tsv) - accessions = proteins["Protein.Ids"].str.split(";").str[0] - proteins = proteins.set_index(accessions) - proteins = proteins.rename_axis("Protein", axis="index") - proteins = proteins.drop( - columns=[ - "Protein.Group", - "Protein.Ids", - "Protein.Names", - "Genes", - "First.Protein.Description", - ] + columns = _read_colnames(proteins_tsv) + + expect = [ + "Protein.Group", + "Protein.Ids", + "Protein.Names", + "Genes", + "First.Protein.Description", + ] + + if not all(c in columns for c in expect): + msg = f"Expected columns {expect}, got {columns}, make sure you are" + msg += " using the 'diann_report.pg_matrix.tsv' output." + raise ValueError(msg) + + schema: dict[str, type] = {k: float for k in columns if k not in expect} + schema["Protein.Ids"] = str + + proteins = pd.read_table( + AnyPath(proteins_tsv), dtype=schema, usecols=list(schema) ) + proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0] + + proteins = proteins.set_index("Protein.Ids", drop=True) + proteins = proteins.rename_axis("Protein", axis="index") - # Check data types - # (if loading from S3, default types are 'O' - if proteins.index.dtype not in ["O", "category", "str"]: - raise ValueError( - f"Protein index is incorrect type: {proteins.index.dtype}" - ) - if not all( - np.issubdtype(dtype, np.floating) or dtype == "O" - for dtype in proteins.dtypes - ): - raise ValueError("Non-numeric columns present") - return proteins diff --git a/gopher/stats.py b/gopher/stats.py index b760ac2..156c6b0 100644 --- a/gopher/stats.py +++ b/gopher/stats.py @@ -1,4 +1,5 @@ """Numba Mann-Whitney U test""" + import numba as nb import numpy as np from scipy import stats diff --git a/gopher/utils.py b/gopher/utils.py index 2f8126f..d8ec817 100644 --- a/gopher/utils.py +++ b/gopher/utils.py @@ -1,4 +1,5 @@ """Utility functions""" + import socket from pathlib import Path diff --git a/pyproject.toml b/pyproject.toml index d304b40..d26f6b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,10 +13,27 @@ classifiers = [ "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Bio-Informatics", ] -requires-python = ">=3.6" - +requires-python = ">=3.10" +dependencies = [ + "numpy > 2.0, < 3.0", + "pandas > 2.0, < 3.0", + "scipy", + "tqdm", + "statsmodels", + "biopython", # ... we can implement a fasta parser ... + "loguru", + "numba", + "requests", + "seaborn", + "matplotlib", + "cloudpathlib", +] dynamic = ["version"] +[project.scripts] +gopher = "gopher.gopher:main" + + [project.readme] file = "README.md" content-type = "text/markdown" @@ -40,6 +57,11 @@ docs = [ dev = [ "pre-commit>=2.7.1", "black>=19.10b0", + "pytest", +] +s3 = [ + "cloudpathlib[s3]", + "boto3", ] [tool.setuptools] @@ -52,7 +74,7 @@ find = {namespaces = false} [tool.black] line-length = 79 -target-version = ['py37'] +target-version = ['py310'] include = '\.pyi?$' exclude = ''' diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 95d723c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,49 +0,0 @@ -[metadata] -name = gopher-enrich -author = William E Fondrie -author_email = fondriew@gmail.com -description = Gene ontology enrichment analysis using protein expression. -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/TalusBio/gopher -project_urls = - Documentation = https://TalusBio.github.io/gopher - Bug Tracker = https://github.com/TalusBio/gopher/issues - Discussion Board = https://github.com/TalusBio/gopher/discussions -license = Apache 2.0 -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved - Operating System :: OS Independent - Topic :: Scientific/Engineering :: Bio-Informatics - -[options] -packages = find: -python_requires = >=3.6 -install_requires = - numpy - pandas - scipy>=1.7.1 - statsmodels - requests - numba - seaborn - biopython - tqdm - loguru - -[options.extras_require] -docs = - numpydoc>=1.0.0 - sphinx-argparse>=0.2.5 - pydata-sphinx-theme>=0.4.3 - nbsphinx>=0.7.1 - ipykernel>=5.3.0 - recommonmark>=0.5.0 -dev = - pre-commit>=2.7.1 - black>=19.10b0 - -[options.entry_points] -console_scripts = - gopher = gopher.gopher:main diff --git a/setup.py b/setup.py deleted file mode 100644 index 10d9469..0000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Setup ppx""" -import setuptools - -setuptools.setup() diff --git a/tests/unit_tests/annotations_test.py b/tests/unit_tests/annotations_test.py index 9e22a36..908af22 100644 --- a/tests/unit_tests/annotations_test.py +++ b/tests/unit_tests/annotations_test.py @@ -1,4 +1,5 @@ """Test that the annotations functions are working correctly""" + import re import pandas as pd diff --git a/tests/unit_tests/enrichment_test.py b/tests/unit_tests/enrichment_test.py index dacd3d5..a6dda4b 100644 --- a/tests/unit_tests/enrichment_test.py +++ b/tests/unit_tests/enrichment_test.py @@ -1,4 +1,5 @@ """Test that the enrichment functions are working correctly""" + import random import numpy as np diff --git a/tests/unit_tests/normalize_test.py b/tests/unit_tests/normalize_test.py index 88e5e97..e896a09 100644 --- a/tests/unit_tests/normalize_test.py +++ b/tests/unit_tests/normalize_test.py @@ -6,12 +6,14 @@ from gopher import normalize +CURRPATH = Path(__file__).parent + @pytest.fixture def real_data(tmp_path): """Test using small files.""" - fasta_df = Path("../data/small-yeast.fasta") - quant = pd.read_csv("../data/yeast_small.csv") + fasta_df = CURRPATH / "../data/small-yeast.fasta" + quant = pd.read_csv(CURRPATH / "../data/yeast_small.csv") quant = quant.set_index("Protein") return quant, fasta_df diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py index 954b5db..c31cab2 100644 --- a/tests/unit_tests/tabular_test.py +++ b/tests/unit_tests/tabular_test.py @@ -1,27 +1,91 @@ +from pathlib import Path import pandas as pd -from io import StringIO +import pytest from pandas.testing import assert_frame_equal +from cloudpathlib import CloudPath, implementation_registry +from cloudpathlib.local import ( + LocalS3Client, + LocalS3Path, + local_s3_implementation, +) from gopher.parsers.tabular import read_diann -def test_read_diann_removes_metadata_and_sets_index(): + +@pytest.fixture +def cloud_asset_file(monkeypatch): + """Fixture that patches CloudPath dispatch and also sets up test assets in LocalS3Client's + local storage directory.""" + + monkeypatch.setitem(implementation_registry, "s3", local_s3_implementation) + + # Option 1: Use LocalS3Path to set up test assets directly + local_cloud_path = LocalS3Path( + "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv" + ) # Simulated DIANN output - mock_data = StringIO( - """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2 -PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000 -PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500 -""" + mock_data = ( + "Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2", + "PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000", + "PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500", ) + local_cloud_path.write_text("\n".join(mock_data)) + + local_cloud_path_genes = LocalS3Path( + "s3://cloudpathlib-test-bucket/diann_report.gg_mat.tsv" + ) + # Simulated DIANN output + mock_data = ( + "Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2", + "GENE1\tDescription A\t1000\t2000", + "GENE2\tDescription B\t1500\t2500", + ) + local_cloud_path_genes.write_text("\n".join(mock_data)) + + cloud_path_1 = CloudPath( + "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv" + ) + assert cloud_path_1.exists() # Expected DataFrame expected = pd.DataFrame( { - "Intensity.Sample1": [1000, 1500], - "Intensity.Sample2": [2000, 2500], - }, - index=["P12345", "P23456"] + # The real diann data has float values in the intensities. + "Intensity.Sample1": [1000.0, 1500.0], + "Intensity.Sample2": [2000.0, 2500.0], + }, + index=["P12345", "P23456"], ) expected.index.name = "Protein" - result = read_diann(mock_data) - assert_frame_equal(result, expected) + yield {"cloud_path": cloud_path_1, "expected": expected} + + LocalS3Client.reset_default_storage_dir() # clean up temp directory and replace with new one + + +def test_read_diann_removes_metadata_and_sets_index_cloud(cloud_asset_file): + result = read_diann( + "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv" + ) + assert_frame_equal(result, cloud_asset_file["expected"]) + + +def test_read_diann_removes_metadata_and_sets_index_local( + cloud_asset_file, tmpdir +): + local_path = Path(tmpdir) / "diann_report.pg_mat.tsv" + with open(local_path, "w") as f: + f.write(cloud_asset_file["cloud_path"].read_text()) + + result = read_diann(local_path) + assert_frame_equal(result, cloud_asset_file["expected"]) + + +def test_read_diann_faile_with_gg(cloud_asset_file): + + with pytest.raises(ValueError) as e: + result = read_diann( + "s3://cloudpathlib-test-bucket/diann_report.gg_mat.tsv" + ) + + assert "Expected columns" in str(e.value.args[0]) diff --git a/tests/unit_tests/test_version.py b/tests/unit_tests/test_version.py index a6045b7..784c95e 100644 --- a/tests/unit_tests/test_version.py +++ b/tests/unit_tests/test_version.py @@ -1,4 +1,5 @@ """Test that setuptools-scm is working correctly""" + import gopher