From 8c173727366e2afc2a7f85ace2f14c181dcc3125 Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@gmail.com>
Date: Thu, 27 Mar 2025 09:51:16 -0700
Subject: [PATCH 1/6] feat: efficient data read and pyproject cleanup

---
 gopher/parsers/tabular.py          | 73 ++++++++++++++++++------------
 pyproject.toml                     | 18 +++++++-
 tests/unit_tests/normalize_test.py |  6 ++-
 tests/unit_tests/tabular_test.py   | 26 +++++++++--
 4 files changed, 85 insertions(+), 38 deletions(-)
diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
index 7aae459..c66f231 100644
--- a/gopher/parsers/tabular.py
+++ b/gopher/parsers/tabular.py
@@ -1,4 +1,6 @@
 """Parse tabular result files from common tools"""
+import os
+import io
 import pandas as pd
 import numpy as np
 
@@ -54,10 +56,23 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
     )
     return proteins
 
-def read_diann(proteins_tsv: str) -> pd.DataFrame:
+
+def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]:
+    if isinstance(file, io.TextIOBase):
+        firstcol = file.readline()
+        file.seek(0)
+    else:
+        with open(file) as f:
+            firstcol = f.readline()
+
+    return firstcol.strip().split("\t")
+
+
+def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame:
     """
-    Reads a DIANN-generated TSV file containing protein information, processes
-    it, and returns a cleaned Pandas DataFrame with relevant data.
+    Reads a DIANN-generated TSV file (pg_matrix) containing protein information.
+
+    Also processes it, and returns a cleaned Pandas DataFrame with relevant data.
 
     The function:
     - Extracts the first protein accession from the "Protein.Ids" column to use
@@ -73,40 +88,38 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame:
                 'Protein.Names',
                 'Genes',
                 'First.Protein.Description',
-                <several MSR columns>
+                <several Intensity columns>
 
 
     Returns:
         pd.DataFrame: A DataFrame with the processed protein data, indexed by
             the first protein accession.
-            The returned DataFrame has the "Protein.Ids" column as the 
-            index and all columns are the MSR columns.          
+            The returned DataFrame has the "Protein.Ids" column as the
+            index and all columns are the MSR columns.
     """
-    proteins = pd.read_table(proteins_tsv)
-    accessions = proteins["Protein.Ids"].str.split(";").str[0]
 
-    proteins = proteins.set_index(accessions)
+    columns = _read_colnames(proteins_tsv)
+
+    expect = [
+        "Protein.Group",
+        "Protein.Ids",
+        "Protein.Names",
+        "Genes",
+        "First.Protein.Description",
+    ]
+
+    if not all(c in columns for c in expect):
+        msg = f"Expected columns {expect}, got {columns}, make sure you are"
+        msg += " using the 'diann_report.pg_matrix.tsv' output."
+        raise ValueError(msg)
+
+    schema: dict[str, type] = {k: float for k in columns if k not in expect}
+    schema["Protein.Ids"] = str
+
+    proteins = pd.read_table(proteins_tsv, dtype=schema, usecols=list(schema))
+    proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0]
+
+    proteins = proteins.set_index("Protein.Ids", drop=True)
     proteins = proteins.rename_axis("Protein", axis="index")
-    proteins = proteins.drop(
-        columns=[
-            "Protein.Group",
-            "Protein.Ids",
-            "Protein.Names",
-            "Genes",
-            "First.Protein.Description",
-        ]
-    )
 
-    # Check data types
-    # (if loading from S3, default types are 'O'
-    if proteins.index.dtype not in ["O", "category", "str"]:
-        raise ValueError(
-            f"Protein index is incorrect type: {proteins.index.dtype}"
-        )
-    if not all(
-        np.issubdtype(dtype, np.floating) or dtype == "O"
-        for dtype in proteins.dtypes
-    ):
-        raise ValueError("Non-numeric columns present")
-    
     return proteins
diff --git a/pyproject.toml b/pyproject.toml
index d304b40..38eb9eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,20 @@ classifiers = [
     "Operating System :: OS Independent",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
-requires-python = ">=3.6"
+requires-python = ">=3.10"
+dependencies = [
+  "numpy > 2.0, < 3.0",
+  "pandas > 2.0, < 3.0",
+  "scipy",
+  "tqdm",
+  "statsmodels",
+  "biopython", # ... we can implement a fasta parser ...
+  "loguru",
+  "numba",
+  "requests",
+  "seaborn",
+  "matplotlib",
+]
 
 dynamic = ["version"]
 
@@ -40,6 +53,7 @@ docs = [
 dev = [
     "pre-commit>=2.7.1",
     "black>=19.10b0",
+    "pytest",
 ]
 
 [tool.setuptools]
@@ -52,7 +66,7 @@ find = {namespaces = false}
 
 [tool.black]
 line-length = 79
-target-version = ['py37']
+target-version = ['py310']
 include = '\.pyi?$'
 exclude = '''
 
diff --git a/tests/unit_tests/normalize_test.py b/tests/unit_tests/normalize_test.py
index 88e5e97..e896a09 100644
--- a/tests/unit_tests/normalize_test.py
+++ b/tests/unit_tests/normalize_test.py
@@ -6,12 +6,14 @@
 
 from gopher import normalize
 
+CURRPATH = Path(__file__).parent
+
 
 @pytest.fixture
 def real_data(tmp_path):
     """Test using small files."""
-    fasta_df = Path("../data/small-yeast.fasta")
-    quant = pd.read_csv("../data/yeast_small.csv")
+    fasta_df = CURRPATH / "../data/small-yeast.fasta"
+    quant = pd.read_csv(CURRPATH / "../data/yeast_small.csv")
     quant = quant.set_index("Protein")
 
     return quant, fasta_df
diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py
index 954b5db..e084495 100644
--- a/tests/unit_tests/tabular_test.py
+++ b/tests/unit_tests/tabular_test.py
@@ -1,9 +1,11 @@
 import pandas as pd
+import pytest
 from io import StringIO
 from pandas.testing import assert_frame_equal
 
 from gopher.parsers.tabular import read_diann
 
+
 def test_read_diann_removes_metadata_and_sets_index():
     # Simulated DIANN output
     mock_data = StringIO(
@@ -16,12 +18,28 @@ def test_read_diann_removes_metadata_and_sets_index():
     # Expected DataFrame
     expected = pd.DataFrame(
         {
-            "Intensity.Sample1": [1000, 1500],
-            "Intensity.Sample2": [2000, 2500],
-        }, 
-        index=["P12345", "P23456"]
+            # The real diann data has float values in the intensities.
+            "Intensity.Sample1": [1000.0, 1500.0],
+            "Intensity.Sample2": [2000.0, 2500.0],
+        },
+        index=["P12345", "P23456"],
     )
     expected.index.name = "Protein"
 
     result = read_diann(mock_data)
     assert_frame_equal(result, expected)
+
+
+def test_read_diann_faile_with_gg():
+    # Simulated DIANN output
+    mock_data = StringIO(
+        """Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2
+GENE1\tDescription A\t1000\t2000
+GENE2\tDescription B\t1500\t2500
+"""
+    )
+
+    with pytest.raises(ValueError) as e:
+        result = read_diann(mock_data)
+
+    assert "Expected columns" in str(e.value.args[0])

From 3c4a0dc6ab174b33ee705b123b3d2b8a9b278364 Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@gmail.com>
Date: Thu, 27 Mar 2025 09:58:57 -0700
Subject: [PATCH 2/6] chore: deleted redundant config

---
 pyproject.toml |  7 ++++++-
 setup.cfg      | 49 -------------------------------------------------
 setup.py       |  4 ----
 3 files changed, 6 insertions(+), 54 deletions(-)
 delete mode 100644 setup.cfg
 delete mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
index 38eb9eb..27d95e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,9 +27,12 @@ dependencies = [
   "seaborn",
   "matplotlib",
 ]
-
 dynamic = ["version"]
 
+[project.scripts]
+gopher = "gopher.gopher:main"
+
+
 [project.readme]
 file = "README.md"
 content-type = "text/markdown"
@@ -56,6 +59,8 @@ dev = [
     "pytest",
 ]
 
+
+
 [tool.setuptools]
 include-package-data = false
 
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 95d723c..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,49 +0,0 @@
-[metadata]
-name = gopher-enrich
-author = William E Fondrie
-author_email = fondriew@gmail.com
-description = Gene ontology enrichment analysis using protein expression.
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/TalusBio/gopher
-project_urls =
-    Documentation = https://TalusBio.github.io/gopher
-    Bug Tracker = https://github.com/TalusBio/gopher/issues
-    Discussion Board = https://github.com/TalusBio/gopher/discussions
-license = Apache 2.0
-classifiers =
-    Programming Language :: Python :: 3
-    License :: OSI Approved
-    Operating System :: OS Independent
-    Topic :: Scientific/Engineering :: Bio-Informatics
-
-[options]
-packages = find:
-python_requires = >=3.6
-install_requires =
-    numpy
-    pandas
-    scipy>=1.7.1
-    statsmodels
-    requests
-    numba
-    seaborn
-    biopython
-    tqdm
-    loguru
-
-[options.extras_require]
-docs =
-    numpydoc>=1.0.0
-    sphinx-argparse>=0.2.5
-    pydata-sphinx-theme>=0.4.3
-    nbsphinx>=0.7.1
-    ipykernel>=5.3.0
-    recommonmark>=0.5.0
-dev =
-    pre-commit>=2.7.1
-    black>=19.10b0
-
-[options.entry_points]
-console_scripts =
-    gopher = gopher.gopher:main
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 10d9469..0000000
--- a/setup.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Setup ppx"""
-import setuptools
-
-setuptools.setup()

From b1c455f409e76c351e07ab915879a28c96b5e41d Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@gmail.com>
Date: Thu, 27 Mar 2025 10:22:44 -0700
Subject: [PATCH 3/6] chore: updated pre-commit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d602a2..4923566 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags
+    rev: 25.1.0 # Replace by any tag/version: https://github.com/psf/black/tags
     hooks:
       - id: black
         language_version: python3 # Should be a command that runs python3.6+

From c31ee0b378d27f6504b1ac189eea60dd7c1bc4fb Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@gmail.com>
Date: Thu, 27 Mar 2025 10:27:26 -0700
Subject: [PATCH 4/6] chore: updated gh actions

---
 .github/workflows/black.yml   | 8 ++++----
 .github/workflows/docs.yml    | 6 +++---
 .github/workflows/publish.yml | 4 ++--
 .github/workflows/tests.yml   | 8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
index 96272c8..9ad20f3 100644
--- a/.github/workflows/black.yml
+++ b/.github/workflows/black.yml
@@ -6,11 +6,11 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - name: Setup Python 3.8
-        uses: actions/setup-python@v2
+      - uses: actions/checkout@v4
+      - name: Setup Python 3.10
+        uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.10"
 
       - name: Run black
         uses: psf/black@stable
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 665e35a..c9976cb 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -10,8 +10,8 @@ jobs:
   deploy:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: 3.x
 
@@ -25,4 +25,4 @@ jobs:
           fc-match Montserrat
 
       - run: pip install ".[docs]"
-      - run: mkdocs gh-deploy --force
\ No newline at end of file
+      - run: mkdocs gh-deploy --force
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index dea50c1..29fb969 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -12,9 +12,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: '3.x'
     - name: Install dependencies
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f1a6cf8..ab11422 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -19,11 +19,11 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest]
 
     steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v5
       with:
-        python-version: "3.8"
+        python-version: "3.10"
 
     - name: Install dependencies
       run: |

From 822c4321813d5a3379602633b57829f9b887087c Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@gmail.com>
Date: Thu, 27 Mar 2025 10:32:03 -0700
Subject: [PATCH 5/6] chore: black

---
 gopher/__init__.py                   | 1 +
 gopher/annotations.py                | 1 +
 gopher/config.py                     | 1 +
 gopher/enrichment.py                 | 1 +
 gopher/gopher.py                     | 1 +
 gopher/ontologies.py                 | 1 +
 gopher/parsers/__init__.py           | 1 +
 gopher/parsers/tabular.py            | 1 +
 gopher/stats.py                      | 1 +
 gopher/utils.py                      | 1 +
 tests/unit_tests/annotations_test.py | 1 +
 tests/unit_tests/enrichment_test.py  | 1 +
 tests/unit_tests/test_version.py     | 1 +
 13 files changed, 13 insertions(+)

diff --git a/gopher/__init__.py b/gopher/__init__.py
index 40f0410..cd134cb 100644
--- a/gopher/__init__.py
+++ b/gopher/__init__.py
@@ -1,4 +1,5 @@
 """See the README for detailed documentation and examples."""
+
 try:
     from importlib.metadata import PackageNotFoundError, version
 
diff --git a/gopher/annotations.py b/gopher/annotations.py
index 53a7d2e..d490cbb 100644
--- a/gopher/annotations.py
+++ b/gopher/annotations.py
@@ -1,4 +1,5 @@
 """Get GO annotations."""
+
 import uuid
 from pathlib import Path
 
diff --git a/gopher/config.py b/gopher/config.py
index d011d11..1e97c9b 100644
--- a/gopher/config.py
+++ b/gopher/config.py
@@ -1,4 +1,5 @@
 """This module contains the configuration details for ppx"""
+
 import logging
 import os
 from pathlib import Path
diff --git a/gopher/enrichment.py b/gopher/enrichment.py
index 07358af..4d50f80 100644
--- a/gopher/enrichment.py
+++ b/gopher/enrichment.py
@@ -1,4 +1,5 @@
 """Calculate the enrichments for a collection of experiments."""
+
 import logging
 
 import numpy as np
diff --git a/gopher/gopher.py b/gopher/gopher.py
index 0102c47..2f3cba0 100644
--- a/gopher/gopher.py
+++ b/gopher/gopher.py
@@ -1,4 +1,5 @@
 """The command line entry point for gopher-enrich"""
+
 import logging
 from argparse import ArgumentParser
 
diff --git a/gopher/ontologies.py b/gopher/ontologies.py
index 546c7c6..726ae1b 100644
--- a/gopher/ontologies.py
+++ b/gopher/ontologies.py
@@ -1,4 +1,5 @@
 """Download the GO ontologies"""
+
 from collections import defaultdict
 
 from . import config, utils
diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py
index bef63f5..bb1a80e 100644
--- a/gopher/parsers/__init__.py
+++ b/gopher/parsers/__init__.py
@@ -1,2 +1,3 @@
 """The parsers"""
+
 from .tabular import read_encyclopedia, read_metamorpheus, read_diann
diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
index c66f231..650b382 100644
--- a/gopher/parsers/tabular.py
+++ b/gopher/parsers/tabular.py
@@ -1,4 +1,5 @@
 """Parse tabular result files from common tools"""
+
 import os
 import io
 import pandas as pd
diff --git a/gopher/stats.py b/gopher/stats.py
index b760ac2..156c6b0 100644
--- a/gopher/stats.py
+++ b/gopher/stats.py
@@ -1,4 +1,5 @@
 """Numba Mann-Whitney U test"""
+
 import numba as nb
 import numpy as np
 from scipy import stats
diff --git a/gopher/utils.py b/gopher/utils.py
index 2f8126f..d8ec817 100644
--- a/gopher/utils.py
+++ b/gopher/utils.py
@@ -1,4 +1,5 @@
 """Utility functions"""
+
 import socket
 from pathlib import Path
 
diff --git a/tests/unit_tests/annotations_test.py b/tests/unit_tests/annotations_test.py
index 9e22a36..908af22 100644
--- a/tests/unit_tests/annotations_test.py
+++ b/tests/unit_tests/annotations_test.py
@@ -1,4 +1,5 @@
 """Test that the annotations functions are working correctly"""
+
 import re
 
 import pandas as pd
diff --git a/tests/unit_tests/enrichment_test.py b/tests/unit_tests/enrichment_test.py
index dacd3d5..a6dda4b 100644
--- a/tests/unit_tests/enrichment_test.py
+++ b/tests/unit_tests/enrichment_test.py
@@ -1,4 +1,5 @@
 """Test that the enrichment functions are working correctly"""
+
 import random
 
 import numpy as np
diff --git a/tests/unit_tests/test_version.py b/tests/unit_tests/test_version.py
index a6045b7..784c95e 100644
--- a/tests/unit_tests/test_version.py
+++ b/tests/unit_tests/test_version.py
@@ -1,4 +1,5 @@
 """Test that setuptools-scm is working correctly"""
+
 import gopher
 
 

From 3cf875a50fdb1b8ce90452b00e0053b36598dd35 Mon Sep 17 00:00:00 2001
From: "J. Sebastian Paez" <jspaezp@gmail.com>
Date: Thu, 27 Mar 2025 12:06:09 -0700
Subject: [PATCH 6/6] feat: added explicit tests for s3 on diann data

---
 gopher/parsers/tabular.py        | 13 +++---
 pyproject.toml                   |  7 ++-
 tests/unit_tests/tabular_test.py | 80 +++++++++++++++++++++++++-------
 3 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
index 650b382..ce99cff 100644
--- a/gopher/parsers/tabular.py
+++ b/gopher/parsers/tabular.py
@@ -4,6 +4,7 @@
 import io
 import pandas as pd
 import numpy as np
+from cloudpathlib import AnyPath
 
 
 def read_encyclopedia(proteins_txt: str) -> pd.DataFrame:
@@ -59,12 +60,8 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
 
 
 def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]:
-    if isinstance(file, io.TextIOBase):
-        firstcol = file.readline()
-        file.seek(0)
-    else:
-        with open(file) as f:
-            firstcol = f.readline()
+    with open(AnyPath(file)) as f:
+        firstcol = f.readline()
 
     return firstcol.strip().split("\t")
 
@@ -117,7 +114,9 @@ def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame:
     schema: dict[str, type] = {k: float for k in columns if k not in expect}
     schema["Protein.Ids"] = str
 
-    proteins = pd.read_table(proteins_tsv, dtype=schema, usecols=list(schema))
+    proteins = pd.read_table(
+        AnyPath(proteins_tsv), dtype=schema, usecols=list(schema)
+    )
     proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0]
 
     proteins = proteins.set_index("Protein.Ids", drop=True)
diff --git a/pyproject.toml b/pyproject.toml
index 27d95e0..d26f6b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
   "requests",
   "seaborn",
   "matplotlib",
+  "cloudpathlib",
 ]
 dynamic = ["version"]
 
@@ -58,8 +59,10 @@ dev = [
     "black>=19.10b0",
     "pytest",
 ]
-
-
+s3 = [
+  "cloudpathlib[s3]",
+  "boto3",
+]
 
 [tool.setuptools]
 include-package-data = false
diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py
index e084495..c31cab2 100644
--- a/tests/unit_tests/tabular_test.py
+++ b/tests/unit_tests/tabular_test.py
@@ -1,19 +1,51 @@
+from pathlib import Path
 import pandas as pd
 import pytest
-from io import StringIO
 from pandas.testing import assert_frame_equal
 
+from cloudpathlib import CloudPath, implementation_registry
+from cloudpathlib.local import (
+    LocalS3Client,
+    LocalS3Path,
+    local_s3_implementation,
+)
 from gopher.parsers.tabular import read_diann
 
 
-def test_read_diann_removes_metadata_and_sets_index():
+@pytest.fixture
+def cloud_asset_file(monkeypatch):
+    """Fixture that patches CloudPath dispatch and also sets up test assets in LocalS3Client's
+    local storage directory."""
+
+    monkeypatch.setitem(implementation_registry, "s3", local_s3_implementation)
+
+    # Option 1: Use LocalS3Path to set up test assets directly
+    local_cloud_path = LocalS3Path(
+        "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv"
+    )
+    # Simulated DIANN output
+    mock_data = (
+        "Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2",
+        "PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000",
+        "PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500",
+    )
+    local_cloud_path.write_text("\n".join(mock_data))
+
+    local_cloud_path_genes = LocalS3Path(
+        "s3://cloudpathlib-test-bucket/diann_report.gg_mat.tsv"
+    )
     # Simulated DIANN output
-    mock_data = StringIO(
-        """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2
-PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000
-PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500
-"""
+    mock_data = (
+        "Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2",
+        "GENE1\tDescription A\t1000\t2000",
+        "GENE2\tDescription B\t1500\t2500",
+    )
+    local_cloud_path_genes.write_text("\n".join(mock_data))
+
+    cloud_path_1 = CloudPath(
+        "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv"
     )
+    assert cloud_path_1.exists()
 
     # Expected DataFrame
     expected = pd.DataFrame(
@@ -26,20 +58,34 @@ def test_read_diann_removes_metadata_and_sets_index():
     )
     expected.index.name = "Protein"
 
-    result = read_diann(mock_data)
-    assert_frame_equal(result, expected)
+    yield {"cloud_path": cloud_path_1, "expected": expected}
 
+    LocalS3Client.reset_default_storage_dir()  # clean up temp directory and replace with new one
 
-def test_read_diann_faile_with_gg():
-    # Simulated DIANN output
-    mock_data = StringIO(
-        """Genes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2
-GENE1\tDescription A\t1000\t2000
-GENE2\tDescription B\t1500\t2500
-"""
+
+def test_read_diann_removes_metadata_and_sets_index_cloud(cloud_asset_file):
+    result = read_diann(
+        "s3://cloudpathlib-test-bucket/diann_report.pg_mat.tsv"
     )
+    assert_frame_equal(result, cloud_asset_file["expected"])
+
+
+def test_read_diann_removes_metadata_and_sets_index_local(
+    cloud_asset_file, tmpdir
+):
+    local_path = Path(tmpdir) / "diann_report.pg_mat.tsv"
+    with open(local_path, "w") as f:
+        f.write(cloud_asset_file["cloud_path"].read_text())
+
+    result = read_diann(local_path)
+    assert_frame_equal(result, cloud_asset_file["expected"])
+
+
+def test_read_diann_faile_with_gg(cloud_asset_file):
 
     with pytest.raises(ValueError) as e:
-        result = read_diann(mock_data)
+        result = read_diann(
+            "s3://cloudpathlib-test-bucket/diann_report.gg_mat.tsv"
+        )
 
     assert "Expected columns" in str(e.value.args[0])