TalusBio · ltatka · Mar 26, 2025 · Mar 24, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/gopher/__init__.py b/gopher/__init__.py
@@ -26,4 +26,4 @@
 )
 from .enrichment import test_enrichment
 from .normalize import normalize_values
-from .parsers import read_encyclopedia, read_metamorpheus
+from .parsers import read_encyclopedia, read_metamorpheus, read_diann
diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py
@@ -1,2 +1,2 @@
 """The parsers"""
-from .tabular import read_encyclopedia, read_metamorpheus
+from .tabular import read_encyclopedia, read_metamorpheus, read_diann
diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
@@ -1,5 +1,6 @@
 """Parse tabular result files from common tools"""
 import pandas as pd
+import numpy as np
 
 
 def read_encyclopedia(proteins_txt: str) -> pd.DataFrame:
@@ -52,3 +53,60 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
         .fillna(0)
     )
     return proteins
+
+def read_diann(proteins_tsv: str) -> pd.DataFrame:
+    """
+    Reads a DIANN-generated TSV file containing protein information, processes
+    it, and returns a cleaned Pandas DataFrame with relevant data.
+
+    The function:
+    - Extracts the first protein accession from the "Protein.Ids" column to use
+        as the DataFrame index.
+    - Renames the index axis to "Protein".
+    - Drops unnecessary metadata columns.
+
+    Args:
+        proteins_tsv (str): Path to the DIANN-generated TSV file.
+            Expected columns:
+                'Protein.Group',
+                'Protein.Ids',
+                'Protein.Names',
+                'Genes',
+                'First.Protein.Description',
+                <several MSR columns>
+
+
+    Returns:
+        pd.DataFrame: A DataFrame with the processed protein data, indexed by
+            the first protein accession.
+            The returned DataFrame has the "Protein.Ids" column as the 
+            index and all columns are the MSR columns.          
+    """
+    proteins = pd.read_table(proteins_tsv)
+    accessions = proteins["Protein.Ids"].str.split(";").str[0]
+
+    proteins = proteins.set_index(accessions)
+    proteins = proteins.rename_axis("Protein", axis="index")
+    proteins = proteins.drop(
+        columns=[
+            "Protein.Group",
+            "Protein.Ids",
+            "Protein.Names",
+            "Genes",
+            "First.Protein.Description",
+        ]
+    )
+
+    # Check data types
+    # (if loading from S3, default types are 'O'
+    if proteins.index.dtype not in ["O", "category", "str"]:
+        raise ValueError(
+            f"Protein index is incorrect type: {proteins.index.dtype}"
+        )
+    if not all(
+        np.issubdtype(dtype, np.floating) or dtype == "O"
+        for dtype in proteins.dtypes
+    ):
+        raise ValueError("Non-numeric columns present")
+
+    return proteins
diff --git a/setup.cfg b/setup.cfg
@@ -29,6 +29,8 @@ install_requires =
     numba
     seaborn
     biopython
+    tqdm
+    loguru
 
 [options.extras_require]
 docs =

diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py
@@ -0,0 +1,27 @@
+import pandas as pd
+from io import StringIO
+from pandas.testing import assert_frame_equal
+
+from gopher.parsers.tabular import read_diann
+
+def test_read_diann_removes_metadata_and_sets_index():
+    # Simulated DIANN output
+    mock_data = StringIO(
+        """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2
+PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000
+PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500
+"""
+    )
+
+    # Expected DataFrame
+    expected = pd.DataFrame(
+        {
+            "Intensity.Sample1": [1000, 1500],
+            "Intensity.Sample2": [2000, 2500],
+        }, 
+        index=["P12345", "P23456"]
+    )
+    expected.index.name = "Protein"
+
+    result = read_diann(mock_data)
+    assert_frame_equal(result, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,6 +29,8 @@ install_requires = @@
         numba
         seaborn
         biopython
+        tqdm
+        loguru
     [options.extras_require]
     docs =
@@ Expand Down @@