From 59d52f6ded688c8dea2e5a6b16554f89108315bf Mon Sep 17 00:00:00 2001
From: Lillian Tatka <tatkalillian@gmail.com>
Date: Mon, 24 Mar 2025 13:55:12 -0700
Subject: [PATCH 1/6] add diann file support and tests

---
 gopher/__init__.py               |  2 +-
 gopher/parsers/__init__.py       |  2 +-
 gopher/parsers/tabular.py        | 36 ++++++++++++++++++++++++++++++++
 tests/unit_tests/tabular_test.py | 26 +++++++++++++++++++++++
 4 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit_tests/tabular_test.py

diff --git a/gopher/__init__.py b/gopher/__init__.py
index 23049ff..40f0410 100644
--- a/gopher/__init__.py
+++ b/gopher/__init__.py
@@ -26,4 +26,4 @@
 )
 from .enrichment import test_enrichment
 from .normalize import normalize_values
-from .parsers import read_encyclopedia, read_metamorpheus
+from .parsers import read_encyclopedia, read_metamorpheus, read_diann
diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py
index 6054e45..bef63f5 100644
--- a/gopher/parsers/__init__.py
+++ b/gopher/parsers/__init__.py
@@ -1,2 +1,2 @@
 """The parsers"""
-from .tabular import read_encyclopedia, read_metamorpheus
+from .tabular import read_encyclopedia, read_metamorpheus, read_diann
diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
index b026155..f663be0 100644
--- a/gopher/parsers/tabular.py
+++ b/gopher/parsers/tabular.py
@@ -52,3 +52,39 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
         .fillna(0)
     )
     return proteins
+
+def read_diann(proteins_tsv: str) -> pd.DataFrame:
+    """
+    Reads a DIANN-generated TSV file containing protein information, processes
+    it, and returns a cleaned Pandas DataFrame with relevant data.
+
+    The function:
+    - Extracts the first protein accession from the "Protein.Ids" column to use
+        as the DataFrame index.
+    - Renames the index axis to "Protein".
+    - Drops unnecessary metadata columns.
+
+    Args:
+        proteins_tsv (str): Path to the DIANN-generated TSV file.
+
+    Returns:
+        pd.DataFrame: A DataFrame with the processed protein data, indexed by
+            the first protein accession.
+            The returned DataFrame excludes the following columns:
+            ["Protein.Group", "Protein.Ids", "Protein.Names", "Genes",
+            "First.Protein.Description"].
+    """
+    proteins = pd.read_table(proteins_tsv)
+    accessions = proteins["Protein.Ids"].str.split(";").str[0]
+
+    proteins = proteins.set_index(accessions)
+    proteins = proteins.rename_axis("Protein", axis="index")
+    return proteins.drop(
+        columns=[
+            "Protein.Group",
+            "Protein.Ids",
+            "Protein.Names",
+            "Genes",
+            "First.Protein.Description",
+        ]
+    )
\ No newline at end of file
diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py
new file mode 100644
index 0000000..0592631
--- /dev/null
+++ b/tests/unit_tests/tabular_test.py
@@ -0,0 +1,26 @@
+import pandas as pd
+from io import StringIO
+from pandas.testing import assert_frame_equal
+
+from gopher.parsers.tabular import read_diann
+import os
+
+def test_read_diann_removes_metadata_and_sets_index():
+    # Simulated DIANN output
+    mock_data = StringIO(
+        """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2
+PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000
+PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500
+"""
+    )
+
+    # Expected DataFrame
+    expected = pd.DataFrame({
+        "Intensity.Sample1": [1000, 1500],
+        "Intensity.Sample2": [2000, 2500],
+    }, index=["P12345", "P23456"])
+    expected.index.name = "Protein"
+
+    result = read_diann(mock_data)
+
+    assert_frame_equal(result, expected)
\ No newline at end of file

From 27cce5b422fdaa496186e888fa72d8ddd84283da Mon Sep 17 00:00:00 2001
From: Lillian Tatka <tatkalillian@gmail.com>
Date: Tue, 25 Mar 2025 07:56:07 -0700
Subject: [PATCH 2/6] add tqdm to install_requires

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 1e10cb1..3ca0da9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -29,6 +29,7 @@ install_requires =
     numba
     seaborn
     biopython
+    tqdm
 
 [options.extras_require]
 docs =

From 803558d0593b8a65acbe38f525b631d9cc1a5f54 Mon Sep 17 00:00:00 2001
From: Lillian Tatka <tatkalillian@gmail.com>
Date: Tue, 25 Mar 2025 07:57:59 -0700
Subject: [PATCH 3/6] add loguru to install_requires

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 3ca0da9..95d723c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -30,6 +30,7 @@ install_requires =
     seaborn
     biopython
     tqdm
+    loguru
 
 [options.extras_require]
 docs =

From 1beb214a127f8ee5feeebc2d2298dab1c3e5eee2 Mon Sep 17 00:00:00 2001
From: Lillian Tatka <tatkalillian@gmail.com>
Date: Tue, 25 Mar 2025 09:10:58 -0700
Subject: [PATCH 4/6] remove new line and unnecessary import

---
 tests/unit_tests/tabular_test.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py
index 0592631..6f8bb1f 100644
--- a/tests/unit_tests/tabular_test.py
+++ b/tests/unit_tests/tabular_test.py
@@ -3,7 +3,6 @@
 from pandas.testing import assert_frame_equal
 
 from gopher.parsers.tabular import read_diann
-import os
 
 def test_read_diann_removes_metadata_and_sets_index():
     # Simulated DIANN output
@@ -15,12 +14,14 @@ def test_read_diann_removes_metadata_and_sets_index():
     )
 
     # Expected DataFrame
-    expected = pd.DataFrame({
-        "Intensity.Sample1": [1000, 1500],
-        "Intensity.Sample2": [2000, 2500],
-    }, index=["P12345", "P23456"])
+    expected = pd.DataFrame(
+        {
+            "Intensity.Sample1": [1000, 1500],
+            "Intensity.Sample2": [2000, 2500],
+        }, 
+        index=["P12345", "P23456"]
+    )
     expected.index.name = "Protein"
 
     result = read_diann(mock_data)
-
     assert_frame_equal(result, expected)
\ No newline at end of file

From 610e362334d20887c14867532993280b5314ff87 Mon Sep 17 00:00:00 2001
From: Lillian Tatka <tatkalillian@gmail.com>
Date: Tue, 25 Mar 2025 09:11:33 -0700
Subject: [PATCH 5/6] add doc for expected columns and datatype check

---
 gopher/parsers/tabular.py | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
index f663be0..01bd864 100644
--- a/gopher/parsers/tabular.py
+++ b/gopher/parsers/tabular.py
@@ -1,5 +1,6 @@
 """Parse tabular result files from common tools"""
 import pandas as pd
+import numpy as np
 
 
 def read_encyclopedia(proteins_txt: str) -> pd.DataFrame:
@@ -66,20 +67,27 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame:
 
     Args:
         proteins_tsv (str): Path to the DIANN-generated TSV file.
+            Expected columns:
+                'Protein.Group',
+                'Protein.Ids',
+                'Protein.Names',
+                'Genes',
+                'First.Protein.Description',
+                <several MSR columns>
+
 
     Returns:
         pd.DataFrame: A DataFrame with the processed protein data, indexed by
             the first protein accession.
-            The returned DataFrame excludes the following columns:
-            ["Protein.Group", "Protein.Ids", "Protein.Names", "Genes",
-            "First.Protein.Description"].
+            The returned DataFrame has the "Protein.Ids" column as the 
+            index and all columns are the MSR columns.          
     """
     proteins = pd.read_table(proteins_tsv)
     accessions = proteins["Protein.Ids"].str.split(";").str[0]
 
     proteins = proteins.set_index(accessions)
     proteins = proteins.rename_axis("Protein", axis="index")
-    return proteins.drop(
+    proteins = proteins.drop(
         columns=[
             "Protein.Group",
             "Protein.Ids",
@@ -87,4 +95,18 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame:
             "Genes",
             "First.Protein.Description",
         ]
-    )
\ No newline at end of file
+    )
+
+    # Check data types
+    # (if loading from S3, default types are 'O'
+    if proteins.index.dtype not in ["O", "category", "str"]:
+        raise ValueError(
+            f"Protein index is incorrect type: {proteins.index.dtype}"
+        )
+    if not all(
+        np.issubdtype(dtype, np.floating) or dtype == "O"
+        for dtype in proteins.dtypes
+    ):
+        raise ValueError("Non-numeric columns present")
+    
+    return proteins
\ No newline at end of file

From 909f8e885e805384aae4dddf7eedde82ba2b0303 Mon Sep 17 00:00:00 2001
From: Lillian Tatka <tatkalillian@gmail.com>
Date: Tue, 25 Mar 2025 12:32:20 -0700
Subject: [PATCH 6/6] add new line at end of file

---
 gopher/parsers/tabular.py        | 2 +-
 tests/unit_tests/tabular_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py
index 01bd864..7aae459 100644
--- a/gopher/parsers/tabular.py
+++ b/gopher/parsers/tabular.py
@@ -109,4 +109,4 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame:
     ):
         raise ValueError("Non-numeric columns present")
     
-    return proteins
\ No newline at end of file
+    return proteins
diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py
index 6f8bb1f..954b5db 100644
--- a/tests/unit_tests/tabular_test.py
+++ b/tests/unit_tests/tabular_test.py
@@ -24,4 +24,4 @@ def test_read_diann_removes_metadata_and_sets_index():
     expected.index.name = "Protein"
 
     result = read_diann(mock_data)
-    assert_frame_equal(result, expected)
\ No newline at end of file
+    assert_frame_equal(result, expected)