From 59d52f6ded688c8dea2e5a6b16554f89108315bf Mon Sep 17 00:00:00 2001 From: Lillian Tatka Date: Mon, 24 Mar 2025 13:55:12 -0700 Subject: [PATCH 1/6] add diann file support and tests --- gopher/__init__.py | 2 +- gopher/parsers/__init__.py | 2 +- gopher/parsers/tabular.py | 36 ++++++++++++++++++++++++++++++++ tests/unit_tests/tabular_test.py | 26 +++++++++++++++++++++++ 4 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 tests/unit_tests/tabular_test.py diff --git a/gopher/__init__.py b/gopher/__init__.py index 23049ff..40f0410 100644 --- a/gopher/__init__.py +++ b/gopher/__init__.py @@ -26,4 +26,4 @@ ) from .enrichment import test_enrichment from .normalize import normalize_values -from .parsers import read_encyclopedia, read_metamorpheus +from .parsers import read_encyclopedia, read_metamorpheus, read_diann diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py index 6054e45..bef63f5 100644 --- a/gopher/parsers/__init__.py +++ b/gopher/parsers/__init__.py @@ -1,2 +1,2 @@ """The parsers""" -from .tabular import read_encyclopedia, read_metamorpheus +from .tabular import read_encyclopedia, read_metamorpheus, read_diann diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index b026155..f663be0 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -52,3 +52,39 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame: .fillna(0) ) return proteins + +def read_diann(proteins_tsv: str) -> pd.DataFrame: + """ + Reads a DIANN-generated TSV file containing protein information, processes + it, and returns a cleaned Pandas DataFrame with relevant data. + + The function: + - Extracts the first protein accession from the "Protein.Ids" column to use + as the DataFrame index. + - Renames the index axis to "Protein". + - Drops unnecessary metadata columns. + + Args: + proteins_tsv (str): Path to the DIANN-generated TSV file. + + Returns: + pd.DataFrame: A DataFrame with the processed protein data, indexed by + the first protein accession. + The returned DataFrame excludes the following columns: + ["Protein.Group", "Protein.Ids", "Protein.Names", "Genes", + "First.Protein.Description"]. + """ + proteins = pd.read_table(proteins_tsv) + accessions = proteins["Protein.Ids"].str.split(";").str[0] + + proteins = proteins.set_index(accessions) + proteins = proteins.rename_axis("Protein", axis="index") + return proteins.drop( + columns=[ + "Protein.Group", + "Protein.Ids", + "Protein.Names", + "Genes", + "First.Protein.Description", + ] + ) \ No newline at end of file diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py new file mode 100644 index 0000000..0592631 --- /dev/null +++ b/tests/unit_tests/tabular_test.py @@ -0,0 +1,26 @@ +import pandas as pd +from io import StringIO +from pandas.testing import assert_frame_equal + +from gopher.parsers.tabular import read_diann +import os + +def test_read_diann_removes_metadata_and_sets_index(): + # Simulated DIANN output + mock_data = StringIO( + """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2 +PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000 +PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500 +""" + ) + + # Expected DataFrame + expected = pd.DataFrame({ + "Intensity.Sample1": [1000, 1500], + "Intensity.Sample2": [2000, 2500], + }, index=["P12345", "P23456"]) + expected.index.name = "Protein" + + result = read_diann(mock_data) + + assert_frame_equal(result, expected) \ No newline at end of file From 27cce5b422fdaa496186e888fa72d8ddd84283da Mon Sep 17 00:00:00 2001 From: Lillian Tatka Date: Tue, 25 Mar 2025 07:56:07 -0700 Subject: [PATCH 2/6] add tqdm to install_requires --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 1e10cb1..3ca0da9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,6 +29,7 @@ install_requires = numba seaborn biopython + tqdm [options.extras_require] docs = From 803558d0593b8a65acbe38f525b631d9cc1a5f54 Mon Sep 17 00:00:00 2001 From: Lillian Tatka Date: Tue, 25 Mar 2025 07:57:59 -0700 Subject: [PATCH 3/6] add loguru to install_requires --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 3ca0da9..95d723c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ install_requires = seaborn biopython tqdm + loguru [options.extras_require] docs = From 1beb214a127f8ee5feeebc2d2298dab1c3e5eee2 Mon Sep 17 00:00:00 2001 From: Lillian Tatka Date: Tue, 25 Mar 2025 09:10:58 -0700 Subject: [PATCH 4/6] remove new line and unnecessary import --- tests/unit_tests/tabular_test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py index 0592631..6f8bb1f 100644 --- a/tests/unit_tests/tabular_test.py +++ b/tests/unit_tests/tabular_test.py @@ -3,7 +3,6 @@ from pandas.testing import assert_frame_equal from gopher.parsers.tabular import read_diann -import os def test_read_diann_removes_metadata_and_sets_index(): # Simulated DIANN output @@ -15,12 +14,14 @@ def test_read_diann_removes_metadata_and_sets_index(): ) # Expected DataFrame - expected = pd.DataFrame({ - "Intensity.Sample1": [1000, 1500], - "Intensity.Sample2": [2000, 2500], - }, index=["P12345", "P23456"]) + expected = pd.DataFrame( + { + "Intensity.Sample1": [1000, 1500], + "Intensity.Sample2": [2000, 2500], + }, + index=["P12345", "P23456"] + ) expected.index.name = "Protein" result = read_diann(mock_data) - assert_frame_equal(result, expected) \ No newline at end of file From 610e362334d20887c14867532993280b5314ff87 Mon Sep 17 00:00:00 2001 From: Lillian Tatka Date: Tue, 25 Mar 2025 09:11:33 -0700 Subject: [PATCH 5/6] add doc for expected columns and datatype check --- gopher/parsers/tabular.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index f663be0..01bd864 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -1,5 +1,6 @@ """Parse tabular result files from common tools""" import pandas as pd +import numpy as np def read_encyclopedia(proteins_txt: str) -> pd.DataFrame: @@ -66,20 +67,27 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame: Args: proteins_tsv (str): Path to the DIANN-generated TSV file. + Expected columns: + 'Protein.Group', + 'Protein.Ids', + 'Protein.Names', + 'Genes', + 'First.Protein.Description', + + Returns: pd.DataFrame: A DataFrame with the processed protein data, indexed by the first protein accession. - The returned DataFrame excludes the following columns: - ["Protein.Group", "Protein.Ids", "Protein.Names", "Genes", - "First.Protein.Description"]. + The returned DataFrame has the "Protein.Ids" column as the + index and all columns are the MSR columns. """ proteins = pd.read_table(proteins_tsv) accessions = proteins["Protein.Ids"].str.split(";").str[0] proteins = proteins.set_index(accessions) proteins = proteins.rename_axis("Protein", axis="index") - return proteins.drop( + proteins = proteins.drop( columns=[ "Protein.Group", "Protein.Ids", @@ -87,4 +95,18 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame: "Genes", "First.Protein.Description", ] - ) \ No newline at end of file + ) + + # Check data types + # (if loading from S3, default types are 'O' + if proteins.index.dtype not in ["O", "category", "str"]: + raise ValueError( + f"Protein index is incorrect type: {proteins.index.dtype}" + ) + if not all( + np.issubdtype(dtype, np.floating) or dtype == "O" + for dtype in proteins.dtypes + ): + raise ValueError("Non-numeric columns present") + + return proteins \ No newline at end of file From 909f8e885e805384aae4dddf7eedde82ba2b0303 Mon Sep 17 00:00:00 2001 From: Lillian Tatka Date: Tue, 25 Mar 2025 12:32:20 -0700 Subject: [PATCH 6/6] add new line at end of file --- gopher/parsers/tabular.py | 2 +- tests/unit_tests/tabular_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index 01bd864..7aae459 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -109,4 +109,4 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame: ): raise ValueError("Non-numeric columns present") - return proteins \ No newline at end of file + return proteins diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py index 6f8bb1f..954b5db 100644 --- a/tests/unit_tests/tabular_test.py +++ b/tests/unit_tests/tabular_test.py @@ -24,4 +24,4 @@ def test_read_diann_removes_metadata_and_sets_index(): expected.index.name = "Protein" result = read_diann(mock_data) - assert_frame_equal(result, expected) \ No newline at end of file + assert_frame_equal(result, expected)