diff --git a/gopher/__init__.py b/gopher/__init__.py index 23049ff..40f0410 100644 --- a/gopher/__init__.py +++ b/gopher/__init__.py @@ -26,4 +26,4 @@ ) from .enrichment import test_enrichment from .normalize import normalize_values -from .parsers import read_encyclopedia, read_metamorpheus +from .parsers import read_encyclopedia, read_metamorpheus, read_diann diff --git a/gopher/parsers/__init__.py b/gopher/parsers/__init__.py index 6054e45..bef63f5 100644 --- a/gopher/parsers/__init__.py +++ b/gopher/parsers/__init__.py @@ -1,2 +1,2 @@ """The parsers""" -from .tabular import read_encyclopedia, read_metamorpheus +from .tabular import read_encyclopedia, read_metamorpheus, read_diann diff --git a/gopher/parsers/tabular.py b/gopher/parsers/tabular.py index b026155..7aae459 100644 --- a/gopher/parsers/tabular.py +++ b/gopher/parsers/tabular.py @@ -1,5 +1,6 @@ """Parse tabular result files from common tools""" import pandas as pd +import numpy as np def read_encyclopedia(proteins_txt: str) -> pd.DataFrame: @@ -52,3 +53,60 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame: .fillna(0) ) return proteins + +def read_diann(proteins_tsv: str) -> pd.DataFrame: + """ + Reads a DIANN-generated TSV file containing protein information, processes + it, and returns a cleaned Pandas DataFrame with relevant data. + + The function: + - Extracts the first protein accession from the "Protein.Ids" column to use + as the DataFrame index. + - Renames the index axis to "Protein". + - Drops unnecessary metadata columns. + + Args: + proteins_tsv (str): Path to the DIANN-generated TSV file. + Expected columns: + 'Protein.Group', + 'Protein.Ids', + 'Protein.Names', + 'Genes', + 'First.Protein.Description', + + + + Returns: + pd.DataFrame: A DataFrame with the processed protein data, indexed by + the first protein accession. + The returned DataFrame has the "Protein.Ids" column as the + index and all columns are the MSR columns. + """ + proteins = pd.read_table(proteins_tsv) + accessions = proteins["Protein.Ids"].str.split(";").str[0] + + proteins = proteins.set_index(accessions) + proteins = proteins.rename_axis("Protein", axis="index") + proteins = proteins.drop( + columns=[ + "Protein.Group", + "Protein.Ids", + "Protein.Names", + "Genes", + "First.Protein.Description", + ] + ) + + # Check data types + # (if loading from S3, default types are 'O' + if proteins.index.dtype not in ["O", "category", "str"]: + raise ValueError( + f"Protein index is incorrect type: {proteins.index.dtype}" + ) + if not all( + np.issubdtype(dtype, np.floating) or dtype == "O" + for dtype in proteins.dtypes + ): + raise ValueError("Non-numeric columns present") + + return proteins diff --git a/setup.cfg b/setup.cfg index 1e10cb1..95d723c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,6 +29,8 @@ install_requires = numba seaborn biopython + tqdm + loguru [options.extras_require] docs = diff --git a/tests/unit_tests/tabular_test.py b/tests/unit_tests/tabular_test.py new file mode 100644 index 0000000..954b5db --- /dev/null +++ b/tests/unit_tests/tabular_test.py @@ -0,0 +1,27 @@ +import pandas as pd +from io import StringIO +from pandas.testing import assert_frame_equal + +from gopher.parsers.tabular import read_diann + +def test_read_diann_removes_metadata_and_sets_index(): + # Simulated DIANN output + mock_data = StringIO( + """Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2 +PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000 +PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500 +""" + ) + + # Expected DataFrame + expected = pd.DataFrame( + { + "Intensity.Sample1": [1000, 1500], + "Intensity.Sample2": [2000, 2500], + }, + index=["P12345", "P23456"] + ) + expected.index.name = "Protein" + + result = read_diann(mock_data) + assert_frame_equal(result, expected)