Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gopher/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@
)
from .enrichment import test_enrichment
from .normalize import normalize_values
from .parsers import read_encyclopedia, read_metamorpheus
from .parsers import read_encyclopedia, read_metamorpheus, read_diann
2 changes: 1 addition & 1 deletion gopher/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""The parsers"""
from .tabular import read_encyclopedia, read_metamorpheus
from .tabular import read_encyclopedia, read_metamorpheus, read_diann
58 changes: 58 additions & 0 deletions gopher/parsers/tabular.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Parse tabular result files from common tools"""
import pandas as pd
import numpy as np


def read_encyclopedia(proteins_txt: str) -> pd.DataFrame:
Expand Down Expand Up @@ -52,3 +53,60 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame:
.fillna(0)
)
return proteins

def read_diann(proteins_tsv: str) -> pd.DataFrame:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: rename to read_diann_pg_mat or something that denotes which file is supported

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no objections to this. The reason I did it this way was to be consistent with the current parser function names, which are read_encyclopedia and read_metamorpheus. Do you think those function names should also be updated?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like the difference there is that (as far as I can recall ...) those guys only return one matrix (I might be wrong though)

"""
Reads a DIANN-generated TSV file containing protein information, processes
it, and returns a cleaned Pandas DataFrame with relevant data.

The function:
- Extracts the first protein accession from the "Protein.Ids" column to use
as the DataFrame index.
- Renames the index axis to "Protein".
- Drops unnecessary metadata columns.

Args:
proteins_tsv (str): Path to the DIANN-generated TSV file.
Expected columns:
'Protein.Group',
'Protein.Ids',
'Protein.Names',
'Genes',
'First.Protein.Description',
<several MSR columns>


Returns:
pd.DataFrame: A DataFrame with the processed protein data, indexed by
the first protein accession.
The returned DataFrame has the "Protein.Ids" column as the
index and all columns are the MSR columns.
"""
proteins = pd.read_table(proteins_tsv)
accessions = proteins["Protein.Ids"].str.split(";").str[0]
Comment thread
ltatka marked this conversation as resolved.

proteins = proteins.set_index(accessions)
proteins = proteins.rename_axis("Protein", axis="index")
proteins = proteins.drop(
columns=[
"Protein.Group",
"Protein.Ids",
"Protein.Names",
"Genes",
"First.Protein.Description",
]
)

# Check data types
# (if loading from S3, default types are 'O'
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can pandas read from s3?

if proteins.index.dtype not in ["O", "category", "str"]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you go with this instead of trying to cast to a number and fail if it cannot?

raise ValueError(
f"Protein index is incorrect type: {proteins.index.dtype}"
)
if not all(
np.issubdtype(dtype, np.floating) or dtype == "O"
for dtype in proteins.dtypes
):
raise ValueError("Non-numeric columns present")

return proteins
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ install_requires =
numba
seaborn
biopython
tqdm
loguru

[options.extras_require]
docs =
Expand Down
27 changes: 27 additions & 0 deletions tests/unit_tests/tabular_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd
from io import StringIO
from pandas.testing import assert_frame_equal

from gopher.parsers.tabular import read_diann

def test_read_diann_removes_metadata_and_sets_index():
# Simulated DIANN output
mock_data = StringIO(
"""Protein.Group\tProtein.Ids\tProtein.Names\tGenes\tFirst.Protein.Description\tIntensity.Sample1\tIntensity.Sample2
PG1\tP12345;P67890\tProtein A\tGENE1\tDescription A\t1000\t2000
PG2\tP23456\tProtein B\tGENE2\tDescription B\t1500\t2500
"""
)

# Expected DataFrame
expected = pd.DataFrame(
{
"Intensity.Sample1": [1000, 1500],
"Intensity.Sample2": [2000, 2500],
},
index=["P12345", "P23456"]
)
expected.index.name = "Protein"

result = read_diann(mock_data)
assert_frame_equal(result, expected)
Loading