-
Notifications
You must be signed in to change notification settings - Fork 1
Chore/gen cleanup #25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8c17372
3c4a0dc
b1c455f
c31ee0b
822c432
3cf875a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| repos: | ||
| - repo: https://github.com/psf/black | ||
| rev: 23.3.0 # Replace by any tag/version: https://github.com/psf/black/tags | ||
| rev: 25.1.0 # Replace by any tag/version: https://github.com/psf/black/tags | ||
| hooks: | ||
| - id: black | ||
| language_version: python3 # Should be a command that runs python3.6+ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Get GO annotations.""" | ||
|
|
||
| import uuid | ||
| from pathlib import Path | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """This module contains the configuration details for ppx""" | ||
|
|
||
| import logging | ||
| import os | ||
| from pathlib import Path | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Calculate the enrichments for a collection of experiments.""" | ||
|
|
||
| import logging | ||
|
|
||
| import numpy as np | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """The command line entry point for gopher-enrich""" | ||
|
|
||
| import logging | ||
| from argparse import ArgumentParser | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Download the GO ontologies""" | ||
|
|
||
| from collections import defaultdict | ||
|
|
||
| from . import config, utils | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,3 @@ | ||
| """The parsers""" | ||
|
|
||
| from .tabular import read_encyclopedia, read_metamorpheus, read_diann |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,10 @@ | ||
| """Parse tabular result files from common tools""" | ||
|
|
||
| import os | ||
| import io | ||
| import pandas as pd | ||
| import numpy as np | ||
| from cloudpathlib import AnyPath | ||
|
|
||
|
|
||
| def read_encyclopedia(proteins_txt: str) -> pd.DataFrame: | ||
|
|
@@ -54,10 +58,19 @@ def read_metamorpheus(proteins_txt: str) -> pd.DataFrame: | |
| ) | ||
| return proteins | ||
|
|
||
| def read_diann(proteins_tsv: str) -> pd.DataFrame: | ||
|
|
||
| def _read_colnames(file: os.PathLike | io.TextIOBase) -> list[str]: | ||
| with open(AnyPath(file)) as f: | ||
| firstcol = f.readline() | ||
|
|
||
| return firstcol.strip().split("\t") | ||
|
|
||
|
|
||
| def read_diann(proteins_tsv: os.PathLike) -> pd.DataFrame: | ||
| """ | ||
| Reads a DIANN-generated TSV file containing protein information, processes | ||
| it, and returns a cleaned Pandas DataFrame with relevant data. | ||
| Reads a DIANN-generated TSV file (pg_matrix) containing protein information. | ||
|
|
||
| Also processes it, and returns a cleaned Pandas DataFrame with relevant data. | ||
|
|
||
| The function: | ||
| - Extracts the first protein accession from the "Protein.Ids" column to use | ||
|
|
@@ -73,40 +86,40 @@ def read_diann(proteins_tsv: str) -> pd.DataFrame: | |
| 'Protein.Names', | ||
| 'Genes', | ||
| 'First.Protein.Description', | ||
| <several MSR columns> | ||
| <several Intensity columns> | ||
|
|
||
|
|
||
| Returns: | ||
| pd.DataFrame: A DataFrame with the processed protein data, indexed by | ||
| the first protein accession. | ||
| The returned DataFrame has the "Protein.Ids" column as the | ||
| index and all columns are the MSR columns. | ||
| The returned DataFrame has the "Protein.Ids" column as the | ||
| index and all columns are the MSR columns. | ||
| """ | ||
| proteins = pd.read_table(proteins_tsv) | ||
| accessions = proteins["Protein.Ids"].str.split(";").str[0] | ||
|
|
||
| proteins = proteins.set_index(accessions) | ||
| proteins = proteins.rename_axis("Protein", axis="index") | ||
| proteins = proteins.drop( | ||
| columns=[ | ||
| "Protein.Group", | ||
| "Protein.Ids", | ||
| "Protein.Names", | ||
| "Genes", | ||
| "First.Protein.Description", | ||
| ] | ||
| columns = _read_colnames(proteins_tsv) | ||
|
|
||
| expect = [ | ||
| "Protein.Group", | ||
| "Protein.Ids", | ||
| "Protein.Names", | ||
| "Genes", | ||
| "First.Protein.Description", | ||
| ] | ||
|
|
||
| if not all(c in columns for c in expect): | ||
| msg = f"Expected columns {expect}, got {columns}, make sure you are" | ||
| msg += " using the 'diann_report.pg_matrix.tsv' output." | ||
| raise ValueError(msg) | ||
|
|
||
| schema: dict[str, type] = {k: float for k in columns if k not in expect} | ||
| schema["Protein.Ids"] = str | ||
|
|
||
| proteins = pd.read_table( | ||
| AnyPath(proteins_tsv), dtype=schema, usecols=list(schema) | ||
| ) | ||
| proteins["Protein.Ids"] = proteins["Protein.Ids"].str.split(";").str[0] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current version of gopher requires that the index column be the accessions and that all columns are intensity columns. If you don't drop "Genes", "Protein.Group" etc, downstream functions such as test_enrichment will not work (unless you are also planning on changing those).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. they are not being read, so they dont have to be dropped.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But the returned data frame will have them, right? And then test_enrichment will throw an error, unless you've also modified that.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it does not, the test checks (and proves) that is not the case. |
||
|
|
||
| proteins = proteins.set_index("Protein.Ids", drop=True) | ||
| proteins = proteins.rename_axis("Protein", axis="index") | ||
|
|
||
| # Check data types | ||
| # (if loading from S3, default types are 'O' | ||
| if proteins.index.dtype not in ["O", "category", "str"]: | ||
| raise ValueError( | ||
| f"Protein index is incorrect type: {proteins.index.dtype}" | ||
| ) | ||
| if not all( | ||
| np.issubdtype(dtype, np.floating) or dtype == "O" | ||
| for dtype in proteins.dtypes | ||
| ): | ||
| raise ValueError("Non-numeric columns present") | ||
|
|
||
| return proteins | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Numba Mann-Whitney U test""" | ||
|
|
||
| import numba as nb | ||
| import numpy as np | ||
| from scipy import stats | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Utility functions""" | ||
|
|
||
| import socket | ||
| from pathlib import Path | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,10 +13,27 @@ classifiers = [ | |
| "Operating System :: OS Independent", | ||
| "Topic :: Scientific/Engineering :: Bio-Informatics", | ||
| ] | ||
| requires-python = ">=3.6" | ||
|
|
||
| requires-python = ">=3.10" | ||
| dependencies = [ | ||
| "numpy > 2.0, < 3.0", | ||
| "pandas > 2.0, < 3.0", | ||
| "scipy", | ||
| "tqdm", | ||
| "statsmodels", | ||
| "biopython", # ... we can implement a fasta parser ... | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment: Indeed we could implement a FASTA parser, it just hasn't seemed worth it with how little use Gopher has gotten in recent times. Maybe worth it now though.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fair enough ... although IDK ... biopython is a really heavy dependency and a fasta parser is like 20 lines :P so I usually go with the re-implementing if I am not already using pyteomics. I guess that since this project already depends on scipy+numpy+pandas+matplotlib+statsmodels ... bio is not a big deal |
||
| "loguru", | ||
| "numba", | ||
| "requests", | ||
| "seaborn", | ||
| "matplotlib", | ||
| "cloudpathlib", | ||
| ] | ||
| dynamic = ["version"] | ||
|
|
||
| [project.scripts] | ||
| gopher = "gopher.gopher:main" | ||
|
|
||
|
|
||
| [project.readme] | ||
| file = "README.md" | ||
| content-type = "text/markdown" | ||
|
|
@@ -40,6 +57,11 @@ docs = [ | |
| dev = [ | ||
| "pre-commit>=2.7.1", | ||
| "black>=19.10b0", | ||
| "pytest", | ||
| ] | ||
| s3 = [ | ||
| "cloudpathlib[s3]", | ||
| "boto3", | ||
| ] | ||
|
|
||
| [tool.setuptools] | ||
|
|
@@ -52,7 +74,7 @@ find = {namespaces = false} | |
|
|
||
| [tool.black] | ||
| line-length = 79 | ||
| target-version = ['py37'] | ||
| target-version = ['py310'] | ||
| include = '\.pyi?$' | ||
| exclude = ''' | ||
|
|
||
|
|
||
This file was deleted.
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Test that the annotations functions are working correctly""" | ||
|
|
||
| import re | ||
|
|
||
| import pandas as pd | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| """Test that the enrichment functions are working correctly""" | ||
|
|
||
| import random | ||
|
|
||
| import numpy as np | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.