Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
fbbffd3
feature: start implementation of lib export with pyprophet
jcharkow Jun 17, 2025
02d5d65
more functionality to lib export
jcharkow Jun 18, 2025
1ef91f9
change default min frags to 4
jcharkow Jul 9, 2025
0c999a6
filter fragments with 0 library intensity
jcharkow Jul 9, 2025
0a6e7db
require a --out parameter
jcharkow Jul 9, 2025
5f41220
change config from 6 to 4
jcharkow Jul 9, 2025
d725eb8
fix bugs, update docs
jcharkow Jul 10, 2025
9b108aa
fix: export protein info in lib
jcharkow Jul 10, 2025
e36e23f
fix: lib export compute annotation col if empty
jcharkow Jul 11, 2025
d4c48a5
Merge branch 'feature/polars_explode' into feature/lib_export
jcharkow Jul 29, 2025
76bbff7
feature: option to keep significant decoys in lib refinement
jcharkow Jul 30, 2025
f8b3753
verbose: note that keep_decoys in lib gen is experimental feature
jcharkow Aug 7, 2025
4b9076b
test: add test for lib generation
jcharkow Aug 8, 2025
f455b10
minor refactor for better support across different i/o
jcharkow Aug 8, 2025
1667c25
add not implemented error for osw/parquet output
jcharkow Aug 8, 2025
609e84f
feature: add option to export rt unit in non iRT
jcharkow Aug 11, 2025
5c6bda2
remove debug line
jcharkow Aug 11, 2025
bb5607c
swtich keep_decoys default to no-keep_decoys
jcharkow Aug 11, 2025
d959d70
bug fix: transitions with diff RT
jcharkow Aug 11, 2025
989ade1
update parameter descriptions
jcharkow Aug 12, 2025
0aae3a8
fix: error description
jcharkow Aug 12, 2025
ee3bb62
Merge branch 'feature/lib_export' of github.com:Roestlab/pyprophet in…
jcharkow Aug 12, 2025
c3daea1
apply suggestions from PR review
jcharkow Aug 12, 2025
3a8d203
test: update tests with new snapshots
jcharkow Aug 12, 2025
76ac52c
feature: sort by intensity if q value tie
jcharkow Aug 19, 2025
1c32cb6
apply copilot suggestions
jcharkow Aug 19, 2025
0c0d99d
fix: bug in sql query
jcharkow Aug 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion pyprophet/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,7 @@ class ExportIOConfig(BaseIOConfig):
- "legacy_split": Split TSV files for each run.
- "parquet": Single Parquet file with merged results.
- "parquet_split": Split Parquet files for each run.
- "library" : .tsv library file
out_type (Literal["tsv", "csv"]): Output file type for exported results.
transition_quantification (bool): Report aggregated transition-level quantification.
max_transition_pep (float): Maximum PEP to retain scored transitions for quantification (requires transition-level scoring).
Expand All @@ -653,6 +654,7 @@ class ExportIOConfig(BaseIOConfig):
top_n (int): Number of top intense features to use for summarization
consistent_top (bool): Whether to use same top features across all runs
normalization (Literal["none", "median", "medianmedian", "quantile"]): Normalization method
test: bool = False: Whether to enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge

# OSW: Export to parquet
compression_method (Literal["none", "snappy", "gzip", "brotli", "zstd"]): Compression method for parquet files.
Expand All @@ -662,10 +664,18 @@ class ExportIOConfig(BaseIOConfig):

# SqMass: Export to parquet
pqp_file (Optional[str]): Path to PQP file for precursor/transition mapping.

# Export to library
rt_calibration (bool): If True, will use emperical RT values as oppose to the original library RT values
im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values
intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values
min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True
keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified
rt_unit (Literal["iRT", "RT"], default = 'iRT') = "iRT": Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library
"""

export_format: Literal[
"matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split"
"matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split", "library"
] = "legacy_merged"
out_type: Literal["tsv", "csv"] = "tsv"
transition_quantification: bool = False
Expand All @@ -677,6 +687,7 @@ class ExportIOConfig(BaseIOConfig):
max_global_peptide_qvalue: float = 0.01
protein: bool = True
max_global_protein_qvalue: float = 0.01
test: bool = False

# Quantification matrix options
top_n: int = 3
Expand All @@ -691,3 +702,11 @@ class ExportIOConfig(BaseIOConfig):

# SqMass: Export to parquet
pqp_file: Optional[str] = None # Path to PQP file for precursor/transition mapping

# Export to library options
rt_calibration: bool = True
im_calibration: bool = True
intensity_calibration: bool = True
min_fragments: int = 4
keep_decoys: bool = False # Whether to keep decoy entries in the library
rt_unit: Literal["iRT", "RT"] = "iRT"
126 changes: 126 additions & 0 deletions pyprophet/cli/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def export():
pass

export.add_command(export_tsv, name="tsv")
export.add_command(export_library, name='library')
export.add_command(export_matrix, name="matrix")
export.add_command(export_parquet, name="parquet")
export.add_command(export_compound, name="compound")
Expand Down Expand Up @@ -347,6 +348,131 @@ def export_matrix(
df = reader.read()
writer.export_quant_matrix(df)

# Export to Library to be used in OpenSWATH
@click.command(name="library", cls=AdvancedHelpCommand)
@click.option(
"--in",
"infile",
required=True,
type=click.Path(exists=True),
help="PyProphet OSW input file.",
)
@click.option(
"--out",
"outfile",
required=True, # need to name the library or else get error in os.path.splittext line 75, in __post_init__in _base.
type=click.Path(exists=False),
help="Output tsv library.",
)
@click.option(
"--max_peakgroup_qvalue",
default=0.01,
show_default=True,
type=float,
help="Filter results to maximum run-specific peak group-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates. If there are multiple runs with the same precursors, the run with the lowest q value is used",
)
@click.option(
"--max_global_peptide_qvalue",
default=0.01,
show_default=True,
type=float,
help="Filter results to maximum global peptide-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates."
)
@click.option(
"--max_global_protein_qvalue",
default=0.01,
show_default=True,
type=float,
help="Filter results to maximum global protein-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates."
)
@click.option(
"--rt_calibration/--no-rt_calibration",
default=True,
show_default=True,
help="Use empirical RT values as oppose to the original library RT values."
)
@click.option(
"--im_calibration/--no-im_calibration",
default=True,
show_default=True,
help="Use empirical IM values as oppose to the original library IM values."
)
@click.option(
"--intensity_calibration/--no-intensity_calibration",
default=True,
show_default=True,
help="Use empirical intensity values as oppose to the original library intensity values."
)
@click.option(
"--min_fragments",
default=4,
show_default=True,
type=int,
help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True."
)
@click.option(
"--keep_decoys/--no-keep_decoys",
default=False,
show_default=True,
type=bool,
help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
)
@click.option(
"--rt_unit",
default="iRT",
show_default=True,
type=click.Choice(["iRT", "RT"]),
help='Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library.',
hidden=True
)
@click.option(
"--test/--no-test",
default=False,
show_default=True,
help="Enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge")
@measure_memory_usage_and_time
def export_library(
infile,
outfile,
max_peakgroup_qvalue,
max_global_peptide_qvalue,
max_global_protein_qvalue,
rt_calibration,
im_calibration,
intensity_calibration,
min_fragments,
keep_decoys,
rt_unit,
test
):
"""
Export OSW to tsv library format
"""
config = ExportIOConfig(
infile=infile,
outfile=outfile,
subsample_ratio=1.0, # Not used in export
level="export",
context="export",
export_format="library",
out_type="tsv",
max_rs_peakgroup_qvalue=max_peakgroup_qvalue,
max_global_peptide_qvalue=max_global_peptide_qvalue,
max_global_protein_qvalue=max_global_protein_qvalue,
rt_calibration=rt_calibration,
im_calibration=im_calibration,
intensity_calibration=intensity_calibration,
min_fragments=min_fragments,
keep_decoys=keep_decoys,
rt_unit=rt_unit,
test=test
)

reader = ReaderDispatcher.get_reader(config)
writer = WriterDispatcher.get_writer(config)

df = reader.read()
writer.clean_and_export_library(df)

# Export to Parquet
@click.command(name="parquet", cls=AdvancedHelpCommand)
Expand Down
57 changes: 57 additions & 0 deletions pyprophet/io/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import duckdb
import pandas as pd
import polars as pl
import sklearn.preprocessing as preprocessing # For MinMaxScaler
Copy link

Copilot AI Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import comment should follow PEP 8 guidelines with proper spacing: '# For MinMaxScaler' should be '# For MinMaxScaler' (two spaces before #)

Suggested change
import sklearn.preprocessing as preprocessing # For MinMaxScaler
import sklearn.preprocessing as preprocessing # For MinMaxScaler

Copilot uses AI. Check for mistakes.
from loguru import logger

from .._base import BaseIOConfig
Expand Down Expand Up @@ -619,6 +620,62 @@ def export_results(self, data: pd.DataFrame):
else:
raise ValueError(f"Unsupported export format: {cfg.export_format}")

def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
"""
This function cleans the original dataframe and exports the library

Args:
data: Input DataFrame with library data

"""
cfg = self.config

# For precursors found in more than one run, select the run with the smallest q value
# If q values are the same, select the first run
data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1)
assert len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value Intensity and RunId, duplicate transition IDs found."

# Remove Annotation Column if all NAN
if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all():
logger.debug("Annotation column is empty, so computing it manually.")
data.drop(columns=['Annotation'], inplace=True)
data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str)

if cfg.rt_calibration and cfg.rt_unit == "iRT":
data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100
if cfg.intensity_calibration:
data['LibraryIntensity'] = (
data['LibraryIntensity'] /
data.groupby('Precursor')['LibraryIntensity'].transform('max') *
10000)
logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0])))
# Remove rows with zero intensity
data = data[data['LibraryIntensity'] > 0]

## Print Library statistics
logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors")

logger.info(f"Precursor Fragment Distribution (Before Filtering)")
num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
Copy link

Copilot AI Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is overly complex with multiple chained operations. Consider breaking it into multiple steps for better readability and debugging.

Suggested change
num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
precursor_transition = data[['Precursor', 'TransitionId']]
precursor_counts = precursor_transition.groupby("Precursor").count()
precursor_counts_reset = precursor_counts.reset_index(names='Precursor')
num_frags_per_prec = precursor_counts_reset.groupby('TransitionId').count()

Copilot uses AI. Check for mistakes.
for frag, count in num_frags_per_prec.iterrows():
logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)")

logger.info(f"Filter library to precursors containing {cfg.min_fragments} or more fragments")
ids_to_keep = data[['Precursor', 'Annotation']].groupby('Precursor').count()
ids_to_keep = ids_to_keep[ ids_to_keep['Annotation'] >= cfg.min_fragments ].index
data = data[ data['Precursor'].isin(ids_to_keep) ]

logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors")
if cfg.keep_decoys:
logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
Copy link

Copilot AI Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use f-string formatting instead of .format() for consistency with the rest of the codebase and better performance: f"Of which {len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())} are decoys"

Suggested change
logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
logger.info(f"Of Which {len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())} are decoys")

Copilot uses AI. Check for mistakes.

data.drop(columns=['TransitionId', 'Q_Value', 'RunId', 'Intensity'], inplace=True)
if cfg.test:
data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz'])

logger.info("Exporting library to file.")
data.to_csv(cfg.outfile, sep='\t', index=False)

def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Export quantification matrix at specified level with optional normalization.
Expand Down
3 changes: 3 additions & 0 deletions pyprophet/io/export/osw.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ def _read_sqlite(self, con):
"""Main entry point for reading SQLite data, delegates to specific methods."""
cfg = self.config

if self.config.export_format == "library":
raise NotImplementedError("Library export from sqlite OSW files is not supported")

if self._is_unscored_file(con):
logger.info("Reading unscored data from Parquet file.")
return self._read_unscored_data(con)
Expand Down
3 changes: 3 additions & 0 deletions pyprophet/io/export/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def read(self) -> pd.DataFrame:
try:
self._init_duckdb_views(con)

if self.config.export_format == "library":
raise NotImplementedError("Library export from non-split .parquet files is not supported")

if self._is_unscored_file():
logger.info("Reading unscored data from Parquet file.")
return self._read_unscored_data(con)
Expand Down
82 changes: 81 additions & 1 deletion pyprophet/io/export/split_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,18 @@ def read(self) -> pd.DataFrame:
try:
self._init_duckdb_views(con)

if self.config.export_format == "library":
if self._is_unscored_file():
descr= "Files must be scored for library generation."
Copy link

Copilot AI Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent spacing around assignment operator. Should be 'descr = "Files must be scored for library generation."'

Copilot uses AI. Check for mistakes.
logger.exception(descr)
raise ValueError(descr)
if not self._has_peptide_protein_global_scores():
descr= "Files must have peptide and protein level global scores for library generation."
Copy link

Copilot AI Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent spacing around assignment operator. Should be 'descr = "Files must have peptide and protein level global scores for library generation."'

Copilot uses AI. Check for mistakes.
logger.exception(descr)
raise ValueError(descr)
logger.info("Reading standard OpenSWATH data for library from split Parquet files.")
return self._read_library_data(con)

if self._is_unscored_file():
logger.info("Reading unscored data from split Parquet files.")
return self._read_unscored_data(con)
Expand All @@ -82,9 +94,17 @@ def read(self) -> pd.DataFrame:
logger.info("Reading standard OpenSWATH data from split Parquet files.")
data = self._read_standard_data(con)

return self._augment_data(data, con)
return self._augment_data(data, con)
finally:
con.close()

def _has_peptide_protein_global_scores(self) -> bool:
"""
Check if files contain peptide and protein global scores
"""
has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns)
has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns)
return has_peptide and has_protein

def _is_unscored_file(self) -> bool:
"""
Expand Down Expand Up @@ -257,6 +277,66 @@ def _read_augmented_data(self, con) -> pd.DataFrame:

return pd.merge(data, ipf_data, on="id", how="left")

def _read_library_data(self, con) -> pd.DataFrame:
"""
Read data specifically for precursors for library generation. This does not include all output in standard output
"""
if self.config.rt_calibration:
rt_col = "p.EXP_RT"
else:
rt_col = "p.PRECURSOR_LIBRARY_RT"

if self.config.im_calibration:
im_col = "p.EXP_IM"
else:
im_col = "p.PRECURSOR_LIBRARY_DRIFT_TIME"

if self.config.intensity_calibration:
intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY'
else:
intensity_col = 't.TRANSITION_LIBRARY_INTENSITY'

if self.config.keep_decoys:
decoy_query = ""
else:
decoy_query ="p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and"
Copy link

Copilot AI Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent spacing around assignment operator. Should be 'decoy_query = "p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and"'

Copilot uses AI. Check for mistakes.

query = f"""
SELECT
{rt_col} as NormalizedRetentionTime,
{im_col} as PrecursorIonMobility,
{intensity_col} as LibraryIntensity,
p.SCORE_MS2_Q_VALUE as Q_Value,
p.UNMODIFIED_SEQUENCE AS PeptideSequence,
p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
p.PRECURSOR_CHARGE AS PrecursorCharge,
p.FEATURE_MS2_AREA_INTENSITY AS Intensity,
p.RUN_ID AS RunId,
(p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor,
p.PRECURSOR_MZ AS PrecursorMz,
STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName,
t.ANNOTATION as Annotation,
t.PRODUCT_MZ as ProductMz,
t.TRANSITION_CHARGE as FragmentCharge,
t.TRANSITION_TYPE as FragmentType,
t.TRANSITION_ORDINAL as FragmentSeriesNumber,
t.TRANSITION_ID as TransitionId,
p.PRECURSOR_DECOY as Decoy
FROM precursors p
INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID
WHERE {decoy_query}
p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and
p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and
p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and
p.SCORE_MS2_PEAK_GROUP_RANK = 1

GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE,
p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE,
p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ,
t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID, p.FEATURE_MS2_AREA_INTENSITY
"""
return con.execute(query).fetchdf()

def _read_standard_data(self, con) -> pd.DataFrame:
"""
Read standard OpenSWATH data without IPF from split files.
Expand Down
Loading