From fbbffd34b8a359b54b743799083d7b0cbce756b0 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 17 Jun 2025 16:04:04 -0400 Subject: [PATCH 01/25] feature: start implementation of lib export with pyprophet --- pyprophet/cli/export.py | 73 ++++++++++++++++++++++++++++ pyprophet/io/_base.py | 11 +++++ pyprophet/io/export/split_parquet.py | 50 +++++++++++++++++++ 3 files changed, 134 insertions(+) diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 9ca6a3e1..083ce272 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -37,6 +37,7 @@ def export(): pass export.add_command(export_tsv, name="tsv") + export.add_command(export_library, name='library') export.add_command(export_matrix, name="matrix") export.add_command(export_parquet, name="parquet") export.add_command(export_compound, name="compound") @@ -347,6 +348,78 @@ def export_matrix( df = reader.read() writer.export_quant_matrix(df) +# Export to Library to be used in OpenSWATH +@click.command(name="library", cls=AdvancedHelpCommand) +@click.option( + "--in", + "infile", + required=True, + type=click.Path(exists=True), + help="PyProphet OSW input file.", +) +@click.option( + "--out", + "outfile", + required=False, + type=click.Path(exists=False), + help="Output tsv library.", +) +@click.option( + "--max_peakgroup_qvalue", + default=0.01, + show_default=True, + type=float, + help="Filter results to maximum run-specific peak group-level q-value, should not use values > 0.01.", +) +@click.option( + "--max_global_peptide_qvalue", + default=0.01, + show_default=True, + type=float, + help="Filter results to maximum global peptide-level q-value, should not use values > 0.01.", +) +@click.option( + "--max_global_protein_qvalue", + default=0.01, + show_default=True, + type=float, + help="Filter results to maximum global protein-level q-value, should not use values > 0.01.", +) +@measure_memory_usage_and_time +def export_library( + infile, + outfile, + max_peakgroup_qvalue, + max_global_peptide_qvalue, + max_global_protein_qvalue, +): + """ + Export OSW to tsv library format + """ + config = ExportIOConfig( + infile=infile, + outfile=outfile, + subsample_ratio=1.0, # Not used in export + level="export", + context="export", + export_format=format, + out_type="tsv", + max_rs_peakgroup_qvalue=max_peakgroup_qvalue, + max_global_peptide_qvalue=max_global_peptide_qvalue, + max_global_protein_qvalue=max_global_protein_qvalue, + ) + + reader = ReaderDispatcher.get_reader(config) + writer = WriterDispatcher.get_writer(config) + + df = reader.read_for_library() + logger.debug(df.columns) + logger.info(f"Library Contains {len(df['Precursor'].drop_duplicates())} Precursors") + logger.info(f"Precursor Fragment Distribution") + num_frags_per_prec = df[['Precursor', 'Annotation']].groupby("Precursor").count().reset_index(names='Precursor').groupby('Annotation').count() + for frag, count in num_frags_per_prec.iterrows(): + logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)") + writer.export_library(df) # Export to Parquet @click.command(name="parquet", cls=AdvancedHelpCommand) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 66588353..d1735fe2 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -617,6 +617,17 @@ def export_results(self, data: pd.DataFrame): else: raise ValueError(f"Unsupported export format: {cfg.export_format}") + def export_library(self, data: pd.DataFrame) -> pd.DataFrame: + """ + Export library data at specified level. + + Args: + data: Input DataFrame with library data + + """ + cfg = self.config + data.to_csv(cfg.outfile, sep='\t', index=False) + def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame: """ Export quantification matrix at specified level with optional normalization. diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 6137d533..4e05b188 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -85,6 +85,23 @@ def read(self) -> pd.DataFrame: return self._augment_data(data, con) finally: con.close() + + def read_for_library(self) -> pd.DataFrame: + """ + Read data specifically for library generation, which may not include all features. + """ + con = duckdb.connect() + try: + self._init_duckdb_views(con) + + if self._is_unscored_file(): + raise logger.exception("Files must be scored for library generation.") + + logger.info("Reading standard OpenSWATH data for library from split Parquet files.") + return self._read_library_data(con) + finally: + con.close() + def _is_unscored_file(self) -> bool: """ @@ -257,6 +274,39 @@ def _read_augmented_data(self, con) -> pd.DataFrame: return pd.merge(data, ipf_data, on="id", how="left") + def _read_library_data(self, con) -> pd.DataFrame: + """ + Read data specifically for precursors for library generation. This does not include all output in standard output + """ + logger.debug("Reading library data!!!!!") + query = f""" + SELECT + p.EXP_RT AS RT, + p.UNMODIFIED_SEQUENCE AS PeptideSequence, + p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, + p.PRECURSOR_CHARGE AS Charge, + (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_ID AS VARCHAR)) AS Precursor, + p.PRECURSOR_MZ AS mz, + p.FEATURE_MS2_AREA_INTENSITY AS Intensity, + t.FEATURE_ID AS id, + t.FEATURE_TRANSITION_AREA_INTENSITY AS FragmentIonIntensity, + t.ANNOTATION as Annotation, + t.PRODUCT_MZ as ProductMz, + t.TRANSITION_CHARGE as FragmentCharge, + t.TRANSITION_TYPE as FragmentType, + t.TRANSITION_ORDINAL as FragmentSeriesNumber + FROM precursors p + INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID + WHERE p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and + p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and + p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and + p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and + p.SCORE_MS2_PEAK_GROUP_RANK = 1 + + ORDER BY p.FEATURE_ID + """ + return con.execute(query).fetchdf() + def _read_standard_data(self, con) -> pd.DataFrame: """ Read standard OpenSWATH data without IPF from split files. From 02d5d650272a38cd12355adec7fcdd2d050b7bd3 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 18 Jun 2025 09:30:03 -0400 Subject: [PATCH 02/25] more functionality to lib export Add options to calibrate/not calibrate IM,RT, MS2 Frag --- pyprophet/_config.py | 12 ++++++++ pyprophet/cli/export.py | 41 +++++++++++++++++++++++----- pyprophet/io/_base.py | 34 +++++++++++++++++++++-- pyprophet/io/export/split_parquet.py | 29 +++++++++++++++----- 4 files changed, 100 insertions(+), 16 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 41ec4b1e..acc0791d 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -662,6 +662,12 @@ class ExportIOConfig(BaseIOConfig): # SqMass: Export to parquet pqp_file (Optional[str]): Path to PQP file for precursor/transition mapping. + + # Export to library + rt_calibration (bool): If True, will use emperical RT values as oppose to the original library RT values + im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values + intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values + min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True """ export_format: Literal[ @@ -691,3 +697,9 @@ class ExportIOConfig(BaseIOConfig): # SqMass: Export to parquet pqp_file: Optional[str] = None # Path to PQP file for precursor/transition mapping + + # Export to library + rt_calibration: bool = True + im_calibration: bool = True + intensity_calibration: bool = True + min_fragments: int = 6 \ No newline at end of file diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 083ce272..ecb59fff 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -385,6 +385,31 @@ def export_matrix( type=float, help="Filter results to maximum global protein-level q-value, should not use values > 0.01.", ) +@click.option( + "--rt_calibration/--no-rt_calibration", + default=True, + show_default=True, + help="Use empirical RT values as oppose to the original library RT values." +) +@click.option( + "--im_calibration/--no-im_calibration", + default=True, + show_default=True, + help="Use empirical IM values as oppose to the original library IM values." +) +@click.option( + "--intensity_calibration/--no-intensity_calibration", + default=True, + show_default=True, + help="Use empirical intensity values as oppose to the original library intensity values." +) +@click.option( + "--min_fragments", + default=3, + show_default=True, + type=int, + help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True." +) @measure_memory_usage_and_time def export_library( infile, @@ -392,6 +417,10 @@ def export_library( max_peakgroup_qvalue, max_global_peptide_qvalue, max_global_protein_qvalue, + rt_calibration, + im_calibration, + intensity_calibration, + min_fragments, ): """ Export OSW to tsv library format @@ -407,19 +436,17 @@ def export_library( max_rs_peakgroup_qvalue=max_peakgroup_qvalue, max_global_peptide_qvalue=max_global_peptide_qvalue, max_global_protein_qvalue=max_global_protein_qvalue, + rt_calibration=rt_calibration, + im_calibration=im_calibration, + intensity_calibration=intensity_calibration, + min_fragments=min_fragments, ) reader = ReaderDispatcher.get_reader(config) writer = WriterDispatcher.get_writer(config) df = reader.read_for_library() - logger.debug(df.columns) - logger.info(f"Library Contains {len(df['Precursor'].drop_duplicates())} Precursors") - logger.info(f"Precursor Fragment Distribution") - num_frags_per_prec = df[['Precursor', 'Annotation']].groupby("Precursor").count().reset_index(names='Precursor').groupby('Annotation').count() - for frag, count in num_frags_per_prec.iterrows(): - logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)") - writer.export_library(df) + writer.clean_and_export_library(df) # Export to Parquet @click.command(name="parquet", cls=AdvancedHelpCommand) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index d1735fe2..7ef01710 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -617,15 +617,45 @@ def export_results(self, data: pd.DataFrame): else: raise ValueError(f"Unsupported export format: {cfg.export_format}") - def export_library(self, data: pd.DataFrame) -> pd.DataFrame: + def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: """ - Export library data at specified level. + This function cleans the original dataframe and exports the library Args: data: Input DataFrame with library data """ cfg = self.config + + # For precursors found in more than one run, select the run with the smallest q value + data = data.sort_values(by='Q_Value').groupby("TransitionId").head(1) + assert (len(data['TransitionId'].drop_duplicates()) == len(data)) + + # Remove Annotation Column if all NAN + if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all(): + logger.debug("Annotation column is empty, dropping it.") + data.drop(columns=['Annotation'], inplace=True) + + import sklearn.preprocessing as preprocessing + if cfg.rt_calibration: + data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100 + if cfg.intensity_calibration: + data['LibraryIntensity'] = ( + data['LibraryIntensity'] / + data.groupby('Precursor')['LibraryIntensity'].transform('max') * + 10000) + + + ## Print Library statistics + logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors") + logger.info(f"Precursor Fragment Distribution") + num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count() + for frag, count in num_frags_per_prec.iterrows(): + logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)") + + data.drop(columns=['TransitionId', 'Q_Value'], inplace=True) + + logger.info("Exporting library to file.") data.to_csv(cfg.outfile, sep='\t', index=False) def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame: diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 4e05b188..550b5591 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -278,23 +278,38 @@ def _read_library_data(self, con) -> pd.DataFrame: """ Read data specifically for precursors for library generation. This does not include all output in standard output """ - logger.debug("Reading library data!!!!!") + if self.config.rt_calibration: + rt_query = "p.Norm_RT as NormalizedRetentionTime" + else: + rt_query = "p.EXP_RT as NormalizedRetentionTime" + + if self.config.im_calibration: + im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility" + else: + im_query = "p.EXP_IM as PrecursorIonMobility" + + if self.config.intensity_calibration: + intensity_query = 't.FEATURE_TRANSITION_AREA_INTENSITY AS LibraryIntensity' + else: + intensity_query = 't.TRANSITION_LIBRARY_INTENSITY AS LibraryIntensity' + query = f""" SELECT - p.EXP_RT AS RT, + {rt_query}, + {im_query}, + {intensity_query}, + p.SCORE_MS2_Q_VALUE as Q_Value, p.UNMODIFIED_SEQUENCE AS PeptideSequence, p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, p.PRECURSOR_CHARGE AS Charge, (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_ID AS VARCHAR)) AS Precursor, - p.PRECURSOR_MZ AS mz, - p.FEATURE_MS2_AREA_INTENSITY AS Intensity, - t.FEATURE_ID AS id, - t.FEATURE_TRANSITION_AREA_INTENSITY AS FragmentIonIntensity, + p.PRECURSOR_MZ AS PrecursorMz, t.ANNOTATION as Annotation, t.PRODUCT_MZ as ProductMz, t.TRANSITION_CHARGE as FragmentCharge, t.TRANSITION_TYPE as FragmentType, - t.TRANSITION_ORDINAL as FragmentSeriesNumber + t.TRANSITION_ORDINAL as FragmentSeriesNumber, + t.TRANSITION_ID as TransitionId FROM precursors p INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID WHERE p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and From 1ef91f99ae851a05ca4d72291216adb0692ebf22 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 9 Jul 2025 11:50:24 -0400 Subject: [PATCH 03/25] change default min frags to 4 --- pyprophet/cli/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index ecb59fff..6acea357 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -405,7 +405,7 @@ def export_matrix( ) @click.option( "--min_fragments", - default=3, + default=4, show_default=True, type=int, help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True." From 0c999a665a03a53f7356bd93dbead9ff84fe33a5 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 9 Jul 2025 12:27:28 -0400 Subject: [PATCH 04/25] filter fragments with 0 library intensity --- pyprophet/io/_base.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 7ef01710..750114b6 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -644,15 +644,23 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: data['LibraryIntensity'] / data.groupby('Precursor')['LibraryIntensity'].transform('max') * 10000) - + data = data[data['LibraryIntensity'] > 0] # Remove rows with zero intensity ## Print Library statistics logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors") - logger.info(f"Precursor Fragment Distribution") + + logger.info(f"Precursor Fragment Distribution (Before Filtering)") num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count() for frag, count in num_frags_per_prec.iterrows(): logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)") + logger.info(f"Filter library to precursors containing {cfg.min_fragments} or more fragments") + ids_to_keep = data[['Precursor', 'Annotation']].groupby('Precursor').count() + ids_to_keep = ids_to_keep[ ids_to_keep['Annotation'] >= cfg.min_fragments ].index + data = data[ data['Precursor'].isin(ids_to_keep) ] + + logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors") + data.drop(columns=['TransitionId', 'Q_Value'], inplace=True) logger.info("Exporting library to file.") From 0a6e7db0276963229b66b8780b9348107aed5ac6 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 9 Jul 2025 12:28:10 -0400 Subject: [PATCH 05/25] require a --out parameter --- pyprophet/cli/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 6acea357..3c0fc825 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -360,7 +360,7 @@ def export_matrix( @click.option( "--out", "outfile", - required=False, + required=True, # need to name the library or else get error in os.path.splittext line 75, in __post_init__in _base. type=click.Path(exists=False), help="Output tsv library.", ) From 5f41220b9e17854b913410e6aaa7ea2bc2a0e265 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 9 Jul 2025 12:28:27 -0400 Subject: [PATCH 06/25] change config from 6 to 4 --- pyprophet/_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index acc0791d..1cf5ec58 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -702,4 +702,4 @@ class ExportIOConfig(BaseIOConfig): rt_calibration: bool = True im_calibration: bool = True intensity_calibration: bool = True - min_fragments: int = 6 \ No newline at end of file + min_fragments: int = 4 \ No newline at end of file From d725eb83d10e8ef0cc403543781e7f553a7f0d01 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 10 Jul 2025 10:53:48 -0400 Subject: [PATCH 07/25] fix bugs, update docs --- pyprophet/io/export/split_parquet.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 550b5591..a4dc0ab1 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -88,7 +88,7 @@ def read(self) -> pd.DataFrame: def read_for_library(self) -> pd.DataFrame: """ - Read data specifically for library generation, which may not include all features. + Read data specifically for library generation """ con = duckdb.connect() try: @@ -279,14 +279,14 @@ def _read_library_data(self, con) -> pd.DataFrame: Read data specifically for precursors for library generation. This does not include all output in standard output """ if self.config.rt_calibration: - rt_query = "p.Norm_RT as NormalizedRetentionTime" - else: rt_query = "p.EXP_RT as NormalizedRetentionTime" + else: + rt_query = "p.PRECURSOR_LIBRARY_RT as NormalizedRetentionTime" if self.config.im_calibration: - im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility" - else: im_query = "p.EXP_IM as PrecursorIonMobility" + else: + im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility" if self.config.intensity_calibration: intensity_query = 't.FEATURE_TRANSITION_AREA_INTENSITY AS LibraryIntensity' @@ -301,8 +301,8 @@ def _read_library_data(self, con) -> pd.DataFrame: p.SCORE_MS2_Q_VALUE as Q_Value, p.UNMODIFIED_SEQUENCE AS PeptideSequence, p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, - p.PRECURSOR_CHARGE AS Charge, - (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_ID AS VARCHAR)) AS Precursor, + p.PRECURSOR_CHARGE AS PrecursorCharge, + (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor, p.PRECURSOR_MZ AS PrecursorMz, t.ANNOTATION as Annotation, t.PRODUCT_MZ as ProductMz, From 9b108aafbb62cf3cfa1b778435ee3627fa5026a2 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 10 Jul 2025 12:10:24 -0400 Subject: [PATCH 08/25] fix: export protein info in lib --- pyprophet/io/export/split_parquet.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index a4dc0ab1..0e77133f 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -279,31 +279,32 @@ def _read_library_data(self, con) -> pd.DataFrame: Read data specifically for precursors for library generation. This does not include all output in standard output """ if self.config.rt_calibration: - rt_query = "p.EXP_RT as NormalizedRetentionTime" + rt_col = "p.EXP_RT" else: - rt_query = "p.PRECURSOR_LIBRARY_RT as NormalizedRetentionTime" + rt_col = "p.PRECURSOR_LIBRARY_RT" if self.config.im_calibration: - im_query = "p.EXP_IM as PrecursorIonMobility" + im_col = "p.EXP_IM" else: - im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility" + im_col = "p.PRECURSOR_LIBRARY_DRIFT_TIME" if self.config.intensity_calibration: - intensity_query = 't.FEATURE_TRANSITION_AREA_INTENSITY AS LibraryIntensity' + intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY' else: - intensity_query = 't.TRANSITION_LIBRARY_INTENSITY AS LibraryIntensity' + intensity_col = 't.TRANSITION_LIBRARY_INTENSITY' query = f""" SELECT - {rt_query}, - {im_query}, - {intensity_query}, + {rt_col} as NormalizedRetentionTime, + {im_col} as PrecursorIonMobility, + {intensity_col} as LibraryIntensity, p.SCORE_MS2_Q_VALUE as Q_Value, p.UNMODIFIED_SEQUENCE AS PeptideSequence, p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, p.PRECURSOR_CHARGE AS PrecursorCharge, (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor, p.PRECURSOR_MZ AS PrecursorMz, + STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName, t.ANNOTATION as Annotation, t.PRODUCT_MZ as ProductMz, t.TRANSITION_CHARGE as FragmentCharge, @@ -318,7 +319,10 @@ def _read_library_data(self, con) -> pd.DataFrame: p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and p.SCORE_MS2_PEAK_GROUP_RANK = 1 - ORDER BY p.FEATURE_ID + GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE, + p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE, + p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ, + t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID """ return con.execute(query).fetchdf() From e36e23f03564e1621223f332384fe15b4f4e022b Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 11 Jul 2025 14:00:57 -0400 Subject: [PATCH 09/25] fix: lib export compute annotation col if empty --- pyprophet/io/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 750114b6..bde41930 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -633,8 +633,9 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: # Remove Annotation Column if all NAN if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all(): - logger.debug("Annotation column is empty, dropping it.") + logger.debug("Annotation column is empty, so computing it manually.") data.drop(columns=['Annotation'], inplace=True) + data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str) import sklearn.preprocessing as preprocessing if cfg.rt_calibration: From 76bbff78dbb50f137c5373a928bb775ce582ed26 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Jul 2025 12:10:52 -0400 Subject: [PATCH 10/25] feature: option to keep significant decoys in lib refinement --- pyprophet/_config.py | 4 +++- pyprophet/cli/export.py | 9 +++++++++ pyprophet/io/_base.py | 4 +++- pyprophet/io/export/split_parquet.py | 12 +++++++++--- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 1cf5ec58..4f2375d2 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -668,6 +668,7 @@ class ExportIOConfig(BaseIOConfig): im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True + keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified """ export_format: Literal[ @@ -702,4 +703,5 @@ class ExportIOConfig(BaseIOConfig): rt_calibration: bool = True im_calibration: bool = True intensity_calibration: bool = True - min_fragments: int = 4 \ No newline at end of file + min_fragments: int = 4 + keep_decoys: bool = False # Whether to keep decoy entries in the library \ No newline at end of file diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 3c0fc825..439a2d40 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -410,6 +410,13 @@ def export_matrix( type=int, help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True." ) +@click.option( + "--keep_decoys/--no-keep_decoys", + default=True, + show_default=True, + type=bool, + help="Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" +) @measure_memory_usage_and_time def export_library( infile, @@ -421,6 +428,7 @@ def export_library( im_calibration, intensity_calibration, min_fragments, + keep_decoys ): """ Export OSW to tsv library format @@ -440,6 +448,7 @@ def export_library( im_calibration=im_calibration, intensity_calibration=intensity_calibration, min_fragments=min_fragments, + keep_decoys=keep_decoys ) reader = ReaderDispatcher.get_reader(config) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 26613e36..090330ec 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -663,7 +663,9 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: data = data[ data['Precursor'].isin(ids_to_keep) ] logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors") - + if cfg.keep_decoys: + logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates()))) + data.drop(columns=['TransitionId', 'Q_Value'], inplace=True) logger.info("Exporting library to file.") diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 0e77133f..73c0f3e9 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -292,6 +292,11 @@ def _read_library_data(self, con) -> pd.DataFrame: intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY' else: intensity_col = 't.TRANSITION_LIBRARY_INTENSITY' + + if self.config.keep_decoys: + decoy_query = "" + else: + decoy_query ="p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and" query = f""" SELECT @@ -310,10 +315,11 @@ def _read_library_data(self, con) -> pd.DataFrame: t.TRANSITION_CHARGE as FragmentCharge, t.TRANSITION_TYPE as FragmentType, t.TRANSITION_ORDINAL as FragmentSeriesNumber, - t.TRANSITION_ID as TransitionId + t.TRANSITION_ID as TransitionId, + p.PRECURSOR_DECOY as Decoy FROM precursors p INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID - WHERE p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and + WHERE {decoy_query} p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and @@ -322,7 +328,7 @@ def _read_library_data(self, con) -> pd.DataFrame: GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE, p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE, p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ, - t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID + t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY """ return con.execute(query).fetchdf() From f8b3753c00063f8bf2bf35ae555bcc0b061ca2cf Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 7 Aug 2025 17:27:15 -0400 Subject: [PATCH 11/25] verbose: note that keep_decoys in lib gen is experimental feature --- pyprophet/cli/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 439a2d40..da6f30a5 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -415,7 +415,7 @@ def export_matrix( default=True, show_default=True, type=bool, - help="Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" + help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" ) @measure_memory_usage_and_time def export_library( From 4b9076bc2fd66646643b23d7eaff6d8f9ac0d400 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 8 Aug 2025 08:28:03 -0400 Subject: [PATCH 12/25] test: add test for lib generation --- pyprophet/_config.py | 2 ++ pyprophet/cli/export.py | 11 +++++++++-- pyprophet/io/_base.py | 2 ++ tests/test_pyprophet_export.py | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 4f2375d2..3ad1398b 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -653,6 +653,7 @@ class ExportIOConfig(BaseIOConfig): top_n (int): Number of top intense features to use for summarization consistent_top (bool): Whether to use same top features across all runs normalization (Literal["none", "median", "medianmedian", "quantile"]): Normalization method + test: bool = False: Whether to enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge # OSW: Export to parquet compression_method (Literal["none", "snappy", "gzip", "brotli", "zstd"]): Compression method for parquet files. @@ -684,6 +685,7 @@ class ExportIOConfig(BaseIOConfig): max_global_peptide_qvalue: float = 0.01 protein: bool = True max_global_protein_qvalue: float = 0.01 + test: bool = False # Quantification matrix options top_n: int = 3 diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index da6f30a5..19fb67ea 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -417,6 +417,11 @@ def export_matrix( type=bool, help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" ) +@click.option( + "--test/--no-test", + default=False, + show_default=True, + help="Enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge") @measure_memory_usage_and_time def export_library( infile, @@ -428,7 +433,8 @@ def export_library( im_calibration, intensity_calibration, min_fragments, - keep_decoys + keep_decoys, + test ): """ Export OSW to tsv library format @@ -448,7 +454,8 @@ def export_library( im_calibration=im_calibration, intensity_calibration=intensity_calibration, min_fragments=min_fragments, - keep_decoys=keep_decoys + keep_decoys=keep_decoys, + test=test ) reader = ReaderDispatcher.get_reader(config) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 090330ec..e4f3e6c6 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -667,6 +667,8 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates()))) data.drop(columns=['TransitionId', 'Q_Value'], inplace=True) + if cfg.test: + data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz']) logger.info("Exporting library to file.") data.to_csv(cfg.outfile, sep='\t', index=False) diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py index 19e0bece..761247ec 100644 --- a/tests/test_pyprophet_export.py +++ b/tests/test_pyprophet_export.py @@ -147,6 +147,40 @@ def test_osw_analysis( f"{temp_folder}/test_data.tsv", ) +@pytest.mark.parametrize( + "calib", + [ True, False] +) +def test_osw_analysis_libExport(test_data_split_parquet, temp_folder, regtest, calib +): + # TODO extend to other inputs as well, for now just use split_parquet + input_strategy = { + "path": test_data_split_parquet, + "reader": "parquet_split", + "cmd_prefix": f"--in={test_data_split_parquet}", + } + + cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && " + + # peptide-level + cmd += f"pyprophet infer peptide --pi0_lambda=0.001 0 0 {input_strategy['cmd_prefix']} --context=global && " + + # protein-level + cmd += f"pyprophet infer protein --pi0_lambda=0 0 0 {input_strategy['cmd_prefix']} --context=global && " + + # export + if calib: + cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1" + else: + cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration" + + run_pyprophet_command(cmd, temp_folder) + validate_export_results( + regtest, + input_strategy["path"], + input_strategy["reader"], + f"{temp_folder}/test_lib.tsv", + ) def test_osw_unscored(input_strategy, temp_folder, regtest): """Test export of unscored OSW data""" From f455b10fcb48ed114eb3a42dc0cd6ccf02105b7d Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 8 Aug 2025 08:48:56 -0400 Subject: [PATCH 13/25] minor refactor for better support across different i/o --- pyprophet/_config.py | 5 +++-- pyprophet/cli/export.py | 4 ++-- pyprophet/io/export/split_parquet.py | 30 ++++++++++++++-------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 3ad1398b..55c98a17 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -635,6 +635,7 @@ class ExportIOConfig(BaseIOConfig): - "legacy_split": Split TSV files for each run. - "parquet": Single Parquet file with merged results. - "parquet_split": Split Parquet files for each run. + - "library" : .tsv library file out_type (Literal["tsv", "csv"]): Output file type for exported results. transition_quantification (bool): Report aggregated transition-level quantification. max_transition_pep (float): Maximum PEP to retain scored transitions for quantification (requires transition-level scoring). @@ -673,7 +674,7 @@ class ExportIOConfig(BaseIOConfig): """ export_format: Literal[ - "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split" + "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split", "library" ] = "legacy_merged" out_type: Literal["tsv", "csv"] = "tsv" transition_quantification: bool = False @@ -701,7 +702,7 @@ class ExportIOConfig(BaseIOConfig): # SqMass: Export to parquet pqp_file: Optional[str] = None # Path to PQP file for precursor/transition mapping - # Export to library + # Export to library options rt_calibration: bool = True im_calibration: bool = True intensity_calibration: bool = True diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 19fb67ea..bc4a1bc3 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -445,7 +445,7 @@ def export_library( subsample_ratio=1.0, # Not used in export level="export", context="export", - export_format=format, + export_format="library", out_type="tsv", max_rs_peakgroup_qvalue=max_peakgroup_qvalue, max_global_peptide_qvalue=max_global_peptide_qvalue, @@ -461,7 +461,7 @@ def export_library( reader = ReaderDispatcher.get_reader(config) writer = WriterDispatcher.get_writer(config) - df = reader.read_for_library() + df = reader.read() writer.clean_and_export_library(df) # Export to Parquet diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 73c0f3e9..767901fe 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -68,6 +68,14 @@ def read(self) -> pd.DataFrame: try: self._init_duckdb_views(con) + if self.config.export_format == "library": + if self._is_unscored_file(): + raise logger.exception("Files must be scored for library generation.") + if not self._has_peptide_protein_global_scores(): + raise logger.exception("Files must have peptide and protein level global scores for library generation.") + logger.info("Reading standard OpenSWATH data for library from split Parquet files.") + return self._read_library_data(con) + if self._is_unscored_file(): logger.info("Reading unscored data from split Parquet files.") return self._read_unscored_data(con) @@ -82,26 +90,18 @@ def read(self) -> pd.DataFrame: logger.info("Reading standard OpenSWATH data from split Parquet files.") data = self._read_standard_data(con) - return self._augment_data(data, con) + return self._augment_data(data, con) finally: con.close() - def read_for_library(self) -> pd.DataFrame: + def _has_peptide_protein_global_scores(self) -> bool: """ - Read data specifically for library generation + Check if files contain peptide and protein global scores """ - con = duckdb.connect() - try: - self._init_duckdb_views(con) - - if self._is_unscored_file(): - raise logger.exception("Files must be scored for library generation.") - - logger.info("Reading standard OpenSWATH data for library from split Parquet files.") - return self._read_library_data(con) - finally: - con.close() - + print(self._columns) + has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns) + has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns) + return has_peptide and has_protein def _is_unscored_file(self) -> bool: """ From 1667c2578377bba51da8b12d40576dc91f485aaf Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 8 Aug 2025 09:09:52 -0400 Subject: [PATCH 14/25] add not implemented error for osw/parquet output also add tests for osw/parquet to test for the not implemented error --- pyprophet/io/export/osw.py | 3 ++ pyprophet/io/export/parquet.py | 3 ++ ...nalysis_libExport[split_parquet-False].out | 14 ++++++++ ...analysis_libExport[split_parquet-True].out | 14 ++++++++ tests/test_pyprophet_export.py | 32 +++++++++---------- 5 files changed, 50 insertions(+), 16 deletions(-) create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py index 02d1669d..db6e2455 100644 --- a/pyprophet/io/export/osw.py +++ b/pyprophet/io/export/osw.py @@ -123,6 +123,9 @@ def _read_sqlite(self, con): """Main entry point for reading SQLite data, delegates to specific methods.""" cfg = self.config + if self.config.export_format == "library": + raise NotImplementedError("Library export from non-split .parquet files is not supported") + if self._is_unscored_file(con): logger.info("Reading unscored data from Parquet file.") return self._read_unscored_data(con) diff --git a/pyprophet/io/export/parquet.py b/pyprophet/io/export/parquet.py index 16026cdd..556abdd4 100644 --- a/pyprophet/io/export/parquet.py +++ b/pyprophet/io/export/parquet.py @@ -36,6 +36,9 @@ def read(self) -> pd.DataFrame: try: self._init_duckdb_views(con) + if self.config.export_format == "library": + raise NotImplementedError("Library export from non-split .parquet files is not supported") + if self._is_unscored_file(): logger.info("Reading unscored data from Parquet file.") return self._read_unscored_data(con) diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out new file mode 100644 index 00000000..f3e17a61 --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 403.1646 AQUA4SWATH_HMLangeE +3 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 516.2486 AQUA4SWATH_HMLangeE +4 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 y11^1 0 1 11 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1119.4173 AQUA4SWATH_Spyo +96 y11^2 0 2 11 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 548.2978 AQUA4SWATH_Spyo +97 y11^2 0 2 11 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 560.2123 AQUA4SWATH_Spyo +98 y12^1 0 1 12 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1152.6099 AQUA4SWATH_Spyo +99 y12^1 0 1 12 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1232.5013 AQUA4SWATH_Spyo + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out new file mode 100644 index 00000000..d37d89e1 --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN 10000.0000 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b 1912.5839 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b4^1 0 1 4 b 704.8697 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 403.1646 AQUA4SWATH_HMLangeE +3 b5^1 0 1 5 b 1185.3327 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 516.2486 AQUA4SWATH_HMLangeE +4 b5^1 0 1 5 b 910.8388 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 y11^1 0 1 11 y 7.9894 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1119.4173 AQUA4SWATH_Spyo +96 y11^2 0 2 11 y 220.1077 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 548.2978 AQUA4SWATH_Spyo +97 y11^2 0 2 11 y 107.0578 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 560.2123 AQUA4SWATH_Spyo +98 y12^1 0 1 12 y 1006.7430 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1152.6099 AQUA4SWATH_Spyo +99 y12^1 0 1 12 y 2.3968 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1232.5013 AQUA4SWATH_Spyo + +[100 rows x 15 columns] diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py index 761247ec..1a4910f8 100644 --- a/tests/test_pyprophet_export.py +++ b/tests/test_pyprophet_export.py @@ -91,7 +91,10 @@ def run_pyprophet_command(cmd, temp_folder): ).decode() except subprocess.CalledProcessError as error: print(f"Command failed: {cmd}\n{error.output.decode()}", file=sys.stderr) - raise + if "NotImplementedError" in error.output.decode(): # attempt to catch the specific error rather than the CalledProcessError + raise NotImplementedError + else: + raise def validate_export_results( @@ -151,15 +154,8 @@ def test_osw_analysis( "calib", [ True, False] ) -def test_osw_analysis_libExport(test_data_split_parquet, temp_folder, regtest, calib +def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib ): - # TODO extend to other inputs as well, for now just use split_parquet - input_strategy = { - "path": test_data_split_parquet, - "reader": "parquet_split", - "cmd_prefix": f"--in={test_data_split_parquet}", - } - cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && " # peptide-level @@ -174,13 +170,17 @@ def test_osw_analysis_libExport(test_data_split_parquet, temp_folder, regtest, c else: cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration" - run_pyprophet_command(cmd, temp_folder) - validate_export_results( - regtest, - input_strategy["path"], - input_strategy["reader"], - f"{temp_folder}/test_lib.tsv", - ) + if not input_strategy["reader"] == "parquet_split": + with pytest.raises(NotImplementedError): + run_pyprophet_command(cmd, temp_folder) + else: + run_pyprophet_command(cmd, temp_folder) + validate_export_results( + regtest, + input_strategy["path"], + input_strategy["reader"], + f"{temp_folder}/test_lib.tsv", + ) def test_osw_unscored(input_strategy, temp_folder, regtest): """Test export of unscored OSW data""" From 609e84f3d175c52822512fa7ac80febd412c5ecf Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 11 Aug 2025 16:46:58 -0400 Subject: [PATCH 15/25] feature: add option to export rt unit in non iRT --- pyprophet/_config.py | 4 +++- pyprophet/cli/export.py | 10 ++++++++++ pyprophet/io/_base.py | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 55c98a17..bec1da7f 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -671,6 +671,7 @@ class ExportIOConfig(BaseIOConfig): intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified + rt_unit (Literal["iRT", "RT"], default = 'iRT') = "iRT": Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library """ export_format: Literal[ @@ -707,4 +708,5 @@ class ExportIOConfig(BaseIOConfig): im_calibration: bool = True intensity_calibration: bool = True min_fragments: int = 4 - keep_decoys: bool = False # Whether to keep decoy entries in the library \ No newline at end of file + keep_decoys: bool = False # Whether to keep decoy entries in the library + rt_unit: Literal["iRT", "RT"] = "iRT" \ No newline at end of file diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index bc4a1bc3..60694b12 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -417,6 +417,14 @@ def export_matrix( type=bool, help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" ) +@click.option( + "--rt_unit", + default="iRT", + show_default=True, + type=click.Choice(["iRT", "RT"]), + help='Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library.', + hidden=True +) @click.option( "--test/--no-test", default=False, @@ -434,6 +442,7 @@ def export_library( intensity_calibration, min_fragments, keep_decoys, + rt_unit, test ): """ @@ -455,6 +464,7 @@ def export_library( intensity_calibration=intensity_calibration, min_fragments=min_fragments, keep_decoys=keep_decoys, + rt_unit=rt_unit, test=test ) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index e4f3e6c6..c4dcabfa 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -640,7 +640,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str) import sklearn.preprocessing as preprocessing - if cfg.rt_calibration: + if cfg.rt_calibration and cfg.rt_unit == "iRT": data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100 if cfg.intensity_calibration: data['LibraryIntensity'] = ( From 5c6bda2d634394ab968faf40836da4e861e00f93 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 11 Aug 2025 16:50:25 -0400 Subject: [PATCH 16/25] remove debug line --- pyprophet/io/export/split_parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 767901fe..035ae808 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -98,7 +98,6 @@ def _has_peptide_protein_global_scores(self) -> bool: """ Check if files contain peptide and protein global scores """ - print(self._columns) has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns) has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns) return has_peptide and has_protein From bb5607c1e962f7419dd51caba9252e4eabf90ccf Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 11 Aug 2025 16:53:14 -0400 Subject: [PATCH 17/25] swtich keep_decoys default to no-keep_decoys --- pyprophet/cli/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 60694b12..2d95f8c3 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -412,7 +412,7 @@ def export_matrix( ) @click.option( "--keep_decoys/--no-keep_decoys", - default=True, + default=False, show_default=True, type=bool, help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" From d959d705e495466943dbd84e888212ae697280be Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Mon, 11 Aug 2025 17:38:57 -0400 Subject: [PATCH 18/25] bug fix: transitions with diff RT If the q values are the same for 2 different runs than behaviour is undefined for which precursor selecting. This can mean that transitions part of the same transition group have different RT/IM. To address this, also sort by RunId. If Q values are the same just take the first run --- pyprophet/io/_base.py | 5 +++-- pyprophet/io/export/split_parquet.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index c4dcabfa..4a6110e1 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -630,7 +630,8 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: cfg = self.config # For precursors found in more than one run, select the run with the smallest q value - data = data.sort_values(by='Q_Value').groupby("TransitionId").head(1) + # If q values are the same, select the first run + data = data.sort_values(by=['Q_Value', 'RunId']).groupby("TransitionId").head(1) assert (len(data['TransitionId'].drop_duplicates()) == len(data)) # Remove Annotation Column if all NAN @@ -666,7 +667,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: if cfg.keep_decoys: logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates()))) - data.drop(columns=['TransitionId', 'Q_Value'], inplace=True) + data.drop(columns=['TransitionId', 'Q_Value', 'RunId'], inplace=True) if cfg.test: data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz']) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 035ae808..d17a75da 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -306,6 +306,7 @@ def _read_library_data(self, con) -> pd.DataFrame: p.UNMODIFIED_SEQUENCE AS PeptideSequence, p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, p.PRECURSOR_CHARGE AS PrecursorCharge, + p.RUN_ID AS RunId, (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor, p.PRECURSOR_MZ AS PrecursorMz, STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName, @@ -327,7 +328,7 @@ def _read_library_data(self, con) -> pd.DataFrame: GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE, p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE, p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ, - t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY + t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID """ return con.execute(query).fetchdf() From 989ade11e768f7359b6c7afcde05a99c09d4d706 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 12 Aug 2025 15:03:24 -0400 Subject: [PATCH 19/25] update parameter descriptions --- pyprophet/cli/export.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 2d95f8c3..2162093c 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -369,21 +369,21 @@ def export_matrix( default=0.01, show_default=True, type=float, - help="Filter results to maximum run-specific peak group-level q-value, should not use values > 0.01.", + help="Filter results to maximum run-specific peak group-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates. If there are multiple runs with the same precursors, the run with the lowest q value is used", ) @click.option( "--max_global_peptide_qvalue", default=0.01, show_default=True, type=float, - help="Filter results to maximum global peptide-level q-value, should not use values > 0.01.", + help="Filter results to maximum global peptide-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates." ) @click.option( "--max_global_protein_qvalue", default=0.01, show_default=True, type=float, - help="Filter results to maximum global protein-level q-value, should not use values > 0.01.", + help="Filter results to maximum global protein-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates." ) @click.option( "--rt_calibration/--no-rt_calibration", From 0aae3a884d0adf9f6a6372d9628d7f84597724cb Mon Sep 17 00:00:00 2001 From: Joshua Charkow <47336288+jcharkow@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:16:04 -0400 Subject: [PATCH 20/25] fix: error description Co-authored-by: Justin Sing <32938975+singjc@users.noreply.github.com> --- pyprophet/io/export/osw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py index db6e2455..d4520930 100644 --- a/pyprophet/io/export/osw.py +++ b/pyprophet/io/export/osw.py @@ -124,7 +124,7 @@ def _read_sqlite(self, con): cfg = self.config if self.config.export_format == "library": - raise NotImplementedError("Library export from non-split .parquet files is not supported") + raise NotImplementedError("Library export from sqlite OSW files is not supported") if self._is_unscored_file(con): logger.info("Reading unscored data from Parquet file.") From c3daea1cfb18fcac9fec258608ced40c88e3a8ad Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 12 Aug 2025 15:24:46 -0400 Subject: [PATCH 21/25] apply suggestions from PR review --- pyprophet/io/_base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 4a6110e1..0e146343 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -48,6 +48,7 @@ import duckdb import pandas as pd import polars as pl +import sklearn.preprocessing as preprocessing from loguru import logger from .._base import BaseIOConfig @@ -632,7 +633,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: # For precursors found in more than one run, select the run with the smallest q value # If q values are the same, select the first run data = data.sort_values(by=['Q_Value', 'RunId']).groupby("TransitionId").head(1) - assert (len(data['TransitionId'].drop_duplicates()) == len(data)) + assert (len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value and RunId, duplicate transition IDs found.") # Remove Annotation Column if all NAN if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all(): @@ -640,7 +641,6 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: data.drop(columns=['Annotation'], inplace=True) data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str) - import sklearn.preprocessing as preprocessing if cfg.rt_calibration and cfg.rt_unit == "iRT": data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100 if cfg.intensity_calibration: @@ -648,6 +648,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: data['LibraryIntensity'] / data.groupby('Precursor')['LibraryIntensity'].transform('max') * 10000) + logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0]))) data = data[data['LibraryIntensity'] > 0] # Remove rows with zero intensity ## Print Library statistics From 3a8d20333499cef72471d91236aac9166fed3def Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 12 Aug 2025 15:45:58 -0400 Subject: [PATCH 22/25] test: update tests with new snapshots after minor changes in data manipulation update snapshot tests --- ...t.test_osw_analysis_libExport[osw-False-RT].out | 0 ....test_osw_analysis_libExport[osw-False-iRT].out | 0 ...rt.test_osw_analysis_libExport[osw-True-RT].out | 0 ...t.test_osw_analysis_libExport[osw-True-iRT].out | 0 ...st_osw_analysis_libExport[parquet-False-RT].out | 0 ...t_osw_analysis_libExport[parquet-False-iRT].out | 0 ...est_osw_analysis_libExport[parquet-True-RT].out | 0 ...st_osw_analysis_libExport[parquet-True-iRT].out | 0 ..._analysis_libExport[split_parquet-False-RT].out | 14 ++++++++++++++ ...analysis_libExport[split_parquet-False-iRT].out | 14 ++++++++++++++ ...osw_analysis_libExport[split_parquet-False].out | 14 -------------- ...w_analysis_libExport[split_parquet-True-RT].out | 14 ++++++++++++++ ..._analysis_libExport[split_parquet-True-iRT].out | 14 ++++++++++++++ ..._osw_analysis_libExport[split_parquet-True].out | 14 -------------- tests/test_pyprophet_export.py | 11 ++++++----- 15 files changed, 62 insertions(+), 33 deletions(-) create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out delete mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out delete mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out new file mode 100644 index 00000000..37c4cd9b --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out new file mode 100644 index 00000000..37c4cd9b --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out deleted file mode 100644 index f3e17a61..00000000 --- a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out +++ /dev/null @@ -1,14 +0,0 @@ - Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName -0 -1^1 0 1 -1 NaN -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE -1 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE -2 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 403.1646 AQUA4SWATH_HMLangeE -3 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 516.2486 AQUA4SWATH_HMLangeE -4 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE -.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... -95 y11^1 0 1 11 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1119.4173 AQUA4SWATH_Spyo -96 y11^2 0 2 11 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 548.2978 AQUA4SWATH_Spyo -97 y11^2 0 2 11 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 560.2123 AQUA4SWATH_Spyo -98 y12^1 0 1 12 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1152.6099 AQUA4SWATH_Spyo -99 y12^1 0 1 12 y -1.0 AAGASAQVLGQEGK(UniMod:259) -5.3 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1232.5013 AQUA4SWATH_Spyo - -[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out new file mode 100644 index 00000000..5d75e9c9 --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN 10000.0000 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b 1912.5839 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b 910.8388 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b 763.6335 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b 619.6819 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b 4293.9906 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b 2245.5035 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b 1169.3817 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b 796.7460 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b 616.6858 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out new file mode 100644 index 00000000..8f3d8b08 --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN 10000.0000 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b 1912.5839 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b 910.8388 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b 763.6335 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b 619.6819 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b 4293.9906 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b 2245.5035 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b 1169.3817 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b 796.7460 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b 616.6858 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out deleted file mode 100644 index d37d89e1..00000000 --- a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out +++ /dev/null @@ -1,14 +0,0 @@ - Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName -0 -1^1 0 1 -1 NaN 10000.0000 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE -1 b4^1 0 1 4 b 1912.5839 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE -2 b4^1 0 1 4 b 704.8697 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 403.1646 AQUA4SWATH_HMLangeE -3 b5^1 0 1 5 b 1185.3327 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 516.2486 AQUA4SWATH_HMLangeE -4 b5^1 0 1 5 b 910.8388 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE -.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... -95 y11^1 0 1 11 y 7.9894 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1119.4173 AQUA4SWATH_Spyo -96 y11^2 0 2 11 y 220.1077 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 548.2978 AQUA4SWATH_Spyo -97 y11^2 0 2 11 y 107.0578 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 560.2123 AQUA4SWATH_Spyo -98 y12^1 0 1 12 y 1006.7430 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1152.6099 AQUA4SWATH_Spyo -99 y12^1 0 1 12 y 2.3968 AAGASAQVLGQEGK(UniMod:259) 22.1652 AAGASAQVLGQEGK AAGASAQVLGQEGK(UniMod:259)_2 2 NaN 647.8457 1232.5013 AQUA4SWATH_Spyo - -[100 rows x 15 columns] diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py index 1a4910f8..6e05f8c3 100644 --- a/tests/test_pyprophet_export.py +++ b/tests/test_pyprophet_export.py @@ -151,10 +151,10 @@ def test_osw_analysis( ) @pytest.mark.parametrize( - "calib", - [ True, False] + "calib, rt_unit", + [ (True, 'iRT'), (False, 'iRT'), (True, 'RT'), (False, 'RT')] ) -def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib +def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib, rt_unit ): cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && " @@ -164,11 +164,12 @@ def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib # protein-level cmd += f"pyprophet infer protein --pi0_lambda=0 0 0 {input_strategy['cmd_prefix']} --context=global && " + # export if calib: - cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1" + cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --rt_unit={rt_unit}" else: - cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration" + cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration --rt_unit={rt_unit}" if not input_strategy["reader"] == "parquet_split": with pytest.raises(NotImplementedError): From 76ac52c32eca2624e56ea92bc94b105ebadc0ae7 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 19 Aug 2025 16:07:46 -0400 Subject: [PATCH 23/25] feature: sort by intensity if q value tie if still a tie sort by runId --- pyprophet/io/_base.py | 4 ++-- pyprophet/io/export/split_parquet.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 0e146343..878075d7 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -632,7 +632,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: # For precursors found in more than one run, select the run with the smallest q value # If q values are the same, select the first run - data = data.sort_values(by=['Q_Value', 'RunId']).groupby("TransitionId").head(1) + data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1) assert (len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value and RunId, duplicate transition IDs found.") # Remove Annotation Column if all NAN @@ -668,7 +668,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: if cfg.keep_decoys: logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates()))) - data.drop(columns=['TransitionId', 'Q_Value', 'RunId'], inplace=True) + data.drop(columns=['TransitionId', 'Q_Value', 'RunId', 'Intensity'], inplace=True) if cfg.test: data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz']) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index d17a75da..2f6812b1 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -306,6 +306,7 @@ def _read_library_data(self, con) -> pd.DataFrame: p.UNMODIFIED_SEQUENCE AS PeptideSequence, p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, p.PRECURSOR_CHARGE AS PrecursorCharge, + p.FEATURE_MS2_AREA_INTENSITY AS Intensity, p.RUN_ID AS RunId, (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor, p.PRECURSOR_MZ AS PrecursorMz, From 1c32cb6f404135c3b37f1b866185413f0e73878b Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 19 Aug 2025 16:15:45 -0400 Subject: [PATCH 24/25] apply copilot suggestions --- pyprophet/io/_base.py | 7 ++++--- pyprophet/io/export/split_parquet.py | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 878075d7..23aa3680 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -48,7 +48,7 @@ import duckdb import pandas as pd import polars as pl -import sklearn.preprocessing as preprocessing +import sklearn.preprocessing as preprocessing # For MinMaxScaler from loguru import logger from .._base import BaseIOConfig @@ -633,7 +633,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: # For precursors found in more than one run, select the run with the smallest q value # If q values are the same, select the first run data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1) - assert (len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value and RunId, duplicate transition IDs found.") + assert len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value Intensity and RunId, duplicate transition IDs found." # Remove Annotation Column if all NAN if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all(): @@ -649,7 +649,8 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: data.groupby('Precursor')['LibraryIntensity'].transform('max') * 10000) logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0]))) - data = data[data['LibraryIntensity'] > 0] # Remove rows with zero intensity + # Remove rows with zero intensity + data = data[data['LibraryIntensity'] > 0] ## Print Library statistics logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors") diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 2f6812b1..0cbf9732 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -70,9 +70,13 @@ def read(self) -> pd.DataFrame: if self.config.export_format == "library": if self._is_unscored_file(): - raise logger.exception("Files must be scored for library generation.") + descr= "Files must be scored for library generation." + logger.exception(descr) + raise ValueError(descr) if not self._has_peptide_protein_global_scores(): - raise logger.exception("Files must have peptide and protein level global scores for library generation.") + descr= "Files must have peptide and protein level global scores for library generation." + logger.exception(descr) + raise ValueError(descr) logger.info("Reading standard OpenSWATH data for library from split Parquet files.") return self._read_library_data(con) From 0c0d99dce09eeacba13a0fdde4dea9e8dc17d71c Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 28 Aug 2025 10:00:49 -0400 Subject: [PATCH 25/25] fix: bug in sql query --- pyprophet/io/export/split_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 0cbf9732..265130a8 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -333,7 +333,7 @@ def _read_library_data(self, con) -> pd.DataFrame: GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE, p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE, p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ, - t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID + t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID, p.FEATURE_MS2_AREA_INTENSITY """ return con.execute(query).fetchdf()